|
- // This is necessary to prevent Unity from deciding that our default config logic is actually an include guard declaration
- #ifndef STP_UNITY_INCLUDE_GUARD
- #define STP_UNITY_INCLUDE_GUARD
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- //
- //
- // SPATIAL TEMPORAL POST [STP] v1.0
- //
- //
- //==============================================================================================================================
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // C/C++/GLSL/HLSL PORTABILITY BASED ON AMD's 'ffx_a.h'.
- // INCLUDING ASSOCIATED LICENSE BELOW
- //------------------------------------------------------------------------------------------------------------------------------
- // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
- // Permission is hereby granted, free of charge, to any person obtaining a copy
- // of this software and associated documentation files(the "Software"), to deal
- // in the Software without restriction, including without limitation the rights
- // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
- // copies of the Software, and to permit persons to whom the Software is
- // furnished to do so, subject to the following conditions :
- // The above copyright notice and this permission notice shall be included in
- // all copies or substantial portions of the Software.
- // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
- // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- // THE SOFTWARE.
- //==============================================================================================================================
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // NOTES
- //------------------------------------------------------------------------------------------------------------------------------
- // PLATFORM SPECIFIC WORKAROUNDS
- // =============================
- // - These all default to not enabled {0}, define to {1} to enable.
- // - define STP_BUG_ALIAS16 1 .... Define to enable workaround for asuint16()/asfloat16().
- // - define STP_BUG_PRX 1 ........ Define to disable approximate transendentals.
- // - define STP_BUG_SAT_INF 1 .... Define to workaround platforms with broken 16-bit saturate +/- INF.
- // - define STP_BUG_SAT 1 ........ Define to workaround compiler incorrectly factoring out inner saturate in 16-bit code.
- //------------------------------------------------------------------------------------------------------------------------------
- // CONFIGURATIONS
- // ==============
- // - INDEPENDENT OPTIONS
- // - define STP_32BIT {0 := disable, 1 := compile the 32-bit version or implicit precision version}
- // - define STP_MEDIUM {0 := disable, 1 := enable the implicit medium precision version for 32-bit}
- // - define STP_16BIT {0 := disable, 1 := compile the explicit 16-bit version}
- // -----
- // - define STP_GPU {to include shader code}
- // - define STP_GLSL {to include the GLSL version of the code}
- // - define STP_HLSL {to include the HLSL version of the code}
- // -----
- // - define STP_DIL {to include the StpDil<H,F>() entry points}
- // - define STP_PAT {to include the StpPat<H,F>() entry points}
- // - define STP_SAA {to include the StpSaa<H,F>() entry points}
- // - define STP_TAA {to include the StpTaa<H,F>() entry points}
- // -----
- // - define STP_POSTMAP {running STP, 0 := before, 1 := after, application tonemapping}
- //------------------------------------------------------------------------------------------------------------------------------
- // IMPORTANT
- // =========
- // - All callbacks should explicitly sample from MIP level 0.
- // - Meaning if used in a pixel shader do not allow implicit LOD calculation.
- // - The algorithm is tuned for pre-tonemap operation, post-tonemap wasn't tested yet.
- //==============================================================================================================================
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // EXTERNAL OPTIONS
- //==============================================================================================================================
- // Enable {1} or default disable any debug functionality {0}.
- #ifndef STP_BUG
- #define STP_BUG 0
- #endif
- //------------------------------------------------------------------------------------------------------------------------------
- // Define to test a pass-through dummy shader that fetches all resources but does no logic.
- #ifndef STP_BUG_BW_SOL
- #define STP_BUG_BW_SOL 0
- #endif
- //------------------------------------------------------------------------------------------------------------------------------
- // Define to {1} to use the max/min sampling permutation for color values.
- #ifndef STP_MAX_MIN_10BIT
- #define STP_MAX_MIN_10BIT 0
- #endif
- //------------------------------------------------------------------------------------------------------------------------------
- // Define to {1} to use the max/min sampling permutation for UINT32 values.
- #ifndef STP_MAX_MIN_UINT
- #define STP_MAX_MIN_UINT 0
- #endif
- //------------------------------------------------------------------------------------------------------------------------------
- // Define to {1} to use sampling with offsets.
- #ifndef STP_OFFSETS
- #define STP_OFFSETS 0
- #endif
- //------------------------------------------------------------------------------------------------------------------------------
- // STP is currently only tested to run pre-tonemap at that is what Unity is using.
- // Run 0 := pre-tonemap, 1 := post-tonemap.
- #ifndef STP_POSTMAP
- #define STP_POSTMAP 0
- #endif
- //------------------------------------------------------------------------------------------------------------------------------
- // STP TAA quality level {0 to 1}
- #ifndef STP_TAA_Q
- #define STP_TAA_Q 1
- #endif
- //==============================================================================================================================
- // PLATFORM SPECIFIC BUG WORKAROUNDS
- // =================================
- // Define to {1} to disable usage of transendental approximations using float/int aliasing.
- #ifndef STP_BUG_PRX
- #define STP_BUG_PRX 0
- #endif
- //------------------------------------------------------------------------------------------------------------------------------
- // Define to {1} for workaround if platform cannot use saturate of +/- INF correctly.
- #ifndef STP_BUG_SAT_INF
- #define STP_BUG_SAT_INF 0
- #endif
- //------------------------------------------------------------------------------------------------------------------------------
- // Define to {1} for workaround for compilier incorrectly factoring out inner saturate in 16-bit code.
- #ifndef STP_BUG_SAT
- #define STP_BUG_SAT 0
- #endif
- //------------------------------------------------------------------------------------------------------------------------------
- // Define to {1} for workarounds for broken asuint16()/asfloat16().
- #ifndef STP_BUG_ALIAS16
- #define STP_BUG_ALIAS16 0
- #undef STP_BUG_PRX
- #define STP_BUG_PRX 1
- #endif
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // C/C++/GLSL/HLSL PORTABILITY
- //==============================================================================================================================
- #if defined(STP_CPU)
- #ifndef STP_RESTRICT
- #define STP_RESTRICT __restrict
- #endif
- //------------------------------------------------------------------------------------------------------------------------------
- #ifndef STP_STATIC
- #define STP_STATIC static
- #endif
- //------------------------------------------------------------------------------------------------------------------------------
- typedef unsigned char StpB1;
- typedef unsigned short StpW1;
- typedef float StpF1;
- typedef uint32_t StpU1;
- #define StpF1_(a) ((StpF1)(a))
- #define StpU1_(a) ((StpU1)(a))
- STP_STATIC StpU1 StpU1_F1(StpF1 a) { union { StpF1 f; StpU1 u; } bits; bits.f = a; return bits.u; }
- #define StpOutF2 StpF1 *STP_RESTRICT
- #define StpExp2F1(x) exp2f(x)
- STP_STATIC StpF1 StpMaxF1(StpF1 a, StpF1 b) { return a > b ? a : b; }
- //------------------------------------------------------------------------------------------------------------------------------
- // Convert float to half (in lower 16-bits of output).
- // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
- // Supports denormals.
- // Conversion rules are to make computations possibly "safer" on the GPU,
- // -INF & -NaN -> -65504
- // +INF & +NaN -> +65504
- STP_STATIC StpU1 StpU1_H1_F1(StpF1 f) {
- static StpW1 base[512] = {
- 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
- 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
- 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
- 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
- 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
- 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
- 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100,
- 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00,
- 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff,
- 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
- 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
- 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
- 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
- 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
- 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
- 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
- 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
- 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
- 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
- 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
- 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
- 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
- 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100,
- 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00,
- 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff,
- 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
- 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
- 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
- 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
- 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
- 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
- 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff };
- static StpB1 shift[512] = {
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
- 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
- 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
- 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
- 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
- 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18 };
- union { StpF1 f; StpU1 u; } bits;
- bits.f = f; StpU1 u = bits.u; StpU1 i = u >> 23;
- return (StpU1)(base[i]) + ((u & 0x7fffff) >> shift[i]); }
- //------------------------------------------------------------------------------------------------------------------------------
- STP_STATIC StpU1 StpU1_H2_F2(StpInF2 a) { return StpU1_H1_F1(a[0]) + (StpU1_H1_F1(a[1]) << 16); }
- #endif // defined(STP_CPU)
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_GLSL)
- #define StpP1 bool
- #define StpP2 bvec2
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpF1 float
- #define StpF2 vec2
- #define StpF3 vec3
- #define StpF4 vec4
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpI2 ivec2
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpU1 uint
- #define StpU2 uvec2
- #define StpU3 uvec3
- #define StpU4 uvec4
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpF1_U1(x) uintBitsToFloat(StpU1(x))
- #define StpF2_U2(x) uintBitsToFloat(StpU2(x))
- #define StpF3_U3(x) uintBitsToFloat(StpU3(x))
- #define StpF4_U4(x) uintBitsToFloat(StpU4(x))
- #define StpU1_F1(x) floatBitsToUint(StpF1(x))
- #define StpU2_F2(x) floatBitsToUint(StpF2(x))
- #define StpU3_F3(x) floatBitsToUint(StpF3(x))
- #define StpU4_F4(x) floatBitsToUint(StpF4(x))
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpU1_H2_F2 packHalf2x16
- #define StpF2_H2_U1 unpackHalf2x16
- //------------------------------------------------------------------------------------------------------------------------------
- StpU1 StpBfeU1(StpU1 src, StpU1 off, StpU1 bits) { return bitfieldExtract(src, int(off), int(bits)); }
- // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate.
- StpU1 StpBfiMskU1(StpU1 src, StpU1 ins, StpU1 bits) { return bitfieldInsert(src, ins, 0, int(bits)); }
- #endif // defined(STP_GPU) && defined(STP_GLSL)
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_GLSL) && defined(STP_16BIT)
- #define StpH1 float16_t
- #define StpH2 f16vec2
- #define StpH3 f16vec3
- #define StpH4 f16vec4
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpW1 uint16_t
- #define StpW2 u16vec2
- #define StpW3 u16vec3
- #define StpW4 u16vec4
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpW2_U1(x) unpackUint2x16(StpU1(x))
- #define StpH2_U1(x) unpackFloat2x16(StpU1(x))
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpW1_H1(x) halfBitsToUint16(StpH1(x))
- #define StpW2_H2(x) halfBitsToUint16(StpH2(x))
- #define StpW3_H3(x) halfBitsToUint16(StpH3(x))
- #define StpW4_H4(x) halfBitsToUint16(StpH4(x))
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpH1_W1(x) uint16BitsToHalf(StpW1(x))
- #define StpH2_W2(x) uint16BitsToHalf(StpW2(x))
- #define StpH3_W3(x) uint16BitsToHalf(StpW3(x))
- #define StpH4_W4(x) uint16BitsToHalf(StpW4(x))
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpU1_H2(x) packFloat2x16(StpH2(x))
- #endif // defined(STP_GPU) && defined(STP_GLSL) && defined(STP_16BIT)
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_HLSL)
- #define StpP1 bool
- #define StpP2 bool2
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpF1 float
- #define StpF2 float2
- #define StpF3 float3
- #define StpF4 float4
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpI2 int2
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpU1 uint
- #define StpU2 uint2
- #define StpU3 uint3
- #define StpU4 uint4
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpF1_U1(x) asfloat(StpU1(x))
- #define StpF2_U2(x) asfloat(StpU2(x))
- #define StpF3_U3(x) asfloat(StpU3(x))
- #define StpF4_U4(x) asfloat(StpU4(x))
- #define StpU1_F1(x) asuint(StpF1(x))
- #define StpU2_F2(x) asuint(StpF2(x))
- #define StpU3_F3(x) asuint(StpF3(x))
- #define StpU4_F4(x) asuint(StpF4(x))
- //------------------------------------------------------------------------------------------------------------------------------
- StpU1 StpU1_H2_F2_x(StpF2 a) { return f32tof16(a.x) | (f32tof16(a.y) << 16); }
- #define StpU1_H2_F2(a) StpU1_H2_F2_x(StpF2(a))
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 StpF2_H2_U1_x(StpU1 x) { return StpF2(f16tof32(x & 0xFFFF), f16tof32(x >> 16)); }
- #define StpF2_H2_U1(x) StpF2_H2_U1_x(StpU1(x))
- //------------------------------------------------------------------------------------------------------------------------------
- StpU1 StpBfeU1(StpU1 src, StpU1 off, StpU1 bits) { StpU1 msk = (1u << bits) - 1; return (src >> off) & msk; }
- StpU1 StpBfiMskU1(StpU1 src, StpU1 ins, StpU1 bits) { StpU1 msk = (1u << bits) - 1; return (ins & msk) | (src & (~msk)); }
- #endif // defined(STP_GPU) && defined(STP_HLSL)
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_HLSL) && defined(STP_MEDIUM)
- #define StpMU1 min16uint
- #define StpMU2 min16uint2
- #define StpMU3 min16uint3
- #define StpMU4 min16uint4
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpMF1 min16float
- #define StpMF2 min16float2
- #define StpMF3 min16float3
- #define StpMF4 min16float4
- #endif // defined(STP_GPU) && defined(STP_HLSL) && defined(STP_MEDIUM)
- //==============================================================================================================================
- #if defined(STP_GPU) && (!defined(STP_MEDIUM))
- #define StpMU1 StpU1
- #define StpMU2 StpU2
- #define StpMU3 StpU3
- #define StpMU4 StpU4
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpMF1 StpF1
- #define StpMF2 StpF2
- #define StpMF3 StpF3
- #define StpMF4 StpF4
- #endif // defined(STP_GPU) && (!defined(STP_MEDIUM))
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_HLSL) && defined(STP_16BIT)
- #define StpH1 float16_t
- #define StpH2 float16_t2
- #define StpH3 float16_t3
- #define StpH4 float16_t4
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpW1 uint16_t
- #define StpW2 uint16_t2
- #define StpW3 uint16_t3
- #define StpW4 uint16_t4
- //------------------------------------------------------------------------------------------------------------------------------
- StpW2 StpW2_U1_x(StpU1 x) { StpU2 t = StpU2(x & 0xFFFF, x >> 16); return StpW2(t); }
- #define StpW2_U1(x) StpW2_U1_x(StpU1(x))
- StpH2 StpH2_U1_x(StpU1 x) { return asfloat16(StpW2((StpW1)(x & 0xFFFF), (StpW1)(x >> 16))); }
- #define StpH2_U1(x) StpH2_U1_x(StpU1(x))
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpW1_H1(x) asuint16(StpH1(x))
- #define StpW2_H2(x) asuint16(StpH2(x))
- #define StpW3_H3(x) asuint16(StpH3(x))
- #define StpW4_H4(x) asuint16(StpH4(x))
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpH1_W1(x) asfloat16(StpW1(x))
- #define StpH2_W2(x) asfloat16(StpW2(x))
- #define StpH3_W3(x) asfloat16(StpW3(x))
- #define StpH4_W4(x) asfloat16(StpW4(x))
- //------------------------------------------------------------------------------------------------------------------------------
- StpU1 StpU1_H2_x(StpH2 x) { StpW2 t = asuint16(x); return (((StpU1)t.x) | (((StpU1)t.y) << 16)); }
- #define StpU1_H2(x) StpU1_H2_x(StpH2(x))
- #endif // defined(STP_GPU) && defined(STP_HLSL) && defined(STP_16BIT)
- //==============================================================================================================================
- #if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL))
- StpF1 StpMaxF1(StpF1 a, StpF1 b) { return max(a, b); }
- //------------------------------------------------------------------------------------------------------------------------------
- StpP2 StpP2_x(StpP1 x) { return StpP2(x, x); }
- #define StpP2_(x) StpP2_x(StpP1(x))
- //------------------------------------------------------------------------------------------------------------------------------
- StpF1 StpF1_x(StpF1 x) { return StpF1(x); }
- StpF2 StpF2_x(StpF1 x) { return StpF2(x, x); }
- StpF3 StpF3_x(StpF1 x) { return StpF3(x, x, x); }
- StpF4 StpF4_x(StpF1 x) { return StpF4(x, x, x, x); }
- #define StpF1_(x) StpF1_x(StpF1(x))
- #define StpF2_(x) StpF2_x(StpF1(x))
- #define StpF3_(x) StpF3_x(StpF1(x))
- #define StpF4_(x) StpF4_x(StpF1(x))
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 StpMF1_x(StpMF1 x) { return StpMF1(x); }
- StpMF2 StpMF2_x(StpMF1 x) { return StpMF2(x, x); }
- StpMF3 StpMF3_x(StpMF1 x) { return StpMF3(x, x, x); }
- StpMF4 StpMF4_x(StpMF1 x) { return StpMF4(x, x, x, x); }
- #define StpMF1_(x) StpMF1_x(StpMF1(x))
- #define StpMF2_(x) StpMF2_x(StpMF1(x))
- #define StpMF3_(x) StpMF3_x(StpMF1(x))
- #define StpMF4_(x) StpMF4_x(StpMF1(x))
- //------------------------------------------------------------------------------------------------------------------------------
- StpMU1 StpMU1_x(StpMU1 x) { return StpMU1(x); }
- StpMU2 StpMU2_x(StpMU1 x) { return StpMU2(x, x); }
- StpMU3 StpMU3_x(StpMU1 x) { return StpMU3(x, x, x); }
- StpMU4 StpMU4_x(StpMU1 x) { return StpMU4(x, x, x, x); }
- #define StpMU1_(x) StpMU1_x(StpMU1(x))
- #define StpMU2_(x) StpMU2_x(StpMU1(x))
- #define StpMU3_(x) StpMU3_x(StpMU1(x))
- #define StpMU4_(x) StpMU4_x(StpMU1(x))
- //------------------------------------------------------------------------------------------------------------------------------
- StpU1 StpU1_x(StpU1 x) { return StpU1(x); }
- StpU2 StpU2_x(StpU1 x) { return StpU2(x, x); }
- StpU3 StpU3_x(StpU1 x) { return StpU3(x, x, x); }
- StpU4 StpU4_x(StpU1 x) { return StpU4(x, x, x, x); }
- #define StpU1_(x) StpU1_x(StpU1(x))
- #define StpU2_(x) StpU2_x(StpU1(x))
- #define StpU3_(x) StpU3_x(StpU1(x))
- #define StpU4_(x) StpU4_x(StpU1(x))
- //------------------------------------------------------------------------------------------------------------------------------
- #if 0
- // Slow implementation (if not pattern matched by a compiler).
- StpF1 StpCpySgnF1(StpF1 d, StpF1 s) { return StpF1_U1(StpU1_F1(d) | (StpU1_F1(s) & StpU1_(0x80000000u))); }
- StpF2 StpCpySgnF2(StpF2 d, StpF2 s) { return StpF2_U2(StpU2_F2(d) | (StpU2_F2(s) & StpU2_(0x80000000u))); }
- StpF3 StpCpySgnF3(StpF3 d, StpF3 s) { return StpF3_U3(StpU3_F3(d) | (StpU3_F3(s) & StpU3_(0x80000000u))); }
- StpF4 StpCpySgnF4(StpF4 d, StpF4 s) { return StpF4_U4(StpU4_F4(d) | (StpU4_F4(s) & StpU4_(0x80000000u))); }
- #else
- // Faster implementation (one portable BFI).
- StpF1 StpCpySgnF1(StpF1 d, StpF1 s) { return StpF1_U1(StpBfiMskU1(StpU1_F1(s), StpU1_F1(d), StpU1_(31))); }
- StpF2 StpCpySgnF2(StpF2 d, StpF2 s) { return StpF2(StpCpySgnF1(d.x, s.x), StpCpySgnF1(d.y, s.y)); }
- StpF3 StpCpySgnF3(StpF3 d, StpF3 s) {
- return StpF3(StpCpySgnF1(d.x, s.x), StpCpySgnF1(d.y, s.y), StpCpySgnF1(d.z, s.z)); }
- StpF4 StpCpySgnF4(StpF4 d, StpF4 s) {
- return StpF4(StpCpySgnF1(d.x, s.x), StpCpySgnF1(d.y, s.y), StpCpySgnF1(d.z, s.z), StpCpySgnF1(d.w, s.w)); }
- #endif
- StpF1 StpMax3F1(StpF1 x, StpF1 y, StpF1 z) { return max(x, max(y, z)); }
- StpF2 StpMax3F2(StpF2 x, StpF2 y, StpF2 z) { return max(x, max(y, z)); }
- StpF3 StpMax3F3(StpF3 x, StpF3 y, StpF3 z) { return max(x, max(y, z)); }
- StpF4 StpMax3F4(StpF4 x, StpF4 y, StpF4 z) { return max(x, max(y, z)); }
- StpF1 StpMin3F1(StpF1 x, StpF1 y, StpF1 z) { return min(x, min(y, z)); }
- StpF2 StpMin3F2(StpF2 x, StpF2 y, StpF2 z) { return min(x, min(y, z)); }
- StpF3 StpMin3F3(StpF3 x, StpF3 y, StpF3 z) { return min(x, min(y, z)); }
- StpF4 StpMin3F4(StpF4 x, StpF4 y, StpF4 z) { return min(x, min(y, z)); }
- StpU1 StpMax3U1(StpU1 x, StpU1 y, StpU1 z) { return max(x, max(y, z)); }
- StpU1 StpMin3U1(StpU1 x, StpU1 y, StpU1 z) { return min(x, min(y, z)); }
- StpU4 StpMin3U4(StpU4 x, StpU4 y, StpU4 z) { return min(x, min(y, z)); }
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 StpMax3MF1(StpMF1 x, StpMF1 y, StpMF1 z) { return max(x, max(y, z)); }
- StpMF2 StpMax3MF2(StpMF2 x, StpMF2 y, StpMF2 z) { return max(x, max(y, z)); }
- StpMF3 StpMax3MF3(StpMF3 x, StpMF3 y, StpMF3 z) { return max(x, max(y, z)); }
- StpMF4 StpMax3MF4(StpMF4 x, StpMF4 y, StpMF4 z) { return max(x, max(y, z)); }
- StpMF1 StpMin3MF1(StpMF1 x, StpMF1 y, StpMF1 z) { return min(x, min(y, z)); }
- StpMF2 StpMin3MF2(StpMF2 x, StpMF2 y, StpMF2 z) { return min(x, min(y, z)); }
- StpMF3 StpMin3MF3(StpMF3 x, StpMF3 y, StpMF3 z) { return min(x, min(y, z)); }
- StpMF4 StpMin3MF4(StpMF4 x, StpMF4 y, StpMF4 z) { return min(x, min(y, z)); }
- //------------------------------------------------------------------------------------------------------------------------------
- // Make {<+0 := -1.0, >=+0 := 1.0}.
- StpF1 StpSgnOneF1(StpF1 x) { return StpF1_U1(StpBfiMskU1(StpU1_F1(x), StpU1_(0x3f800000), StpU1_(31))); }
- #endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL))
- //==============================================================================================================================
- #if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_16BIT)
- StpH1 StpH1_x(StpH1 x) { return StpH1(x); }
- StpH2 StpH2_x(StpH1 x) { return StpH2(x, x); }
- StpH3 StpH3_x(StpH1 x) { return StpH3(x, x, x); }
- StpH4 StpH4_x(StpH1 x) { return StpH4(x, x, x, x); }
- #define StpH1_(x) StpH1_x(StpH1(x))
- #define StpH2_(x) StpH2_x(StpH1(x))
- #define StpH3_(x) StpH3_x(StpH1(x))
- #define StpH4_(x) StpH4_x(StpH1(x))
- //------------------------------------------------------------------------------------------------------------------------------
- StpW1 StpW1_x(StpW1 x) { return StpW1(x); }
- StpW2 StpW2_x(StpW1 x) { return StpW2(x, x); }
- StpW3 StpW3_x(StpW1 x) { return StpW3(x, x, x); }
- StpW4 StpW4_x(StpW1 x) { return StpW4(x, x, x, x); }
- #define StpW1_(x) StpW1_x(StpW1(x))
- #define StpW2_(x) StpW2_x(StpW1(x))
- #define StpW3_(x) StpW3_x(StpW1(x))
- #define StpW4_(x) StpW4_x(StpW1(x))
- //------------------------------------------------------------------------------------------------------------------------------
- StpH1 StpMax3H1(StpH1 x, StpH1 y, StpH1 z) { return max(x, max(y, z)); }
- StpH2 StpMax3H2(StpH2 x, StpH2 y, StpH2 z) { return max(x, max(y, z)); }
- StpH3 StpMax3H3(StpH3 x, StpH3 y, StpH3 z) { return max(x, max(y, z)); }
- StpH4 StpMax3H4(StpH4 x, StpH4 y, StpH4 z) { return max(x, max(y, z)); }
- StpH1 StpMin3H1(StpH1 x, StpH1 y, StpH1 z) { return min(x, min(y, z)); }
- StpH2 StpMin3H2(StpH2 x, StpH2 y, StpH2 z) { return min(x, min(y, z)); }
- StpH3 StpMin3H3(StpH3 x, StpH3 y, StpH3 z) { return min(x, min(y, z)); }
- StpH4 StpMin3H4(StpH4 x, StpH4 y, StpH4 z) { return min(x, min(y, z)); }
- StpW1 StpMax3W1(StpW1 x, StpW1 y, StpW1 z) { return max(x, max(y, z)); }
- StpW1 StpMin3W1(StpW1 x, StpW1 y, StpW1 z) { return min(x, min(y, z)); }
- #endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_16BIT)
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_GLSL)
- StpF1 StpFractF1(StpF1 x) { return fract(x); }
- StpF2 StpFractF2(StpF2 x) { return fract(x); }
- StpF3 StpFractF3(StpF3 x) { return fract(x); }
- StpF4 StpFractF4(StpF4 x) { return fract(x); }
- StpF1 StpLerpF1(StpF1 x, StpF1 y, StpF1 z) { return mix(x, y, z); }
- StpF2 StpLerpF2(StpF2 x, StpF2 y, StpF2 z) { return mix(x, y, z); }
- StpF3 StpLerpF3(StpF3 x, StpF3 y, StpF3 z) { return mix(x, y, z); }
- StpF4 StpLerpF4(StpF4 x, StpF4 y, StpF4 z) { return mix(x, y, z); }
- StpF1 StpRcpF1(StpF1 x) { return StpF1_(1.0) / x; }
- StpF2 StpRcpF2(StpF2 x) { return StpF2_(1.0) / x; }
- StpF3 StpRcpF3(StpF3 x) { return StpF3_(1.0) / x; }
- StpF4 StpRcpF4(StpF4 x) { return StpF4_(1.0) / x; }
- StpF1 StpRsqF1(StpF1 x) { return inversesqrt(x); }
- StpF2 StpRsqF2(StpF2 x) { return inversesqrt(x); }
- StpF3 StpRsqF3(StpF3 x) { return inversesqrt(x); }
- StpF4 StpRsqF4(StpF4 x) { return inversesqrt(x); }
- StpF1 StpSatF1(StpF1 x) { return clamp(x, StpF1_(0.0), StpF1_(1.0)); }
- StpF2 StpSatF2(StpF2 x) { return clamp(x, StpF2_(0.0), StpF2_(1.0)); }
- StpF3 StpSatF3(StpF3 x) { return clamp(x, StpF3_(0.0), StpF3_(1.0)); }
- StpF4 StpSatF4(StpF4 x) { return clamp(x, StpF4_(0.0), StpF4_(1.0)); }
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 StpFractMF1(StpMF1 x) { return fract(x); }
- StpMF2 StpFractMF2(StpMF2 x) { return fract(x); }
- StpMF3 StpFractMF3(StpMF3 x) { return fract(x); }
- StpMF4 StpFractMF4(StpMF4 x) { return fract(x); }
- StpMF1 StpLerpMF1(StpMF1 x, StpMF1 y, StpMF1 z) { return mix(x, y, z); }
- StpMF2 StpLerpMF2(StpMF2 x, StpMF2 y, StpMF2 z) { return mix(x, y, z); }
- StpMF3 StpLerpMF3(StpMF3 x, StpMF3 y, StpMF3 z) { return mix(x, y, z); }
- StpMF4 StpLerpMF4(StpMF4 x, StpMF4 y, StpMF4 z) { return mix(x, y, z); }
- StpMF1 StpRcpMF1(StpMF1 x) { return StpMF1_(1.0) / x; }
- StpMF2 StpRcpMF2(StpMF2 x) { return StpMF2_(1.0) / x; }
- StpMF3 StpRcpMF3(StpMF3 x) { return StpMF3_(1.0) / x; }
- StpMF4 StpRcpMF4(StpMF4 x) { return StpMF4_(1.0) / x; }
- StpMF1 StpRsqMF1(StpMF1 x) { return inversesqrt(x); }
- StpMF2 StpRsqMF2(StpMF2 x) { return inversesqrt(x); }
- StpMF3 StpRsqMF3(StpMF3 x) { return inversesqrt(x); }
- StpMF4 StpRsqMF4(StpMF4 x) { return inversesqrt(x); }
- StpMF1 StpSatMF1(StpMF1 x) { return clamp(x, StpMF1_(0.0), StpMF1_(1.0)); }
- StpMF2 StpSatMF2(StpMF2 x) { return clamp(x, StpMF2_(0.0), StpMF2_(1.0)); }
- StpMF3 StpSatMF3(StpMF3 x) { return clamp(x, StpMF3_(0.0), StpMF3_(1.0)); }
- StpMF4 StpSatMF4(StpMF4 x) { return clamp(x, StpMF4_(0.0), StpMF4_(1.0)); }
- #endif // defined(STP_GPU) && defined(STP_GLSL)
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_GLSL) && defined(STP_16BIT)
- StpH1 StpFractH1(StpH1 x) { return fract(x); }
- StpH2 StpFractH2(StpH2 x) { return fract(x); }
- StpH3 StpFractH3(StpH3 x) { return fract(x); }
- StpH4 StpFractH4(StpH4 x) { return fract(x); }
- StpH1 StpLerpH1(StpH1 x, StpH1 y, StpH1 z) { return mix(x, y, z); }
- StpH2 StpLerpH2(StpH2 x, StpH2 y, StpH2 z) { return mix(x, y, z); }
- StpH3 StpLerpH3(StpH3 x, StpH3 y, StpH3 z) { return mix(x, y, z); }
- StpH4 StpLerpH4(StpH4 x, StpH4 y, StpH4 z) { return mix(x, y, z); }
- StpH1 StpRcpH1(StpH1 x) { return StpH1_(1.0) / x; }
- StpH2 StpRcpH2(StpH2 x) { return StpH2_(1.0) / x; }
- StpH3 StpRcpH3(StpH3 x) { return StpH3_(1.0) / x; }
- StpH4 StpRcpH4(StpH4 x) { return StpH4_(1.0) / x; }
- StpH1 StpRsqH1(StpH1 x) { return inversesqrt(x); }
- StpH2 StpRsqH2(StpH2 x) { return inversesqrt(x); }
- StpH3 StpRsqH3(StpH3 x) { return inversesqrt(x); }
- StpH4 StpRsqH4(StpH4 x) { return inversesqrt(x); }
- StpH1 StpSatH1(StpH1 x) { return clamp(x, StpH1_(0.0), StpH1_(1.0)); }
- StpH2 StpSatH2(StpH2 x) { return clamp(x, StpH2_(0.0), StpH2_(1.0)); }
- StpH3 StpSatH3(StpH3 x) { return clamp(x, StpH3_(0.0), StpH3_(1.0)); }
- StpH4 StpSatH4(StpH4 x) { return clamp(x, StpH4_(0.0), StpH4_(1.0)); }
- #endif // defined(STP_GPU) && defined(STP_GLSL) && defined(STP_16BIT)
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_HLSL)
- StpF1 StpFractF1(StpF1 x) { return x - floor(x); }
- StpF2 StpFractF2(StpF2 x) { return x - floor(x); }
- StpF3 StpFractF3(StpF3 x) { return x - floor(x); }
- StpF4 StpFractF4(StpF4 x) { return x - floor(x); }
- StpF1 StpLerpF1(StpF1 x, StpF1 y, StpF1 z) { return lerp(x, y, z); }
- StpF2 StpLerpF2(StpF2 x, StpF2 y, StpF2 z) { return lerp(x, y, z); }
- StpF3 StpLerpF3(StpF3 x, StpF3 y, StpF3 z) { return lerp(x, y, z); }
- StpF4 StpLerpF4(StpF4 x, StpF4 y, StpF4 z) { return lerp(x, y, z); }
- StpF1 StpRcpF1(StpF1 x) { return rcp(x); }
- StpF2 StpRcpF2(StpF2 x) { return rcp(x); }
- StpF3 StpRcpF3(StpF3 x) { return rcp(x); }
- StpF4 StpRcpF4(StpF4 x) { return rcp(x); }
- StpF1 StpRsqF1(StpF1 x) { return rsqrt(x); }
- StpF2 StpRsqF2(StpF2 x) { return rsqrt(x); }
- StpF3 StpRsqF3(StpF3 x) { return rsqrt(x); }
- StpF4 StpRsqF4(StpF4 x) { return rsqrt(x); }
- StpF1 StpSatF1(StpF1 x) { return saturate(x); }
- StpF2 StpSatF2(StpF2 x) { return saturate(x); }
- StpF3 StpSatF3(StpF3 x) { return saturate(x); }
- StpF4 StpSatF4(StpF4 x) { return saturate(x); }
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 StpFractMF1(StpMF1 x) { return x - floor(x); }
- StpMF2 StpFractMF2(StpMF2 x) { return x - floor(x); }
- StpMF3 StpFractMF3(StpMF3 x) { return x - floor(x); }
- StpMF4 StpFractMF4(StpMF4 x) { return x - floor(x); }
- StpMF1 StpLerpMF1(StpMF1 x, StpMF1 y, StpMF1 z) { return lerp(x, y, z); }
- StpMF2 StpLerpMF2(StpMF2 x, StpMF2 y, StpMF2 z) { return lerp(x, y, z); }
- StpMF3 StpLerpMF3(StpMF3 x, StpMF3 y, StpMF3 z) { return lerp(x, y, z); }
- StpMF4 StpLerpMF4(StpMF4 x, StpMF4 y, StpMF4 z) { return lerp(x, y, z); }
- StpMF1 StpRcpMF1(StpMF1 x) { return rcp(x); }
- StpMF2 StpRcpMF2(StpMF2 x) { return rcp(x); }
- StpMF3 StpRcpMF3(StpMF3 x) { return rcp(x); }
- StpMF4 StpRcpMF4(StpMF4 x) { return rcp(x); }
- StpMF1 StpRsqMF1(StpMF1 x) { return rsqrt(x); }
- StpMF2 StpRsqMF2(StpMF2 x) { return rsqrt(x); }
- StpMF3 StpRsqMF3(StpMF3 x) { return rsqrt(x); }
- StpMF4 StpRsqMF4(StpMF4 x) { return rsqrt(x); }
- StpMF1 StpSatMF1(StpMF1 x) { return saturate(x); }
- StpMF2 StpSatMF2(StpMF2 x) { return saturate(x); }
- StpMF3 StpSatMF3(StpMF3 x) { return saturate(x); }
- StpMF4 StpSatMF4(StpMF4 x) { return saturate(x); }
- #endif // defined(STP_GPU) && defined(STP_HLSL)
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_HLSL) && defined(STP_16BIT)
- StpH1 StpFractH1(StpH1 x) { return x - floor(x); }
- StpH2 StpFractH2(StpH2 x) { return x - floor(x); }
- StpH3 StpFractH3(StpH3 x) { return x - floor(x); }
- StpH4 StpFractH4(StpH4 x) { return x - floor(x); }
- StpH1 StpLerpH1(StpH1 x, StpH1 y, StpH1 z) { return lerp(x, y, z); }
- StpH2 StpLerpH2(StpH2 x, StpH2 y, StpH2 z) { return lerp(x, y, z); }
- StpH3 StpLerpH3(StpH3 x, StpH3 y, StpH3 z) { return lerp(x, y, z); }
- StpH4 StpLerpH4(StpH4 x, StpH4 y, StpH4 z) { return lerp(x, y, z); }
- StpH1 StpRcpH1(StpH1 x) { return rcp(x); }
- StpH2 StpRcpH2(StpH2 x) { return rcp(x); }
- StpH3 StpRcpH3(StpH3 x) { return rcp(x); }
- StpH4 StpRcpH4(StpH4 x) { return rcp(x); }
- StpH1 StpRsqH1(StpH1 x) { return rsqrt(x); }
- StpH2 StpRsqH2(StpH2 x) { return rsqrt(x); }
- StpH3 StpRsqH3(StpH3 x) { return rsqrt(x); }
- StpH4 StpRsqH4(StpH4 x) { return rsqrt(x); }
- StpH1 StpSatH1(StpH1 x) { return saturate(x); }
- StpH2 StpSatH2(StpH2 x) { return saturate(x); }
- StpH3 StpSatH3(StpH3 x) { return saturate(x); }
- StpH4 StpSatH4(StpH4 x) { return saturate(x); }
- #endif // defined(STP_GPU) && defined(STP_HLSL) && defined(STP_16BIT)
- //==============================================================================================================================
- #if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL))
- StpF1 StpExp2F1(StpF1 x) { return exp2(x); }
- StpF1 StpLog2F1(StpF1 x) { return log2(x); }
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 StpExp2MF1(StpMF1 x) { return exp2(x); }
- StpMF1 StpLog2MF1(StpMF1 x) { return log2(x); }
- //------------------------------------------------------------------------------------------------------------------------------
- #define STP_INFN_F StpF1_U1(0xff800000u)
- #define STP_INFP_F StpF1_U1(0x7f800000u)
- #if STP_BUG_SAT_INF
- // Defined if unable to use the fast path because of problem related to saturating +/- INF.
- StpF1 StpGtZeroF1(StpF1 x) { return (x > StpF1_(0.0)) ? StpF1_(1.0) : StpF1_(0.0); }
- StpF3 StpGtZeroF3(StpF3 x) { return StpF3(StpGtZeroF1(x.r), StpGtZeroF1(x.g), StpGtZeroF1(x.b)); }
- StpF4 StpGtZeroF4(StpF4 x) { return StpF4(StpGtZeroF1(x.r), StpGtZeroF1(x.g),
- StpGtZeroF1(x.b), StpGtZeroF1(x.a)); }
- StpF1 StpSignedF1(StpF1 x) { return (x < StpF1_(0.0)) ? StpF1_(1.0) : StpF1_(0.0); }
- StpF2 StpSignedF2(StpF2 x) { return StpF2(StpSignedF1(x.r), StpSignedF1(x.g)); }
- StpF3 StpSignedF3(StpF3 x) { return StpF3(StpSignedF1(x.r), StpSignedF1(x.g), StpSignedF1(x.b)); }
- StpF4 StpSignedF4(StpF4 x) { return StpF4(StpSignedF1(x.r), StpSignedF1(x.g),
- StpSignedF1(x.b), StpSignedF1(x.a)); }
- #else
- StpF1 StpGtZeroF1(StpF1 x) { return StpSatF1(x * StpF1_(STP_INFP_F)); }
- StpF3 StpGtZeroF3(StpF3 x) { return StpSatF3(x * StpF3_(STP_INFP_F)); }
- StpF4 StpGtZeroF4(StpF4 x) { return StpSatF4(x * StpF4_(STP_INFP_F)); }
- StpF1 StpSignedF1(StpF1 x) { return StpSatF1(x * StpF1_(STP_INFN_F)); }
- StpF2 StpSignedF2(StpF2 x) { return StpSatF2(x * StpF2_(STP_INFN_F)); }
- StpF3 StpSignedF3(StpF3 x) { return StpSatF3(x * StpF3_(STP_INFN_F)); }
- StpF4 StpSignedF4(StpF4 x) { return StpSatF4(x * StpF4_(STP_INFN_F)); }
- #endif // STP_BUG_SAT_INF
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG_PRX
- StpF1 StpPrxLoSqrtF1(StpF1 a) { return sqrt(a); }
- StpF3 StpPrxLoSqrtF3(StpF3 a) { return sqrt(a); }
- StpF4 StpPrxLoSqrtF4(StpF4 a) { return sqrt(a); }
- #else
- StpF1 StpPrxLoSqrtF1(StpF1 a) { return StpF1_U1((StpU1_F1(a) >> StpU1_(1)) + StpU1_(0x1fbc4639)); }
- StpF3 StpPrxLoSqrtF3(StpF3 a) { return StpF3_U3((StpU3_F3(a) >> StpU3_(1)) + StpU3_(0x1fbc4639)); }
- StpF4 StpPrxLoSqrtF4(StpF4 a) { return StpF4_U4((StpU4_F4(a) >> StpU4_(1)) + StpU4_(0x1fbc4639)); }
- #endif // STP_BUG_PRX
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG_PRX
- StpF1 StpPrxLoRcpF1(StpF1 a) { return StpRcpF1(a); }
- StpF2 StpPrxLoRcpF2(StpF2 a) { return StpRcpF2(a); }
- StpF3 StpPrxLoRcpF3(StpF3 a) { return StpRcpF3(a); }
- StpF4 StpPrxLoRcpF4(StpF4 a) { return StpRcpF4(a); }
- StpF1 StpPrxMedRcpF1(StpF1 a) { return StpRcpF1(a); }
- StpF3 StpPrxMedRcpF3(StpF3 a) { return StpRcpF3(a); }
- #else
- StpF1 StpPrxLoRcpF1(StpF1 a) { return StpF1_U1(StpU1_(0x7ef07ebb) - StpU1_F1(a)); }
- StpF2 StpPrxLoRcpF2(StpF2 a) { return StpF2_U2(StpU2_(0x7ef07ebb) - StpU2_F2(a)); }
- StpF3 StpPrxLoRcpF3(StpF3 a) { return StpF3_U3(StpU3_(0x7ef07ebb) - StpU3_F3(a)); }
- StpF4 StpPrxLoRcpF4(StpF4 a) { return StpF4_U4(StpU4_(0x7ef07ebb) - StpU4_F4(a)); }
- StpF1 StpPrxMedRcpF1(StpF1 a) { StpF1 b = StpF1_U1(StpU1_(0x7ef19fff) - StpU1_F1(a));
- return b * (-b * a + StpF1_(2.0)); }
- StpF3 StpPrxMedRcpF3(StpF3 a) { StpF3 b = StpF3_U3(StpU3_(0x7ef19fff) - StpU3_F3(a));
- return b * (-b * a + StpF3_(2.0)); }
- #endif // STP_BUG_PRX
- //------------------------------------------------------------------------------------------------------------------------------
- #define STP_STATIC /* */
- #define StpInF2 in StpF2
- #define StpInF4 in StpF4
- #define StpInOutU4 inout StpU4
- #define StpOutF2 out StpF2
- #define StpVarF2 StpF2
- #endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL))
- //==============================================================================================================================
- #if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_MEDIUM)
- #if STP_BUG_SAT_INF
- // Defined if unable to use the fast path because of problem related to saturating +/- INF.
- StpMF1 StpGtZeroMF1(StpMF1 x) { return (x > StpMF1_(0.0)) ? StpMF1_(1.0) : StpMF1_(0.0); }
- StpMF3 StpGtZeroMF3(StpMF3 x) { return StpMF3(StpGtZeroMF1(x.r), StpGtZeroMF1(x.g), StpGtZeroMF1(x.b)); }
- StpMF4 StpGtZeroMF4(StpMF4 x) { return StpMF4(StpGtZeroMF1(x.r), StpGtZeroMF1(x.g),
- StpGtZeroMF1(x.b), StpGtZeroMF1(x.a)); }
- StpMF1 StpSignedMF1(StpMF1 x) { return (x < StpMF1_(0.0)) ? StpMF1_(1.0) : StpMF1_(0.0); }
- StpMF2 StpSignedMF2(StpMF2 x) { return StpMF2(StpSignedMF1(x.r), StpSignedMF1(x.g)); }
- StpMF3 StpSignedMF3(StpMF3 x) { return StpMF3(StpSignedMF1(x.r), StpSignedMF1(x.g), StpSignedMF1(x.b)); }
- StpMF4 StpSignedMF4(StpMF4 x) { return StpMF4(StpSignedMF1(x.r), StpSignedMF1(x.g),
- StpSignedMF1(x.b), StpSignedMF1(x.a)); }
- #elif STP_BUG_SAT
- // Defined if compiler factors out saturation incorrectly.
- #define STP_INFN_MF StpMF1(StpF1_U1(0xff800000u))
- #define STP_INFP_MF StpMF1(StpF1_U1(0x7f800000u))
- StpMF1 StpGtZeroMF1(StpMF1 x) { return max(min(x * StpMF1_(STP_INFP_MF), StpMF1_(1.0)), StpMF1_(0.0)); }
- StpMF3 StpGtZeroMF3(StpMF3 x) { return max(min(x * StpMF3_(STP_INFP_MF), StpMF3_(1.0)), StpMF3_(0.0)); }
- StpMF4 StpGtZeroMF4(StpMF4 x) { return max(min(x * StpMF4_(STP_INFP_MF), StpMF4_(1.0)), StpMF4_(0.0)); }
- StpMF1 StpSignedMF1(StpMF1 x) { return max(min(x * StpMF1_(STP_INFN_MF), StpMF1_(1.0)), StpMF1_(0.0)); }
- StpMF2 StpSignedMF2(StpMF2 x) { return max(min(x * StpMF2_(STP_INFN_MF), StpMF2_(1.0)), StpMF2_(0.0)); }
- StpMF3 StpSignedMF3(StpMF3 x) { return max(min(x * StpMF3_(STP_INFN_MF), StpMF3_(1.0)), StpMF3_(0.0)); }
- StpMF4 StpSignedMF4(StpMF4 x) { return max(min(x * StpMF4_(STP_INFN_MF), StpMF4_(1.0)), StpMF4_(0.0)); }
- #else
- // Using +/- INF typecast down to medium precision.
- #define STP_INFN_MF StpMF1(StpF1_U1(0xff800000u))
- #define STP_INFP_MF StpMF1(StpF1_U1(0x7f800000u))
- StpMF1 StpGtZeroMF1(StpMF1 x) { return StpSatMF1(x * StpMF1_(STP_INFP_MF)); }
- StpMF3 StpGtZeroMF3(StpMF3 x) { return StpSatMF3(x * StpMF3_(STP_INFP_MF)); }
- StpMF4 StpGtZeroMF4(StpMF4 x) { return StpSatMF4(x * StpMF4_(STP_INFP_MF)); }
- StpMF1 StpSignedMF1(StpMF1 x) { return StpSatMF1(x * StpMF1_(STP_INFN_MF)); }
- StpMF2 StpSignedMF2(StpMF2 x) { return StpSatMF2(x * StpMF2_(STP_INFN_MF)); }
- StpMF3 StpSignedMF3(StpMF3 x) { return StpSatMF3(x * StpMF3_(STP_INFN_MF)); }
- StpMF4 StpSignedMF4(StpMF4 x) { return StpSatMF4(x * StpMF4_(STP_INFN_MF)); }
- #endif // STP_BUG_SAT_INF
- //------------------------------------------------------------------------------------------------------------------------------
- // Unable to use the approximations due to not knowing what the type actually is.
- StpMF1 StpPrxLoSqrtMF1(StpMF1 a) { return sqrt(a); }
- StpMF3 StpPrxLoSqrtMF3(StpMF3 a) { return sqrt(a); }
- StpMF4 StpPrxLoSqrtMF4(StpMF4 a) { return sqrt(a); }
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 StpPrxLoRcpMF1(StpMF1 a) { return StpRcpMF1(a); }
- StpMF2 StpPrxLoRcpMF2(StpMF2 a) { return StpRcpMF2(a); }
- StpMF3 StpPrxLoRcpMF3(StpMF3 a) { return StpRcpMF3(a); }
- StpMF4 StpPrxLoRcpMF4(StpMF4 a) { return StpRcpMF4(a); }
- StpMF1 StpPrxMedRcpMF1(StpMF1 a) { return StpRcpMF1(a); }
- StpMF3 StpPrxMedRcpMF3(StpMF3 a) { return StpRcpMF3(a); }
- #endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_MEDIUM)
- //==============================================================================================================================
- #if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && (!defined(STP_MEDIUM))
- // Same types so just use the full precision version.
- #define StpGtZeroMF1(a) StpGtZeroF1(a)
- #define StpGtZeroMF2(a) StpGtZeroF2(a)
- #define StpGtZeroMF3(a) StpGtZeroF3(a)
- #define StpGtZeroMF4(a) StpGtZeroF4(a)
- #define StpSignedMF1(a) StpSignedF1(a)
- #define StpSignedMF2(a) StpSignedF2(a)
- #define StpSignedMF3(a) StpSignedF3(a)
- #define StpSignedMF4(a) StpSignedF4(a)
- //------------------------------------------------------------------------------------------------------------------------------
- // The medium precision types are the same as the full precision so use the full precision approximations.
- #define StpPrxLoSqrtMF1(a) StpPrxLoSqrtF1(a)
- #define StpPrxLoSqrtMF3(a) StpPrxLoSqrtF3(a)
- #define StpPrxLoSqrtMF4(a) StpPrxLoSqrtF4(a)
- //------------------------------------------------------------------------------------------------------------------------------
- #define StpPrxLoRcpMF1(a) StpPrxLoRcpF1(a)
- #define StpPrxLoRcpMF2(a) StpPrxLoRcpF2(a)
- #define StpPrxLoRcpMF3(a) StpPrxLoRcpF3(a)
- #define StpPrxLoRcpMF4(a) StpPrxLoRcpF4(a)
- #define StpPrxMedRcpMF1(a) StpPrxMedRcpF1(a)
- #define StpPrxMedRcpMF3(a) StpPrxMedRcpF3(a)
- #endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && (!defined(STP_MEDIUM))
- //==============================================================================================================================
- #if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_16BIT)
- StpH1 StpExp2H1(StpH1 x) { return exp2(x); }
- StpH1 StpLog2H1(StpH1 x) { return log2(x); }
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG_ALIAS16
- // Use 32-bit aliasing to build the +/-INF, then typecast to 16-bit.
- #define STP_INFN_H StpH1(StpF1_U1(0xff800000u))
- #define STP_INFP_H StpH1(StpF1_U1(0x7f800000u))
- #else
- #define STP_INFN_H StpH1_W1(StpW1_(0xfc00))
- #define STP_INFP_H StpH1_W1(StpW1_(0x7c00))
- #endif // STP_BUG_ALIAS16
- #if STP_BUG_SAT_INF
- StpH1 StpGtZeroH1(StpH1 x) { return (x > StpH1_(0.0)) ? StpH1_(1.0) : StpH1_(0.0); }
- StpH2 StpGtZeroH2(StpH2 x) { return StpH2(StpGtZeroH1(x.r), StpGtZeroH1(x.g)); }
- StpH3 StpGtZeroH3(StpH3 x) { return StpH3(StpGtZeroH1(x.r), StpGtZeroH1(x.g), StpGtZeroH1(x.b)); }
- StpH4 StpGtZeroH4(StpH4 x) { return StpH4(StpGtZeroH1(x.r), StpGtZeroH1(x.g),
- StpGtZeroH1(x.b), StpGtZeroH1(x.a)); }
- StpH1 StpSignedH1(StpH1 x) { return (x < StpH1_(0.0)) ? StpH1_(1.0) : StpH1_(0.0); }
- StpH2 StpSignedH2(StpH2 x) { return StpH2(StpSignedH1(x.r), StpSignedH1(x.g)); }
- StpH3 StpSignedH3(StpH3 x) { return StpH3(StpSignedH1(x.r), StpSignedH1(x.g), StpSignedH1(x.b)); }
- StpH4 StpSignedH4(StpH4 x) { return StpH4(StpSignedH1(x.r), StpSignedH1(x.g),
- StpSignedH1(x.b), StpSignedH1(x.a)); }
- #elif STP_BUG_SAT
- StpH1 StpGtZeroH1(StpH1 x) { return max(min(x * StpH1_(STP_INFP_H), StpH1_(1.0)), StpH1_(0.0)); }
- StpH2 StpGtZeroH2(StpH2 x) { return max(min(x * StpH2_(STP_INFP_H), StpH2_(1.0)), StpH2_(0.0)); }
- StpH3 StpGtZeroH3(StpH3 x) { return max(min(x * StpH3_(STP_INFP_H), StpH3_(1.0)), StpH3_(0.0)); }
- StpH4 StpGtZeroH4(StpH4 x) { return max(min(x * StpH4_(STP_INFP_H), StpH4_(1.0)), StpH4_(0.0)); }
- StpH1 StpSignedH1(StpH1 x) { return max(min(x * StpH1_(STP_INFN_H), StpH1_(1.0)), StpH1_(0.0)); }
- StpH2 StpSignedH2(StpH2 x) { return max(min(x * StpH2_(STP_INFN_H), StpH2_(1.0)), StpH2_(0.0)); }
- StpH3 StpSignedH3(StpH3 x) { return max(min(x * StpH3_(STP_INFN_H), StpH3_(1.0)), StpH3_(0.0)); }
- StpH4 StpSignedH4(StpH4 x) { return max(min(x * StpH4_(STP_INFN_H), StpH4_(1.0)), StpH4_(0.0)); }
- #else
- StpH1 StpGtZeroH1(StpH1 x) { return StpSatH1(x * StpH1_(STP_INFP_H)); }
- StpH2 StpGtZeroH2(StpH2 x) { return StpSatH2(x * StpH2_(STP_INFP_H)); }
- StpH3 StpGtZeroH3(StpH3 x) { return StpSatH3(x * StpH3_(STP_INFP_H)); }
- StpH4 StpGtZeroH4(StpH4 x) { return StpSatH4(x * StpH4_(STP_INFP_H)); }
- StpH1 StpSignedH1(StpH1 x) { return StpSatH1(x * StpH1_(STP_INFN_H)); }
- StpH2 StpSignedH2(StpH2 x) { return StpSatH2(x * StpH2_(STP_INFN_H)); }
- StpH3 StpSignedH3(StpH3 x) { return StpSatH3(x * StpH3_(STP_INFN_H)); }
- StpH4 StpSignedH4(StpH4 x) { return StpSatH4(x * StpH4_(STP_INFN_H)); }
- #endif // STP_BUG_SAT_INF
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG_PRX
- StpH1 StpPrxLoSqrtH1(StpH1 a) { return sqrt(a); }
- StpH3 StpPrxLoSqrtH3(StpH3 a) { return sqrt(a); }
- StpH4 StpPrxLoSqrtH4(StpH4 a) { return sqrt(a); }
- #else
- StpH1 StpPrxLoSqrtH1(StpH1 a) { return StpH1_W1((StpW1_H1(a) >> StpW1_(1)) + StpW1_(0x1de2)); }
- StpH3 StpPrxLoSqrtH3(StpH3 a) { return StpH3_W3((StpW3_H3(a) >> StpW3_(1)) + StpW3_(0x1de2)); }
- StpH4 StpPrxLoSqrtH4(StpH4 a) { return StpH4_W4((StpW4_H4(a) >> StpW4_(1)) + StpW4_(0x1de2)); }
- #endif // STP_BUG_PRX
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG_PRX
- StpH1 StpPrxLoRcpH1(StpH1 a) { return StpRcpH1(a); }
- StpH2 StpPrxLoRcpH2(StpH2 a) { return StpRcpH2(a); }
- StpH3 StpPrxLoRcpH3(StpH3 a) { return StpRcpH3(a); }
- StpH4 StpPrxLoRcpH4(StpH4 a) { return StpRcpH4(a); }
- StpH1 StpPrxMedRcpH1(StpH1 a) { return StpRcpH1(a); }
- StpH3 StpPrxMedRcpH3(StpH3 a) { return StpRcpH3(a); }
- #else
- // Note this will create denormals.
- // MAPPING
- // -------
- // +INF (7c00) -> -61568
- // 65504 (7bff) -> -61600
- // 30800 (7785) -> NaN
- // 30784 (7784) -> 0 ........ (any input larger than 30784 will break)
- // 1 (3c00) -> 0.9395 ... (so not energy preserving for 1.0)
- // 0 (0000) -> 30784
- StpH1 StpPrxLoRcpH1(StpH1 a) { return StpH1_W1(StpW1_(0x7784) - StpW1_H1(a)); }
- StpH2 StpPrxLoRcpH2(StpH2 a) { return StpH2_W2(StpW2_(0x7784) - StpW2_H2(a)); }
- StpH3 StpPrxLoRcpH3(StpH3 a) { return StpH3_W3(StpW3_(0x7784) - StpW3_H3(a)); }
- StpH4 StpPrxLoRcpH4(StpH4 a) { return StpH4_W4(StpW4_(0x7784) - StpW4_H4(a)); }
- // Anything larger than 30928 will break in this function.
- StpH1 StpPrxMedRcpH1(StpH1 a) { StpH1 b = StpH1_W1(StpW1_(0x778d) - StpW1_H1(a));
- return b * (-b * a + StpH1_(2.0)); }
- StpH3 StpPrxMedRcpH3(StpH3 a) { StpH3 b = StpH3_W3(StpW3_(0x778d) - StpW3_H3(a));
- return b * (-b * a + StpH3_(2.0)); }
- #endif // STP_BUG_PRX
- #endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_16BIT)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // LANE REMAPPING
- //==============================================================================================================================
- #if defined(STP_GPU)
- // More complex remap which is safe for both portability (different wave sizes up to 128) and for 2D wave reductions.
- // 6543210
- // =======
- // ..xx..x
- // yy..yy.
- // Details,
- // LANE TO 8x16 MAPPING
- // ====================
- // 00 01 08 09 10 11 18 19
- // 02 03 0a 0b 12 13 1a 1b
- // 04 05 0c 0d 14 15 1c 1d
- // 06 07 0e 0f 16 17 1e 1f
- // 20 21 28 29 30 31 38 39
- // 22 23 2a 2b 32 33 3a 3b
- // 24 25 2c 2d 34 35 3c 3d
- // 26 27 2e 2f 36 37 3e 3f
- // .......................
- // ... repeat the 8x8 ....
- // .... pattern, but .....
- // .... for 40 to 7f .....
- // .......................
- StpU2 StpRmp8x16U2(StpU1 a) {
- // Note the BFIs used for MSBs have "strange offsets" due to leaving space for the LSB bits replaced in the BFI.
- return StpU2(StpBfiMskU1(StpBfeU1(a, 2u, 3u), a, 1u),
- StpBfiMskU1(StpBfeU1(a, 3u, 4u), StpBfeU1(a, 1u, 2u), 2u)); }
- #endif // defined(STP_GPU)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // PRESETS (DON'T CHANGE)
- //==============================================================================================================================
- // High-end mobile.
- #if (STP_TAA_Q == 0)
- #define STP_GEAA_P 1
- #define STP_GEAA_SUBPIX (2.0 / 16.0)
- #define STP_TAA_PEN_F1 (1.0 / 4.0)
- #define STP_TAA_PEN_F0 (1.0 / 2.0)
- #define STP_TAA_PEN_W (1.0 / 2.0)
- #define STP_TAA_PRX_LANCZOS 1
- #define STP_TAA_PRX_LANCZOS_DERING 0
- #endif // (STP_TAA_Q == 0)
- //------------------------------------------------------------------------------------------------------------------------------
- // Desktop.
- #if (STP_TAA_Q == 1)
- #define STP_GEAA_P 3
- #define STP_GEAA_SUBPIX (2.0 / 16.0)
- #define STP_TAA_PEN_F1 (1.0 / 4.0)
- #define STP_TAA_PEN_F0 (1.0 / 2.0)
- #define STP_TAA_PEN_W (1.0 / 2.0)
- #define STP_TAA_PRX_LANCZOS 2
- #define STP_TAA_PRX_LANCZOS_DERING 1
- #endif // (STP_TAA_Q == 1)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // INTERNAL TUNING (DON'T CHANGE)
- //==============================================================================================================================
- // Limits on anti-flicker weighting, tuning for range and precision challenges of FP16.
- #define STP_ANTI_MAX 8192.0
- // Using '1/8192' provides known problems on some platforms that are 16-bit precision challenged.
- #define STP_ANTI_MIN (1.0 / 4096.0)
- //------------------------------------------------------------------------------------------------------------------------------
- #define STP_DITHER_DEPTH 1
- #define STP_DITHER_MOTION 1
- //------------------------------------------------------------------------------------------------------------------------------
- // Ratios for luma in a gamma space, using BT.709 luma.
- #define STP_LUMA_R 0.2126
- #define STP_LUMA_G 0.7152
- #define STP_LUMA_B 0.0722
- #define STP_LUMA STP_LUMA_R, STP_LUMA_G, STP_LUMA_B
- //------------------------------------------------------------------------------------------------------------------------------
- // Maximum frames of feedback.
- #define STP_FRAME_MAX 32.0
- //------------------------------------------------------------------------------------------------------------------------------
- // Control the min (motion match), and max (no motion match), in units of pixels.
- // Settings of {max=1.0} won't work for 8x area scaling (trailing edge smears).
- // Setting too tight won't have enough slop for motion matching (motion match easily fails, leading to loss of detail).
- // If STP_PAT_MOT_MAX is too big, it will look like edges expand (or float) during change of motion.
- #define STP_PAT_MOT_MIN (1.0 / 16.0)
- #define STP_PAT_MOT_MAX (1.0 / 8.0)
- // Computed constants.
- #define STP_PAT_MOT_ADD (STP_PAT_MOT_MIN * STP_PAT_MOT_MIN)
- #define STP_PAT_MOT_AMP (1.0 / (STP_PAT_MOT_MAX * STP_PAT_MOT_MAX - STP_PAT_MOT_ADD))
- //------------------------------------------------------------------------------------------------------------------------------
- // Larger numbers ghost more, smaller numbers flicker more.
- #define STP_PAT_DEMOIRE 64.0
- // Increase for less ghosting, decrease for more ghosting.
- #define STP_PAT_SENSITIVITY (2.0 / 16.0)
- // Amount to scale up sensitivity on responsive. Lower numbers ghost more, higher flicker more.
- #define STP_PAT_RESPONSIVE 16.0
- // Minimum neighborhood (defaults to 1/32 of maximum value of neighborhood to allow some noise).
- #define STP_PAT_NE_MIN (1.0 / 32.0)
- //------------------------------------------------------------------------------------------------------------------------------
- // {0} = default lowest dilation (higher chance of slight trailing ghost, but less overall flicker)
- // {1} = expand a little (higher cost)
- // {2} = expand by too much (a lot more cost, more flicker, perhaps less trailing ghost)
- // In practice it's dilation and motion match threshold (PAT_MOT) which results in the final {flicker, ghost} tradeoff.
- #define STP_SAFE_DILATE 1
- //------------------------------------------------------------------------------------------------------------------------------
- // Adjusts the point at which spatial-only weights blend up and anti-flicker fully takes over.
- #define STP_TAA_SAA (1.0 / 2.0)
- // De-weight pixel contribution for chopped corner.
- #define STP_TAA_TRI_MASK_AVOID (1.0 / 8192.0)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // JITTER LOCATIONS
- //------------------------------------------------------------------------------------------------------------------------------
- // STP is now using Halton(2,3).
- //==============================================================================================================================
- // Generate jitter amount given frame index.
- STP_STATIC void StpJit(StpOutF2 p, StpU1 frame) {
- // TODO: This function isn't used inside Unity, if ever this is used the implementation should be added here.
- p[0] = StpF1_(0.0);
- p[1] = StpF1_(0.0); }
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // PARABOLIC {SIN,COS}
- //==============================================================================================================================
- #if defined(STP_GPU)
- // Input is {-1 to 1} representing {0 to 2 pi}, output is {-1/4 to 1/4} representing {-1 to 1}.
- void StpPSinF2(inout StpF2 p) { p = p * abs(p) - p; }
- // This is used to dither position of gather4 fetch for nearest motion vector to remove nearest artifacts when scaling.
- // Input 'p.x' is {0 to 1} representing {0 to 2 pi}, output is {-1/4 to 1/4} representing {-1 to 1}.
- void StpPSinCosF(inout StpF2 p) { p.y = StpFractF1(p.x + StpF1_(0.25)); p = p * StpF2_(2.0) - StpF2_(1.0); StpPSinF2(p); }
- //------------------------------------------------------------------------------------------------------------------------------
- void StpPSinMF2(inout StpMF2 p) { p = p * abs(p) - p; }
- void StpPSinCosMF(inout StpMF2 p) {
- p.y = StpFractMF1(p.x + StpMF1_(0.25));
- p = p * StpMF2_(2.0) - StpMF2_(1.0); StpPSinMF2(p); }
- #endif // defined(STP_GPU)
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_16BIT)
- void StpPSinH2(inout StpH2 p) { p = p * abs(p) - p; }
- void StpPSinCosH(inout StpH2 p) { p.y = StpFractH1(p.x + StpH1_(0.25)); p = p * StpH2_(2.0) - StpH2_(1.0); StpPSinH2(p); }
- #endif // defined(STP_GPU) && defined(STP_16BIT)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // DEPTH ENCODING
- //------------------------------------------------------------------------------------------------------------------------------
- // Using a log2() based encoding, takes {0 to inf} to {0 to 1}.
- // log2(k.x*z)*k.y
- // Where
- // k.x = 1/near ............ (so that k0*z is 1 when z=near)
- // k.y = 1/log2(k.x*far) ... (so that output is {0 to 1} ranged)
- //------------------------------------------------------------------------------------------------------------------------------
- // And the inverse
- // exp2(x*k.x)*k.y
- // Where
- // k.x = log2(far/near)
- // k.y = near
- //==============================================================================================================================
- #if defined(STP_GPU)
- // Build the constants, based on near and far planes.
- // The 'far' is where anything more distant clamps to 1.0.
- StpF2 StpZCon(StpF1 near, StpF1 far) {
- StpF2 k;
- k.x = StpRcpF1(near);
- k.y = StpRcpF1(log2(k.x * far));
- return k; }
- //------------------------------------------------------------------------------------------------------------------------------
- // Where 'k' is generated by StpZCon().
- StpF1 StpZPack(StpF1 z, StpF2 k, StpF1 dit) {
- #if (STP_DITHER_DEPTH == 0)
- return StpSatF1(log2(k.x * z) * k.y);
- #endif // (STP_DITHER_DEPTH == 0)
- #if (STP_DITHER_DEPTH == 1)
- // Fast linearly incorrect dither for 10-bit.
- return StpSatF1(log2(k.x * z) * k.y + dit * StpF1_(1.0 / 1024.0) - StpF1_(0.5 / 1024.0));
- #endif // (STP_DITHER_DEPTH == 1)
- }
- //==============================================================================================================================
- // Build the constants, based on near and far planes.
- // The 'far' is where anything more distant clamps to 1.0.
- StpF2 StpZUnCon(StpF1 near, StpF1 far) {
- StpF2 k;
- k.x = log2(far * StpRcpF1(near));
- k.y = near;
- return k; }
- //------------------------------------------------------------------------------------------------------------------------------
- // Where 'k' is generated by StpZUnCon().
- StpF1 StpZUnpack(StpF1 x, StpF2 k) { return exp2(x * k.x) * k.y; }
- #endif // defined(STP_GPU)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // STATIC GEOMETRY MOTION FORWARD PROJECTION
- //==============================================================================================================================
- // This is a separate section simply for documentation.
- // This logic must be computed in 32-bit precision (in theory).
- //------------------------------------------------------------------------------------------------------------------------------
- // MOTION MATCH NOTES
- // ==================
- // - The 'position - motion' is the reprojected position.
- // - Where {0 to 1} is no motion to a screen in motion.
- // - Motion check works with a differential vector '((motionPrior - motionCurrent) * kC)'.
- // - For static forward projection it will be '((motionPrior*0.5 - motionCurrent) * kC)'.
- // - Due to motionPrior being in {-1 to 1} NDC instead of {0 to 1} for screen.
- // - Working with motion vector differences to avoid complexity with jitter.
- //------------------------------------------------------------------------------------------------------------------------------
- // MOTION VECTOR NOTES
- // ===================
- // - 'reprojection = position - motion'
- // - 'reprojection + motion = position'
- // - 'motion = position - reprojection'
- // - So motion points forward.
- //------------------------------------------------------------------------------------------------------------------------------
- // FORWARD PROJECTION LOGIC
- // ========================
- // HAVE INPUT {0 TO 1} SCREEN POSITION
- // xy
- // GET XY INTO {-1 TO 1} NDC [2 FMA, CANNOT FACTOR, NEED AT END]
- // x=x*2-1
- // y=y*2-1
- // HAVE INPUT {0 TO INF} DEPTH
- // z
- // GET FROM {XY NDC, DEPTH} TO 3D VIEW POSITION [4 FMA]
- // xx=x*((z*g+h)/a) ... xx=x*(z*(g/a)+(h/a)) ... xx=x*(z*k0+k1)
- // yy=y*((z*g+h)/b) ... yy=y*(z*(g/b)+(h/b)) ... yy=y*(z*k2+k3)
- // TRANSFORM TO NEW VIEW
- // xxx=xx*i+yy*j+z*k+l
- // yyy=xx*m+yy*n+z*o+p
- // zzz=xx*q+yy*r+z*s+t
- // PROJECTION [9 FMA]
- // xxxx=xxx*a ..... xxxx=xx*(i*a)+yy*(j*a)+z*(k*a)+(l*a) ..... xxxx=xx*k4+yy*k5+z*k6+k7
- // yyyy=yyy*b ..... yyyy=xx*(m*b)+yy*(n*b)+z*(o*b)+(p*b) ..... yyyy=xx*k8+yy*k9+z*kA+kB
- // wwww=zzz*g+h ... wwww=xx*(q*g)+yy*(r*g)+z*(s*g)+(t*g+h) ... wwww=xx*kC+yy*kD+z*kE+kF
- // PERSPECTIVE DIVIDE [1 RCP]
- // xxxxx=xxxx/wwww
- // yyyyy=yyyy/wwww
- // SUBTRACT TO GET 2X MOTION [2 FMA]
- // u=xxxxx-x ... u=xxxx*(1/wwww)-x
- // v=yyyyy-y ... v=yyyy*(1/wwww)-y
- // CONSTANTS (SEE BELOW FOR MEANING OF VARIABLES)
- // k0=g/a ... Constants {a,b,c,d,g,h} for prior projection
- // k1=h/a
- // k2=g/b
- // k3=h/b
- // k4=i*a ... Constants {a,b,c,d,g,h} for next projection
- // k5=j*a
- // k6=k*a
- // k7=l*a
- // k8=m*b
- // k9=n*b
- // kA=o*b
- // kB=p*b
- // kC=q*g
- // kD=r*g
- // kE=s*g
- // kF=t*g+h
- //------------------------------------------------------------------------------------------------------------------------------
- // BACKWARD PROJECTION LOGIC
- // =========================
- // This starts from '3D VIEW POSITION' of 'FORWARD PROJECTION LOGIC', but with different constants.
- // TRANSFORM TO NEW VIEW
- // xxx=xx*i+yy*j+z*k+l
- // yyy=xx*m+yy*n+z*o+p
- // zzz=xx*q+yy*r+z*s+t
- // PROJECTION [9 FMA]
- // xxxx=xxx*a ..... xxxx=xx*(i*a)+yy*(j*a)+z*(k*a)+(l*a) ..... xxxx=xx*kG+yy*kH+z*kI+kJ
- // yyyy=yyy*b ..... yyyy=xx*(m*b)+yy*(n*b)+z*(o*b)+(p*b) ..... yyyy=xx*kK+yy*kL+z*kM+kN
- // wwww=zzz*g+h ... wwww=xx*(q*g)+yy*(r*g)+z*(s*g)+(t*g+h) ... wwww=xx*kO+yy*kP+z*kQ+kR
- // PERSPECTIVE DIVIDE [1 RCP]
- // xxxxx=xxxx/wwww
- // yyyyy=yyyy/wwww
- // SUBTRACT TO GET 2X MOTION [2 FMA]
- // u=xxxxx-x ... u=xxxx*(1/wwww)-x
- // v=yyyyy-y ... v=yyyy*(1/wwww)-y
- // CONSTANTS (SEE BELOW FOR MEANING OF VARIABLES)
- // kG=i*a ... Constants {a,b,c,d,g,h} for previous prior projection, and {i,j,k,l,m,n,o,p,q,r,s,t} for prior back projection
- // kH=j*a
- // kI=k*a
- // kJ=l*a
- // kK=m*b
- // kL=n*b
- // kM=o*b
- // kN=p*b
- // kO=q*g
- // kP=r*g
- // kQ=s*g
- // kR=t*g+h
- //==============================================================================================================================
- // GET FROM {0 TO 1} TO {-1 TO 1}
- // ==============================
- // - Get to NDC for {x,y}
- // X:=x*2-1
- // Y:=y*2-1
- //------------------------------------------------------------------------------------------------------------------------------
- // FORWARD VIEW
- // ============
- // - Using 12 values
- // X:=x*i+y*j+z*k+l
- // Y:=x*m+y*n+z*o+p
- // Z:=x*q+y*r+z*s+t
- // W:=1
- // i j k l
- // m n o p
- // q r s t
- // 0 0 0 1
- //------------------------------------------------------------------------------------------------------------------------------
- // PROJECTIONS
- // ===========
- // - INPUTS
- // n ... near plane z
- // f ... far plane z
- // - DX ORTHO PROJECTION
- // c:=1/(f-n)
- // d:=-n/(f-n)
- // X:=x*a
- // Y:=y*b
- // Z:=z*c+d ... (w=1 on input)
- // W:=1
- // a 0 0 0
- // 0 b 0 0
- // 0 0 c d
- // 0 0 0 1
- // - DX PERSPECTIVE PROJECTION (LEFT HANDED)
- // c:=f/(f-n)
- // d:=-(f*n)/(f-n)
- // X:=x*a
- // Y:=y*b
- // Z:=z*c+d ... (w=1 on input)
- // W:=z
- // a 0 0 0
- // 0 b 0 0
- // 0 0 c d
- // 0 0 1 0 ... (note DX allows the 1 to be non-one)
- // - DX PERSPECTIVE PROJECTION REVERSED FOR BETTER PRECISION (LEFT HANDED)
- // c:=-n/(f-n)
- // d:=(f*n)/(f-n)
- // X:=x*a
- // Y:=y*b
- // Z:=z*c+d ... (w=1 on input)
- // W:=z
- // a 0 0 0
- // 0 b 0 0
- // 0 0 c d
- // 0 0 1 0
- // - DX PERSPECTIVE PROJECTION REVERSED WITH INF FAR (LEFT HANDED)
- // X:=x*a
- // Y:=y*b
- // Z:=n ... (w=1 on input)
- // W:=z
- // a 0 0 0
- // 0 b 0 0
- // 0 0 0 n
- // 0 0 1 0
- // - GL PERSPECTIVE PROJECTION
- // c:=-(f+n)/(f-n)
- // d:=-(2fn)/(f-n)
- // X:=x*a
- // Y:=y*b
- // Z:=z*c+d ... (w=1 on input)
- // W:=z
- // a 0 0 0
- // 0 b 0 0
- // 0 0 c d
- // 0 0 -1 0
- // - GENERALIZED (WILL DO ANYTHING)
- // X:=x*a
- // Y:=y*b
- // Z:=z*c+d ... (w=1 on input)
- // W:=z*g+h
- // a 0 0 0
- // 0 b 0 0
- // 0 0 c d
- // 0 0 g h
- //------------------------------------------------------------------------------------------------------------------------------
- // PROJECTED TO NDC
- // ================
- // - Ignoring viewport transform
- // X:=x/w
- // Y:=y/w
- // Z:=z/w
- // W:=1/w
- // - Inverse
- // x=X*w
- // y=Y*w
- //==============================================================================================================================
- // MODIFICATIONS FOR COMPLEX PROJECTIONS
- //------------------------------------------------------------------------------------------------------------------------------
- // Since this worked out to just 2 more FMAs and 2 more constants, decided not to create a shader permutation.
- //==============================================================================================================================
- // COMPLEX PROJECTION
- // ==================
- // - GL PERSPECTIVE PROJECTION - WITH Z BASED {X,Y} MODIFICATIONS
- // c:=-(F+N)/(F-N)
- // d:=-(2FN)/(F-N)
- // X:=x*a + z*e
- // Y:=y*b + z*f
- // Z:=z*c+d ... (w=1 on input)
- // W:=z
- // a 0 e 0
- // 0 b f 0
- // 0 0 c d
- // 0 0 -1 0
- // - GENERALIZED (WILL DO ANYTHING) - WITH Z BASED {X,Y} MODIFICATIONS
- // X:=x*a + z*e
- // Y:=y*b + z*f
- // Z:=z*c+d ... (w=1 on input)
- // W:=z*g+h
- // a 0 e 0
- // 0 b f 0
- // 0 0 c d
- // 0 0 g h
- // - INVERSE GIVEN 'z'
- // X:=x*a + z*e
- // Y:=y*b + z*f
- // X - z*e:=x*a
- // Y - z*f:=y*b
- // X/a - z*e/a:=x
- // Y/b - z*f/b:=y
- //------------------------------------------------------------------------------------------------------------------------------
- // FORWARD PROJECTION LOGIC
- // ========================
- // HAVE INPUT {0 TO 1} SCREEN POSITION
- // xy
- // GET XY INTO {-1 TO 1} NDC [2 FMA, CANNOT FACTOR, NEED AT END]
- // x=x*2-1
- // y=y*2-1
- // HAVE INPUT {0 TO INF} DEPTH
- // z
- // GET FROM {XY NDC, DEPTH} TO 3D VIEW POSITION [4 FMA]
- // ... have {X,Y,z}
- // ... xx=(x*(z*g+h))*(1/a) + z*(e/a)
- // ... yy=(y*(z*g+h))*(1/b) + z*(f/b)
- // ... xx=x*((z*g+h)/a) + z*(e/a)
- // ... yy=y*((z*g+h)/b) + z*(f/b)
- // ... xx=x*(z*(g/a)+(h/a)) + z*(e/a)
- // ... yy=y*(z*(g/b)+(h/b)) + z*(f/b)
- // xx=x*(z*k0+k1)+z*k2
- // yy=y*(z*k3+k4)+z*k5
- // TRANSFORM TO NEW VIEW
- // xxx=xx*i+yy*j+z*k+l
- // yyy=xx*m+yy*n+z*o+p
- // zzz=xx*q+yy*r+z*s+t
- // PROJECTION [9 FMA]
- // xxxx=xxx*a+zzz*e
- // ... xxxx=xx*(i*a)+yy*(j*a)+z*(k*a)+(l*a) + xx*(q*e)+yy*(r*e)+z*(s*e)+(t*e)
- // ... xxxx=xx*k6+yy*k7+z*k8+k9
- // yyyy=yyy*b+zzz*f
- // ... yyyy=xx*(m*b)+yy*(n*b)+z*(o*b)+(p*b) + xx*(q*f)+yy*(r*f)+z*(s*f)+(t*f)
- // ... yyyy=xx*kA+yy*kB+z*kC+kD
- // wwww=zzz*g+h
- // ... wwww=xx*(q*g)+yy*(r*g)+z*(s*g)+(t*g+h)
- // ... wwww=xx*kE+yy*kF+z*kG+kH
- // PERSPECTIVE DIVIDE [1 RCP]
- // xxxxx=xxxx/wwww
- // yyyyy=yyyy/wwww
- // SUBTRACT TO GET 2X MOTION [2 FMA]
- // u=xxxxx-x ... u=xxxx*(1/wwww)-x
- // v=yyyyy-y ... v=yyyy*(1/wwww)-y
- // CONSTANTS (SEE BELOW FOR MEANING OF VARIABLES)
- // k0=g/a ... Constants {a,b,c,d,e,f,g,h} for prior projection
- // k1=h/a
- // k2=e/a
- // k3=g/b
- // k4=h/b
- // k5=f/b
- // k6=(i*a)+(q*e) ... Constants {a,b,c,d,e,f,g,h} for next projection
- // k7=(j*a)+(r*e)
- // k8=(k*a)+(s*e)
- // k9=(l*a)+(t*e)
- // kA=(m*b)+(q*f)
- // kB=(n*b)+(r*f)
- // kC=(o*b)+(s*f)
- // kD=(p*b)+(t*f)
- // kE=q*g
- // kF=r*g
- // kG=s*g
- // kH=t*g+h
- //------------------------------------------------------------------------------------------------------------------------------
- // BACKWARD PROJECTION LOGIC
- // =========================
- // This starts from '3D VIEW POSITION' of 'FORWARD PROJECTION LOGIC', but with different constants.
- // TRANSFORM TO NEW VIEW
- // xxx=xx*i+yy*j+z*k+l
- // yyy=xx*m+yy*n+z*o+p
- // zzz=xx*q+yy*r+z*s+t
- // PROJECTION [9 FMA]
- // xxxx=xxx*a+zzz*e
- // ..... xxxx=xx*(i*a)+yy*(j*a)+z*(k*a)+(l*a) + xx*(q*e)+yy*(r*e)+z*(s*e)+(t*e)
- // ..... xxxx=xx*kI+yy*kJ+z*kK+kJL
- // yyyy=yyy*b+zzz*f
- // ..... yyyy=xx*(m*b)+yy*(n*b)+z*(o*b)+(p*b) + xx*(q*f)+yy*(r*f)+z*(s*f)+(t*f)
- // ..... yyyy=xx*kM+yy*kN+z*kO+kP
- // wwww=zzz*g+h
- // ... wwww=xx*(q*g)+yy*(r*g)+z*(s*g)+(t*g+h)
- // ... wwww=xx*kQ+yy*kR+z*kS+kT
- // PERSPECTIVE DIVIDE [1 RCP]
- // xxxxx=xxxx/wwww
- // yyyyy=yyyy/wwww
- // SUBTRACT TO GET 2X MOTION [2 FMA]
- // u=xxxxx-x ... u=xxxx*(1/wwww)-x
- // v=yyyyy-y ... v=yyyy*(1/wwww)-y
- // CONSTANTS (SEE BELOW FOR MEANING OF VARIABLES)
- // ... Constants {a,b,c,d,e,f,g,h} for previous prior projection
- // ... Constants {i,j,k,l,m,n,o,p,q,r,s,t} for prior back projection
- // kI=(i*a)+(q*e)
- // kJ=(j*a)+(r*e)
- // kK=(k*a)+(s*e)
- // kL=(l*a)+(t*e)
- // kM=(m*b)+(q*f)
- // kN=(n*b)+(r*f)
- // kO=(o*b)+(s*f)
- // kP=(p*b)+(t*f)
- // kQ=q*g
- // kR=r*g
- // kS=s*g
- // kT=t*g+h
- //==============================================================================================================================
- #if defined(STP_GPU)
- // Generates forward {-1 to 1} NDC forward projection vectors given (from prior frame),
- // p .... {0 to 1} screen position
- // z .... {0 to INF} depth
- // m .... {0 to 1} prior motion vector
- // The results are approximately corrected for dynamic motion.
- // This takes 'dynamicMotion = priorMotionVector - priorStaticGeometryBackprojection'
- // Then adds that estimate of dynamic motion to the static geometry forward projection vector.
- StpF2 StpFor(StpF2 p, StpF1 z, StpF2 m, StpF1 kMotionMatch,
- StpF4 k0123, StpF4 k4567, StpF4 k89AB, StpF4 kCDEF, StpF4 kGHIJ, StpF4 kKLMN, StpF4 kOPQR, StpF2 kST,
- out StpF2 bugF, out StpF2 bugD){
- // Implements the logic described above in the comments.
- p = p * StpF2_(2.0) - StpF2_(1.0);
- StpF2 q;
- q.x = p.x * (z * k0123.x + k0123.y) + (z * k0123.z);
- q.y = p.y * (z * k0123.w + k4567.x) + (z * k4567.y);
- StpF3 v;
- v.x = q.x * k4567.z + q.y * k4567.w + z * k89AB.x + k89AB.y;
- v.y = q.x * k89AB.z + q.y * k89AB.w + z * kCDEF.x + kCDEF.y;
- v.z = q.x * kCDEF.z + q.y * kCDEF.w + z * kGHIJ.x + kGHIJ.y;
- v.z = StpRcpF1(v.z);
- StpF3 v2;
- v2.x = q.x * kGHIJ.z + q.y * kGHIJ.w + z * kKLMN.x + kKLMN.y;
- v2.y = q.x * kKLMN.z + q.y * kKLMN.w + z * kOPQR.x + kOPQR.y;
- v2.z = q.x * kOPQR.z + q.y * kOPQR.w + z * kST.x + kST.y;
- v2.z = StpRcpF1(v2.z);
- // Motion vector points forward (to estimated position in next frame).
- // Negative motion vector points back to where the pixel was in the prior frame.
- // Motion vector is {0 to 1} for one screen, but this logic is {-1 to 1} based (hence a 2x scaling).
- bugF = (v.xy * StpF2_(v.z) - p); // Static forward estimate.
- bugD = ((StpF2_(2.0) * m) - (p - v2.xy * StpF2_(v2.z))) * StpF2_(kMotionMatch); // Dynamic estimate.
- return bugF + bugD; }
- #endif // defined(STP_GPU)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // MOTION VECTOR ENCODING
- //------------------------------------------------------------------------------------------------------------------------------
- // {MSB 10-bit depth, LSB {11,11}-bit motion with sqrt() encoding}
- // Motion is encoding in sqrt() space.
- //------------------------------------------------------------------------------------------------------------------------------
- // 11111111111111110000000000000000
- // fedcba9876543210fedcba9876543210
- // ================================
- // zzzzzzzzzz...................... 10-bit encoded z
- // ..........yyyyyyyyyyy........... 11-bit {-1 to <1} y encoded in gamma 2.0 (sqrt)
- // .....................xxxxxxxxxxx 11-bit {-1 to <1} x encoded in gamma 2.0 (sqrt)
- //------------------------------------------------------------------------------------------------------------------------------
- // The 32-bit path is 8 ops to decode {x,y}.
- //------------------------------------------------------------------------------------------------------------------------------
- // There once was a 16-bit path which takes 6 ops to decode (bit extra because ABS isn't free).
- // hhhhhhhhhhhhhhhhllllllllllllllll
- // ================================
- // zzzzzzzzzzyyyyyyyyyyyxxxxxxxxxxx input
- // zzzzzyyyyyyyyyyyxxxxxxxxxxx00000 << 5
- // 00000yyyyyyyyyyyxxxxxxxxxxx00000 & 0x7FFFFFF
- // 00000yyyyyyyyyyy00000xxxxxxxxxxx >> 5 (for 16-bit LSB only)
- // This gets 11-bit integers which perfectly alias lowest non-denormal and denormals of FP16.
- // Can scale by '16384' and subtract 1 to decompress without a CVT.
- //==============================================================================================================================
- #if defined(STP_GPU)
- // The 'z' comes in {0 to 1}.
- // This depends on 'v' ranging inside and including {-1 to 1}.
- StpU1 StpMvPack(StpF1 z, StpF2 v, StpF1 dit) {
- // {-1 to 1} linear to gamma 2.0 {-1 to 1}
- #if STP_DITHER_MOTION
- v = StpCpySgnF2(StpSatF2(sqrt(abs(v)) + StpF2_(dit * StpF1_(1.0 / 1024.0) - StpF1_(0.5 / 1024.0))), v);
- #else
- v = StpCpySgnF2(sqrt(abs(v)), v);
- #endif
- // Limit to {-1024/1024 to 1023/1024}.
- v = min(v, StpF2_(1023.0/1024.0));
- // Encode to 11-bit with zero at center of one step.
- v = v * StpF2_(1024.0) + StpF2_(1024.0);
- // Pack.
- return (StpU1(z * StpF1(1023.0)) << StpU1(22)) + (StpU1(v.y) << StpU1(11)) + StpU1(v.x); }
- //------------------------------------------------------------------------------------------------------------------------------
- // Unpacks all.
- void StpMvUnpack(out StpF1 z, out StpF2 v, StpU1 i) {
- StpU1 iz = StpBfeU1(i, 22u, 10u);
- StpU1 iy = StpBfeU1(i, 11u, 11u);
- StpU1 ix = StpBfeU1(i, 0, 11u);
- z = StpF1(iz) * StpF1_(1.0 / 1023.0);
- v.y = StpF1(iy) * StpF1_(1.0 / 1024.0) + StpF1_(-1.0);
- v.x = StpF1(ix) * StpF1_(1.0 / 1024.0) + StpF1_(-1.0);
- v *= abs(v); }
- //------------------------------------------------------------------------------------------------------------------------------
- // Unpack just velocity.
- void StpMvUnpackV(out StpF2 v, StpU1 i) {
- StpU1 iy = StpBfeU1(i, 11u, 11u);
- StpU1 ix = StpBfeU1(i, 0, 11u);
- v.y = StpF1(iy) * StpF1_(1.0 / 1024.0) + StpF1_(-1.0);
- v.x = StpF1(ix) * StpF1_(1.0 / 1024.0) + StpF1_(-1.0);
- v *= abs(v); }
- #endif // defined(STP_GPU)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // COLOR CONVERSION
- //==============================================================================================================================
- #if defined(STP_GPU)
- // Scaling in the reversible tonemapper (should be >= 1).
- // Getting too close to 1.0 will result in luma inversions in highly saturated content in the oldest algorithm.
- // Using 4.0 or ideally 8.0 is recommended.
- #define STP_SAT 4.0
- #endif // defined(STP_GPU)
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_32BIT)
- void StpToneF1(inout StpF1 x) { StpF1 y = StpRcpF1(StpF1_(STP_SAT) + x); x = StpSatF1(x * StpF1_(y)); }
- //------------------------------------------------------------------------------------------------------------------------------
- // Reversible tonemapper.
- void StpToneF3(inout StpF3 x) {
- StpF1 y = StpRcpF1(StpF1_(STP_SAT) + StpMax3F1(x.r, x.g, x.b));
- x = StpSatF3(x * StpF3_(y)); }
- //------------------------------------------------------------------------------------------------------------------------------
- void StpToneInvF3(inout StpF3 x) {
- StpF1 y = StpRcpF1(
- // |-----| <- Using 32768.0 causes problems in Unity with bloom on at least some platforms.
- // | | So output maximum is 16384 for StpToneInvF3().
- max(StpF1_(1.0 / 16384.0), StpSatF1(StpF1_(1.0 / STP_SAT) - StpMax3F1(x.r, x.g, x.b) * StpF1_(1.0 / STP_SAT))));
- x *= StpF3_(y); }
- //------------------------------------------------------------------------------------------------------------------------------
- // This is currently unused but left in for reference.
- // Convert LDR RGB to Gamma 2.0 RGB {0 to 1}.
- // This is for storage to 8-bit.
- // This is temporal dithered.
- // Unoptimized logic (for reference).
- // StpF3 n = sqrt(c);
- // n = floor(n * StpF3_(255.0)) * StpF3_(1.0 / 255.0);
- // StpF3 a = n * n;
- // StpF3 b = n + StpF3_(1.0 / 255.0); b = b * b;
- // // Ratio of 'a' to 'b' required to produce 'c'.
- // StpF3 r = (c - b) * StpRcpF3(a - b);
- // // Use the ratio as a cutoff to choose 'a' or 'b'.
- // c = StpSatF3(n + StpGtZeroF3(StpF3_(dit) - r) * StpF3_(1.0 / 255.0));
- // Optimized from 57 to 42 clks on GCN.
- StpF3 StpRgbGamDit8F3(StpF3 c, StpF1 dit) {
- StpF3 n = sqrt(c);
- n = floor(n * StpF3_(255.0)) * StpF3_(1.0 / 255.0);
- StpF3 a = n * n;
- StpF3 b = n + StpF3_(1.0 / 255.0);
- c = StpSatF3(n + StpGtZeroF3(StpF3_(dit) * (b * b - a) - (b * b - c)) * StpF3_(1.0 / 255.0)); return c; }
- //------------------------------------------------------------------------------------------------------------------------------
- // This is currently unused but left in for reference.
- // Version for 10-bit for feedback.
- StpF3 StpRgbGamDit10F3(StpF3 c, StpF1 dit) {
- StpF3 n = sqrt(c);
- n = floor(n * StpF3_(1023.0)) * StpF3_(1.0 / 1023.0);
- StpF3 a = n * n;
- StpF3 b = n + StpF3_(1.0 / 1023.0);
- c = StpSatF3(n + StpGtZeroF3(StpF3_(dit) * (b * b - a) - (b * b - c)) * StpF3_(1.0 / 1023.0)); return c; }
- //------------------------------------------------------------------------------------------------------------------------------
- // Can use this function to convert feedback back to color.
- void StpFeed2ClrF(inout StpF3 c) {
- c *= c;
- #if (STP_POSTMAP == 0)
- StpToneInvF3(c.rgb);
- #endif
- }
- #endif // defined(STP_GPU) && defined(STP_32BIT)
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_32BIT)
- void StpToneMF1(inout StpMF1 x) { StpMF1 y = StpRcpMF1(StpMF1_(STP_SAT) + x); x = StpSatMF1(x * StpMF1_(y)); }
- //------------------------------------------------------------------------------------------------------------------------------
- void StpToneMF3(inout StpMF3 x) {
- StpMF1 y = StpRcpMF1(StpMF1_(STP_SAT) + StpMax3MF1(x.r, x.g, x.b));
- x = StpSatMF3(x * StpMF3_(y)); }
- //------------------------------------------------------------------------------------------------------------------------------
- void StpToneInvMF3(inout StpMF3 x) {
- StpMF1 y = StpRcpMF1(
- max(StpMF1_(1.0 / 16384.0), StpSatMF1(StpMF1_(1.0 / STP_SAT) -
- StpMax3MF1(x.r, x.g, x.b) * StpMF1_(1.0 / STP_SAT))));
- x *= StpMF3_(y); }
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF3 StpRgbGamDit8MF3(StpMF3 c, StpMF1 dit) {
- StpMF3 n = sqrt(c);
- n = floor(n * StpMF3_(255.0)) * StpMF3_(1.0 / 255.0);
- StpMF3 a = n * n;
- StpMF3 b = n + StpMF3_(1.0 / 255.0);
- c = StpSatMF3(n + StpGtZeroMF3(StpMF3_(dit) * (b * b - a) - (b * b - c)) * StpMF3_(1.0 / 255.0)); return c; }
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF3 StpRgbGamDit10MF3(StpMF3 c, StpMF1 dit) {
- StpMF3 n = sqrt(c);
- n = floor(n * StpMF3_(1023.0)) * StpMF3_(1.0 / 1023.0);
- StpMF3 a = n * n;
- StpMF3 b = n + StpMF3_(1.0 / 1023.0);
- c = StpSatMF3(n + StpGtZeroMF3(StpMF3_(dit) * (b * b - a) - (b * b - c)) * StpMF3_(1.0 / 1023.0)); return c; }
- //------------------------------------------------------------------------------------------------------------------------------
- void StpFeed2ClrMF(inout StpMF3 c) {
- c *= c;
- #if (STP_POSTMAP == 0)
- StpToneInvMF3(c.rgb);
- #endif
- }
- #endif // defined(STP_GPU) && defined(STP_32BIT)
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_16BIT)
- void StpToneH1(inout StpH1 x) { StpH1 y = StpRcpH1(StpH1_(STP_SAT) + x); x = StpSatH1(x * StpH1_(y)); }
- //------------------------------------------------------------------------------------------------------------------------------
- void StpToneH3(inout StpH3 x) {
- StpH1 y = StpRcpH1(StpH1_(STP_SAT) + StpMax3H1(x.r, x.g, x.b));
- x = StpSatH3(x * StpH3_(y)); }
- //------------------------------------------------------------------------------------------------------------------------------
- void StpToneInvH3(inout StpH3 x) {
- StpH1 y = StpRcpH1(
- max(StpH1_(1.0 / 16384.0), StpSatH1(StpH1_(1.0 / STP_SAT) - StpMax3H1(x.r, x.g, x.b) * StpH1_(1.0 / STP_SAT))));
- x *= StpH3_(y); }
- //------------------------------------------------------------------------------------------------------------------------------
- StpH3 StpRgbGamDit8H3(StpH3 c, StpH1 dit) {
- StpH3 n = sqrt(c);
- n = floor(n * StpH3_(255.0)) * StpH3_(1.0 / 255.0);
- StpH3 a = n * n;
- StpH3 b = n + StpH3_(1.0 / 255.0);
- c = StpSatH3(n + StpGtZeroH3(StpH3_(dit) * (b * b - a) - (b * b - c)) * StpH3_(1.0 / 255.0)); return c; }
- //------------------------------------------------------------------------------------------------------------------------------
- StpH3 StpRgbGamDit10H3(StpH3 c, StpH1 dit) {
- StpH3 n = sqrt(c);
- n = floor(n * StpH3_(1023.0)) * StpH3_(1.0 / 1023.0);
- StpH3 a = n * n;
- StpH3 b = n + StpH3_(1.0 / 1023.0);
- c = StpSatH3(n + StpGtZeroH3(StpH3_(dit) * (b * b - a) - (b * b - c)) * StpH3_(1.0 / 1023.0)); return c; }
- //------------------------------------------------------------------------------------------------------------------------------
- void StpFeed2ClrH(inout StpH3 c) {
- c *= c;
- #if (STP_POSTMAP == 0)
- StpToneInvH3(c.rgb);
- #endif
- }
- #endif // defined(STP_GPU) && defined(STP_16BIT)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // COLOR CONVERSION TOOLS
- //------------------------------------------------------------------------------------------------------------------------------
- // Some platforms do not have a hardware sRGB image store (requires manual conversion).
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_32BIT)
- StpF3 StpLinearToSrgbF3(StpF3 c) {
- StpF3 j = StpF3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); StpF2 k = StpF2(1.055, -0.055);
- return clamp(j.xxx, c * j.yyy, pow(c, j.zzz) * k.xxx + k.yyy); }
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF3 StpLinearToSrgbMF3(StpMF3 c) {
- StpMF3 j = StpMF3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); StpMF2 k = StpMF2(1.055, -0.055);
- return clamp(j.xxx, c * j.yyy, pow(c, j.zzz) * k.xxx + k.yyy); }
- #endif // defined(STP_GPU) && defined(STP_32BIT)
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_16BIT)
- StpH3 StpLinearToSrgbH3(StpH3 c) {
- StpH3 j = StpH3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); StpH2 k = StpH2(1.055, -0.055);
- return clamp(j.xxx, c * j.yyy, pow(c, j.zzz) * k.xxx + k.yyy); }
- #endif // defined(STP_GPU) && defined(STP_16BIT)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // DEBUG COMMON
- //==============================================================================================================================
- #if defined(STP_GPU) && STP_BUG
- void StpBugF(StpU3 p, StpF4 c);
- #endif // defined(STP_GPU) && STP_BUG
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // CONSTANT GENERATION
- //==============================================================================================================================
- STP_STATIC void StpDilCon(
- // Generated constants.
- StpInOutU4 con0,
- // Current image resolution in pixels.
- StpInF2 imgC) {
- // StpF2 kRcpR := 4/size of current input image in pixels.
- con0[0] = StpU1_F1(StpF1_(4.0) / imgC[0]);
- con0[1] = StpU1_F1(StpF1_(4.0) / imgC[1]);
- // StpU2 kR := size/4 of the current input image in pixels.
- // Used for pass merging (DIL and SAA), since convergence is 1/16 area of input, must check position.
- con0[2] = StpU1_(StpU1_(imgC[0]) >> StpU1_(2));
- con0[3] = StpU1_(StpU1_(imgC[1]) >> StpU1_(2)); }
- //==============================================================================================================================
- STP_STATIC void StpPatCon(
- // Generated constants.
- StpInOutU4 con0,
- StpInOutU4 con1,
- StpInOutU4 con2,
- StpInOutU4 con3,
- StpInOutU4 con4,
- StpInOutU4 con5,
- StpInOutU4 con6,
- StpInOutU4 con7,
- StpInOutU4 con8,
- StpInOutU4 con9,
- StpInOutU4 conA,
- StpInOutU4 conB,
- StpInOutU4 conC,
- // Linear depth near plane for log2 depth encoding.
- StpF1 near,
- // Linear depth far plane for log2 depth encoding.
- StpF1 far,
- // Frame count for current frame (sets jitter).
- StpU1 frame,
- // Current image resolution in pixels.
- StpInF2 imgC,
- // Prior image resolution in pixels.
- StpInF2 imgP,
- // Feedback (aka output) resolution in pixels.
- StpInF2 imgF,
- // Ratio of 'currentFrameTime/priorFrameTime'.
- StpF1 motionMatch,
- // Projection matrix data {a,b,c,d,e,f,g,h}.
- // This is used to do static geometry forward projection.
- // a 0 e 0
- // 0 b f 0
- // 0 0 c d
- // 0 0 g h
- // For reference, an DX ortho projection would be,
- // a 0 e 0
- // 0 b f 0
- // 0 0 c d
- // 0 0 0 1
- // And a DX, left handed perspective projection would be,
- // a 0 e 0
- // 0 b f 0
- // 0 0 c d ... c := F/(F-N), d := -(F*N)/(F-N)
- // 0 0 1 0
- // Previous prior projection.
- StpInF4 prjPrvABEF,
- StpInF4 prjPrvCDGH,
- // Prior projection.
- StpInF4 prjPriABEF,
- StpInF4 prjPriCDGH,
- // Current projection (the difference enables changing zoom).
- StpInF4 prjCurABEF,
- StpInF4 prjCurCDGH,
- // Forward viewspace transform.
- // Transform prior 3D view position into current 3D view position.
- // This is used to do static geometry forward projection.
- // X := x*i + y*j +z*k +l
- // Y := x*m + y*n +z*o +p
- // Z := x*q + y*r +z*s +t
- // W := 1
- // i j k l
- // m n o p
- // q r s t
- // 0 0 0 1
- StpInF4 forIJKL,
- StpInF4 forMNOP,
- StpInF4 forQRST,
- // Prior frame backward viewspace transform.
- // Transform prior 3D view position into previous-prior 3D view position.
- // This is used to 'fix' static geometry forward projection for dynamic motion.
- // X := x*i + y*j +z*k +l
- // Y := x*m + y*n +z*o +p
- // Z := x*q + y*r +z*s +t
- // W := 1
- // i j k l
- // m n o p
- // q r s t
- // 0 0 0 1
- StpInF4 bckIJKL,
- StpInF4 bckMNOP,
- StpInF4 bckQRST) {
- //------------------------------------------------------------------------------------------------------------------------------
- // StpF2 kRcpC := 1.0 / size of current input image in pixels.
- con0[0] = StpU1_F1(StpF1_(1.0) / imgC[0]);
- con0[1] = StpU1_F1(StpF1_(1.0) / imgC[1]);
- // StpF2 kHalfRcpC := 0.5 / size of current input image in pixels.
- con0[2] = StpU1_F1(StpF1_(0.5) / imgC[0]);
- con0[3] = StpU1_F1(StpF1_(0.5) / imgC[1]);
- //------------------------------------------------------------------------------------------------------------------------------
- // Grab jitter for current and prior frames.
- StpVarF2 jitP;
- StpVarF2 jitC;
- StpJit(jitP, frame - StpU1_(1));
- StpJit(jitC, frame);
- // StpF2 kJitCRcpCUnjitPRcpP := Map current into prior frame.
- con1[0] = StpU1_F1(jitC[0] / imgC[0] - jitP[0] / imgP[0]);
- con1[1] = StpU1_F1(jitC[1] / imgC[1] - jitP[1] / imgP[1]);
- // StpF2 kJitCRcpC := Take {0 to 1} position in current image, and map back to {0 to 1} position in feedback (removes jitter).
- con1[2] = StpU1_F1(jitC[0] / imgC[0]);
- con1[3] = StpU1_F1(jitC[1] / imgC[1]);
- //------------------------------------------------------------------------------------------------------------------------------
- // StpF2 kF := size of feedback (aka output) in pixels.
- con2[0] = StpU1_F1(imgF[0]);
- con2[1] = StpU1_F1(imgF[1]);
- // StpF2 kDepth := Copied logic from StpZCon().
- StpF1 k0 = StpRcpF1(near);
- StpF1 k1 = StpRcpF1(StpLog2F1(k0 * far));
- con2[2] = StpU1_F1(k0);
- con2[3] = StpU1_F1(k1);
- //------------------------------------------------------------------------------------------------------------------------------
- // StpF4 kOS := Scale and bias to check for out of bounds (and kill feedback).
- // Scaled and biased output needs to {-1 out of bounds, >-1 in bounds, <1 in bounds, 1 out of bounds}.
- StpVarF2 s;
- // Undo 'pM' scaling, and multiply by 2 (as this needs to be -1 to 1 at edge of acceptable reprojection).
- s[0] = StpF1_(2.0);
- s[1] = StpF1_(2.0);
- // Scaling to push outside safe reprojection over 1.
- s[0] *= imgP[0] / (imgP[0] + StpF1_(4.0));
- s[1] *= imgP[1] / (imgP[1] + StpF1_(4.0));
- con3[0] = StpU1_F1(s[0]);
- con3[1] = StpU1_F1(s[1]);
- // Factor out subtracting off the mid point scaled by the multiply term.
- con3[2] = StpU1_F1(StpF1_(-0.5) * s[0]);
- con3[3] = StpU1_F1(StpF1_(-0.5) * s[1]);
- //------------------------------------------------------------------------------------------------------------------------------
- // StpF2 kUnDepth := Copied logic from StpZUnCon().
- con4[0] = StpU1_F1(StpLog2F1(far * StpRcpF1(near)));
- con4[1] = StpU1_F1(near);
- // kMotionMatch
- con4[2] = StpU1_F1(motionMatch);
- // Unused for now.
- con4[3] = StpU1_(0);
- //------------------------------------------------------------------------------------------------------------------------------
- // StpF2 kC := Size of current input image in pixels.
- con5[0] = StpU1_F1(imgC[0]);
- con5[1] = StpU1_F1(imgC[1]);
- // kST
- con5[2] = StpU1_F1(bckQRST.z * prjPrvCDGH.z);
- con5[3] = StpU1_F1(bckQRST.w * prjPrvCDGH.z + prjPrvCDGH.w);
- //------------------------------------------------------------------------------------------------------------------------------
- // See header docs in "STATIC GEOMETRY MOTION FORWARD PROJECTION".
- // k0123
- con6[0] = StpU1_F1(prjPriCDGH.z / prjPriABEF.x);
- con6[1] = StpU1_F1(prjPriCDGH.w / prjPriABEF.x);
- con6[2] = StpU1_F1(prjPriABEF.z / prjPriABEF.x);
- con6[3] = StpU1_F1(prjPriCDGH.z / prjPriABEF.y);
- // k4567
- con7[0] = StpU1_F1(prjPriCDGH.w / prjPriABEF.y);
- con7[1] = StpU1_F1(prjPriABEF.w / prjPriABEF.y);
- con7[2] = StpU1_F1(forIJKL.x * prjCurABEF.x + forQRST.x * prjCurABEF.z);
- con7[3] = StpU1_F1(forIJKL.y * prjCurABEF.x + forQRST.y * prjCurABEF.z);
- // k89AB
- con8[0] = StpU1_F1(forIJKL.z * prjCurABEF.x + forQRST.z * prjCurABEF.z);
- con8[1] = StpU1_F1(forIJKL.w * prjCurABEF.x + forQRST.w * prjCurABEF.z);
- con8[2] = StpU1_F1(forMNOP.x * prjCurABEF.y + forQRST.x * prjCurABEF.w);
- con8[3] = StpU1_F1(forMNOP.y * prjCurABEF.y + forQRST.y * prjCurABEF.w);
- // kCDEF
- con9[0] = StpU1_F1(forMNOP.z * prjCurABEF.y + forQRST.z * prjCurABEF.w);
- con9[1] = StpU1_F1(forMNOP.w * prjCurABEF.y + forQRST.w * prjCurABEF.w);
- con9[2] = StpU1_F1(forQRST.x * prjCurCDGH.z);
- con9[3] = StpU1_F1(forQRST.y * prjCurCDGH.z);
- // kGHIJ
- conA[0] = StpU1_F1(forQRST.z * prjCurCDGH.z);
- conA[1] = StpU1_F1(forQRST.w * prjCurCDGH.z + prjCurCDGH.w);
- conA[2] = StpU1_F1(bckIJKL.x * prjPrvABEF.x + bckQRST.x * prjPrvABEF.z);
- conA[3] = StpU1_F1(bckIJKL.y * prjPrvABEF.x + bckQRST.y * prjPrvABEF.z);
- // kKLMN
- conB[0] = StpU1_F1(bckIJKL.z * prjPrvABEF.x + bckQRST.z * prjPrvABEF.z);
- conB[1] = StpU1_F1(bckIJKL.w * prjPrvABEF.x + bckQRST.w * prjPrvABEF.z);
- conB[2] = StpU1_F1(bckMNOP.x * prjPrvABEF.y + bckQRST.x * prjPrvABEF.w);
- conB[3] = StpU1_F1(bckMNOP.y * prjPrvABEF.y + bckQRST.y * prjPrvABEF.w);
- // kOPQR
- conC[0] = StpU1_F1(bckMNOP.z * prjPrvABEF.y + bckQRST.z * prjPrvABEF.w);
- conC[1] = StpU1_F1(bckMNOP.w * prjPrvABEF.y + bckQRST.w * prjPrvABEF.w);
- conC[2] = StpU1_F1(bckQRST.x * prjPrvCDGH.z);
- conC[3] = StpU1_F1(bckQRST.y * prjPrvCDGH.z);}
- //==============================================================================================================================
- STP_STATIC void StpTaaCon(
- // Generated constants.
- StpInOutU4 con0,
- StpInOutU4 con1,
- StpInOutU4 con2,
- StpInOutU4 con3,
- // Amount of grain {0 = maximum, >0 is amount of stops less of grain}.
- StpF1 grain,
- // Frame count for current frame (sets jitter).
- StpU1 frame,
- // Current image resolution in pixels.
- StpInF2 imgC,
- // Feedback (aka output) resolution in pixels.
- StpInF2 imgF) {
- //------------------------------------------------------------------------------------------------------------------------------
- // Grab jitter for current frame.
- StpVarF2 jitC;
- StpJit(jitC, frame);
- //------------------------------------------------------------------------------------------------------------------------------
- // Conversion from integer pix position to center pix float pixel position in image for current input.
- // xy := multiply term (M) --- Scale by 1/imgF to get to {0 to 1}.
- // zw := addition term (A) --- Add 0.5*M to get to center of pixel, then subtract jitC to undo jitter.
- // StpF2 kCRcpF.
- con0[0] = StpU1_F1(imgC[0] / imgF[0]);
- con0[1] = StpU1_F1(imgC[1] / imgF[1]);
- // StpF2 kHalfCRcpFUnjitC.
- con0[2] = StpU1_F1(StpF1_(0.5) * imgC[0] / imgF[0] - jitC[0]);
- con0[3] = StpU1_F1(StpF1_(0.5) * imgC[1] / imgF[1] - jitC[1]);
- //------------------------------------------------------------------------------------------------------------------------------
- // StpF2 kRcpC := 1/size of current input image in pixels.
- con1[0] = StpU1_F1(StpF1_(1.0) / imgC[0]);
- con1[1] = StpU1_F1(StpF1_(1.0) / imgC[1]);
- //------------------------------------------------------------------------------------------------------------------------------
- // StpF2 kRcpF := 1/size of feedback image (aka output) in pixels.
- con1[2] = StpU1_F1(StpF1_(1.0) / imgF[0]);
- con1[3] = StpU1_F1(StpF1_(1.0) / imgF[1]);
- //------------------------------------------------------------------------------------------------------------------------------
- // StpF2 kHalfRcpF := 0.5/size of feedback image (aka output) in pixels.
- con2[0] = StpU1_F1(StpF1_(0.5) / imgF[0]);
- con2[1] = StpU1_F1(StpF1_(0.5) / imgF[1]);
- //------------------------------------------------------------------------------------------------------------------------------
- // Conversion from a {0 to 1} position in current input to feedback.
- // StpH3 kJitCRcpC0 := jitC / image image size in pixels + {-0.5/size, +0.5/size} of current input image in pixels.
- con2[2] = StpU1_F1(jitC[0] / imgC[0] - StpF1_(0.5) / imgC[0]);
- con2[3] = StpU1_F1(jitC[1] / imgC[1] + StpF1_(0.5) / imgC[1]);
- //------------------------------------------------------------------------------------------------------------------------------
- // StpF2 kHalfRcpC := 0.5/size of current input image in pixels.
- con3[0] = StpU1_F1(StpF1_(0.5) / imgC[0]);
- con3[1] = StpU1_F1(StpF1_(0.5) / imgC[1]);
- //------------------------------------------------------------------------------------------------------------------------------
- // StpF2 kF := size of feedback image in pixels.
- con3[2] = StpU1_F1(imgF[0]);
- con3[3] = StpU1_F1(imgF[1]); }
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- //
- // PATTERN ENTRY POINT
- //
- //==============================================================================================================================
- // See the packed 16-bit version for comments.
- #if defined(STP_GPU) && defined(STP_32BIT) && defined(STP_PAT)
- void StpPat4x4MaxF8(StpMU1 i, inout StpF4 a, inout StpF4 b);
- void StpPat4x4SumF4(StpMU1 i, inout StpF4 a);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 StpPatPriConF(StpF2 p);
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 StpPatDatMotF(StpMU2 o);
- StpMF3 StpPatDatColF(StpMU2 o);
- StpF1 StpPatDatZF(StpMU2 o);
- StpF1 StpPatFixZF(StpF1 z);
- StpU1 StpPatDatRF(StpMU2 o);
- StpMF1 StpPatFixRF(StpU1 v);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 StpPatDitF(StpMU2 o);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF4 StpPatPriFedF(StpF2 p);
- StpMF4 StpPatPriFedR4F(StpF2 p);
- StpMF4 StpPatPriFedG4F(StpF2 p);
- StpMF4 StpPatPriFedB4F(StpF2 p);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF2 StpPatPriLumF(StpF2 p);
- //------------------------------------------------------------------------------------------------------------------------------
- StpU4 StpPatPriMot4F(StpF2 p);
- #if STP_MAX_MIN_UINT
- StpU1 StpPatPriMotMinF(StpF2 p);
- #endif // STP_MAX_MIN_UINT
- #if STP_OFFSETS
- StpU4 StpPatPriMot4OF(StpF2 p, StpI2 o);
- #if STP_MAX_MIN_UINT
- StpU1 StpPatPriMotMinOF(StpF2 p, StpI2 o);
- #endif // STP_MAX_MIN_UINT
- #endif // STP_OFFSETS
- //------------------------------------------------------------------------------------------------------------------------------
- void StpPatStMotF(StpMU2 p, StpU1 v);
- void StpPatStColF(StpMU2 p, StpMF4 v);
- void StpPatStLumF(StpMU2 p, StpMF2 v);
- void StpPatStCnvF(StpMU2 p, StpMF1 v);
- //==============================================================================================================================
- void StpPatF(
- StpMU1 lane,
- StpMU2 pp,
- StpU4 con0,
- StpU4 con1,
- StpU4 con2,
- StpU4 con3,
- StpU4 con4,
- StpU4 con5,
- StpU4 con6,
- StpU4 con7,
- StpU4 con8,
- StpU4 con9,
- StpU4 conA,
- StpU4 conB,
- StpU4 conC,
- StpU4 conD) {
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF4 rC;
- StpU1 rM;
- StpMF2 rL;
- StpMF1 rCnv;
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 kRcpC = StpF2_U2(con0.xy);
- StpF2 kHalfRcpC = StpF2_U2(con0.zw);
- StpF2 kJitCRcpCUnjitPRcpP = StpF2_U2(con1.xy);
- StpF2 kJitCRcpC = StpF2_U2(con1.zw);
- StpF2 kF = StpF2_U2(con2.xy);
- StpF4 kOS = StpF4_U4(con3);
- StpF2 kDepth = StpF2_U2(con2.zw);
- StpF2 kUnDepth = StpF2_U2(con4.xy);
- StpF1 kMotionMatch = StpF1_U1(con4.z);
- StpF2 kC = StpF2_U2(con5.xy);
- StpF4 k0123 = StpF4_U4(con6);
- StpF4 k4567 = StpF4_U4(con7);
- StpF4 k89AB = StpF4_U4(con8);
- StpF4 kCDEF = StpF4_U4(con9);
- StpF4 kGHIJ = StpF4_U4(conA);
- StpF4 kKLMN = StpF4_U4(conB);
- StpF4 kOPQR = StpF4_U4(conC);
- StpF2 kST = StpF2_U2(conD.xy);
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 m = StpPatDatMotF(pp);
- StpMF1 d = StpPatDitF(pp);
- StpF1 zPre = StpPatDatZF(pp);
- StpMF3 c = StpPatDatColF(pp);
- //==============================================================================================================================
- // DEPENDENT INLINE INPUT MOTION
- //==============================================================================================================================
- StpF2 p = StpF2(pp) * kRcpC + kHalfRcpC;
- //------------------------------------------------------------------------------------------------------------------------------
- // Check the streaming bandwidth limit.
- #if STP_BUG_BW_SOL
- { StpMF2 lum2 = StpPatPriLumF(p);
- StpMF1 cnvPrev = StpPatPriConF(p);
- StpU4 mZVP4 = StpPatPriMot4F(p);
- StpU1 rPre = StpPatDatRF(p);
- StpMF3 f = StpPatPriFedF(p).rgb;
- StpF1 z = StpPatFixZF(zPre);
- StpMF1 r = StpPatFixRF(rPre);
- rC.rgb = StpMF3_(m.x) + StpMF3_(d.x) + c + StpMF3_(lum2.x) + StpMF3_(cnvPrev) + StpMF3(mZVP4.xyz) + f + StpMF3_(z+r);
- rC.a = StpMF1_(0.0);
- rL = rC.rg;
- rM = StpU1_(rC.r);
- rCnv = rC.r;
- StpPatStMotF(pp, rM);
- StpPatStLumF(pp, rL);
- StpPatStColF(pp, rC);
- StpPatStCnvF(pp, rCnv);
- return; }
- #endif // STP_BUG_BW_SOL
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 pM = (p - m);
- StpF2 pF = pM + kJitCRcpC;
- pM = pM + kJitCRcpCUnjitPRcpP;
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF2 lum2 = StpPatPriLumF(pM);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 cnvPrev = StpPatPriConF(pM);
- //------------------------------------------------------------------------------------------------------------------------------
- #if (STP_SAFE_DILATE == 2)
- #if STP_MAX_MIN_UINT
- StpU4 mZVP4;
- #if STP_OFFSETS
- mZVP4.x = StpPatPriMotMinOF(pM, StpI2(-1, -1));
- mZVP4.y = StpPatPriMotMinOF(pM, StpI2( 1, -1));
- mZVP4.z = StpPatPriMotMinOF(pM, StpI2(-1, 1));
- mZVP4.w = StpPatPriMotMinOF(pM, StpI2( 1, 1));
- #else // STP_OFFSETS
- mZVP4.x = StpPatPriMotMinF(pM + StpF2(-kRcpC.x, -kRcpC.y));
- mZVP4.y = StpPatPriMotMinF(pM + StpF2( kRcpC.x, -kRcpC.y));
- mZVP4.z = StpPatPriMotMinF(pM + StpF2(-kRcpC.x, kRcpC.y));
- mZVP4.w = StpPatPriMotMinF(pM + StpF2( kRcpC.x, kRcpC.y));
- #endif // ST_OFFSETS
- #else // STP_MAX_MIN_UINT
- #if STP_OFFSETS
- StpU4 mZVP4_0 = StpPatPriMot4OF(pM, StpI2(-1, -1));
- StpU4 mZVP4_1 = StpPatPriMot4OF(pM, StpI2( 1, -1));
- StpU4 mZVP4_2 = StpPatPriMot4OF(pM, StpI2(-1, 1));
- StpU4 mZVP4_3 = StpPatPriMot4OF(pM, StpI2( 1, 1));
- #else // STP_OFFSETS
- StpU4 mZVP4_0 = StpPatPriMot4F(pM + StpF2(-kRcpC.x, -kRcpC.y));
- StpU4 mZVP4_1 = StpPatPriMot4F(pM + StpF2( kRcpC.x, -kRcpC.y));
- StpU4 mZVP4_2 = StpPatPriMot4F(pM + StpF2(-kRcpC.x, kRcpC.y));
- StpU4 mZVP4_3 = StpPatPriMot4F(pM + StpF2( kRcpC.x, kRcpC.y));
- #endif // STP_OFFSETS
- #endif // STP_MAX_MIN_UINT
- #else // (STP_SAFE_DILATE == 2)
- StpU1 mZVPN;
- StpU4 mZVP2a = StpPatPriMot4F(pM - kHalfRcpC);
- StpU4 mZVP2b = StpPatPriMot4F(pM + kHalfRcpC);
- #if STP_MAX_MIN_UINT
- mZVPN = StpPatPriMotMinF(pM);
- #else // STP_MAX_MIN_UINT
- StpU4 mZVP4 = StpPatPriMot4F(pM);
- #endif // STP_MAX_MIN_UINT
- #endif // (STP_SAFE_DILATE == 2)
- //------------------------------------------------------------------------------------------------------------------------------
- StpU1 rPre = StpPatDatRF(pp);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF4 f4R = StpPatPriFedR4F(pF);
- StpMF4 f4G = StpPatPriFedG4F(pF);
- StpMF4 f4B = StpPatPriFedB4F(pF);
- StpMF3 f = StpPatPriFedF(pF).rgb;
- //==============================================================================================================================
- // DEPENDENT ON DITHER AND INLINE INPUT PARAMETERS
- //==============================================================================================================================
- StpF1 dd = StpF1_(d);
- StpF1 z = StpPatFixZF(zPre);
- z = StpZPack(z, kDepth, dd);
- rM = StpMvPack(z, m, dd);
- StpPatStMotF(pp, rM);
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG
- // Pattern/Clipped Input Color
- { StpF4 bug = StpF4_(0.0);
- bug.rgb = sqrt(StpF3(c.rgb));
- bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
- StpBugF(StpU3(pp, 0), bug); }
- //------------------------------------------------------------------------------------------------------------------------------
- // Pattern/Log Input Depth
- { StpF4 bug = StpF4_(0.0);
- bug.rgb = StpF3_(StpSatF1(z + StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
- StpBugF(StpU3(pp, 1), bug); }
- #endif // STP_BUG
- //------------------------------------------------------------------------------------------------------------------------------
- #if (STP_POSTMAP == 0)
- StpToneMF3(c);
- #endif // (STP_POSTMAP == 0)
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG
- // Pattern/Reversible Tonemapped Input Color
- { StpF4 bug = StpF4_(0.0);
- bug.rgb = sqrt(StpF3(c.rgb));
- bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
- StpBugF(StpU3(pp, 2), bug); }
- #endif // STP_BUG
- //------------------------------------------------------------------------------------------------------------------------------
- c = sqrt(c);
- rC.rgb = StpSatMF3(c + StpMF3_(d * StpMF1(1.0 / 1023.0) + StpMF1(-0.5 / 1023.0)));
- //------------------------------------------------------------------------------------------------------------------------------
- rL.x = dot(c, StpMF3(STP_LUMA));
- rL.y = lum2.x;
- StpPatStLumF(pp, rL);
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG
- // Pattern/Shaped Absolute Input Motion
- { StpF4 bug = StpF4_(0.0);
- bug.b = sqrt(StpF1_(rL.x) * StpF1_(0.25));
- bug.rg = StpF2_(1.0) - exp2(abs(StpF2(m)) * StpF2_(-32.0));
- bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
- StpBugF(StpU3(pp, 3), bug); }
- #endif // STP_BUG
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 moire = min(abs(rL.x - lum2.x), abs(lum2.x - lum2.y));
- moire *= StpMF1_(STP_PAT_DEMOIRE);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF4 xnyRG = StpMF4(c.r, -c.r, c.g, -c.g);
- StpMF4 xnyBC = StpMF4(c.b, -c.b, -cnvPrev, -cnvPrev);
- #if defined(STP_16BIT)
- #else // defined(STP_16BIT)
- // We convert to full precision floats here since the reductions work on 32-bit values.
- StpF4 xnyRGF = StpF4(xnyRG);
- StpF4 xnyBCF = StpF4(xnyBC);
- StpPat4x4MaxF8(lane, xnyRGF, xnyBCF);
- xnyRG = StpMF4(xnyRGF);
- xnyBC = StpMF4(xnyBCF);
- #endif // defined(STP_16BIT)
- cnvPrev = -xnyBC.z;
- StpMF3 ne = max(StpMF3_(STP_PAT_NE_MIN) * StpMF3(xnyRG.x, xnyRG.z, xnyBC.x),
- StpMF3(xnyRG.x + xnyRG.y, xnyRG.z + xnyRG.w, xnyBC.x + xnyBC.y));
- StpMF1 ne1 = dot(ne, StpMF3(STP_LUMA));
- //------------------------------------------------------------------------------------------------------------------------------
- cnvPrev = StpSatMF1(cnvPrev + StpMF1_(1.0 / STP_FRAME_MAX));
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 onXY = StpF2(pM.xy);
- onXY = onXY * kOS.xy + kOS.zw;
- StpF1 onS = StpSignedF1(max(abs(onXY.x), abs(onXY.y)) - StpF1_(1.0));
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG
- // Pattern/Motion Reprojection {R=Prior G=This Sqrt Luma Feedback Diff, B=Offscreen}
- { StpF4 bug = StpF4_(0.0);
- bug.g = StpF1_(abs(rL.x - lum2.x));
- bug.r = StpF1_(abs(lum2.x - lum2.y));
- bug.b = StpF1_(1.0) - StpF1_(onS);
- bug.rg = sqrt(bug.rg);
- bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
- StpBugF(StpU3(pp, 4), bug); }
- #endif // STP_BUG
- //==============================================================================================================================
- // DEPENDENT ON PRIOR {Z, MOTION}
- //==============================================================================================================================
- #if (STP_SAFE_DILATE == 2)
- #if (STP_MAX_MIN_UINT == 0)
- StpU4 mZVP4 = min(StpMin3U4(mZVP4_0, mZVP4_1, mZVP4_2), mZVP4_3);
- #endif // (STP_MAX_MIN_UINT == 0)
- StpU1 mZVPN = min(StpMin3U1(mZVP4.x, mZVP4.y, mZVP4.z), mZVP4.w);
- #else // (STP_SAFE_DILATE == 2)
- #if (STP_MAX_MIN_UINT == 0)
- mZVPN = min(StpMin3U1(mZVP4.x, mZVP4.y, mZVP4.z), mZVP4.w);
- #endif // (STP_MAX_MIN_UINT == 0)
- #if STP_SAFE_DILATE
- mZVPN = StpMin3U1(StpMin3U1(mZVPN, mZVP2a.x, mZVP2a.z), mZVP2b.x, mZVP2b.z);
- #endif // STP_SAFE_DILATE
- #endif // (STP_SAFE_DILATE == 2)
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 mPN;
- StpF1 mZPN;
- StpMvUnpack(mZPN, mPN, mZVPN);
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 mE;
- mE = sqrt(abs(m)) + StpF2_(1.0 / 256.0);
- mE = mE * mE - abs(m);
- //------------------------------------------------------------------------------------------------------------------------------
- StpF1 sgZ = StpZUnpack(mZPN, kUnDepth);
- StpF2 bugF; StpF2 bugD;
- StpF2 sgM = StpFor(pM, sgZ, mPN, kMotionMatch, k0123, k4567, k89AB, kCDEF, kGHIJ, kKLMN, kOPQR, kST, bugF, bugD);
- sgM = StpSatF2(abs(sgM * StpF2_(0.5) - m) - mE) * kC;
- StpMF1 sgD = StpMF1(dot(sgM, sgM));
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 match = StpMF1_(1.0) - StpSatMF1(sgD * StpMF1_(STP_PAT_MOT_AMP) - StpMF1_(STP_PAT_MOT_ADD * STP_PAT_MOT_AMP));
- match *= StpMF1_(onS);
- rC.a = match;
- StpPatStColF(pp, rC);
- //------------------------------------------------------------------------------------------------------------------------------
- moire = moire * match + StpMF1_(1.0 / 8192.0);
- moire = min(StpMF1_(1.0), ne1 * StpRcpMF1(moire));
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 tS = moire;
- StpMF1 r = StpPatFixRF(rPre);
- tS = tS * (StpMF1_(STP_PAT_RESPONSIVE) - r * StpMF1_(STP_PAT_RESPONSIVE)) + tS;
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG
- // Pattern/Sensitivity {G=No motion match, R=Responsive, B=Luma}
- { StpF4 bug = StpF4_(0.0);
- bug.g = StpF1_(1.0) - StpF1(match);
- bug.r = StpF1_(1.0) - StpF1(r);
- bug.b = StpF1_(rL.x);
- bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
- StpBugF(StpU3(pp, 5), bug); }
- #endif // STP_BUG
- //==============================================================================================================================
- // DEPENDENT ON FEEDBACK
- //==============================================================================================================================
- StpMF4 t;
- t.rgb = c - f;
- t.a = dot(abs(t.rgb), StpMF3(STP_LUMA));
- StpMF4 t4R = f4R - StpMF4_(c.r);
- StpMF4 t4G = f4G - StpMF4_(c.g);
- StpMF4 t4B = f4B - StpMF4_(c.b);
- StpMF4 t4A = abs(t4R) * StpMF4_(STP_LUMA_R) + abs(t4G) * StpMF4_(STP_LUMA_G) + abs(t4B) * StpMF4_(STP_LUMA_B);
- t.a = StpMin3MF1(t.a, t4A.x, StpMin3MF1(t4A.y, t4A.z, t4A.w));
- if(t.a == t4A.x) t.rgb = StpMF3(t4R.x, t4G.x, t4B.x);
- if(t.a == t4A.y) t.rgb = StpMF3(t4R.y, t4G.y, t4B.y);
- if(t.a == t4A.z) t.rgb = StpMF3(t4R.z, t4G.z, t4B.z);
- if(t.a == t4A.w) t.rgb = StpMF3(t4R.w, t4G.w, t4B.w);
- //------------------------------------------------------------------------------------------------------------------------------
- t.rgb *= StpMF3_(tS);
- //------------------------------------------------------------------------------------------------------------------------------
- #if defined(STP_16BIT)
- StpPat4x4SumH4(lane, t);
- #else // defined(STP_16BIT)
- // We convert to full precision floats here since the reductions work on 32-bit values, and MF might be 16-bit.
- StpF4 tF = StpF4(t);
- StpPat4x4SumF4(lane, tF);
- t = StpMF4(tF);
- #endif // defined(STP_16BIT)
- t.rgb *= StpMF3_(STP_PAT_SENSITIVITY);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF3 bln3 = StpSatMF3(ne * StpRcpMF3(abs(t.rgb)));
- StpMF1 bln = StpMin3MF1(bln3.r, bln3.g, bln3.b);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 cnv = StpSatMF1(bln * StpRcpMF1(StpMF1_(STP_FRAME_MAX) - StpMF1_(STP_FRAME_MAX) * bln));
- //------------------------------------------------------------------------------------------------------------------------------
- cnv = StpSatMF1(cnv - StpMF1_(1.0 / STP_FRAME_MAX));
- rCnv = min(cnv, cnvPrev);
- StpPatStCnvF(pp, rCnv); }
- #endif // defined(STP_GPU) && defined(STP_32BIT) && defined(STP_PAT)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // 16-BIT PATH
- //==============================================================================================================================
- // See the packed 16-bit version for comments.
- #if defined(STP_GPU) && defined(STP_16BIT) && defined(STP_PAT)
- // 4x4 wave op: 8 component maximum.
- void StpPat4x4MaxH8(StpW1 i, inout StpH4 a, inout StpH4 b);
- // 4x4 wave op: 4 component sum.
- void StpPat4x4SumH4(StpW1 i, inout StpH4 a);
- //------------------------------------------------------------------------------------------------------------------------------
- // Sample bilinear interpolated clamp to edge prior convergence.
- StpH1 StpPatPriConH(StpF2 p);
- //------------------------------------------------------------------------------------------------------------------------------
- // Note this is still designed to be an inline function pass merged to avoid DRAM traffic.
- // So in an ideal world (with better merging with pre-scale post) these would be already in registers.
- // But when PAT pass is non-inline, these callbacks are placed in the right order for loads.
- // Input motion, 'position - motion' is the reprojected position, where {0 to 1} is range of the screen.
- StpF2 StpPatDatMotH(StpW2 o);
- // Input color, this is linear HDR or post-tonemap-linear depending on STP_POSTMAP.
- StpH3 StpPatDatColH(StpW2 o);
- StpF1 StpPatDatZH(StpW2 o);
- // Input depth, this is linear {0:near to INF:far} ranged.
- StpF1 StpPatFixZH(StpF1 z);
- StpU1 StpPatDatRH(StpW2 o);
- // Responsive input pixel {0.0 := responsive, 1.0 := normal}.
- StpH1 StpPatFixRH(StpU1 v);
- //------------------------------------------------------------------------------------------------------------------------------
- // Dither value {0 to 1} this should be input pixel frequency spatial temporal blue noise.
- StpH1 StpPatDitH(StpW2 o);
- //------------------------------------------------------------------------------------------------------------------------------
- // Sample bilinear interpolated clamp to edge prior feedback.
- StpH4 StpPatPriFedH(StpF2 p);
- // Gather4 versions.
- StpH4 StpPatPriFedR4H(StpF2 p);
- StpH4 StpPatPriFedG4H(StpF2 p);
- StpH4 StpPatPriFedB4H(StpF2 p);
- //------------------------------------------------------------------------------------------------------------------------------
- // Sample bilinear interpolated clamp to edge 2-frame luma ring.
- StpH2 StpPatPriLumH(StpF2 p);
- //------------------------------------------------------------------------------------------------------------------------------
- // Gather4 on prior {z,motion}.
- StpU4 StpPatPriMot4H(StpF2 p);
- #if STP_MAX_MIN_UINT
- StpU1 StpPatPriMotMinH(StpF2 p);
- #endif // STP_MAX_MIN_UINT
- #if STP_OFFSETS
- StpU4 StpPatPriMot4OH(StpF2 p, StpI2 o);
- #if STP_MAX_MIN_UINT
- StpU1 StpPatPriMotMinOH(StpF2 p, StpI2 o);
- #endif // STP_MAX_MIN_UINT
- #endif // STP_OFFSETS
- //------------------------------------------------------------------------------------------------------------------------------
- void StpPatStMotH(StpW2 p, StpU1 v);
- void StpPatStColH(StpW2 p, StpH4 v);
- void StpPatStLumH(StpW2 p, StpH2 v);
- void StpPatStCnvH(StpW2 p, StpH1 v);
- //==============================================================================================================================
- void StpPatH(
- StpW1 lane,
- StpW2 pp,
- StpU4 con0,
- StpU4 con1,
- StpU4 con2,
- StpU4 con3,
- StpU4 con4,
- StpU4 con5,
- StpU4 con6,
- StpU4 con7,
- StpU4 con8,
- StpU4 con9,
- StpU4 conA,
- StpU4 conB,
- StpU4 conC,
- StpU4 conD) {
- //------------------------------------------------------------------------------------------------------------------------------
- // Outputs.
- StpH4 rC;
- StpU1 rM;
- StpH2 rL;
- StpH1 rCnv;
- //------------------------------------------------------------------------------------------------------------------------------
- // Rename constants.
- StpF2 kRcpC = StpF2_U2(con0.xy);
- StpF2 kHalfRcpC = StpF2_U2(con0.zw);
- StpF2 kJitCRcpCUnjitPRcpP = StpF2_U2(con1.xy);
- StpF2 kJitCRcpC = StpF2_U2(con1.zw);
- StpF2 kF = StpF2_U2(con2.xy);
- StpF4 kOS = StpF4_U4(con3);
- StpF2 kDepth = StpF2_U2(con2.zw);
- StpF2 kUnDepth = StpF2_U2(con4.xy);
- StpF1 kMotionMatch = StpF1_U1(con4.z);
- StpF2 kC = StpF2_U2(con5.xy);
- StpF4 k0123 = StpF4_U4(con6);
- StpF4 k4567 = StpF4_U4(con7);
- StpF4 k89AB = StpF4_U4(con8);
- StpF4 kCDEF = StpF4_U4(con9);
- StpF4 kGHIJ = StpF4_U4(conA);
- StpF4 kKLMN = StpF4_U4(conB);
- StpF4 kOPQR = StpF4_U4(conC);
- StpF2 kST = StpF2_U2(conD.xy);
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 m = StpPatDatMotH(pp);
- // This dither fetch should likely be shared with pass merged pre-scale post work in the future.
- StpH1 d = StpPatDitH(pp);
- StpF1 zPre = StpPatDatZH(pp);
- StpH3 c = StpPatDatColH(pp);
- //==============================================================================================================================
- // DEPENDENT INLINE INPUT MOTION
- //==============================================================================================================================
- // Work towards getting all dependent fetches out first.
- // Compute float position {0 to 1} across screen.
- StpF2 p = StpF2(pp) * kRcpC + kHalfRcpC;
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG_BW_SOL
- { StpH2 lum2 = StpPatPriLumH(p);
- StpH1 cnvPrev = StpPatPriConH(p);
- StpU4 mZVP4 = StpPatPriMot4H(p);
- StpU1 rPre = StpPatDatRH(p);
- StpH3 f = StpPatPriFedH(p).rgb;
- StpF1 z = StpPatFixZH(zPre);
- StpH1 r = StpPatFixRH(rPre);
- rC.rgb = StpH3_(m.x) + StpH3_(d.x) + c + StpH3_(lum2.x) + StpH3_(cnvPrev) + StpH3(mZVP4.xyz) + f + StpH3_(z+r);
- rC.a = StpH1_(0.0);
- rL = rC.rg;
- rM = StpU1_(rC.r);
- rCnv = rC.r;
- StpPatStMotH(pp, rM);
- StpPatStLumH(pp, rL);
- StpPatStColH(pp, rC);
- StpPatStCnvH(pp, rCnv);
- return; }
- #endif // STP_BUG_BW_SOL
- //------------------------------------------------------------------------------------------------------------------------------
- // Reprojection position in prior input and feedback.
- StpF2 pM = (p - m);
- StpF2 pF = pM + kJitCRcpC;
- pM = pM + kJitCRcpCUnjitPRcpP;
- //------------------------------------------------------------------------------------------------------------------------------
- // Fetch 2-frame reprojected history ring of luma.
- StpH2 lum2 = StpPatPriLumH(pM);
- //------------------------------------------------------------------------------------------------------------------------------
- // Fetch reprojected low-frequency convergence prior frame.
- StpH1 cnvPrev = StpPatPriConH(pM);
- //------------------------------------------------------------------------------------------------------------------------------
- // Grab large enough neighborhood for prior reprojected nearest {z,motion}.
- // This nearest dilates {z, motion} reprojection to avoid pulling in anti-aliased edges and leaving temporal ringing.
- #if (STP_SAFE_DILATE == 2)
- #if STP_MAX_MIN_UINT
- StpU4 mZVP4;
- #if STP_OFFSETS
- mZVP4.x = StpPatPriMotMinOH(pM, StpI2(-1, -1));
- mZVP4.y = StpPatPriMotMinOH(pM, StpI2( 1, -1));
- mZVP4.z = StpPatPriMotMinOH(pM, StpI2(-1, 1));
- mZVP4.w = StpPatPriMotMinOH(pM, StpI2( 1, 1));
- #else // STP_OFFSETS
- mZVP4.x = StpPatPriMotMinH(pM + StpF2(-kRcpC.x, -kRcpC.y));
- mZVP4.y = StpPatPriMotMinH(pM + StpF2( kRcpC.x, -kRcpC.y));
- mZVP4.z = StpPatPriMotMinH(pM + StpF2(-kRcpC.x, kRcpC.y));
- mZVP4.w = StpPatPriMotMinH(pM + StpF2( kRcpC.x, kRcpC.y));
- #endif // ST_OFFSETS
- #else // STP_MAX_MIN_UINT
- #if STP_OFFSETS
- StpU4 mZVP4_0 = StpPatPriMot4OH(pM, StpI2(-1, -1));
- StpU4 mZVP4_1 = StpPatPriMot4OH(pM, StpI2( 1, -1));
- StpU4 mZVP4_2 = StpPatPriMot4OH(pM, StpI2(-1, 1));
- StpU4 mZVP4_3 = StpPatPriMot4OH(pM, StpI2( 1, 1));
- #else // STP_OFFSETS
- StpU4 mZVP4_0 = StpPatPriMot4H(pM + StpF2(-kRcpC.x, -kRcpC.y));
- StpU4 mZVP4_1 = StpPatPriMot4H(pM + StpF2( kRcpC.x, -kRcpC.y));
- StpU4 mZVP4_2 = StpPatPriMot4H(pM + StpF2(-kRcpC.x, kRcpC.y));
- StpU4 mZVP4_3 = StpPatPriMot4H(pM + StpF2( kRcpC.x, kRcpC.y));
- #endif // STP_OFFSETS
- #endif // STP_MAX_MIN_UINT
- #else // (STP_SAFE_DILATE == 2)
- StpU1 mZVPN;
- // To be correct here this needs 'kHalfRcpP' (prior instead of current).
- // But didn't want to pass yet another pair of constants, so using current instead.
- // TODO: If later moving to 'kHalfRcpP' can use one sample by offset to save some VALU ops.
- // Also this is only used if STP_SAFE_DILATE=1 (else dead code).
- StpU4 mZVP2a = StpPatPriMot4H(pM - kHalfRcpC);
- StpU4 mZVP2b = StpPatPriMot4H(pM + kHalfRcpC);
- #if STP_MAX_MIN_UINT
- mZVPN = StpPatPriMotMinH(pM);
- #else // STP_MAX_MIN_UINT
- StpU4 mZVP4 = StpPatPriMot4H(pM);
- #endif // STP_MAX_MIN_UINT
- #endif // (STP_SAFE_DILATE == 2)
- //------------------------------------------------------------------------------------------------------------------------------
- StpU1 rPre = StpPatDatRH(pp);
- //------------------------------------------------------------------------------------------------------------------------------
- // Gather 4 on feedback.
- StpH4 f4R = StpPatPriFedR4H(pF);
- StpH4 f4G = StpPatPriFedG4H(pF);
- StpH4 f4B = StpPatPriFedB4H(pF);
- // Grab bilinear feedback.
- StpH3 f = StpPatPriFedH(pF).rgb;
- //==============================================================================================================================
- // DEPENDENT ON DITHER AND INLINE INPUT PARAMETERS
- //==============================================================================================================================
- StpF1 dd = StpF1_(d);
- // Convert depth {0 to inf} to {0 to 1} safe for 10-bit value.
- StpF1 z = StpPatFixZH(zPre);
- z = StpZPack(z, kDepth, dd);
- // Pack {MSB depth, LSB 11-bit XY motion}.
- rM = StpMvPack(z, m, dd);
- StpPatStMotH(pp, rM);
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG
- // Pattern/Clipped Input Color
- { StpF4 bug = StpF4_(0.0);
- bug.rgb = sqrt(StpF3(c));
- bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
- StpBugF(StpU3(pp, 0), bug); }
- //------------------------------------------------------------------------------------------------------------------------------
- // Pattern/Log Input Depth
- { StpF4 bug = StpF4_(0.0);
- bug.rgb = StpF3_(StpSatF1(z + StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
- StpBugF(StpU3(pp, 1), bug); }
- #endif // STP_BUG
- //------------------------------------------------------------------------------------------------------------------------------
- // Pre-process color.
- // If running pre-tonemap, then do a fast reversible tonemapper (convert from {0 to inf} to {0 to 1}).
- #if (STP_POSTMAP == 0)
- StpToneH3(c);
- #endif // (STP_POSTMAP == 0)
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG
- // Pattern/Reversible Tonemapped Input Color
- { StpF4 bug = StpF4_(0.0);
- bug.rgb = sqrt(StpF3(c));
- bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
- StpBugF(StpU3(pp, 2), bug); }
- #endif // STP_BUG
- //------------------------------------------------------------------------------------------------------------------------------
- // Output intermediate color.
- // Dither from linear to gamma 2.0.
- // Simple non-energy conserving dither is working, using 10-bit/channel.
- c = sqrt(c);
- rC.rgb = StpSatH3(c + StpH3_(d * StpH1(1.0 / 1023.0) + StpH1(-0.5 / 1023.0)));
- //------------------------------------------------------------------------------------------------------------------------------
- // Setup the new 3-ring output luma.
- rL.x = dot(c, StpH3(STP_LUMA));
- rL.y = lum2.x;
- StpPatStLumH(pp, rL);
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG
- // Pattern/Shaped Absolute Input Motion
- { StpF4 bug = StpF4_(0.0);
- bug.b = sqrt(StpF1_(rL.x) * StpF1_(0.25));
- bug.rg = StpF2_(1.0) - exp2(abs(StpF2(m)) * StpF2_(-32.0));
- bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
- StpBugF(StpU3(pp, 3), bug); }
- #endif // STP_BUG
- //------------------------------------------------------------------------------------------------------------------------------
- // Minimum change across the 3 frames {current, 2-frame reprojected history}.
- StpH1 moire = min(abs(rL.x - lum2.x), abs(lum2.x - lum2.y));
- moire *= StpH1_(STP_PAT_DEMOIRE);
- //------------------------------------------------------------------------------------------------------------------------------
- // Grab neighborhood.
- // Parallel block {max,-min}, and -min of convergence.
- StpH4 xnyRG = StpH4(c.r, -c.r, c.g, -c.g);
- StpH4 xnyBC = StpH4(c.b, -c.b, -cnvPrev, -cnvPrev);
- #if defined(STP_16BIT)
- StpPat4x4MaxH8(lane, xnyRG, xnyBC);
- #else // defined(STP_16BIT)
- // We convert to full precision floats here since the reductions work on 32-bit values.
- StpF4 xnyRGF = StpF4_(xnyRG);
- StpF4 xnyBCF = StpF4_(xnyBC);
- StpPat4x4MaxF8(lane, xnyRGF, xnyBCF);
- xnyRG = StpMF4_(xnyRGF);
- xnyBC = StpMF4_(xnyBCF);
- #endif // defined(STP_16BIT)
- cnvPrev = -xnyBC.z;
- // This is max minus min (the '.y' is already negative).
- StpH3 ne = max(StpH3_(STP_PAT_NE_MIN) * StpH3(xnyRG.x, xnyRG.z, xnyBC.x),
- StpH3(xnyRG.x + xnyRG.y, xnyRG.z + xnyRG.w, xnyBC.x + xnyBC.y));
- StpH1 ne1 = dot(ne, StpH3(STP_LUMA));
- //------------------------------------------------------------------------------------------------------------------------------
- // Advance low frequency convergence.
- cnvPrev = StpSatH1(cnvPrev + StpH1_(1.0 / STP_FRAME_MAX));
- //------------------------------------------------------------------------------------------------------------------------------
- // Estimate if reprojection is on-screen.
- StpF2 onXY = StpF2(pM.xy);
- // {-1 to 1} is on screen.
- onXY = onXY * kOS.xy + kOS.zw;
- // {0 := offscreen, 1 := onscreen}.
- StpF1 onS = StpSignedF1(max(abs(onXY.x), abs(onXY.y)) - StpF1_(1.0));
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG
- // Pattern/Motion Reprojection {R=Prior G=This Sqrt Luma Feedback Diff, B=Offscreen}
- { StpF4 bug = StpF4_(0.0);
- bug.g = StpF1_(abs(rL.x - lum2.x));
- bug.r = StpF1_(abs(lum2.x - lum2.y));
- bug.b = StpF1_(1.0) - StpF1_(onS);
- bug.rg = sqrt(bug.rg);
- bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
- StpBugF(StpU3(pp, 4), bug); }
- #endif // STP_BUG
- //==============================================================================================================================
- // DEPENDENT ON PRIOR {Z, MOTION}
- //==============================================================================================================================
- // Compute a motion match value.
- // Finish {z, motion} nearest dilation.
- #if (STP_SAFE_DILATE == 2)
- #if (STP_MAX_MIN_UINT == 0)
- StpU4 mZVP4 = min(StpMin3U4(mZVP4_0, mZVP4_1, mZVP4_2), mZVP4_3);
- #endif // (STP_MAX_MIN_UINT == 0)
- StpU1 mZVPN = min(StpMin3U1(mZVP4.x, mZVP4.y, mZVP4.z), mZVP4.w);
- #else // (STP_SAFE_DILATE == 2)
- #if (STP_MAX_MIN_UINT == 0)
- mZVPN = min(StpMin3U1(mZVP4.x, mZVP4.y, mZVP4.z), mZVP4.w);
- #endif // (STP_MAX_MIN_UINT == 0)
- #if STP_SAFE_DILATE
- mZVPN = StpMin3U1(StpMin3U1(mZVPN, mZVP2a.x, mZVP2a.z), mZVP2b.x, mZVP2b.z);
- #endif // STP_SAFE_DILATE
- #endif // (STP_SAFE_DILATE == 2)
- //------------------------------------------------------------------------------------------------------------------------------
- // The {motion} matching logic.
- StpF2 mPN;
- StpF1 mZPN;
- // Motion 'm' units are {1 := move by one screen}.
- StpMvUnpack(mZPN, mPN, mZVPN);
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 mE;
- // Use a smoother error estimate.
- // This '1/256' instead of '1/1024' is to be more accepting of a motion match.
- // The 'sqrt()' cannot be the low precision approximation without visually seeing differences in the mask.
- mE = sqrt(abs(m)) + StpF2_(1.0 / 256.0);
- mE = mE * mE - abs(m);
- //------------------------------------------------------------------------------------------------------------------------------
- // Static geometry motion + estimated dynamic motion matching logic.
- // Take unpacked low precision {0 to 1} Z and decode to {0 to INF}.
- StpF1 sgZ = StpZUnpack(mZPN, kUnDepth);
- StpF2 bugF; StpF2 bugD;
- StpF2 sgM = StpFor(pM, sgZ, mPN, kMotionMatch, k0123, k4567, k89AB, kCDEF, kGHIJ, kKLMN, kOPQR, kST, bugF, bugD);
- // Note 'sgM' is in NDC {-1 to 1} space and 'm' is in {0 to 1} space, thus the 0.5 scaling factor.
- // The difference gets conservative possible motion encoding error subtracted out via 'saturate(abs(..)-mE)'.
- sgM = StpSatF2(abs(sgM * StpF2_(0.5) - m) - mE) * kC;
- StpH1 sgD = StpH1(dot(sgM, sgM));
- //------------------------------------------------------------------------------------------------------------------------------
- // Motion match {0 := no match, 1 := match}.
- StpH1 match = StpH1_(1.0) - StpSatH1(sgD * StpH1_(STP_PAT_MOT_AMP) - StpH1_(STP_PAT_MOT_ADD * STP_PAT_MOT_AMP));
- // Offscreen is a non-match.
- match *= StpH1_(onS);
- // Pass motion match in alpha.
- rC.a = match;
- StpPatStColH(pp, rC);
- //------------------------------------------------------------------------------------------------------------------------------
- // Must disable on non-motion match, but make sure it doesn't fully /0 later.
- moire = moire * match + StpH1_(1.0 / 8192.0);
- // Scale down temporal change proportional to ratio of local neighborhood and minimum 3-frame temporal change.
- moire = min(StpH1_(1.0), ne1 * StpRcpH1(moire));
- //------------------------------------------------------------------------------------------------------------------------------
- // Sensitivity modifiers.
- // The following which gets optimized to two FMAs.
- // tS = tS * ((1-v)*k + 1) ... logic
- // tS = tS * ((1-v)*k) + tS
- // tS = tS * (k-v*k) + tS ..... optimized
- StpH1 tS = moire;
- StpH1 r = StpPatFixRH(rPre);
- tS = tS * (StpH1_(STP_PAT_RESPONSIVE) - r * StpH1_(STP_PAT_RESPONSIVE)) + tS;
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG
- // Pattern/Sensitivity {G=No motion match, R=Responsive, B=Luma}
- { StpF4 bug = StpF4_(0.0);
- bug.g = StpF1_(1.0) - StpF1(match);
- bug.r = StpF1_(1.0) - StpF1(r);
- bug.b = StpF1_(rL.x);
- bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
- StpBugF(StpU3(pp, 5), bug); }
- #endif // STP_BUG
- //==============================================================================================================================
- // DEPENDENT ON FEEDBACK
- //==============================================================================================================================
- // Find lowest temporal difference.
- StpH4 t;
- t.rgb = c - f;
- // Luma diff in alpha.
- t.a = dot(abs(t.rgb), StpH3(STP_LUMA));
- // Compute lowest difference for all in quad.
- StpH4 t4R = f4R - StpH4_(c.r);
- StpH4 t4G = f4G - StpH4_(c.g);
- StpH4 t4B = f4B - StpH4_(c.b);
- StpH4 t4A = abs(t4R) * StpH4_(STP_LUMA_R) + abs(t4G) * StpH4_(STP_LUMA_G) + abs(t4B) * StpH4_(STP_LUMA_B);
- // Override with lower from gather4.
- t.a = StpMin3H1(t.a, t4A.x, StpMin3H1(t4A.y, t4A.z, t4A.w));
- if(t.a == t4A.x) t.rgb = StpH3(t4R.x, t4G.x, t4B.x);
- if(t.a == t4A.y) t.rgb = StpH3(t4R.y, t4G.y, t4B.y);
- if(t.a == t4A.z) t.rgb = StpH3(t4R.z, t4G.z, t4B.z);
- if(t.a == t4A.w) t.rgb = StpH3(t4R.w, t4G.w, t4B.w);
- //------------------------------------------------------------------------------------------------------------------------------
- // Factor in sensitivity and reduce.
- t.rgb *= StpH3_(tS);
- //------------------------------------------------------------------------------------------------------------------------------
- #if defined(STP_16BIT)
- StpPat4x4SumH4(lane, t);
- #else // defined(STP_16BIT)
- // We convert to full precision floats here since the reductions work on 32-bit values, and MF might be 16-bit.
- StpF4 tF = StpF4(t);
- StpPat4x4SumF4(lane, tF);
- t = StpMF4(tF);
- #endif // defined(STP_16BIT)
- t.rgb *= StpH3_(STP_PAT_SENSITIVITY);
- //------------------------------------------------------------------------------------------------------------------------------
- // Ratio of 'spatial/temporal' change.
- StpH3 bln3 = StpSatH3(ne * StpPrxLoRcpH3(abs(t.rgb)));
- // Worst channel limits to avoid chroma ghosting.
- StpH1 bln = StpMin3H1(bln3.r, bln3.g, bln3.b);
- //------------------------------------------------------------------------------------------------------------------------------
- // Convert from blend ratio to convergence.
- // Note, 'rcp(0)=+INF' when approximations are not used.
- StpH1 cnv = StpSatH1(bln * StpPrxLoRcpH1(StpH1_(STP_FRAME_MAX) - StpH1_(STP_FRAME_MAX) * bln));
- //------------------------------------------------------------------------------------------------------------------------------
- // Feedback the min of reprojected convergence, and subtract one frame (as next frame advances by one).
- cnv = StpSatH1(cnv - StpH1_(1.0 / STP_FRAME_MAX));
- rCnv = min(cnv, cnvPrev);
- StpPatStCnvH(pp, rCnv); }
- #endif // defined(STP_GPU) && defined(STP_16BIT) && defined(STP_PAT)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- //
- // PATTERN DILATION ENTRY POINT
- //
- //------------------------------------------------------------------------------------------------------------------------------
- // This should be pass merged with STP_SAA.
- // Dilates low frequency convergence.
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_32BIT) && defined(STP_DIL)
- StpMF1 StpDilDitF(StpMU2 o);
- StpMF1 StpDilConF(StpF2 p);
- StpMF4 StpDilCon4F(StpF2 p);
- #if STP_OFFSETS
- StpMF1 StpDilConOF(StpF2 p, StpI2 o);
- StpMF4 StpDilCon4OF(StpF2 p, StpI2 o);
- #endif // STP_OFFSETS
- //==============================================================================================================================
- void StpDilF(out StpMF1 oC, StpU2 pp, StpU4 con0) {
- StpF2 kRcpR = StpF2_U2(con0.xy);
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 p = StpF2(pp) * kRcpR;
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG_BW_SOL
- { oC = StpDilCon4F(p).x; return; }
- #endif // STP_BUG_BW_SOL
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_OFFSETS
- StpMF4 g0 = StpDilCon4OF(p, StpI2(-1.0, -1.0));
- StpMF4 g1 = StpDilCon4OF(p, StpI2( 1.0, -1.0));
- StpMF4 g2 = StpDilCon4OF(p, StpI2( 3.0, -1.0));
- StpMF4 g3 = StpDilCon4OF(p, StpI2(-1.0, 1.0));
- StpMF4 g4 = StpDilCon4OF(p, StpI2( 1.0, 1.0));
- StpMF4 g5 = StpDilCon4OF(p, StpI2( 3.0, 1.0));
- StpMF4 g6 = StpDilCon4OF(p, StpI2(-1.0, 3.0));
- StpMF4 g7 = StpDilCon4OF(p, StpI2( 1.0, 3.0));
- StpMF4 g8 = StpDilCon4OF(p, StpI2( 3.0, 3.0));
- #else // STP_OFFSETS
- StpMF4 g0 = StpDilCon4F(p + StpF2(-1.0 * kRcpR.x, -1.0 * kRcpR.y));
- StpMF4 g1 = StpDilCon4F(p + StpF2( 1.0 * kRcpR.x, -1.0 * kRcpR.y));
- StpMF4 g2 = StpDilCon4F(p + StpF2( 3.0 * kRcpR.x, -1.0 * kRcpR.y));
- StpMF4 g3 = StpDilCon4F(p + StpF2(-1.0 * kRcpR.x, 1.0 * kRcpR.y));
- StpMF4 g4 = StpDilCon4F(p + StpF2( 1.0 * kRcpR.x, 1.0 * kRcpR.y));
- StpMF4 g5 = StpDilCon4F(p + StpF2( 3.0 * kRcpR.x, 1.0 * kRcpR.y));
- StpMF4 g6 = StpDilCon4F(p + StpF2(-1.0 * kRcpR.x, 3.0 * kRcpR.y));
- StpMF4 g7 = StpDilCon4F(p + StpF2( 1.0 * kRcpR.x, 3.0 * kRcpR.y));
- StpMF4 g8 = StpDilCon4F(p + StpF2( 3.0 * kRcpR.x, 3.0 * kRcpR.y));
- #endif // STP_OFFSETS
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 cA = g0.w;
- StpMF1 cB = g0.z;
- StpMF1 cC = g1.w;
- StpMF1 cD = g1.z;
- StpMF1 cE = g2.w;
- StpMF1 cF = g0.x;
- StpMF1 cG = g0.y;
- StpMF1 cH = g1.x;
- StpMF1 cI = g1.y;
- StpMF1 cJ = g2.x;
- StpMF1 cK = g3.w;
- StpMF1 cL = g3.z;
- StpMF1 cM = g4.w;
- StpMF1 cN = g4.z;
- StpMF1 cO = g5.w;
- StpMF1 cP = g3.x;
- StpMF1 cQ = g3.y;
- StpMF1 cR = g4.x;
- StpMF1 cS = g4.y;
- StpMF1 cT = g5.x;
- StpMF1 cU = g6.w;
- StpMF1 cV = g6.z;
- StpMF1 cW = g7.w;
- StpMF1 cX = g7.z;
- StpMF1 cY = g8.w;
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF4 m1345;
- m1345.x = StpMin3MF1(StpMin3MF1(cG, cH, cI), cC, cM);
- m1345.y = StpMin3MF1(StpMin3MF1(cK, cL, cM), cG, cQ);
- m1345.z = StpMin3MF1(StpMin3MF1(cL, cM, cN), cH, cR);
- m1345.w = StpMin3MF1(StpMin3MF1(cM, cN, cO), cI, cS);
- StpMF1 m7 = StpMin3MF1(StpMin3MF1(cQ, cR, cS), cM, cW);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 b0 = StpMF1_(0.5);
- StpMF1 b1 = (StpMF1_(1.0) - b0) * StpMF1_(0.25);
- oC = m1345.z * b0 + m1345.x * b1 + m1345.y * b1 + m1345.w * b1 + m7 * b1; }
- #endif // defined(STP_GPU) && defined(STP_32BIT) && defined(STP_DIL)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // 16-BIT PATH
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_16BIT) && defined(STP_DIL)
- // Some of these are unused, possibly for future experimentation.
- StpH1 StpDilDitH(StpW2 o);
- StpH1 StpDilConH(StpF2 p);
- StpH4 StpDilCon4H(StpF2 p);
- #if STP_OFFSETS
- StpH1 StpDilConOH(StpF2 p, StpI2 o);
- StpH4 StpDilCon4OH(StpF2 p, StpI2 o);
- #endif // STP_OFFSETS
- //==============================================================================================================================
- void StpDilH(out StpH1 oC, StpU2 pp, StpU4 con0) {
- StpF2 kRcpR = StpF2_U2(con0.xy);
- StpF2 p = StpF2(pp) * kRcpR;
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG_BW_SOL
- { oC = StpDilCon4H(p).x; return; }
- #endif // STP_BUG_BW_SOL
- //------------------------------------------------------------------------------------------------------------------------------
- // Gather.
- // 0 1 2
- //
- // 3 4 5
- //
- // 6 7 8
- // For.
- // w z w z w z
- // x y.x y x y
- // w z[w]z w z
- // x y x y x y
- // w z w z w z
- // x y x y x y
- #if STP_OFFSETS
- StpH4 g0 = StpDilCon4OH(p, StpI2(-1.0, -1.0));
- StpH4 g1 = StpDilCon4OH(p, StpI2( 1.0, -1.0));
- StpH4 g2 = StpDilCon4OH(p, StpI2( 3.0, -1.0));
- StpH4 g3 = StpDilCon4OH(p, StpI2(-1.0, 1.0));
- StpH4 g4 = StpDilCon4OH(p, StpI2( 1.0, 1.0));
- StpH4 g5 = StpDilCon4OH(p, StpI2( 3.0, 1.0));
- StpH4 g6 = StpDilCon4OH(p, StpI2(-1.0, 3.0));
- StpH4 g7 = StpDilCon4OH(p, StpI2( 1.0, 3.0));
- StpH4 g8 = StpDilCon4OH(p, StpI2( 3.0, 3.0));
- #else // STP_OFFSETS
- StpH4 g0 = StpDilCon4H(p + StpF2(-1.0 * kRcpR.x, -1.0 * kRcpR.y));
- StpH4 g1 = StpDilCon4H(p + StpF2( 1.0 * kRcpR.x, -1.0 * kRcpR.y));
- StpH4 g2 = StpDilCon4H(p + StpF2( 3.0 * kRcpR.x, -1.0 * kRcpR.y));
- StpH4 g3 = StpDilCon4H(p + StpF2(-1.0 * kRcpR.x, 1.0 * kRcpR.y));
- StpH4 g4 = StpDilCon4H(p + StpF2( 1.0 * kRcpR.x, 1.0 * kRcpR.y));
- StpH4 g5 = StpDilCon4H(p + StpF2( 3.0 * kRcpR.x, 1.0 * kRcpR.y));
- StpH4 g6 = StpDilCon4H(p + StpF2(-1.0 * kRcpR.x, 3.0 * kRcpR.y));
- StpH4 g7 = StpDilCon4H(p + StpF2( 1.0 * kRcpR.x, 3.0 * kRcpR.y));
- StpH4 g8 = StpDilCon4H(p + StpF2( 3.0 * kRcpR.x, 3.0 * kRcpR.y));
- #endif // STP_OFFSETS
- //------------------------------------------------------------------------------------------------------------------------------
- // Rename
- // a b c d e
- // f g h i j
- // k l m n o
- // p q r s t
- // u v w x y
- StpH1 cA = g0.w;
- StpH1 cB = g0.z;
- StpH1 cC = g1.w;
- StpH1 cD = g1.z;
- StpH1 cE = g2.w;
- StpH1 cF = g0.x;
- StpH1 cG = g0.y;
- StpH1 cH = g1.x;
- StpH1 cI = g1.y;
- StpH1 cJ = g2.x;
- StpH1 cK = g3.w;
- StpH1 cL = g3.z;
- StpH1 cM = g4.w;
- StpH1 cN = g4.z;
- StpH1 cO = g5.w;
- StpH1 cP = g3.x;
- StpH1 cQ = g3.y;
- StpH1 cR = g4.x;
- StpH1 cS = g4.y;
- StpH1 cT = g5.x;
- StpH1 cU = g6.w;
- StpH1 cV = g6.z;
- StpH1 cW = g7.w;
- StpH1 cX = g7.z;
- StpH1 cY = g8.w;
- //------------------------------------------------------------------------------------------------------------------------------
- // 5 point min.
- // . 1 .
- // 3 4 5
- // . 7 .
- StpH4 m1345;
- m1345.x = StpMin3H1(StpMin3H1(cG, cH, cI), cC, cM);
- m1345.y = StpMin3H1(StpMin3H1(cK, cL, cM), cG, cQ);
- m1345.z = StpMin3H1(StpMin3H1(cL, cM, cN), cH, cR);
- m1345.w = StpMin3H1(StpMin3H1(cM, cN, cO), cI, cS);
- StpH1 m7 = StpMin3H1(StpMin3H1(cQ, cR, cS), cM, cW);
- //------------------------------------------------------------------------------------------------------------------------------
- StpH1 b0 = StpH1_(0.5);
- StpH1 b1 = (StpH1_(1.0) - b0) * StpH1_(0.25);
- oC = m1345.z * b0 + m1345.x * b1 + m1345.y * b1 + m1345.w * b1 + m7 * b1; }
- #endif // defined(STP_GPU) && defined(STP_16BIT) && defined(STP_DIL)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- //
- // SPATIAL ANTI-ALIASING ENTRY POINT
- //
- //------------------------------------------------------------------------------------------------------------------------------
- // This should be pass merged with STP_DIL.
- // It's a shell, GEAA is separated as a modified form could be useful on its own.
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_32BIT) && defined(STP_SAA)
- StpMF4 StpSaaLum4F(StpF2 p);
- #if STP_OFFSETS
- StpMF4 StpSaaLum4OF(StpF2 p, StpI2 o);
- #endif
- //------------------------------------------------------------------------------------------------------------------------------
- #define STP_GEAA 1
- StpMF4 StpGeaa4F(StpF2 p) { return StpSaaLum4F(p); }
- #if STP_OFFSETS
- StpMF4 StpGeaa4OF(StpF2 p, StpI2 o) { return StpSaaLum4OF(p, o); }
- #endif
- void StpGeaaF(out StpMF1 gW, out StpMF1 gLuma, out StpF2 gFilter, out StpF2 gDilate, StpF2 p, StpF2 kRcpI, StpF2 kHalfRcpI);
- //==============================================================================================================================
- void StpSaaF(out StpMF1 oN, StpU2 pp, StpU4 con0) {
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 kRcpC = StpF2_U2(con0.xy);
- StpF2 kHalfRcpC = StpF2_U2(con0.zw);
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 p = StpF2(pp) * kRcpC + kHalfRcpC;
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG_BW_SOL
- { oN = StpSaaLum4F(p).x; return; }
- #endif // STP_BUG_BW_SOL
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 gLuma;
- StpMF1 gNe;
- StpF2 gFilter;
- StpF2 gDilate;
- StpGeaaF(oN, gLuma, gFilter, gDilate, p, kRcpC, kHalfRcpC); }
- #endif // defined(STP_GPU) && defined(STP_32BIT) && defined(STP_SAA)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // 16-BIT PATH
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_16BIT) && defined(STP_SAA)
- // Gather4 on current luma.
- StpH4 StpSaaLum4H(StpF2 p);
- #if STP_OFFSETS
- StpH4 StpSaaLum4OH(StpF2 p, StpI2 o);
- #endif
- //------------------------------------------------------------------------------------------------------------------------------
- #define STP_GEAA 1
- StpH4 StpGeaa4H(StpF2 p) { return StpSaaLum4H(p); }
- #if STP_OFFSETS
- StpH4 StpGeaa4OH(StpF2 p, StpI2 o) { return StpSaaLum4OH(p, o); }
- #endif
- void StpGeaaH(out StpH1 gW, out StpH1 gLuma, out StpF2 gFilter, out StpF2 gDilate, StpF2 p, StpF2 kRcpI, StpF2 kHalfRcpI);
- //==============================================================================================================================
- void StpSaaH(
- out StpH1 oN, // Output control (to be stored).
- StpU2 pp, // Input position {0 to size-1} across the input frame.
- StpU4 con0) { // Shared, first constant generated by StpPatCon().
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 kRcpC = StpF2_U2(con0.xy);
- StpF2 kHalfRcpC = StpF2_U2(con0.zw);
- //------------------------------------------------------------------------------------------------------------------------------
- // Float position {0 to 1} across screen.
- StpF2 p = StpF2(pp) * kRcpC + kHalfRcpC;
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG_BW_SOL
- { oN = StpSaaLum4H(p).x; return; }
- #endif // STP_BUG_BW_SOL
- //------------------------------------------------------------------------------------------------------------------------------
- StpH1 gLuma; // Spatial AA (unused).
- StpH1 gNe; // Output spatial neighborhood (unused).
- StpF2 gFilter; // Output position for anti-aliased color sampling if standalone (unused).
- StpF2 gDilate; // Output for {z,motion} dilation (unused).
- StpGeaaH(oN, gLuma, gFilter, gDilate, p, kRcpC, kHalfRcpC); }
- #endif // defined(STP_GPU) && defined(STP_16BIT) && defined(STP_SAA)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- //
- // SCALING TAA ENTRY POINT
- //
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_TAA) && defined(STP_32BIT)
- StpMF4 StpTaaCtl4F(StpF2 p);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF4 StpTaaCol4RF(StpF2 p);
- StpMF4 StpTaaCol4GF(StpF2 p);
- StpMF4 StpTaaCol4BF(StpF2 p);
- StpMF4 StpTaaCol4AF(StpF2 p);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 StpTaaConF(StpF2 p);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 StpTaaDitF(StpMU2 o);
- //------------------------------------------------------------------------------------------------------------------------------
- StpU4 StpTaaMot4F(StpF2 p);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF4 StpTaaPriFedF(StpF2 p);
- StpMF4 StpTaaPriFed4RF(StpF2 p);
- StpMF4 StpTaaPriFed4GF(StpF2 p);
- StpMF4 StpTaaPriFed4BF(StpF2 p);
- #if STP_MAX_MIN_10BIT
- StpMF4 StpTaaPriFedMaxF(StpF2 p);
- StpMF4 StpTaaPriFedMinF(StpF2 p);
- #endif // STP_MAX_MIN_10BIT
- #if STP_OFFSETS
- StpMF4 StpTaaPriFedOF(StpF2 p, StpI2 o);
- StpMF4 StpTaaPriFed4ROF(StpF2 p, StpI2 o);
- StpMF4 StpTaaPriFed4GOF(StpF2 p, StpI2 o);
- StpMF4 StpTaaPriFed4BOF(StpF2 p, StpI2 o);
- #endif // STP_OFFSETS
- //==============================================================================================================================
- void StpTaaF(
- StpMU1 lane,
- StpMU2 o,
- out StpMF4 rF,
- out StpMF4 rW,
- StpU4 con0,
- StpU4 con1,
- StpU4 con2,
- StpU4 con3) {
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 dit = StpTaaDitF(o);
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 kCRcpF = StpF2_U2(con0.xy);
- StpF2 kHalfCRcpFUnjitC = StpF2_U2(con0.zw);
- StpF2 kRcpC = StpF2_U2(con1.xy);
- StpF2 kRcpF = StpF2_U2(con1.zw);
- StpF2 kHalfRcpF = StpF2_U2(con2.xy);
- StpF2 kJitCRcpC0 = StpF2_U2(con2.zw);
- StpF2 kHalfRcpC = StpF2_U2(con3.xy);
- StpF2 kF = StpF2_U2(con3.zw);
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_BUG_BW_SOL
- { StpF2 oo = StpF2(o) * kRcpF;
- StpMF4 g4 = StpTaaCtl4RF(oo);
- StpU4 m4 = StpTaaMot4F(oo);
- StpMF1 cnv = StpTaaConF(oo);
- StpMF4 f = StpTaaPriFedF(oo);
- StpMF4 c4R = StpTaaCol4RF(oo);
- rW = rF = l4 + g4 + StpMF4(m4) + StpMF4_(cnv) + f + c4R;
- return; }
- #endif // STP_BUG_BW_SOL
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 oI = StpF2(o);
- StpF2 oC = oI * kCRcpF + kHalfCRcpFUnjitC;
- StpF2 oCNW = floor(oC + StpF2_(-0.5));
- StpF2 oC4 = oCNW * kRcpC + kRcpC;
- StpF2 oC1 = oC * kRcpC;
- //==============================================================================================================================
- // FETCH {CONVERGENCE, COLOR, CONTROL, Z+MOTION}
- //==============================================================================================================================
- StpMF1 cnv = StpTaaConF(oC1);
- StpMF4 c4R = StpTaaCol4RF(oC4);
- StpMF4 c4G = StpTaaCol4GF(oC4);
- StpMF4 c4B = StpTaaCol4BF(oC4);
- StpMF4 c4A = StpTaaCol4AF(oC4);
- StpMF4 g4 = StpTaaCtl4F(oC4);
- StpU4 m4 = StpTaaMot4F(oC4);
- //------------------------------------------------------------------------------------------------------------------------------
- // INDEPENDENT
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF2 rP = StpMF2(oC - oCNW) - StpMF2_(0.5);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF2 rPX10 = StpMF2(1.0, 0.0) + StpMF2(-rP.x, rP.x);
- StpMF2 rPY01 = StpMF2(0.0, 1.0) + StpMF2(rP.y, -rP.y);
- StpMF4 pen4x = StpMF4(rPX10.g, rPX10.r, rPX10.r, rPX10.g);
- StpMF4 pen4y = StpMF4(rPY01.g, rPY01.g, rPY01.r, rPY01.r);
- StpMF4 pen4 = StpSatMF4(pen4x * pen4x + pen4y * pen4y);
- //==============================================================================================================================
- // DEPENDENT ON {CONVERGENCE}
- //==============================================================================================================================
- cnv = StpSatMF1(cnv - StpMF1_(1.0 / STP_FRAME_MAX));
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 pen = StpMF1_(cnv) * StpMF1_(STP_FRAME_MAX) + StpMF1_(1.0);
- pen = StpPrxLoSqrtMF1(pen);
- pen4 = StpSatMF4(StpMF4_(1.0) - pen4 * StpMF4_(pen));
- #if defined(STP_16BIT)
- #else // defined(STP_16BIT)
- pen = StpSatMF1(pen4.x * pen4.x + pen4.y * pen4.y + pen4.z * pen4.z + pen4.w * pen4.w);
- #endif // defined(STP_16BIT)
- //==============================================================================================================================
- // DEPENDENT ON {COLOR}
- //==============================================================================================================================
- StpMF4 wG;
- StpMF4 l4 = c4R + c4G * StpMF4_(2.0) + c4B;
- StpMF2 difST = abs(l4.gr - l4.ab);
- StpP1 useS = difST.x > difST.y;
- StpMF2 wTrb = StpSatMF2(StpMF2(-rP.x, rP.x) + StpMF2(rP.y, -rP.y));
- StpMF2 wSrb = min(rPX10, rPY01);
- if(useS) wTrb = wSrb;
- StpMF2 wTga = rPY01 - wTrb;
- wG.rg = StpMF2(wTrb.x, wTga.x);
- wG.ba = StpMF2(wTrb.y, wTga.y);
- wG *= wG;
- wG *= wG;
- //------------------------------------------------------------------------------------------------------------------------------
- wG *= g4;
- StpMF4 triMask = StpMF4_(1.0);
- StpMF2 wGmin2 = min(wG.xy, wG.zw);
- //==============================================================================================================================
- // DEPENDENT ON {Z,MOTION}
- //==============================================================================================================================
- if(wGmin2.x < wGmin2.y) {
- if(wG.x < wG.z) { triMask.x = StpMF1_(STP_TAA_TRI_MASK_AVOID); m4.x = 0xFFFFFFFF; }
- else { triMask.z = StpMF1_(STP_TAA_TRI_MASK_AVOID); m4.z = 0xFFFFFFFF; } }
- else {
- if(wG.y < wG.w) { triMask.y = StpMF1_(STP_TAA_TRI_MASK_AVOID); m4.y = 0xFFFFFFFF; }
- else { triMask.w = StpMF1_(STP_TAA_TRI_MASK_AVOID); m4.w = 0xFFFFFFFF; } }
- StpU1 m1 = min(StpMin3U1(m4.x, m4.y, m4.z), m4.w);
- //------------------------------------------------------------------------------------------------------------------------------
- wG *= triMask;
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 mXY;
- StpMvUnpackV(mXY, m1);
- //==============================================================================================================================
- // GET ALL FEEDBACK FILTERING DONE
- //==============================================================================================================================
- StpF2 oF = oI * kRcpF + kHalfRcpF - mXY;
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF3 f;
- #if STP_TAA_PRX_LANCZOS
- StpF2 oM = oI + StpF2_(0.5) - mXY * kF;
- StpF2 oMNW = floor(oM + StpF2_(-0.5));
- StpF2 oM4 = oMNW * kRcpF + kRcpF;
- StpMF3 fMax, fMin;
- #else // STP_TAA_PRX_LANCZOS
- f = StpTaaPriFedF(oF).rgb;
- #endif // STP_TAA_PRX_LANCZOS
- //==============================================================================================================================
- #if (STP_TAA_PRX_LANCZOS == 1)
- #if STP_OFFSETS
- StpF2 oM0 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-1.5));
- StpMF3 f0 = StpTaaPriFedF(oM0).rgb;
- StpMF3 f1 = StpTaaPriFedOF(oM0, StpI2(0, 1)).rgb;
- StpMF3 f2 = StpTaaPriFedOF(oM0, StpI2(0, 2)).rgb;
- StpMF3 f3 = StpTaaPriFedOF(oM0, StpI2(0, 3)).rgb;
- #else // STP_OFFSETS
- StpF2 oM0 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-1.5));
- StpF2 oM1 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-0.5));
- StpF2 oM2 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_( 0.5));
- StpF2 oM3 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_( 1.5));
- StpMF3 f0 = StpTaaPriFedF(oM0).rgb;
- StpMF3 f1 = StpTaaPriFedF(oM1).rgb;
- StpMF3 f2 = StpTaaPriFedF(oM2).rgb;
- StpMF3 f3 = StpTaaPriFedF(oM3).rgb;
- #endif // STP_OFFSETS
- #if (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
- fMax = StpTaaPriFedMaxF(oM4).rgb;
- fMin = StpTaaPriFedMinF(oM4).rgb;
- #endif // (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
- #if ((STP_MAX_MIN_10BIT == 0) && STP_TAA_PRX_LANCZOS_DERING)
- StpMF4 f4R = StpTaaPriFed4RF(oM4);
- StpMF4 f4G = StpTaaPriFed4GF(oM4);
- StpMF4 f4B = StpTaaPriFed4BF(oM4);
- #endif // ((STP_MAX_MIN_10BIT == 0) && STP_TAA_PRX_LANCZOS_DERING)
- //------------------------------------------------------------------------------------------------------------------------------
- // INDEPENDENT
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF2 fP = StpMF2(oM - oMNW);
- StpMF4 fPY = StpMF4_(-fP.y * StpMF1_(0.5)) + StpMF4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
- fPY = StpSatMF4(StpMF4_(1.0) - fPY * fPY);
- fPY *= fPY;
- StpMF4 fPY4 = fPY * fPY;
- fPY = (StpMF4_(1.0 + 81.0 / 175.0) * fPY4 - StpMF4_(81.0 / 175.0)) * fPY;
- #if defined(STP_16BIT)
- #else // defined(STP_16BIT)
- StpMF1 fRcp = StpPrxLoRcpMF1(fPY.r + fPY.g + fPY.b + fPY.a);
- #endif // defined(STP_16BIT)
- //------------------------------------------------------------------------------------------------------------------------------
- // DEPENDENT
- //------------------------------------------------------------------------------------------------------------------------------
- f.rgb = f0 * StpMF3_(fPY.r) + f1 * StpMF3_(fPY.g) + f2 * StpMF3_(fPY.b) + f3 * StpMF3_(fPY.a);
- f.rgb *= StpMF3_(fRcp);
- #if STP_TAA_PRX_LANCZOS_DERING
- #if (STP_MAX_MIN_10BIT == 0)
- #if defined(STP_16BIT)
- #else // defined(STP_16BIT)
- fMax.r = max(StpMax3MF1(f4R.x, f4R.y, f4R.z), f4R.w);
- fMax.g = max(StpMax3MF1(f4G.x, f4G.y, f4G.z), f4G.w);
- fMax.b = max(StpMax3MF1(f4B.x, f4B.y, f4B.z), f4B.w);
- fMin.r = min(StpMin3MF1(f4R.x, f4R.y, f4R.z), f4R.w);
- fMin.g = min(StpMin3MF1(f4G.x, f4G.y, f4G.z), f4G.w);
- fMin.b = min(StpMin3MF1(f4B.x, f4B.y, f4B.z), f4B.w);
- f = clamp(f, fMin, fMax);
- #endif // defined(STP_16BIT)
- #else // (STP_MAX_MIN_10BIT == 0)
- f = clamp(f, fMin, fMax);
- #endif // (STP_MAX_MIN_10BIT == 0)
- #endif // STP_TAA_PRX_LANCZOS_DERING
- #endif // (STP_TAA_PRX_LANCZOS == 1)
- //==============================================================================================================================
- #if (STP_TAA_PRX_LANCZOS == 2)
- #if STP_OFFSETS
- StpMF4 f4R0 = StpTaaPriFed4ROF(oM4, StpI2(-1, -1));
- StpMF4 f4G0 = StpTaaPriFed4GOF(oM4, StpI2(-1, -1));
- StpMF4 f4B0 = StpTaaPriFed4BOF(oM4, StpI2(-1, -1));
- StpMF4 f4R1 = StpTaaPriFed4ROF(oM4, StpI2( 1, -1));
- StpMF4 f4G1 = StpTaaPriFed4GOF(oM4, StpI2( 1, -1));
- StpMF4 f4B1 = StpTaaPriFed4BOF(oM4, StpI2( 1, -1));
- StpMF4 f4R2 = StpTaaPriFed4ROF(oM4, StpI2(-1, 1));
- StpMF4 f4G2 = StpTaaPriFed4GOF(oM4, StpI2(-1, 1));
- StpMF4 f4B2 = StpTaaPriFed4BOF(oM4, StpI2(-1, 1));
- StpMF4 f4R3 = StpTaaPriFed4ROF(oM4, StpI2( 1, 1));
- StpMF4 f4G3 = StpTaaPriFed4GOF(oM4, StpI2( 1, 1));
- StpMF4 f4B3 = StpTaaPriFed4BOF(oM4, StpI2( 1, 1));
- #else // STP_OFFSETS
- StpF2 oM0 = oM4 + StpF2(-kRcpF.x, -kRcpF.y);
- StpF2 oM1 = oM4 + StpF2( kRcpF.x, -kRcpF.y);
- StpF2 oM2 = oM4 + StpF2(-kRcpF.x, kRcpF.y);
- StpF2 oM3 = oM4 + StpF2( kRcpF.x, kRcpF.y);
- StpMF4 f4R0 = StpTaaPriFed4RF(oM0);
- StpMF4 f4G0 = StpTaaPriFed4GF(oM0);
- StpMF4 f4B0 = StpTaaPriFed4BF(oM0);
- StpMF4 f4R1 = StpTaaPriFed4RF(oM1);
- StpMF4 f4G1 = StpTaaPriFed4GF(oM1);
- StpMF4 f4B1 = StpTaaPriFed4BF(oM1);
- StpMF4 f4R2 = StpTaaPriFed4RF(oM2);
- StpMF4 f4G2 = StpTaaPriFed4GF(oM2);
- StpMF4 f4B2 = StpTaaPriFed4BF(oM2);
- StpMF4 f4R3 = StpTaaPriFed4RF(oM3);
- StpMF4 f4G3 = StpTaaPriFed4GF(oM3);
- StpMF4 f4B3 = StpTaaPriFed4BF(oM3);
- #endif // STP_OFFSETS
- #if (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
- fMax = StpTaaPriFedMaxF(oM4).rgb;
- fMin = StpTaaPriFedMinF(oM4).rgb;
- #endif // (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
- //------------------------------------------------------------------------------------------------------------------------------
- // INDEPENDENT
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF2 fP = StpMF2(oM - oMNW);
- StpMF4 fPX = StpMF4_(-fP.x * StpMF1_(0.5)) + StpMF4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
- StpMF4 fPY = StpMF4_(-fP.y * StpMF1_(0.5)) + StpMF4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
- fPX = StpSatMF4(StpMF4_(1.0) - fPX * fPX);
- fPY = StpSatMF4(StpMF4_(1.0) - fPY * fPY);
- fPX *= fPX;
- fPY *= fPY;
- StpMF4 fPX4 = fPX * fPX;
- StpMF4 fPY4 = fPY * fPY;
- fPX = (StpMF4_(1.0 + 81.0 / 175.0) * fPX4 - StpMF4_(81.0 / 175.0)) * fPX;
- fPY = (StpMF4_(1.0 + 81.0 / 175.0) * fPY4 - StpMF4_(81.0 / 175.0)) * fPY;
- #if defined(STP_16BIT)
- #else // defined(STP_16BIT)
- fPX *= StpMF4_(StpPrxLoRcpMF1(fPX.r + fPX.g + fPX.b + fPX.a));
- fPY *= StpMF4_(StpPrxLoRcpMF1(fPY.r + fPY.g + fPY.b + fPY.a));
- #endif // defined(STP_16BIT)
- StpMF4 fPX0 = fPX * StpMF4_(fPY.r);
- StpMF4 fPX1 = fPX * StpMF4_(fPY.g);
- StpMF4 fPX2 = fPX * StpMF4_(fPY.b);
- StpMF4 fPX3 = fPX * StpMF4_(fPY.a);
- //------------------------------------------------------------------------------------------------------------------------------
- // DEPENDENT
- //------------------------------------------------------------------------------------------------------------------------------
- #if defined(STP_16BIT)
- #else // defined(STP_16BIT)
- f.r = f4R0.w * fPX0.r + f4R0.z * fPX0.g + f4R1.w * fPX0.b + f4R1.z * fPX0.a +
- f4R0.x * fPX1.r + f4R0.y * fPX1.g + f4R1.x * fPX1.b + f4R1.y * fPX1.a +
- f4R2.w * fPX2.r + f4R2.z * fPX2.g + f4R3.w * fPX2.b + f4R3.z * fPX2.a +
- f4R2.x * fPX3.r + f4R2.y * fPX3.g + f4R3.x * fPX3.b + f4R3.y * fPX3.a;
- f.g = f4G0.w * fPX0.r + f4G0.z * fPX0.g + f4G1.w * fPX0.b + f4G1.z * fPX0.a +
- f4G0.x * fPX1.r + f4G0.y * fPX1.g + f4G1.x * fPX1.b + f4G1.y * fPX1.a +
- f4G2.w * fPX2.r + f4G2.z * fPX2.g + f4G3.w * fPX2.b + f4G3.z * fPX2.a +
- f4G2.x * fPX3.r + f4G2.y * fPX3.g + f4G3.x * fPX3.b + f4G3.y * fPX3.a;
- f.b = f4B0.w * fPX0.r + f4B0.z * fPX0.g + f4B1.w * fPX0.b + f4B1.z * fPX0.a +
- f4B0.x * fPX1.r + f4B0.y * fPX1.g + f4B1.x * fPX1.b + f4B1.y * fPX1.a +
- f4B2.w * fPX2.r + f4B2.z * fPX2.g + f4B3.w * fPX2.b + f4B3.z * fPX2.a +
- f4B2.x * fPX3.r + f4B2.y * fPX3.g + f4B3.x * fPX3.b + f4B3.y * fPX3.a;
- #endif // defined(STP_16BIT)
- #if STP_TAA_PRX_LANCZOS_DERING
- #if (STP_MAX_MIN_10BIT == 0)
- #if defined(STP_16BIT)
- #else // defined(STP_16BIT)
- fMax.r = max(StpMax3MF1(f4R0.y, f4R1.x, f4R2.z), f4R3.w);
- fMax.g = max(StpMax3MF1(f4G0.y, f4G1.x, f4G2.z), f4G3.w);
- fMax.b = max(StpMax3MF1(f4B0.y, f4B1.x, f4B2.z), f4B3.w);
- fMin.r = min(StpMin3MF1(f4R0.y, f4R1.x, f4R2.z), f4R3.w);
- fMin.g = min(StpMin3MF1(f4G0.y, f4G1.x, f4G2.z), f4G3.w);
- fMin.b = min(StpMin3MF1(f4B0.y, f4B1.x, f4B2.z), f4B3.w);
- f = clamp(f, fMin, fMax);
- #endif // defined(STP_16BIT)
- #else // (STP_MAX_MIN_10BIT == 0)
- f = clamp(f, fMin, fMax);
- #endif // (STP_MAX_MIN_10BIT == 0)
- #endif // STP_TAA_PRX_LANCZOS_DERING
- #endif // (STP_TAA_PRX_LANCZOS == 2)
- //==============================================================================================================================
- // DISPLACEMENT
- //==============================================================================================================================
- StpF2 oD0 = oC4 + kJitCRcpC0 - mXY;
- StpF2 oD1 = StpF2(kRcpC.x, 0.0) + oD0;
- StpF2 oD2 = StpF2(kRcpC.x, -kRcpC.y) + oD0;
- StpF2 oD3 = StpF2(0.0, -kRcpC.y) + oD0;
- StpMF3 d0 = StpTaaPriFedF(oD0).rgb;
- StpMF3 d1 = StpTaaPriFedF(oD1).rgb;
- StpMF3 d2 = StpTaaPriFedF(oD2).rgb;
- StpMF3 d3 = StpTaaPriFedF(oD3).rgb;
- //------------------------------------------------------------------------------------------------------------------------------
- // INDEPENDENT
- //------------------------------------------------------------------------------------------------------------------------------
- #if defined(STP_16BIT)
- #else // defined(STP_16BIT)
- wG = StpSatMF4(wG * StpMF4_(StpPrxLoRcpMF1(wG.x + wG.y + wG.z + wG.w)));
- #endif // defined(STP_16BIT)
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF4 wT = abs(c4R - StpMF4_(f.r)) * StpMF4_(STP_LUMA_R) +
- abs(c4G - StpMF4_(f.g)) * StpMF4_(STP_LUMA_G) +
- abs(c4B - StpMF4_(f.b)) * StpMF4_(STP_LUMA_B);
- wT = StpPrxLoRcpMF4(wT * StpMF4_(STP_ANTI_MAX) + StpMF4_(STP_ANTI_MIN)) * triMask;
- //------------------------------------------------------------------------------------------------------------------------------
- #if defined(STP_16BIT)
- #else // defined(STP_16BIT)
- wT = StpSatMF4(wT * StpMF4_(StpPrxLoRcpMF1(wT.x + wT.y + wT.z + wT.w)));
- #endif // defined(STP_16BIT)
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF4 wM = wT * StpMF4_(0.5) + wG * StpMF4_(0.5);
- #if defined(STP_16BIT)
- #else // defined(STP_16BIT)
- StpMF1 match = c4A.x * wM.x + c4A.y * wM.y + c4A.z * wM.z + c4A.w * wM.w;
- #endif // defined(STP_16BIT)
- cnv *= match;
- //------------------------------------------------------------------------------------------------------------------------------
- // DEPENDENT
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF3 dG = d0 * StpMF3_(wG.x) + d1 * StpMF3_(wG.y) + d2 * StpMF3_(wG.z) + d3 * StpMF3_(wG.w);
- StpMF3 dT = d0 * StpMF3_(wT.x) + d1 * StpMF3_(wT.y) + d2 * StpMF3_(wT.z) + d3 * StpMF3_(wT.w);
- //------------------------------------------------------------------------------------------------------------------------------
- #if defined(STP_16BIT)
- #else // defined(STP_16BIT)
- StpMF3 t = StpMF3(
- c4R.x * wT.x + c4R.y * wT.y + c4R.z * wT.z + c4R.w * wT.w,
- c4G.x * wT.x + c4G.y * wT.y + c4G.z * wT.z + c4G.w * wT.w,
- c4B.x * wT.x + c4B.y * wT.y + c4B.z * wT.z + c4B.w * wT.w);
- StpMF3 c = StpMF3(
- c4R.x * wG.x + c4R.y * wG.y + c4R.z * wG.z + c4R.w * wG.w,
- c4G.x * wG.x + c4G.y * wG.y + c4G.z * wG.z + c4G.w * wG.w,
- c4B.x * wG.x + c4B.y * wG.y + c4B.z * wG.z + c4B.w * wG.w);
- #endif // defined(STP_16BIT)
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 bln = StpSatMF1(cnv * StpPrxLoRcpMF1(cnv + StpMF1_(1.0 / STP_FRAME_MAX)));
- StpMF1 blnT = StpMF1_(1.0) - bln;
- StpMF3 b = f * StpMF3_(bln) + t * StpMF3_(blnT);
- StpMF3 minNe = min(c, b);
- StpMF3 maxNe = max(c, b);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF3 penC = StpSatMF3(c + (f - dG) * StpMF3_(StpMF1_(0.9875) * match));
- StpMF2 penWF;
- penWF.x = pen * StpMF1_(STP_TAA_PEN_W);
- penWF.y = pen * lerp(StpMF1_(STP_TAA_PEN_F0), StpMF1_(STP_TAA_PEN_F1), cnv);
- StpMF2 penNotWF = StpMF2_(1.0) - penWF;
- rF.rgb = t + (f - dT);
- rF.rgb = rF.rgb * StpMF3_(blnT) + f * StpMF3_(bln);
- rW.rgb = StpSatMF3(rF.rgb * StpMF3_(penNotWF.x) + penC * StpMF3_(penWF.x));
- rF.rgb = StpSatMF3(rF.rgb * StpMF3_(penNotWF.y) + penC * StpMF3_(penWF.y));
- rW.rgb = clamp(rW.rgb, minNe, maxNe);
- rF.rgb = clamp(rF.rgb, minNe, maxNe);
- //------------------------------------------------------------------------------------------------------------------------------
- rW.rgb *= rW.rgb;
- #if (STP_POSTMAP == 0)
- StpToneInvMF3(rW.rgb);
- #endif // (STP_POSTMAP == 0)
- rF.a = rW.a = StpMF1(0.0); }
- #endif // defined(STP_GPU) && defined(STP_TAA) && defined(STP_32BIT)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // 16-BIT PATH
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_TAA) && defined(STP_16BIT)
- // Callbacks.
- // Gather4 of GEAA control data.
- StpH4 StpTaaCtl4H(StpF2 p);
- //------------------------------------------------------------------------------------------------------------------------------
- // Current frame {color,anti} input.
- // Gather4 specific channels.
- StpH4 StpTaaCol4RH(StpF2 p);
- StpH4 StpTaaCol4GH(StpF2 p);
- StpH4 StpTaaCol4BH(StpF2 p);
- StpH4 StpTaaCol4AH(StpF2 p);
- //------------------------------------------------------------------------------------------------------------------------------
- // Bilinear sampling of low-frequency convergence.
- StpH1 StpTaaConH(StpF2 p);
- //------------------------------------------------------------------------------------------------------------------------------
- // Dither value {0 to 1} this should be output pixel frequency spatial temporal blue noise.
- StpH1 StpTaaDitH(StpW2 o);
- //------------------------------------------------------------------------------------------------------------------------------
- // Gather4 current frame motion {z,x,y} packed input, same as the 32-bit version (just renamed).
- StpU4 StpTaaMot4H(StpF2 p);
- //------------------------------------------------------------------------------------------------------------------------------
- // Feedback {color, alpha}.
- // Bilinear fetch with clamp to edge.
- StpH4 StpTaaPriFedH(StpF2 p);
- // Gather4.
- StpH4 StpTaaPriFed4RH(StpF2 p);
- StpH4 StpTaaPriFed4GH(StpF2 p);
- StpH4 StpTaaPriFed4BH(StpF2 p);
- // Min/max sampling used for dering.
- #if STP_MAX_MIN_10BIT
- StpH4 StpTaaPriFedMaxH(StpF2 p);
- StpH4 StpTaaPriFedMinH(StpF2 p);
- #endif // STP_MAX_MIN_10BIT
- // Sampling with offsets.
- #if STP_OFFSETS
- StpH4 StpTaaPriFedOH(StpF2 p, StpI2 o);
- StpH4 StpTaaPriFed4ROH(StpF2 p, StpI2 o);
- StpH4 StpTaaPriFed4GOH(StpF2 p, StpI2 o);
- StpH4 StpTaaPriFed4BOH(StpF2 p, StpI2 o);
- #endif // STP_OFFSETS
- //==============================================================================================================================
- void StpTaaH(
- StpW1 lane, // Currently unused but in the interface for possible future expansion.
- StpW2 o, // Integer pixel offset in output.
- out StpH4 rF, // Return Feedback (to be stored).
- out StpH4 rW, // Return Output (to be stored).
- StpU4 con0, // Constants generated by StpTaaCon().
- StpU4 con1,
- StpU4 con2,
- StpU4 con3) {
- //------------------------------------------------------------------------------------------------------------------------------
- // This is only currently used for debug.
- StpH1 dit = StpTaaDitH(o);
- //------------------------------------------------------------------------------------------------------------------------------
- // Rename constants.
- StpF2 kCRcpF = StpF2_U2(con0.xy);
- StpF2 kHalfCRcpFUnjitC = StpF2_U2(con0.zw);
- StpF2 kRcpC = StpF2_U2(con1.xy);
- StpF2 kRcpF = StpF2_U2(con1.zw);
- StpF2 kHalfRcpF = StpF2_U2(con2.xy);
- StpF2 kJitCRcpC0 = StpF2_U2(con2.zw);
- StpF2 kHalfRcpC = StpF2_U2(con3.xy);
- StpF2 kF = StpF2_U2(con3.zw);
- //------------------------------------------------------------------------------------------------------------------------------
- // Check the streaming bandwidth limit.
- #if STP_BUG_BW_SOL
- { StpF2 oo = StpF2(o) * kRcpF;
- StpH4 g4 = StpTaaCtl4RH(oo);
- StpU4 m4 = StpTaaMot4H(oo);
- StpH1 cnv = StpTaaConH(oo);
- StpH4 f = StpTaaPriFedH(oo);
- StpH4 c4R = StpTaaCol4RH(oo);
- rW = rF = l4 + g4 + StpH4(m4) + StpH4_(cnv) + f + c4R;
- return; }
- #endif // STP_BUG_BW_SOL
- //------------------------------------------------------------------------------------------------------------------------------
- // Locate 2x2 neighborhood.
- // Float version of integer pixel offset in output.
- // All the 'o' prefixed variables are offset (aka position/coordinate) related.
- StpF2 oI = StpF2(o);
- // This gets to the center of the 2x2 quad directly because of possibility of shader/tex precision mismatch.
- // Precision mismatch could yield different 2x2 quads.
- StpF2 oC = oI * kCRcpF + kHalfCRcpFUnjitC;
- // NW of 2x2 quad.
- StpF2 oCNW = floor(oC + StpF2_(-0.5));
- // Center of the 2x2 quad.
- StpF2 oC4 = oCNW * kRcpC + kRcpC;
- // Coordinates for low frequency convergence.
- StpF2 oC1 = oC * kRcpC;
- //==============================================================================================================================
- // FETCH {CONVERGENCE, COLOR, CONTROL, Z+MOTION}
- //==============================================================================================================================
- // Fetch low-frequency convergence.
- StpH1 cnv = StpTaaConH(oC1);
- // Fetch color.
- StpH4 c4R = StpTaaCol4RH(oC4);
- StpH4 c4G = StpTaaCol4GH(oC4);
- StpH4 c4B = StpTaaCol4BH(oC4);
- StpH4 c4A = StpTaaCol4AH(oC4);
- // Control (GEAA weights)
- StpH4 g4 = StpTaaCtl4H(oC4);
- // Fetch {z,motion}.
- StpU4 m4 = StpTaaMot4H(oC4);
- //------------------------------------------------------------------------------------------------------------------------------
- // INDEPENDENT
- //------------------------------------------------------------------------------------------------------------------------------
- // Setup resolve position {0 to 1} inside 2x2 quad.
- // The extra -0.5 is to get from NW position to center.
- StpH2 rP = StpH2(oC - oCNW) - StpH2_(0.5);
- //------------------------------------------------------------------------------------------------------------------------------
- // The 'rP' is resolve position {0 to 1} inside 2x2 quad, this is distance to ends of 2x2.
- // Instead of using {a,a-1} this uses {a,1-a} for reuse with the simple angular filtering.
- StpH2 rPX10 = StpH2(1.0, 0.0) + StpH2(-rP.x, rP.x);
- StpH2 rPY01 = StpH2(0.0, 1.0) + StpH2(rP.y, -rP.y);
- // Distance^2 {0 := on, 1 := off}.
- StpH4 pen4x = StpH4(rPX10.g, rPX10.r, rPX10.r, rPX10.g);
- StpH4 pen4y = StpH4(rPY01.g, rPY01.g, rPY01.r, rPY01.r);
- // Pen starts with distance squared to all 2x2 points.
- StpH4 pen4 = StpSatH4(pen4x * pen4x + pen4y * pen4y);
- //==============================================================================================================================
- // DEPENDENT ON {CONVERGENCE}
- //==============================================================================================================================
- // Low frequency convergence keeps the next frame value, so subtract one frame.
- cnv = StpSatH1(cnv - StpH1_(1.0 / STP_FRAME_MAX));
- //------------------------------------------------------------------------------------------------------------------------------
- // Pen size based on convergence.
- StpH1 pen = StpH1_(cnv) * StpH1_(STP_FRAME_MAX) + StpH1_(1.0);
- pen = StpPrxLoSqrtH1(pen);
- pen4 = StpSatH4(StpH4_(1.0) - pen4 * StpH4_(pen));
- #if defined(STP_16BIT)
- StpH2 pen2 = pen4.xy * pen4.xy + pen4.zw * pen4.zw;
- pen = StpSatH1(pen2.x + pen2.y);
- #else // defined(STP_16BIT)
- pen = StpSatMF1(pen4.x * pen4.x + pen4.y * pen4.y + pen4.z * pen4.z + pen4.w * pen4.w);
- #endif // defined(STP_16BIT)
- //==============================================================================================================================
- // DEPENDENT ON {COLOR}
- //==============================================================================================================================
- // Simple angular filtering (gets rid of block artifacts, adds sawtooth artifacts which are not a problem in practice).
- // Create a GEAA based weighting for no temporal feedback case.
- StpH4 wG;
- // Selects between either (S) or (T).
- // (S) A--B ... (T) A--B
- // |\ | | /|
- // | \| |/ |
- // R--G R--G
- // S and T only use the other diagonal.
- // Exact luma not required.
- StpH4 l4 = c4R + c4G * StpH4_(2.0) + c4B;
- StpH2 difST = abs(l4.gr - l4.ab);
- // Choose configuration based on which difference is maximum.
- StpP1 useS = difST.x > difST.y;
- // Choose interpolation weights given the configuration.
- // _T__________ _S__________
- // R | sat( -x+ y) min(1-x, y) = y-G
- // G | min( x, y) sat(x-1+ y) = y-R
- // B | sat( x- y) min( x,1-y) = (1-y)-A
- // A | min(1-x,1-y) sat(1-x- y) = (1-y)-B
- // Difference between S and T is a {x} vs {1-x} and a RGBA vs GRAB swap.
- StpH2 wTrb = StpSatH2(StpH2(-rP.x, rP.x) + StpH2(rP.y, -rP.y));
- StpH2 wSrb = min(rPX10, rPY01);
- if(useS) wTrb = wSrb;
- StpH2 wTga = rPY01 - wTrb;
- wG.rg = StpH2(wTrb.x, wTga.x);
- wG.ba = StpH2(wTrb.y, wTga.y);
- // Shaping is needed to get good high area scaling (remove the transition region).
- wG *= wG;
- wG *= wG;
- //------------------------------------------------------------------------------------------------------------------------------
- // Scale directional interpolation weights by GEAA weights to introduce anti-aliasing.
- wG *= g4;
- // Triangular nearest.
- // This works by removing the corner which contributes the least to the spatial interpolated result.
- StpH4 triMask = StpH4_(1.0);
- StpH2 wGmin2 = min(wG.xy, wG.zw);
- //==============================================================================================================================
- // DEPENDENT ON {Z,MOTION}
- //==============================================================================================================================
- // This overwrites gather4 results.
- if(wGmin2.x < wGmin2.y) {
- if(wG.x < wG.z) { triMask.x = StpH1_(STP_TAA_TRI_MASK_AVOID); m4.x = 0xFFFFFFFF; }
- else { triMask.z = StpH1_(STP_TAA_TRI_MASK_AVOID); m4.z = 0xFFFFFFFF; } }
- else {
- if(wG.y < wG.w) { triMask.y = StpH1_(STP_TAA_TRI_MASK_AVOID); m4.y = 0xFFFFFFFF; }
- else { triMask.w = StpH1_(STP_TAA_TRI_MASK_AVOID); m4.w = 0xFFFFFFFF; } }
- StpU1 m1 = min(StpMin3U1(m4.x, m4.y, m4.z), m4.w);
- //------------------------------------------------------------------------------------------------------------------------------
- // Want to consume 'triMask' to free up register space.
- wG *= triMask;
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 mXY;
- // Motion 'm' units are {1 := move by one screen}.
- StpMvUnpackV(mXY, m1);
- //==============================================================================================================================
- // GET ALL FEEDBACK FILTERING DONE
- //==============================================================================================================================
- // This region of code will have the highest register pressure in some configs, so doing as early as possible.
- // Setup for fetch feedback.
- StpF2 oF = oI * kRcpF + kHalfRcpF - mXY;
- //------------------------------------------------------------------------------------------------------------------------------
- StpH3 f;
- // Lanczos common.
- #if STP_TAA_PRX_LANCZOS
- // Motion reprojection position in feedback pixels.
- StpF2 oM = oI + StpF2_(0.5) - mXY * kF;
- // NW of center 2x2 quad.
- StpF2 oMNW = floor(oM + StpF2_(-0.5));
- // Center of the center 2x2 quad.
- StpF2 oM4 = oMNW * kRcpF + kRcpF;
- StpH3 fMax, fMin;
- #else // STP_TAA_PRX_LANCZOS
- // Sample nearest feedback.
- f = StpTaaPriFedH(oF).rgb;
- #endif // STP_TAA_PRX_LANCZOS
- //==============================================================================================================================
- #if (STP_TAA_PRX_LANCZOS == 1)
- // This one does a fixed 1x4 to try to cut cost in half relative to the complete 4x4.
- // It uses bilinear sampling on the 'x'.
- // Lanczos on the 'y' because most floating camera motion is 'y' based.
- // Fetch {feedback}.
- #if STP_OFFSETS
- // TODO: Can optimize out the 'oM4.y' add with constant change.
- StpF2 oM0 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-1.5));
- StpH3 f0 = StpTaaPriFedH(oM0).rgb;
- StpH3 f1 = StpTaaPriFedOH(oM0, StpI2(0, 1)).rgb;
- StpH3 f2 = StpTaaPriFedOH(oM0, StpI2(0, 2)).rgb;
- StpH3 f3 = StpTaaPriFedOH(oM0, StpI2(0, 3)).rgb;
- #else // STP_OFFSETS
- StpF2 oM0 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-1.5));
- StpF2 oM1 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-0.5));
- StpF2 oM2 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_( 0.5));
- StpF2 oM3 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_( 1.5));
- StpH3 f0 = StpTaaPriFedH(oM0).rgb;
- StpH3 f1 = StpTaaPriFedH(oM1).rgb;
- StpH3 f2 = StpTaaPriFedH(oM2).rgb;
- StpH3 f3 = StpTaaPriFedH(oM3).rgb;
- #endif // STP_OFFSETS
- // Want this last because it's used last.
- #if (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
- fMax = StpTaaPriFedMaxH(oM4).rgb;
- fMin = StpTaaPriFedMinH(oM4).rgb;
- #endif // (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
- #if ((STP_MAX_MIN_10BIT == 0) && STP_TAA_PRX_LANCZOS_DERING)
- // Without {min,max} sampling, must gather4.
- StpH4 f4R = StpTaaPriFed4RH(oM4);
- StpH4 f4G = StpTaaPriFed4GH(oM4);
- StpH4 f4B = StpTaaPriFed4BH(oM4);
- #endif // ((STP_MAX_MIN_10BIT == 0) && STP_TAA_PRX_LANCZOS_DERING)
- //------------------------------------------------------------------------------------------------------------------------------
- // INDEPENDENT
- //------------------------------------------------------------------------------------------------------------------------------
- // Convert to approximate lanczos weights.
- // Feedback position {0 to 1} inside 2x2 quad + 0.5.
- StpH2 fP = StpH2(oM - oMNW);
- // Convert to approximate lanczos weights.
- // This converts {-2 to 2} to {-1 to 1} because the kernel approximation is written for {-1 to 1}.
- StpH4 fPY = StpH4_(-fP.y * StpH1_(0.5)) + StpH4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
- // Weights in one axis.
- fPY = StpSatH4(StpH4_(1.0) - fPY * fPY);
- fPY *= fPY;
- StpH4 fPY4 = fPY * fPY;
- // ^6 (slightly more negative lobe than lanczos 2, slightly less expensive)
- fPY = (StpH4_(1.0 + 81.0 / 175.0) * fPY4 - StpH4_(81.0 / 175.0)) * fPY;
- #if defined(STP_16BIT)
- StpH2 fRcp2 = fPY.rg + fPY.ba;
- StpH1 fRcp = StpPrxLoRcpH1(fRcp2.x + fRcp2.y);
- #else // defined(STP_16BIT)
- StpMF1 fRcp = StpPrxLoRcpMF1(fPY.r + fPY.g + fPY.b + fPY.a);
- #endif // defined(STP_16BIT)
- //------------------------------------------------------------------------------------------------------------------------------
- // DEPENDENT
- //------------------------------------------------------------------------------------------------------------------------------
- f.rgb = f0 * StpH3_(fPY.r) + f1 * StpH3_(fPY.g) + f2 * StpH3_(fPY.b) + f3 * StpH3_(fPY.a);
- f.rgb *= StpH3_(fRcp);
- #if STP_TAA_PRX_LANCZOS_DERING
- #if (STP_MAX_MIN_10BIT == 0)
- #if defined(STP_16BIT)
- StpH2 fXnyR = max(max(StpH2(f4R.x, -f4R.x), StpH2(f4R.y, -f4R.y)),
- max(StpH2(f4R.z, -f4R.z), StpH2(f4R.w, -f4R.w)));
- StpH2 fXnyG = max(max(StpH2(f4G.x, -f4G.x), StpH2(f4G.y, -f4G.y)),
- max(StpH2(f4G.z, -f4G.z), StpH2(f4G.w, -f4G.w)));
- StpH2 fXnyB = max(max(StpH2(f4B.x, -f4B.x), StpH2(f4B.y, -f4B.y)),
- max(StpH2(f4B.z, -f4B.z), StpH2(f4B.w, -f4B.w)));
- f = clamp(f, StpH3(-fXnyR.y, -fXnyG.y, -fXnyB.y), StpH3(fXnyR.x, fXnyG.x, fXnyB.x));
- #else // defined(STP_16BIT)
- fMax.r = max(StpMax3H1(f4R.x, f4R.y, f4R.z), f4R.w);
- fMax.g = max(StpMax3H1(f4G.x, f4G.y, f4G.z), f4G.w);
- fMax.b = max(StpMax3H1(f4B.x, f4B.y, f4B.z), f4B.w);
- fMin.r = min(StpMin3H1(f4R.x, f4R.y, f4R.z), f4R.w);
- fMin.g = min(StpMin3H1(f4G.x, f4G.y, f4G.z), f4G.w);
- fMin.b = min(StpMin3H1(f4B.x, f4B.y, f4B.z), f4B.w);
- f = clamp(f, fMin, fMax);
- #endif // defined(STP_16BIT)
- #else // (STP_MAX_MIN_10BIT == 0)
- // Leaning on {min,max} sampling so no 16/32-bit permutation.
- f = clamp(f, fMin, fMax);
- #endif // (STP_MAX_MIN_10BIT == 0)
- #endif // STP_TAA_PRX_LANCZOS_DERING
- #endif // (STP_TAA_PRX_LANCZOS == 1)
- //==============================================================================================================================
- #if (STP_TAA_PRX_LANCZOS == 2)
- // Unstable approximate lanczos feedback, full 4x4.
- // a = saturate(1-x*x)
- // u = 1+v
- // v = moves the zero crossing to 0.5
- // w = adjusts the shape
- // u*a^w - v*a^2
- // Fetch {feedback}.
- // 0w 0z 1w 1z | R
- // 0x 0y 1x 1y | G
- // 2w 2z 3w 3z | B
- // 2x 2y 3x 3y | A
- // -- -- -- --
- // R G B A
- #if STP_OFFSETS
- StpH4 f4R0 = StpTaaPriFed4ROH(oM4, StpI2(-1, -1));
- StpH4 f4G0 = StpTaaPriFed4GOH(oM4, StpI2(-1, -1));
- StpH4 f4B0 = StpTaaPriFed4BOH(oM4, StpI2(-1, -1));
- StpH4 f4R1 = StpTaaPriFed4ROH(oM4, StpI2( 1, -1));
- StpH4 f4G1 = StpTaaPriFed4GOH(oM4, StpI2( 1, -1));
- StpH4 f4B1 = StpTaaPriFed4BOH(oM4, StpI2( 1, -1));
- StpH4 f4R2 = StpTaaPriFed4ROH(oM4, StpI2(-1, 1));
- StpH4 f4G2 = StpTaaPriFed4GOH(oM4, StpI2(-1, 1));
- StpH4 f4B2 = StpTaaPriFed4BOH(oM4, StpI2(-1, 1));
- StpH4 f4R3 = StpTaaPriFed4ROH(oM4, StpI2( 1, 1));
- StpH4 f4G3 = StpTaaPriFed4GOH(oM4, StpI2( 1, 1));
- StpH4 f4B3 = StpTaaPriFed4BOH(oM4, StpI2( 1, 1));
- #else // STP_OFFSETS
- StpF2 oM0 = oM4 + StpF2(-kRcpF.x, -kRcpF.y);
- StpF2 oM1 = oM4 + StpF2( kRcpF.x, -kRcpF.y);
- StpF2 oM2 = oM4 + StpF2(-kRcpF.x, kRcpF.y);
- StpF2 oM3 = oM4 + StpF2( kRcpF.x, kRcpF.y);
- StpH4 f4R0 = StpTaaPriFed4RH(oM0);
- StpH4 f4G0 = StpTaaPriFed4GH(oM0);
- StpH4 f4B0 = StpTaaPriFed4BH(oM0);
- StpH4 f4R1 = StpTaaPriFed4RH(oM1);
- StpH4 f4G1 = StpTaaPriFed4GH(oM1);
- StpH4 f4B1 = StpTaaPriFed4BH(oM1);
- StpH4 f4R2 = StpTaaPriFed4RH(oM2);
- StpH4 f4G2 = StpTaaPriFed4GH(oM2);
- StpH4 f4B2 = StpTaaPriFed4BH(oM2);
- StpH4 f4R3 = StpTaaPriFed4RH(oM3);
- StpH4 f4G3 = StpTaaPriFed4GH(oM3);
- StpH4 f4B3 = StpTaaPriFed4BH(oM3);
- #endif // STP_OFFSETS
- // Want this last because it's used last.
- #if (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
- fMax = StpTaaPriFedMaxH(oM4).rgb;
- fMin = StpTaaPriFedMinH(oM4).rgb;
- #endif // (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
- //------------------------------------------------------------------------------------------------------------------------------
- // INDEPENDENT
- //------------------------------------------------------------------------------------------------------------------------------
- // Feedback position {0 to 1} inside 2x2 quad + 0.5.
- StpH2 fP = StpH2(oM - oMNW);
- // Convert to approximate lanczos weights.
- // This converts {-2 to 2} to {-1 to 1} because the kernel approximation is written for {-1 to 1}.
- StpH4 fPX = StpH4_(-fP.x * StpH1_(0.5)) + StpH4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
- StpH4 fPY = StpH4_(-fP.y * StpH1_(0.5)) + StpH4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
- // Weights in both axis.
- fPX = StpSatH4(StpH4_(1.0) - fPX * fPX);
- fPY = StpSatH4(StpH4_(1.0) - fPY * fPY);
- fPX *= fPX;
- fPY *= fPY;
- StpH4 fPX4 = fPX * fPX;
- StpH4 fPY4 = fPY * fPY;
- // ^6 (slightly more negative lobe than lanczos 2, slightly less expensive)
- fPX = (StpH4_(1.0 + 81.0 / 175.0) * fPX4 - StpH4_(81.0 / 175.0)) * fPX;
- fPY = (StpH4_(1.0 + 81.0 / 175.0) * fPY4 - StpH4_(81.0 / 175.0)) * fPY;
- #if defined(STP_16BIT)
- StpH2 fRcpX = fPX.rg + fPX.ba;
- StpH2 fRcpY = fPY.rg + fPY.ba;
- fPX *= StpH4_(StpPrxLoRcpH1(fRcpX.r + fRcpX.y));
- fPY *= StpH4_(StpPrxLoRcpH1(fRcpY.r + fRcpY.y));
- #else // defined(STP_16BIT)
- fPX *= StpMF4_(StpPrxLoRcpMF1(fPX.r + fPX.g + fPX.b + fPX.a));
- fPY *= StpMF4_(StpPrxLoRcpMF1(fPY.r + fPY.g + fPY.b + fPY.a));
- #endif // defined(STP_16BIT)
- StpH4 fPX0 = fPX * StpH4_(fPY.r);
- StpH4 fPX1 = fPX * StpH4_(fPY.g);
- StpH4 fPX2 = fPX * StpH4_(fPY.b);
- StpH4 fPX3 = fPX * StpH4_(fPY.a);
- //------------------------------------------------------------------------------------------------------------------------------
- // DEPENDENT
- //------------------------------------------------------------------------------------------------------------------------------
- #if defined(STP_16BIT)
- StpH2 fR2 = f4R0.wz * fPX0.xy + f4R1.wz * fPX0.zw + f4R0.xy * fPX1.xy + f4R1.xy * fPX1.zw +
- f4R2.wz * fPX2.xy + f4R3.wz * fPX2.zw + f4R2.xy * fPX3.xy + f4R3.xy * fPX3.zw;
- StpH2 fG2 = f4G0.wz * fPX0.xy + f4G1.wz * fPX0.zw + f4G0.xy * fPX1.xy + f4G1.xy * fPX1.zw +
- f4G2.wz * fPX2.xy + f4G3.wz * fPX2.zw + f4G2.xy * fPX3.xy + f4G3.xy * fPX3.zw;
- StpH2 fB2 = f4B0.wz * fPX0.xy + f4B1.wz * fPX0.zw + f4B0.xy * fPX1.xy + f4B1.xy * fPX1.zw +
- f4B2.wz * fPX2.xy + f4B3.wz * fPX2.zw + f4B2.xy * fPX3.xy + f4B3.xy * fPX3.zw;
- f = StpH3(fR2.x + fR2.y, fG2.x + fG2.y, fB2.x + fB2.y);
- #else // defined(STP_16BIT)
- f.r = f4R0.w * fPX0.r + f4R0.z * fPX0.g + f4R1.w * fPX0.b + f4R1.z * fPX0.a +
- f4R0.x * fPX1.r + f4R0.y * fPX1.g + f4R1.x * fPX1.b + f4R1.y * fPX1.a +
- f4R2.w * fPX2.r + f4R2.z * fPX2.g + f4R3.w * fPX2.b + f4R3.z * fPX2.a +
- f4R2.x * fPX3.r + f4R2.y * fPX3.g + f4R3.x * fPX3.b + f4R3.y * fPX3.a;
- f.g = f4G0.w * fPX0.r + f4G0.z * fPX0.g + f4G1.w * fPX0.b + f4G1.z * fPX0.a +
- f4G0.x * fPX1.r + f4G0.y * fPX1.g + f4G1.x * fPX1.b + f4G1.y * fPX1.a +
- f4G2.w * fPX2.r + f4G2.z * fPX2.g + f4G3.w * fPX2.b + f4G3.z * fPX2.a +
- f4G2.x * fPX3.r + f4G2.y * fPX3.g + f4G3.x * fPX3.b + f4G3.y * fPX3.a;
- f.b = f4B0.w * fPX0.r + f4B0.z * fPX0.g + f4B1.w * fPX0.b + f4B1.z * fPX0.a +
- f4B0.x * fPX1.r + f4B0.y * fPX1.g + f4B1.x * fPX1.b + f4B1.y * fPX1.a +
- f4B2.w * fPX2.r + f4B2.z * fPX2.g + f4B3.w * fPX2.b + f4B3.z * fPX2.a +
- f4B2.x * fPX3.r + f4B2.y * fPX3.g + f4B3.x * fPX3.b + f4B3.y * fPX3.a;
- #endif // defined(STP_16BIT)
- #if STP_TAA_PRX_LANCZOS_DERING
- #if (STP_MAX_MIN_10BIT == 0)
- #if defined(STP_16BIT)
- StpH2 fXnyR = max(max(StpH2(f4R0.y, -f4R0.y), StpH2(f4R1.x, -f4R1.x)),
- max(StpH2(f4R2.z, -f4R2.z), StpH2(f4R3.w, -f4R3.w)));
- StpH2 fXnyG = max(max(StpH2(f4G0.y, -f4G0.y), StpH2(f4G1.x, -f4G1.x)),
- max(StpH2(f4G2.z, -f4G2.z), StpH2(f4G3.w, -f4G3.w)));
- StpH2 fXnyB = max(max(StpH2(f4B0.y, -f4B0.y), StpH2(f4B1.x, -f4B1.x)),
- max(StpH2(f4B2.z, -f4B2.z), StpH2(f4B3.w, -f4B3.w)));
- f = clamp(f, StpH3(-fXnyR.y, -fXnyG.y, -fXnyB.y), StpH3(fXnyR.x, fXnyG.x, fXnyB.x));
- #else // defined(STP_16BIT)
- fMax.r = max(StpMax3H1(f4R0.y, f4R1.x, f4R2.z), f4R3.w);
- fMax.g = max(StpMax3H1(f4G0.y, f4G1.x, f4G2.z), f4G3.w);
- fMax.b = max(StpMax3H1(f4B0.y, f4B1.x, f4B2.z), f4B3.w);
- fMin.r = min(StpMin3H1(f4R0.y, f4R1.x, f4R2.z), f4R3.w);
- fMin.g = min(StpMin3H1(f4G0.y, f4G1.x, f4G2.z), f4G3.w);
- fMin.b = min(StpMin3H1(f4B0.y, f4B1.x, f4B2.z), f4B3.w);
- f = clamp(f, fMin, fMax);
- #endif // defined(STP_16BIT)
- #else // (STP_MAX_MIN_10BIT == 0)
- // Leaning on {min,max} sampling so no 16/32-bit permutation.
- f = clamp(f, fMin, fMax);
- #endif // (STP_MAX_MIN_10BIT == 0)
- #endif // STP_TAA_PRX_LANCZOS_DERING
- #endif // (STP_TAA_PRX_LANCZOS == 2)
- //==============================================================================================================================
- // DISPLACEMENT
- //==============================================================================================================================
- // Note the 'kJitCRcpC0' gets to position 0 to save some runtime maths.
- // 3 2
- // 0 1
- StpF2 oD0 = oC4 + kJitCRcpC0 - mXY;
- StpF2 oD1 = StpF2(kRcpC.x, 0.0) + oD0;
- StpF2 oD2 = StpF2(kRcpC.x, -kRcpC.y) + oD0;
- StpF2 oD3 = StpF2(0.0, -kRcpC.y) + oD0;
- StpH3 d0 = StpTaaPriFedH(oD0).rgb;
- StpH3 d1 = StpTaaPriFedH(oD1).rgb;
- StpH3 d2 = StpTaaPriFedH(oD2).rgb;
- StpH3 d3 = StpTaaPriFedH(oD3).rgb;
- //------------------------------------------------------------------------------------------------------------------------------
- // INDEPENDENT
- //------------------------------------------------------------------------------------------------------------------------------
- // Normalize interpolation weights.
- #if defined(STP_16BIT)
- StpH2 wG2 = wG.xy + wG.zw;
- wG = StpSatH4(wG * StpH4_(StpPrxLoRcpH1(wG2.x + wG2.y)));
- #else // defined(STP_16BIT)
- wG = StpSatMF4(wG * StpMF4_(StpPrxLoRcpMF1(wG.x + wG.y + wG.z + wG.w)));
- #endif // defined(STP_16BIT)
- //------------------------------------------------------------------------------------------------------------------------------
- // Temporal weighting.
- StpH4 wT = abs(c4R - StpH4_(f.r)) * StpH4_(STP_LUMA_R) +
- abs(c4G - StpH4_(f.g)) * StpH4_(STP_LUMA_G) +
- abs(c4B - StpH4_(f.b)) * StpH4_(STP_LUMA_B);
- wT = StpPrxLoRcpH4(wT * StpH4_(STP_ANTI_MAX) + StpH4_(STP_ANTI_MIN)) * triMask;
- //------------------------------------------------------------------------------------------------------------------------------
- #if defined(STP_16BIT)
- StpH2 wT2 = wT.xy + wT.zw;
- wT = StpSatH4(wT * StpH4_(StpPrxLoRcpH1(wT2.x + wT2.y)));
- #else // defined(STP_16BIT)
- wT = StpSatMF4(wT * StpMF4_(StpPrxLoRcpMF1(wT.x + wT.y + wT.z + wT.w)));
- #endif // defined(STP_16BIT)
- //------------------------------------------------------------------------------------------------------------------------------
- // Interpolate match.
- // Using a fixed 50/50 split of two normalized weights yields a normalized weight.
- StpH4 wM = wT * StpH4_(0.5) + wG * StpH4_(0.5);
- #if defined(STP_16BIT)
- StpH2 match2 = (c4A.xy * wM.xy) + (c4A.zw * wM.zw);
- StpH1 match = match2.x + match2.y;
- #else // defined(STP_16BIT)
- StpMF1 match = c4A.x * wM.x + c4A.y * wM.y + c4A.z * wM.z + c4A.w * wM.w;
- #endif // defined(STP_16BIT)
- // Non-motion-match kills convergence for this frame only.
- cnv *= match;
- //------------------------------------------------------------------------------------------------------------------------------
- // DEPENDENT
- //------------------------------------------------------------------------------------------------------------------------------
- // Interpolation, this first section doesn't have gather4, so probably no gain in swizzling.
- StpH3 dG = d0 * StpH3_(wG.x) + d1 * StpH3_(wG.y) + d2 * StpH3_(wG.z) + d3 * StpH3_(wG.w);
- StpH3 dT = d0 * StpH3_(wT.x) + d1 * StpH3_(wT.y) + d2 * StpH3_(wT.z) + d3 * StpH3_(wT.w);
- //------------------------------------------------------------------------------------------------------------------------------
- #if defined(STP_16BIT)
- StpH2 t2R = (c4R.xy * wT.xy) + (c4R.zw * wT.zw);
- StpH2 t2G = (c4G.xy * wT.xy) + (c4G.zw * wT.zw);
- StpH2 t2B = (c4B.xy * wT.xy) + (c4B.zw * wT.zw);
- StpH3 t = StpH3(t2R.x + t2R.y, t2G.x + t2G.y, t2B.x + t2B.y);
- StpH2 c2R = (c4R.xy * wG.xy) + (c4R.zw * wG.zw);
- StpH2 c2G = (c4G.xy * wG.xy) + (c4G.zw * wG.zw);
- StpH2 c2B = (c4B.xy * wG.xy) + (c4B.zw * wG.zw);
- StpH3 c = StpH3(c2R.x + c2R.y, c2G.x + c2G.y, c2B.x + c2B.y);
- #else // defined(STP_16BIT)
- StpMF3 t = StpMF3(
- c4R.x * wT.x + c4R.y * wT.y + c4R.z * wT.z + c4R.w * wT.w,
- c4G.x * wT.x + c4G.y * wT.y + c4G.z * wT.z + c4G.w * wT.w,
- c4B.x * wT.x + c4B.y * wT.y + c4B.z * wT.z + c4B.w * wT.w);
- StpMF3 c = StpMF3(
- c4R.x * wG.x + c4R.y * wG.y + c4R.z * wG.z + c4R.w * wG.w,
- c4G.x * wG.x + c4G.y * wG.y + c4G.z * wG.z + c4G.w * wG.w,
- c4B.x * wG.x + c4B.y * wG.y + c4B.z * wG.z + c4B.w * wG.w);
- #endif // defined(STP_16BIT)
- //------------------------------------------------------------------------------------------------------------------------------
- // Neighborhood.
- StpH1 bln = StpSatH1(cnv * StpPrxLoRcpH1(cnv + StpH1_(1.0 / STP_FRAME_MAX)));
- StpH1 blnT = StpH1_(1.0) - bln;
- StpH3 b = f * StpH3_(bln) + t * StpH3_(blnT);
- StpH3 minNe = min(c, b);
- StpH3 maxNe = max(c, b);
- //------------------------------------------------------------------------------------------------------------------------------
- // Apply pen.
- StpH3 penC = StpSatH3(c + (f - dG) * StpH3_(StpH1_(0.9875) * match));
- StpH2 penWF;
- penWF.x = pen * StpH1_(STP_TAA_PEN_W);
- penWF.y = pen * lerp(StpH1_(STP_TAA_PEN_F0), StpH1_(STP_TAA_PEN_F1), cnv);
- StpH2 penNotWF = StpH2_(1.0) - penWF;
- rF.rgb = t + (f - dT);
- rF.rgb = rF.rgb * StpH3_(blnT) + f * StpH3_(bln);
- rW.rgb = StpSatH3(rF.rgb * StpH3_(penNotWF.x) + penC * StpH3_(penWF.x));
- rF.rgb = StpSatH3(rF.rgb * StpH3_(penNotWF.y) + penC * StpH3_(penWF.y));
- rW.rgb = clamp(rW.rgb, minNe, maxNe);
- rF.rgb = clamp(rF.rgb, minNe, maxNe);
- //------------------------------------------------------------------------------------------------------------------------------
- // Get back into linear, and then HDR.
- rW.rgb *= rW.rgb;
- #if (STP_POSTMAP == 0)
- StpToneInvH3(rW.rgb);
- #endif // (STP_POSTMAP == 0)
- // Alpha is currently unused, this might improve compression (vs undefined).
- rF.a = rW.a = StpH1(0.0); }
- #endif // defined(STP_GPU) && defined(STP_TAA) && defined(STP_16BIT)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- //
- // GOOD ENOUGH ANTI-ALIASING [GEAA]
- //
- //------------------------------------------------------------------------------------------------------------------------------
- // Yet another simplified spatial morphological AA.
- // Not perfect, but it has low complexity (one pass), and is good enough for a TAA override.
- // Fails on longer edges (due to low maximum search), doesn't get diagonals perfect.
- // But good on already part AA'ed inputs.
- // The spatial AA is not used in STP, only a weighting value which is later used to guide a quick-and-dirty scalar.
- // With some modification this could be used for spatial AA, with or without scaling.
- //------------------------------------------------------------------------------------------------------------------------------
- // CALLBACKS
- // =========
- // StpMF4 StpGeaa4F(StpF2 p) - Gather4 of luma (or green as luma).
- // ---------
- // StpH4 StpGeaa4H(StpF2 p)
- //==============================================================================================================================
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // [GEAA] DEFAULTS
- //==============================================================================================================================
- // Choose a configuration of number of positions to sample.
- // 0 ... 3 per side (faster, less quality)
- // 1 ... 5 per side
- // 2 ... 7 per side
- // 3 ... 9 per side (slower, higher quality)
- #ifndef STP_GEAA_P
- #define STP_GEAA_P 3
- #endif // STP_GEAA_P
- //------------------------------------------------------------------------------------------------------------------------------
- // Amount of sub-pixel blur.
- // 0.50 ... Turn it off
- // 0.25 ... Middle ground
- // 0.00 ... More blur
- #ifndef STP_GEAA_SUBPIX
- #define STP_GEAA_SUBPIX (8.0 / 16.0)
- #endif // STP_GEAA_SUBPIX
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // [GEAA] INTERNAL TUNING
- //==============================================================================================================================
- // Higher numbers can reduce the amount of AA, lower numbers can increase it but can look dirty.
- // Best not to mess with this, 1/3 is the 'correct' value for 2 of the 3 edge cases.
- #define STP_GEAA_THRESHOLD (1.0/3.0)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // [GEAA] 32-BIT ENTRY POINT
- //==============================================================================================================================
- // See the 16-bit version for all comments.
- #if defined(STP_GPU) && defined(STP_GEAA) && defined(STP_32BIT)
- void StpGeaaF(
- out StpMF1 gW, out StpMF1 gLuma, out StpF2 gFilter, out StpF2 gDilate, StpF2 p, StpF2 kRcpI, StpF2 kHalfRcpI) {
- //------------------------------------------------------------------------------------------------------------------------------
- #if STP_OFFSETS
- StpF2 pDEBA = p + StpF2(-kHalfRcpI.x, -kHalfRcpI.y);
- StpMF4 gDEBA = StpGeaa4F(pDEBA);
- StpMF4 gEFCB = StpGeaa4OF(pDEBA, StpI2(1, 0));
- StpMF4 gGHED = StpGeaa4OF(pDEBA, StpI2(0, 1));
- StpMF4 gHIFE = StpGeaa4OF(pDEBA, StpI2(1, 1));
- #else // STP_OFFSETS
- StpMF4 gDEBA = StpGeaa4F(p + StpF2(-kHalfRcpI.x, -kHalfRcpI.y));
- StpMF4 gEFCB = StpGeaa4F(p + StpF2( kHalfRcpI.x, -kHalfRcpI.y));
- StpMF4 gGHED = StpGeaa4F(p + StpF2(-kHalfRcpI.x, kHalfRcpI.y));
- StpMF4 gHIFE = StpGeaa4F(p + StpF2( kHalfRcpI.x, kHalfRcpI.y));
- #endif // STP_OFFSETS
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF2 gHV0,gHV1,gHV2;
- gHV0.x = gDEBA.z * StpMF1_(-2.0) + gEFCB.z;
- gHV0.y = gDEBA.x * StpMF1_(-2.0) + gGHED.x;
- gHV0 += StpMF2_(gDEBA.w);
- gHV1.x = gDEBA.x + gEFCB.y;
- gHV1.y = gDEBA.z + gGHED.y;
- gHV1 += StpMF2_(gDEBA.y) * StpMF2_(-2.0);
- gHV2.x = gGHED.x + gGHED.y * StpMF1_(-2.0);
- gHV2.y = gEFCB.z + gEFCB.y * StpMF1_(-2.0);
- gHV2 += StpMF2_(gHIFE.y);
- #if 0
- StpMF2 gHV = abs(gHV0) + abs(gHV1) * StpMF2_(2.0) + abs(gHV2);
- #else
- StpMF2 gHV = gHV0 * gHV0 + gHV1 * gHV1 * StpMF2_(2.0) + gHV2 * gHV2;
- #endif
- StpP1 gVert = gHV.x > gHV.y;
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF2 gBH = gVert ? StpMF2(gDEBA.x, gEFCB.y) : StpMF2(gDEBA.z, gGHED.y);
- StpMF2 gAC = gVert ? StpMF2(gDEBA.w, gGHED.x) : StpMF2(gDEBA.w, gEFCB.z);
- StpMF2 gDF = gVert ? StpMF2(gDEBA.z, gGHED.y) : StpMF2(gDEBA.x, gEFCB.y);
- StpMF2 gGI = gVert ? StpMF2(gEFCB.y, gHIFE.y) : StpMF2(gGHED.x, gHIFE.y);
- StpMF2 gBHMinusE = gBH - StpMF2_(gDEBA.y);
- StpMF2 gEnd2 = abs(gBHMinusE);
- StpP1 gUp = gEnd2.x >= gEnd2.y;
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 gE = gDEBA.y;
- gBH = gUp ? gBH : gBH.yx;
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF2 gBi = gUp ? StpMF2(2.0 / 3.0, 1.0 / 3.0) : StpMF2(1.0 / 3.0 , 2.0 / 3.0);
- StpMF1 gBMinusE = gUp ? gBHMinusE.x : gBHMinusE.y;
- StpMF2 gBi0 = (gUp ? gAC : gGI) * StpMF2_(1.0 / 3.0) + gDF * StpMF2_(2.0 / 3.0);
- StpMF2 gLo0 = gDF;
- StpMF1 gAbsBMinusE = abs(gBMinusE);
- StpMF1 gNe = gAbsBMinusE;
- StpMF1 gGood = StpGtZeroMF1(gBMinusE);
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 gWalk = gVert ? StpF2(0.0, kRcpI.y) : StpF2(kRcpI.x, 0.0);
- StpF2 gDecon = gVert ? StpF2(kRcpI.x, 0.0) : StpF2(0.0, kRcpI.y);
- if(gUp) gDecon = -gDecon;
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 gP = p + gDecon * StpF2_(1.0/3.0);
- //------------------------------------------------------------------------------------------------------------------------------
- StpF2 gPN3 = gP - StpF2_(8.5) * gWalk;
- StpF2 gPN2 = gP - StpF2_(6.5) * gWalk;
- StpF2 gPN1 = gP - StpF2_(4.5) * gWalk;
- StpF2 gPN0 = gP - StpF2_(2.5) * gWalk;
- StpF2 gPP0 = gP + StpF2_(2.5) * gWalk;
- StpF2 gPP1 = gP + StpF2_(4.5) * gWalk;
- StpF2 gPP2 = gP + StpF2_(6.5) * gWalk;
- StpF2 gPP3 = gP + StpF2_(8.5) * gWalk;
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF4 gGN3, gGN2, gGN1, gGN0, gGP0, gGP1, gGP2, gGP3;
- gGN3 = StpGeaa4F(gPN3);
- gGN2 = StpGeaa4F(gPN2);
- gGN1 = StpGeaa4F(gPN1);
- gGN0 = StpGeaa4F(gPN0);
- gGP0 = StpGeaa4F(gPP0);
- gGP1 = StpGeaa4F(gPP1);
- gGP2 = StpGeaa4F(gPP2);
- gGP3 = StpGeaa4F(gPP3);
- //------------------------------------------------------------------------------------------------------------------------------
- if(gVert) {
- gGN3 = gGN3.zyxw;
- gGN2 = gGN2.zyxw;
- gGN1 = gGN1.zyxw;
- gGN0 = gGN0.zyxw;
- gGP0 = gGP0.zyxw;
- gGP1 = gGP1.zyxw;
- gGP2 = gGP2.zyxw;
- gGP3 = gGP3.zyxw; }
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF2 gLo8 = StpMF2(gGN3.x, gGP3.y);
- StpMF2 gLo7 = StpMF2(gGN3.y, gGP3.x);
- StpMF2 gLo6 = StpMF2(gGN2.x, gGP2.y);
- StpMF2 gLo5 = StpMF2(gGN2.y, gGP2.x);
- StpMF2 gLo4 = StpMF2(gGN1.x, gGP1.y);
- StpMF2 gLo3 = StpMF2(gGN1.y, gGP1.x);
- StpMF2 gLo2 = StpMF2(gGN0.x, gGP0.y);
- StpMF2 gLo1 = StpMF2(gGN0.y, gGP0.x);
- if(!gUp) {
- gLo8 = StpMF2(gGN3.w, gGP3.z);
- gLo7 = StpMF2(gGN3.z, gGP3.w);
- gLo6 = StpMF2(gGN2.w, gGP2.z);
- gLo5 = StpMF2(gGN2.z, gGP2.w);
- gLo4 = StpMF2(gGN1.w, gGP1.z);
- gLo3 = StpMF2(gGN1.z, gGP1.w);
- gLo2 = StpMF2(gGN0.w, gGP0.z);
- gLo1 = StpMF2(gGN0.z, gGP0.w); }
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF2 gGN3Bi = gGN3.yx * StpMF2_(gBi.x) + gGN3.zw * StpMF2_(gBi.y);
- StpMF2 gGN2Bi = gGN2.yx * StpMF2_(gBi.x) + gGN2.zw * StpMF2_(gBi.y);
- StpMF2 gGN1Bi = gGN1.yx * StpMF2_(gBi.x) + gGN1.zw * StpMF2_(gBi.y);
- StpMF2 gGN0Bi = gGN0.yx * StpMF2_(gBi.x) + gGN0.zw * StpMF2_(gBi.y);
- StpMF2 gGP0Bi = gGP0.yx * StpMF2_(gBi.x) + gGP0.zw * StpMF2_(gBi.y);
- StpMF2 gGP1Bi = gGP1.yx * StpMF2_(gBi.x) + gGP1.zw * StpMF2_(gBi.y);
- StpMF2 gGP2Bi = gGP2.yx * StpMF2_(gBi.x) + gGP2.zw * StpMF2_(gBi.y);
- StpMF2 gGP3Bi = gGP3.yx * StpMF2_(gBi.x) + gGP3.zw * StpMF2_(gBi.y);
- StpMF2 gBi8 = StpMF2(gGN3Bi.y, gGP3Bi.x);
- StpMF2 gBi7 = StpMF2(gGN3Bi.x, gGP3Bi.y);
- StpMF2 gBi6 = StpMF2(gGN2Bi.y, gGP2Bi.x);
- StpMF2 gBi5 = StpMF2(gGN2Bi.x, gGP2Bi.y);
- StpMF2 gBi4 = StpMF2(gGN1Bi.y, gGP1Bi.x);
- StpMF2 gBi3 = StpMF2(gGN1Bi.x, gGP1Bi.y);
- StpMF2 gBi2 = StpMF2(gGN0Bi.y, gGP0Bi.x);
- StpMF2 gBi1 = StpMF2(gGN0Bi.x, gGP0Bi.y);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF2 gEndBase;
- gEndBase.y = gBMinusE * StpMF1_(1.0/3.0) + gE;
- gEndBase.x = gAbsBMinusE * StpMF1_(STP_GEAA_THRESHOLD);
- #if 0
- gEndBase.x = StpRcpMF1(max(StpMF1_(1.0 / 16384.0), gEndBase.x));
- #else
- gEndBase.x = StpPrxLoRcpMF1(gEndBase.x);
- #endif
- //------------------------------------------------------------------------------------------------------------------------------
- #if (STP_GEAA_P > 2)
- StpMF2 gUseP8 = StpSatMF2(abs(gBi8 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
- StpMF2 gUseP7 = StpSatMF2(abs(gBi7 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
- #endif
- #if (STP_GEAA_P > 1)
- StpMF2 gUseP6 = StpSatMF2(abs(gBi6 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
- StpMF2 gUseP5 = StpSatMF2(abs(gBi5 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
- #endif
- #if (STP_GEAA_P > 0)
- StpMF2 gUseP4 = StpSatMF2(abs(gBi4 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
- StpMF2 gUseP3 = StpSatMF2(abs(gBi3 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
- #endif
- StpMF2 gUseP2 = StpSatMF2(abs(gBi2 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
- StpMF2 gUseP1 = StpSatMF2(abs(gBi1 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
- StpMF2 gUseP0 = StpSatMF2(abs(gBi0 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
- //------------------------------------------------------------------------------------------------------------------------------
- #if (STP_GEAA_P == 3)
- StpMF2 gDst2 = StpMF2_(9.5);
- #endif
- #if (STP_GEAA_P == 2)
- StpMF2 gDst2 = StpMF2_(7.5);
- #endif
- #if (STP_GEAA_P == 1)
- StpMF2 gDst2 = StpMF2_(5.5);
- #endif
- #if (STP_GEAA_P == 0)
- StpMF2 gDst2 = StpMF2_(3.5);
- #endif
- #if (STP_GEAA_P > 2)
- gDst2 = gDst2 + (StpMF2_(8.5) - gDst2) * gUseP8;
- gDst2 = gDst2 + (StpMF2_(7.5) - gDst2) * gUseP7;
- #endif
- #if (STP_GEAA_P > 1)
- gDst2 = gDst2 + (StpMF2_(6.5) - gDst2) * gUseP6;
- gDst2 = gDst2 + (StpMF2_(5.5) - gDst2) * gUseP5;
- #endif
- #if (STP_GEAA_P > 0)
- gDst2 = gDst2 + (StpMF2_(4.5) - gDst2) * gUseP4;
- gDst2 = gDst2 + (StpMF2_(3.5) - gDst2) * gUseP3;
- #endif
- gDst2 = gDst2 + (StpMF2_(2.5) - gDst2) * gUseP2;
- gDst2 = gDst2 + (StpMF2_(1.5) - gDst2) * gUseP1;
- gDst2 = gDst2 + (StpMF2_(0.5) - gDst2) * gUseP0;
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 gLoSub = (gDst2.x + gDst2.y) * StpMF1_(0.5) - StpMF1_(STP_GEAA_SUBPIX);
- StpMF2 gLoW01 = StpMF2_(1.0) - StpSatMF2(StpMF2(1.0, 2.0) - StpMF2_(gLoSub));
- StpMF2 gLoW23 = StpMF2_(1.0) - StpSatMF2(StpMF2(3.0, 4.0) - StpMF2_(gLoSub));
- StpMF2 gLoW45 = StpMF2_(1.0) - StpSatMF2(StpMF2(5.0, 6.0) - StpMF2_(gLoSub));
- StpMF2 gLoW67 = StpMF2_(1.0) - StpSatMF2(StpMF2(7.0, 8.0) - StpMF2_(gLoSub));
- StpMF2 gLoW89 = StpMF2_(1.0) - StpSatMF2(StpMF2(9.0,10.0) - StpMF2_(gLoSub));
- StpMF2 gLoAcc2 =
- gLo0 * StpMF2_(gLoW01.x) +
- gLo1 * StpMF2_(gLoW01.y) +
- gLo2 * StpMF2_(gLoW23.x) +
- gLo3 * StpMF2_(gLoW23.y) +
- gLo4 * StpMF2_(gLoW45.x) +
- gLo5 * StpMF2_(gLoW45.y) +
- gLo6 * StpMF2_(gLoW67.x) +
- gLo7 * StpMF2_(gLoW67.y) +
- gLo8 * StpMF2_(gLoW89.x);
- StpMF1 gLoAcc = gE + gLoAcc2.x + gLoAcc2.y;
- StpMF2 gLoW2 = gLoW01 + gLoW23 + gLoW45 + gLoW67;
- gLoW2 *= StpMF2_(2.0);
- gLoAcc *= StpRcpMF1(StpMF1_(1.0) + gLoW89.x * StpMF1_(2.0) + gLoW2.x + gLoW2.y);
- StpMF1 gOff = StpSatMF1((gLoAcc - gE) * StpRcpMF1(gBH.x - gE));
- gOff = min(gOff, StpMF1_(0.5));
- //------------------------------------------------------------------------------------------------------------------------------
- gDilate = p + gDecon;
- gFilter = p + gDecon * StpF2_(gOff);
- gLuma = lerp(gE, gBH.x, gOff);
- //------------------------------------------------------------------------------------------------------------------------------
- StpMF1 gAnti = lerp(gE, gBH.x, gOff);
- StpMF1 gT = StpSatMF1((StpMF1_(-2.0) * gAnti + gBH.x + gE) * StpRcpMF1(gE - gBH.y));
- StpMF1 gFix = gE * (gT - StpMF1_(1.0)) - gBH.y * gT;
- gFix = StpSatMF1((gFix + gAnti) * StpRcpMF1(gFix + gBH.x));
- //------------------------------------------------------------------------------------------------------------------------------
- gW = gFix;
- gW = StpRcpMF1(gW + StpMF1_(0.5)) - StpMF1_(1.0);
- gW *= gW;
- gW = max(gW, StpMF1_(1.0/255.0)); }
- #endif // defined(STP_GPU) && defined(STP_GEAA) && defined(STP_32BIT)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- //_____________________________________________________________.._______________________________________________________________
- //==============================================================================================================================
- // [GEAA] PACKED 16-BIT ENTRY POINT
- //==============================================================================================================================
- #if defined(STP_GPU) && defined(STP_GEAA) && defined(STP_16BIT)
- void StpGeaaH(
- out StpH1 gW, // Output weight for pixel art scalar.
- out StpH1 gLuma, // Filtered luma for debug.
- out StpF2 gFilter, // Location to sample for standalone unscaled spatial AA.
- out StpF2 gDilate, // Location of highest contrast neighbor.
- StpF2 p, // {0 to 1} position across screen.
- StpF2 kRcpI, // 1.0 / input image size in pixels.
- StpF2 kHalfRcpI) { // 0.5 / input image size in pixels.
- //------------------------------------------------------------------------------------------------------------------------------
- // Sample 3x3 input pattern in luma (or green).
- // A B C
- // D E F
- // G H I
- // Via four gather4s, usage for the next section to try to improve operand caching.
- #if STP_OFFSETS
- StpF2 pDEBA = p + StpF2(-kHalfRcpI.x, -kHalfRcpI.y);
- StpH4 gDEBA = StpGeaa4H(pDEBA);
- StpH4 gEFCB = StpGeaa4OH(pDEBA, StpI2(1, 0));
- StpH4 gGHED = StpGeaa4OH(pDEBA, StpI2(0, 1));
- StpH4 gHIFE = StpGeaa4OH(pDEBA, StpI2(1, 1));
- #else // STP_OFFSETS
- StpH4 gDEBA = StpGeaa4H(p + StpF2(-kHalfRcpI.x, -kHalfRcpI.y)); // .xyzw=DEBA
- StpH4 gEFCB = StpGeaa4H(p + StpF2( kHalfRcpI.x, -kHalfRcpI.y)); // .yz =FC
- StpH4 gGHED = StpGeaa4H(p + StpF2(-kHalfRcpI.x, kHalfRcpI.y)); // .xy =GH
- StpH4 gHIFE = StpGeaa4H(p + StpF2( kHalfRcpI.x, kHalfRcpI.y)); // .y =I
- #endif // STP_OFFSETS
- //------------------------------------------------------------------------------------------------------------------------------
- // Compute {horz,vert} change terms. Complex to decide on either horizontal or vertical direction.
- // Trouble case for some algorithms,
- // 0 1 0
- // 0 1 0
- // 0 1 0
- // This should present as a vertical search direction.
- // Simple stuff like sum of each 2x2 produces,
- // 2 2
- // 2 2
- // Which has no direction.
- // {ABC,ADG}
- StpH2 gHV0,gHV1,gHV2;
- gHV0.x = gDEBA.z * StpH1_(-2.0) + gEFCB.z;
- gHV0.y = gDEBA.x * StpH1_(-2.0) + gGHED.x;
- gHV0 += StpH2_(gDEBA.w);
- // {DEF,BEH}
- gHV1.x = gDEBA.x + gEFCB.y;
- gHV1.y = gDEBA.z + gGHED.y;
- gHV1 += StpH2_(gDEBA.y) * StpH2_(-2.0);
- // {GHI,CFI}
- gHV2.x = gGHED.x + gGHED.y * StpH1_(-2.0);
- gHV2.y = gEFCB.z + gEFCB.y * StpH1_(-2.0);
- gHV2 += StpH2_(gHIFE.y);
- // Combine terms.
- #if 0
- // What FXAA does, better for a diagonal computation (which is not needed), left for reference.
- StpH2 gHV = abs(gHV0) + abs(gHV1) * StpH2_(2.0) + abs(gHV2);
- #else
- // Slightly faster for packed 16-bit (which has no free ABS on AMD).
- StpH2 gHV = gHV0 * gHV0 + gHV1 * gHV1 * StpH2_(2.0) + gHV2 * gHV2;
- #endif
- // Choose search direction, the 'gVert' is true:=vert, false:=horz.
- // Go vertical search if horizontal has higher contrast (search perpendicular).
- StpP1 gVert = gHV.x > gHV.y;
- //------------------------------------------------------------------------------------------------------------------------------
- // This is BH if search horzontal, else DF (as BH) if search vertical.
- StpH2 gBH = gVert ? StpH2(gDEBA.x, gEFCB.y) : StpH2(gDEBA.z, gGHED.y);
- // Will need these later, will let the compiler move around the transpose.
- StpH2 gAC = gVert ? StpH2(gDEBA.w, gGHED.x) : StpH2(gDEBA.w, gEFCB.z);
- StpH2 gDF = gVert ? StpH2(gDEBA.z, gGHED.y) : StpH2(gDEBA.x, gEFCB.y);
- StpH2 gGI = gVert ? StpH2(gEFCB.y, gHIFE.y) : StpH2(gGHED.x, gHIFE.y);
- // Start to compute threshold for end of span, compute a gradient pair.
- StpH2 gBHMinusE = gBH - StpH2_(gDEBA.y);
- StpH2 gEnd2 = abs(gBHMinusE);
- // If gradient is larger upward (or leftward if vert).
- StpP1 gUp = gEnd2.x >= gEnd2.y;
- //------------------------------------------------------------------------------------------------------------------------------
- // Rename.
- StpH1 gE = gDEBA.y;
- // Swap if not up. From this point on, the B is the high-contrast neighbor, and the H is the other one in same dir.
- gBH = gUp ? gBH : gBH.yx;
- //------------------------------------------------------------------------------------------------------------------------------
- // Choose the bilinear scalar (gets to 1/3 between texels during the search).
- // .x ... For texel closer to pixel axis when up (reversed when down).
- // .y ... For more distant texel.
- // LOGIC
- // =====
- // This keeps threshold of 2 of the 3 end conditions the same (so 1/3 shift is better than 1/4).
- // =====
- // e e e <- e = end cases
- // 0 0 1 1 <- 1/3 of high contrast neighbor
- // 0 1 0 1 <- 2/3 of self
- // ------------------
- // 0 2/3 1/3 1 <- blended value (2/3 is the target)
- // 2/3 0 1/3 1/3 <- abs(difference to target)
- StpH2 gBi = gUp ? StpH2(2.0 / 3.0, 1.0 / 3.0) : StpH2(1.0 / 3.0 , 2.0 / 3.0);
- // Choose either {B-E, or H-E}.
- StpH1 gBMinusE = gUp ? gBHMinusE.x : gBHMinusE.y;
- // Finish Bi0, this is the first 2 texture fetches (done using math instead) at P0 (1 texel away from center).
- StpH2 gBi0 = (gUp ? gAC : gGI) * StpH2_(1.0 / 3.0) + gDF * StpH2_(2.0 / 3.0);
- // Finish Lo0, for the directional blur.
- StpH2 gLo0 = gDF;
- // Store out spatial neighborhood.
- StpH1 gAbsBMinusE = abs(gBMinusE);
- // This is just the highest contrast neighbor along the choosen direction, may report less contrast then actual.
- StpH1 gNe = gAbsBMinusE;
- // Good direction to compare against at the end.
- // Good means 'don't flip' to the other side.
- // Have 'B-E' want 'signed(E-(B/2+E/2))' = 'signed(E/2-B/2)' = 'signed(E-B)' = 'gtzero(B-E)'
- StpH1 gGood = StpGtZeroH1(gBMinusE);
- //------------------------------------------------------------------------------------------------------------------------------
- // One pixel walk distance for search.
- StpF2 gWalk = gVert ? StpF2(0.0, kRcpI.y) : StpF2(kRcpI.x, 0.0);
- // This is the direction of decontrast (towards the highest contrast neighbor).
- StpF2 gDecon = gVert ? StpF2(kRcpI.x, 0.0) : StpF2(0.0, kRcpI.y);
- // If up (or left) work negative.
- if(gUp) gDecon = -gDecon;
- //------------------------------------------------------------------------------------------------------------------------------
- // Have enough now to build out sampling positions.
- // This works in gather4 to get two samples per gather, then uses math to finish the bilinear fetch.
- // In case the logic ever goes back to a non-gather4 version, this keeps with the 1/3 offset.
- // Build base, 1/3 to neighbor pixel.
- // It must be 1/3 to neighbor pixel to be able to find the end of thin stuff like this.
- // . . . . . . . . . . .
- // . . . . . . x x x x x
- // . x x x x x . . . . .
- // | |
- // |------>|
- // | . x
- // If it was 1/2 to neighbor, then x and . would look the same.
- StpF2 gP = p + gDecon * StpF2_(1.0/3.0);
- // The gather4 positions are (assuming horizontal then up).
- // 3 3 2 2 1 1 0 0 A B C 0 0 1 1 2 2 3 3
- // 3 3 2 2 1 1 0 0 D E F 0 0 1 1 2 2 3 3
- // G H I
- //------------------------------------------------------------------------------------------------------------------------------
- // Sampling positions.
- // Currently walking without gaps, but could skip along too!
- StpF2 gPN3 = gP - StpF2_(8.5) * gWalk;
- StpF2 gPN2 = gP - StpF2_(6.5) * gWalk;
- StpF2 gPN1 = gP - StpF2_(4.5) * gWalk;
- StpF2 gPN0 = gP - StpF2_(2.5) * gWalk;
- StpF2 gPP0 = gP + StpF2_(2.5) * gWalk;
- StpF2 gPP1 = gP + StpF2_(4.5) * gWalk;
- StpF2 gPP2 = gP + StpF2_(6.5) * gWalk;
- StpF2 gPP3 = gP + StpF2_(8.5) * gWalk;
- //------------------------------------------------------------------------------------------------------------------------------
- // This attempts to do sampling in a cache friendly way.
- // Cannot sample with offsets, because it could be vertical or horizontal and offsets need to be static in DX.
- // Sampling pairs {negative, positive} directions.
- StpH4 gGN3, gGN2, gGN1, gGN0, gGP0, gGP1, gGP2, gGP3;
- gGN3 = StpGeaa4H(gPN3);
- gGN2 = StpGeaa4H(gPN2);
- gGN1 = StpGeaa4H(gPN1);
- gGN0 = StpGeaa4H(gPN0);
- gGP0 = StpGeaa4H(gPP0);
- gGP1 = StpGeaa4H(gPP1);
- gGP2 = StpGeaa4H(gPP2);
- gGP3 = StpGeaa4H(gPP3);
- //------------------------------------------------------------------------------------------------------------------------------
- // Finish the bilinear fetch.
- // For 'vertical' this needs to do a transpose.
- // The FMAs are duplicated, else the compiler would need to do that anyway.
- // 1st 2nd for N side (P side is reversed)
- // ----------- | |
- // W Z w z !vert & up ... Y X, Z W
- // X Y [p] x y
- // -----------
- // W Z [p] w z !vert & !up ... Z W, Y X
- // X Y x y
- // -----------
- // W Z vert & up ... Y Z, X W
- // X Y
- // [p]
- // w z
- // x y
- // -----------
- // W Z vert & !up ... X W, Y Z
- // X Y | | | |
- // [p] | | 0.33 term
- // w z | |
- // x y 0.66 term
- // -----------
- if(gVert) {
- gGN3 = gGN3.zyxw;
- gGN2 = gGN2.zyxw;
- gGN1 = gGN1.zyxw;
- gGN0 = gGN0.zyxw;
- gGP0 = gGP0.zyxw;
- gGP1 = gGP1.zyxw;
- gGP2 = gGP2.zyxw;
- gGP3 = gGP3.zyxw; }
- //------------------------------------------------------------------------------------------------------------------------------
- // Grab the texels for the variable length inline low-pass box blur.
- StpH2 gLo8 = StpH2(gGN3.x, gGP3.y);
- StpH2 gLo7 = StpH2(gGN3.y, gGP3.x);
- StpH2 gLo6 = StpH2(gGN2.x, gGP2.y);
- StpH2 gLo5 = StpH2(gGN2.y, gGP2.x);
- StpH2 gLo4 = StpH2(gGN1.x, gGP1.y);
- StpH2 gLo3 = StpH2(gGN1.y, gGP1.x);
- StpH2 gLo2 = StpH2(gGN0.x, gGP0.y);
- StpH2 gLo1 = StpH2(gGN0.y, gGP0.x);
- if(!gUp) {
- gLo8 = StpH2(gGN3.w, gGP3.z);
- gLo7 = StpH2(gGN3.z, gGP3.w);
- gLo6 = StpH2(gGN2.w, gGP2.z);
- gLo5 = StpH2(gGN2.z, gGP2.w);
- gLo4 = StpH2(gGN1.w, gGP1.z);
- gLo3 = StpH2(gGN1.z, gGP1.w);
- gLo2 = StpH2(gGN0.w, gGP0.z);
- gLo1 = StpH2(gGN0.z, gGP0.w); }
- //------------------------------------------------------------------------------------------------------------------------------
- // Simulate the bilinear fetch.
- StpH2 gGN3Bi = gGN3.yx * StpH2_(gBi.x) + gGN3.zw * StpH2_(gBi.y);
- StpH2 gGN2Bi = gGN2.yx * StpH2_(gBi.x) + gGN2.zw * StpH2_(gBi.y);
- StpH2 gGN1Bi = gGN1.yx * StpH2_(gBi.x) + gGN1.zw * StpH2_(gBi.y);
- StpH2 gGN0Bi = gGN0.yx * StpH2_(gBi.x) + gGN0.zw * StpH2_(gBi.y);
- StpH2 gGP0Bi = gGP0.yx * StpH2_(gBi.x) + gGP0.zw * StpH2_(gBi.y);
- StpH2 gGP1Bi = gGP1.yx * StpH2_(gBi.x) + gGP1.zw * StpH2_(gBi.y);
- StpH2 gGP2Bi = gGP2.yx * StpH2_(gBi.x) + gGP2.zw * StpH2_(gBi.y);
- StpH2 gGP3Bi = gGP3.yx * StpH2_(gBi.x) + gGP3.zw * StpH2_(gBi.y);
- // Note positive side the {x,y} order is reversed.
- StpH2 gBi8 = StpH2(gGN3Bi.y, gGP3Bi.x);
- StpH2 gBi7 = StpH2(gGN3Bi.x, gGP3Bi.y);
- StpH2 gBi6 = StpH2(gGN2Bi.y, gGP2Bi.x);
- StpH2 gBi5 = StpH2(gGN2Bi.x, gGP2Bi.y);
- StpH2 gBi4 = StpH2(gGN1Bi.y, gGP1Bi.x);
- StpH2 gBi3 = StpH2(gGN1Bi.x, gGP1Bi.y);
- StpH2 gBi2 = StpH2(gGN0Bi.y, gGP0Bi.x);
- StpH2 gBi1 = StpH2(gGN0Bi.x, gGP0Bi.y);
- //------------------------------------------------------------------------------------------------------------------------------
- // Threshold for end of span (X), and base to compare against (Y).
- StpH2 gEndBase;
- // For a (1.0/3.0) pixel shift.
- // The 'gBMinusE = other - self', and want 'self * (2.0/3.0) + other * (1.0/3.0)'.
- gEndBase.y = gBMinusE * StpH1_(1.0/3.0) + gE;
- gEndBase.x = gAbsBMinusE * StpH1_(STP_GEAA_THRESHOLD);
- // Safer version here for reference.
- #if 0
- gEndBase.x = StpRcpH1(max(StpH1_(1.0 / 16384.0), gEndBase.x));
- #else
- gEndBase.x = StpPrxLoRcpH1(gEndBase.x);
- #endif
- //------------------------------------------------------------------------------------------------------------------------------
- // Compute opacity term, {0 := not done, 1 := end of span}.
- #if (STP_GEAA_P > 2)
- StpH2 gUseP8 = StpSatH2(abs(gBi8 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
- StpH2 gUseP7 = StpSatH2(abs(gBi7 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
- #endif
- #if (STP_GEAA_P > 1)
- StpH2 gUseP6 = StpSatH2(abs(gBi6 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
- StpH2 gUseP5 = StpSatH2(abs(gBi5 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
- #endif
- #if (STP_GEAA_P > 0)
- StpH2 gUseP4 = StpSatH2(abs(gBi4 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
- StpH2 gUseP3 = StpSatH2(abs(gBi3 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
- #endif
- StpH2 gUseP2 = StpSatH2(abs(gBi2 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
- StpH2 gUseP1 = StpSatH2(abs(gBi1 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
- StpH2 gUseP0 = StpSatH2(abs(gBi0 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
- //------------------------------------------------------------------------------------------------------------------------------
- // Work this like painters alpha blending.
- // This analog path is faster and cleaner than binary logic.
- // Distance traveled for {negative, positive} paths.
- // LOGIC
- // =====
- // Note distance factors already have the 0.5 factored in.
- // N := negative search end (1 pixel away, but edge is 0.5 pixel away)
- // P := positive search end (4 pixel away, but edge is 3.5 pixel away)
- // X := the pixel to filter
- // :<->:<------------->:
- // : : :
- // : : +---+---+---+---+
- // : : | : | | | |
- // N +---+---+---+---+-P-+---+---+---+
- // | X | | | | | | | |
- // +---+---+---+---+---+---+---+---+---+---+---+---+
- // | | | | | | | | | | | | |
- // +---+---+---+---+---+---+---+---+---+---+---+---+
- #if (STP_GEAA_P == 3)
- StpH2 gDst2 = StpH2_(9.5);
- #endif
- #if (STP_GEAA_P == 2)
- StpH2 gDst2 = StpH2_(7.5);
- #endif
- #if (STP_GEAA_P == 1)
- StpH2 gDst2 = StpH2_(5.5);
- #endif
- #if (STP_GEAA_P == 0)
- StpH2 gDst2 = StpH2_(3.5);
- #endif
- #if (STP_GEAA_P > 2)
- gDst2 = gDst2 + (StpH2_(8.5) - gDst2) * gUseP8;
- gDst2 = gDst2 + (StpH2_(7.5) - gDst2) * gUseP7;
- #endif
- #if (STP_GEAA_P > 1)
- gDst2 = gDst2 + (StpH2_(6.5) - gDst2) * gUseP6;
- gDst2 = gDst2 + (StpH2_(5.5) - gDst2) * gUseP5;
- #endif
- #if (STP_GEAA_P > 0)
- gDst2 = gDst2 + (StpH2_(4.5) - gDst2) * gUseP4;
- gDst2 = gDst2 + (StpH2_(3.5) - gDst2) * gUseP3;
- #endif
- gDst2 = gDst2 + (StpH2_(2.5) - gDst2) * gUseP2;
- gDst2 = gDst2 + (StpH2_(1.5) - gDst2) * gUseP1;
- gDst2 = gDst2 + (StpH2_(0.5) - gDst2) * gUseP0;
- //------------------------------------------------------------------------------------------------------------------------------
- // Run the variable length low-pass box blur.
- // Need half distance with half pixel removed.
- StpH1 gLoSub = (gDst2.x + gDst2.y) * StpH1_(0.5) - StpH1_(STP_GEAA_SUBPIX);
- // compute the weights (if should be included or not).
- StpH2 gLoW01 = StpH2_(1.0) - StpSatH2(StpH2(1.0, 2.0) - StpH2_(gLoSub));
- StpH2 gLoW23 = StpH2_(1.0) - StpSatH2(StpH2(3.0, 4.0) - StpH2_(gLoSub));
- StpH2 gLoW45 = StpH2_(1.0) - StpSatH2(StpH2(5.0, 6.0) - StpH2_(gLoSub));
- StpH2 gLoW67 = StpH2_(1.0) - StpSatH2(StpH2(7.0, 8.0) - StpH2_(gLoSub));
- StpH2 gLoW89 = StpH2_(1.0) - StpSatH2(StpH2(9.0,10.0) - StpH2_(gLoSub));
- // Weighted accumulation of samples.
- StpH2 gLoAcc2 =
- gLo0 * StpH2_(gLoW01.x) +
- gLo1 * StpH2_(gLoW01.y) +
- gLo2 * StpH2_(gLoW23.x) +
- gLo3 * StpH2_(gLoW23.y) +
- gLo4 * StpH2_(gLoW45.x) +
- gLo5 * StpH2_(gLoW45.y) +
- gLo6 * StpH2_(gLoW67.x) +
- gLo7 * StpH2_(gLoW67.y) +
- gLo8 * StpH2_(gLoW89.x);
- StpH1 gLoAcc = gE + gLoAcc2.x + gLoAcc2.y;
- // Weight sum.
- StpH2 gLoW2 = gLoW01 + gLoW23 + gLoW45 + gLoW67;
- gLoW2 *= StpH2_(2.0);
- gLoAcc *= StpRcpH1(StpH1_(1.0) + gLoW89.x * StpH1_(2.0) + gLoW2.x + gLoW2.y);
- // Convert to blend between self and high-contrast neighbor.
- // This currently allows full {0.0 to 1.0} blend.
- StpH1 gOff = StpSatH1((gLoAcc - gE) * StpRcpH1(gBH.x - gE));
- // It is important to not exceed 0.5 weight for PIXart scaling.
- gOff = min(gOff, StpH1_(0.5));
- //------------------------------------------------------------------------------------------------------------------------------
- // Save out dilation pixel for {z,motion}.
- gDilate = p + gDecon;
- // Save out filter position.
- gFilter = p + gDecon * StpF2_(gOff);
- gLuma = lerp(gE, gBH.x, gOff);
- //------------------------------------------------------------------------------------------------------------------------------
- // GEAA up to this point creates weights that only help a scalar for aliased edges.
- // This attempts to increase weight to also restore some anti-aliased edges.
- // It does this by increasing weight as much as can be borrowed from the 'E to H' side.
- // An equation for movement towards H,
- // E+(H-E)*T ... Where T must be {0 to 1} ranged, but want {0 to 0.5} ranged (same as 'gOff').
- // Equation for E motion with respect to the B side,
- // A=E+(B-E)*F ... Where A is the anti-aliased output, and F would typically be 'gOff'.
- // Solving that for E,
- // E=((A-F*B)/(1-F)
- // Combining equations,
- // E+(H-E)*T = ((A-F*B)/(1-F)
- // Then solving for T when 'F=0.5' (maximum 'gOff' weight),
- // T=(-2*A+B+E)/(E-H)
- // Then limit T inside {0 to 0.5}.
- // And use limited 'T' to recompute a new 'F' which becomes the 'gOff' fixed weight.
- StpH1 gAnti = lerp(gE, gBH.x, gOff);
- // Solve for the movement towards 'H'.
- // This in theory should be limited to {0 to 0.5}, but {0 to 1} seems to work too.
- StpH1 gT = StpSatH1((StpH1_(-2.0) * gAnti + gBH.x + gE) * StpRcpH1(gE - gBH.y));
- StpH1 gFix = gE * (gT - StpH1_(1.0)) - gBH.y * gT;
- gFix = StpSatH1((gFix + gAnti) * StpRcpH1(gFix + gBH.x));
- //------------------------------------------------------------------------------------------------------------------------------
- // Output weight for pixel art scalar.
- // The 'gOff'set goes between {0 := no change, to 0.5 := half to neighbor}.
- // The half to neighbor position would be where the edge crosses between two pixels.
- // The sample size needs to be {0 := at the crossing, to 1 := no change}.
- // Can solve this, the 1D kernel will look like,
- // u = (1-x)*s ... weighting terms
- // v = x *t
- // w = 1/(u+v)
- // o = a*u*w + b*v*w
- // The split is where weights are the same,
- // u*w == v*w ... ((1-x)*s)/(((1-x)*s)+(x*t)) == (x*t)/(((1-x)*s)+(x*t))
- // Can assume s=1.0 (the other sample), thus this reduces to,
- // u*w == v*w ... (1-x)/((1-x)+(x*t)) == (x*t)/((1-x)+(x*t))
- // Then solve for 't' given crossing point 'x'.
- // t=1/x-1
- // Convert to 'x=gOffset+1/2'.
- // Solve for 't=1/x-1', or 't=1/(gOffset+1/2)-1'.
- gW = gFix;
- gW = StpRcpH1(gW + StpH1_(0.5)) - StpH1_(1.0);
- // Send squared (as needed by scalar).
- gW *= gW;
- // Make sure not zero.
- gW = max(gW, StpH1_(1.0/255.0)); }
- #endif // defined(STP_GPU) && defined(STP_GEAA) && defined(STP_16BIT)
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
- #endif // STP_UNITY_INCLUDE_GUARD
|