No Description
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Stp.hlsl 267KB


  1. // This is necessary to prevent Unity from deciding that our default config logic is actually an include guard declaration
  2. #ifndef STP_UNITY_INCLUDE_GUARD
  3. #define STP_UNITY_INCLUDE_GUARD
  4. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  5. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  6. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  7. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  8. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  9. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  10. //_____________________________________________________________.._______________________________________________________________
  11. //==============================================================================================================================
  12. //
  13. //
  14. // SPATIAL TEMPORAL POST [STP] v1.0
  15. //
  16. //
  17. //==============================================================================================================================
  18. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  19. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  20. //_____________________________________________________________.._______________________________________________________________
  21. //==============================================================================================================================
  22. // C/C++/GLSL/HLSL PORTABILITY BASED ON AMD's 'ffx_a.h'.
  23. // INCLUDING ASSOCIATED LICENSE BELOW
  24. //------------------------------------------------------------------------------------------------------------------------------
  25. // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
  26. // Permission is hereby granted, free of charge, to any person obtaining a copy
  27. // of this software and associated documentation files(the "Software"), to deal
  28. // in the Software without restriction, including without limitation the rights
  29. // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
  30. // copies of the Software, and to permit persons to whom the Software is
  31. // furnished to do so, subject to the following conditions :
  32. // The above copyright notice and this permission notice shall be included in
  33. // all copies or substantial portions of the Software.
  34. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  35. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  36. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
  37. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  38. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  39. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  40. // THE SOFTWARE.
  41. //==============================================================================================================================
  42. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  43. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  44. //_____________________________________________________________.._______________________________________________________________
  45. //==============================================================================================================================
  46. // NOTES
  47. //------------------------------------------------------------------------------------------------------------------------------
  48. // PLATFORM SPECIFIC WORKAROUNDS
  49. // =============================
  50. // - These all default to not enabled {0}, define to {1} to enable.
  51. // - define STP_BUG_ALIAS16 1 .... Define to enable workaround for asuint16()/asfloat16().
  52. // - define STP_BUG_PRX 1 ........ Define to disable approximate transendentals.
  53. // - define STP_BUG_SAT_INF 1 .... Define to workaround platforms with broken 16-bit saturate +/- INF.
  54. // - define STP_BUG_SAT 1 ........ Define to workaround compiler incorrectly factoring out inner saturate in 16-bit code.
  55. //------------------------------------------------------------------------------------------------------------------------------
  56. // CONFIGURATIONS
  57. // ==============
  58. // - INDEPENDENT OPTIONS
  59. // - define STP_32BIT {0 := disable, 1 := compile the 32-bit version or implicit precision version}
  60. // - define STP_MEDIUM {0 := disable, 1 := enable the implicit medium precision version for 32-bit}
  61. // - define STP_16BIT {0 := disable, 1 := compile the explicit 16-bit version}
  62. // -----
  63. // - define STP_GPU {to include shader code}
  64. // - define STP_GLSL {to include the GLSL version of the code}
  65. // - define STP_HLSL {to include the HLSL version of the code}
  66. // -----
  67. // - define STP_DIL {to include the StpDil<H,F>() entry points}
  68. // - define STP_PAT {to include the StpPat<H,F>() entry points}
  69. // - define STP_SAA {to include the StpSaa<H,F>() entry points}
  70. // - define STP_TAA {to include the StpTaa<H,F>() entry points}
  71. // -----
  72. // - define STP_POSTMAP {running STP, 0 := before, 1 := after, application tonemapping}
  73. //------------------------------------------------------------------------------------------------------------------------------
  74. // IMPORTANT
  75. // =========
  76. // - All callbacks should explicitly sample from MIP level 0.
  77. // - Meaning if used in a pixel shader do not allow implicit LOD calculation.
  78. // - The algorithm is tuned for pre-tonemap operation, post-tonemap wasn't tested yet.
  79. //==============================================================================================================================
  80. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  81. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  82. //_____________________________________________________________.._______________________________________________________________
  83. //==============================================================================================================================
  84. // EXTERNAL OPTIONS
  85. //==============================================================================================================================
  86. // Enable {1} or default disable any debug functionality {0}.
  87. #ifndef STP_BUG
  88. #define STP_BUG 0
  89. #endif
  90. //------------------------------------------------------------------------------------------------------------------------------
  91. // Define to test a pass-through dummy shader that fetches all resources but does no logic.
  92. #ifndef STP_BUG_BW_SOL
  93. #define STP_BUG_BW_SOL 0
  94. #endif
  95. //------------------------------------------------------------------------------------------------------------------------------
  96. // Define to {1} to use the max/min sampling permutation for color values.
  97. #ifndef STP_MAX_MIN_10BIT
  98. #define STP_MAX_MIN_10BIT 0
  99. #endif
  100. //------------------------------------------------------------------------------------------------------------------------------
  101. // Define to {1} to use the max/min sampling permutation for UINT32 values.
  102. #ifndef STP_MAX_MIN_UINT
  103. #define STP_MAX_MIN_UINT 0
  104. #endif
  105. //------------------------------------------------------------------------------------------------------------------------------
  106. // Define to {1} to use sampling with offsets.
  107. #ifndef STP_OFFSETS
  108. #define STP_OFFSETS 0
  109. #endif
  110. //------------------------------------------------------------------------------------------------------------------------------
  111. // STP is currently only tested to run pre-tonemap at that is what Unity is using.
  112. // Run 0 := pre-tonemap, 1 := post-tonemap.
  113. #ifndef STP_POSTMAP
  114. #define STP_POSTMAP 0
  115. #endif
  116. //------------------------------------------------------------------------------------------------------------------------------
  117. // STP TAA quality level {0 to 1}
  118. #ifndef STP_TAA_Q
  119. #define STP_TAA_Q 1
  120. #endif
  121. //==============================================================================================================================
  122. // PLATFORM SPECIFIC BUG WORKAROUNDS
  123. // =================================
  124. // Define to {1} to disable usage of transendental approximations using float/int aliasing.
  125. #ifndef STP_BUG_PRX
  126. #define STP_BUG_PRX 0
  127. #endif
  128. //------------------------------------------------------------------------------------------------------------------------------
  129. // Define to {1} for workaround if platform cannot use saturate of +/- INF correctly.
  130. #ifndef STP_BUG_SAT_INF
  131. #define STP_BUG_SAT_INF 0
  132. #endif
  133. //------------------------------------------------------------------------------------------------------------------------------
  134. // Define to {1} for workaround for compilier incorrectly factoring out inner saturate in 16-bit code.
  135. #ifndef STP_BUG_SAT
  136. #define STP_BUG_SAT 0
  137. #endif
  138. //------------------------------------------------------------------------------------------------------------------------------
  139. // Define to {1} for workarounds for broken asuint16()/asfloat16().
  140. #ifndef STP_BUG_ALIAS16
  141. #define STP_BUG_ALIAS16 0
  142. #undef STP_BUG_PRX
  143. #define STP_BUG_PRX 1
  144. #endif
  145. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  146. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  147. //_____________________________________________________________.._______________________________________________________________
  148. //==============================================================================================================================
  149. // C/C++/GLSL/HLSL PORTABILITY
  150. //==============================================================================================================================
  151. #if defined(STP_CPU)
  152. #ifndef STP_RESTRICT
  153. #define STP_RESTRICT __restrict
  154. #endif
  155. //------------------------------------------------------------------------------------------------------------------------------
  156. #ifndef STP_STATIC
  157. #define STP_STATIC static
  158. #endif
  159. //------------------------------------------------------------------------------------------------------------------------------
  160. typedef unsigned char StpB1;
  161. typedef unsigned short StpW1;
  162. typedef float StpF1;
  163. typedef uint32_t StpU1;
  164. #define StpF1_(a) ((StpF1)(a))
  165. #define StpU1_(a) ((StpU1)(a))
  166. STP_STATIC StpU1 StpU1_F1(StpF1 a) { union { StpF1 f; StpU1 u; } bits; bits.f = a; return bits.u; }
  167. #define StpOutF2 StpF1 *STP_RESTRICT
  168. #define StpExp2F1(x) exp2f(x)
  169. STP_STATIC StpF1 StpMaxF1(StpF1 a, StpF1 b) { return a > b ? a : b; }
  170. //------------------------------------------------------------------------------------------------------------------------------
  171. // Convert float to half (in lower 16-bits of output).
  172. // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
  173. // Supports denormals.
  174. // Conversion rules are to make computations possibly "safer" on the GPU,
  175. // -INF & -NaN -> -65504
  176. // +INF & +NaN -> +65504
  177. STP_STATIC StpU1 StpU1_H1_F1(StpF1 f) {
  178. static StpW1 base[512] = {
  179. 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
  180. 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
  181. 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
  182. 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
  183. 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
  184. 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
  185. 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100,
  186. 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00,
  187. 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff,
  188. 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
  189. 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
  190. 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
  191. 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
  192. 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
  193. 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
  194. 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
  195. 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
  196. 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
  197. 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
  198. 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
  199. 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
  200. 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
  201. 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100,
  202. 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00,
  203. 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff,
  204. 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
  205. 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
  206. 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
  207. 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
  208. 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
  209. 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
  210. 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff };
  211. static StpB1 shift[512] = {
  212. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  213. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  214. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  215. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  216. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  217. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  218. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
  219. 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
  220. 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
  221. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  222. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  223. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  224. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  225. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  226. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  227. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  228. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  229. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  230. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  231. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  232. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  233. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  234. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
  235. 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
  236. 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
  237. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  238. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  239. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  240. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  241. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  242. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
  243. 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18 };
  244. union { StpF1 f; StpU1 u; } bits;
  245. bits.f = f; StpU1 u = bits.u; StpU1 i = u >> 23;
  246. return (StpU1)(base[i]) + ((u & 0x7fffff) >> shift[i]); }
  247. //------------------------------------------------------------------------------------------------------------------------------
  248. STP_STATIC StpU1 StpU1_H2_F2(StpInF2 a) { return StpU1_H1_F1(a[0]) + (StpU1_H1_F1(a[1]) << 16); }
  249. #endif // defined(STP_CPU)
  250. //==============================================================================================================================
  251. #if defined(STP_GPU) && defined(STP_GLSL)
  252. #define StpP1 bool
  253. #define StpP2 bvec2
  254. //------------------------------------------------------------------------------------------------------------------------------
  255. #define StpF1 float
  256. #define StpF2 vec2
  257. #define StpF3 vec3
  258. #define StpF4 vec4
  259. //------------------------------------------------------------------------------------------------------------------------------
  260. #define StpI2 ivec2
  261. //------------------------------------------------------------------------------------------------------------------------------
  262. #define StpU1 uint
  263. #define StpU2 uvec2
  264. #define StpU3 uvec3
  265. #define StpU4 uvec4
  266. //------------------------------------------------------------------------------------------------------------------------------
  267. #define StpF1_U1(x) uintBitsToFloat(StpU1(x))
  268. #define StpF2_U2(x) uintBitsToFloat(StpU2(x))
  269. #define StpF3_U3(x) uintBitsToFloat(StpU3(x))
  270. #define StpF4_U4(x) uintBitsToFloat(StpU4(x))
  271. #define StpU1_F1(x) floatBitsToUint(StpF1(x))
  272. #define StpU2_F2(x) floatBitsToUint(StpF2(x))
  273. #define StpU3_F3(x) floatBitsToUint(StpF3(x))
  274. #define StpU4_F4(x) floatBitsToUint(StpF4(x))
  275. //------------------------------------------------------------------------------------------------------------------------------
  276. #define StpU1_H2_F2 packHalf2x16
  277. #define StpF2_H2_U1 unpackHalf2x16
  278. //------------------------------------------------------------------------------------------------------------------------------
  279. StpU1 StpBfeU1(StpU1 src, StpU1 off, StpU1 bits) { return bitfieldExtract(src, int(off), int(bits)); }
  280. // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate.
  281. StpU1 StpBfiMskU1(StpU1 src, StpU1 ins, StpU1 bits) { return bitfieldInsert(src, ins, 0, int(bits)); }
  282. #endif // defined(STP_GPU) && defined(STP_GLSL)
  283. //==============================================================================================================================
  284. #if defined(STP_GPU) && defined(STP_GLSL) && defined(STP_16BIT)
  285. #define StpH1 float16_t
  286. #define StpH2 f16vec2
  287. #define StpH3 f16vec3
  288. #define StpH4 f16vec4
  289. //------------------------------------------------------------------------------------------------------------------------------
  290. #define StpW1 uint16_t
  291. #define StpW2 u16vec2
  292. #define StpW3 u16vec3
  293. #define StpW4 u16vec4
  294. //------------------------------------------------------------------------------------------------------------------------------
  295. #define StpW2_U1(x) unpackUint2x16(StpU1(x))
  296. #define StpH2_U1(x) unpackFloat2x16(StpU1(x))
  297. //------------------------------------------------------------------------------------------------------------------------------
  298. #define StpW1_H1(x) halfBitsToUint16(StpH1(x))
  299. #define StpW2_H2(x) halfBitsToUint16(StpH2(x))
  300. #define StpW3_H3(x) halfBitsToUint16(StpH3(x))
  301. #define StpW4_H4(x) halfBitsToUint16(StpH4(x))
  302. //------------------------------------------------------------------------------------------------------------------------------
  303. #define StpH1_W1(x) uint16BitsToHalf(StpW1(x))
  304. #define StpH2_W2(x) uint16BitsToHalf(StpW2(x))
  305. #define StpH3_W3(x) uint16BitsToHalf(StpW3(x))
  306. #define StpH4_W4(x) uint16BitsToHalf(StpW4(x))
  307. //------------------------------------------------------------------------------------------------------------------------------
  308. #define StpU1_H2(x) packFloat2x16(StpH2(x))
  309. #endif // defined(STP_GPU) && defined(STP_GLSL) && defined(STP_16BIT)
  310. //==============================================================================================================================
  311. #if defined(STP_GPU) && defined(STP_HLSL)
  312. #define StpP1 bool
  313. #define StpP2 bool2
  314. //------------------------------------------------------------------------------------------------------------------------------
  315. #define StpF1 float
  316. #define StpF2 float2
  317. #define StpF3 float3
  318. #define StpF4 float4
  319. //------------------------------------------------------------------------------------------------------------------------------
  320. #define StpI2 int2
  321. //------------------------------------------------------------------------------------------------------------------------------
  322. #define StpU1 uint
  323. #define StpU2 uint2
  324. #define StpU3 uint3
  325. #define StpU4 uint4
  326. //------------------------------------------------------------------------------------------------------------------------------
  327. #define StpF1_U1(x) asfloat(StpU1(x))
  328. #define StpF2_U2(x) asfloat(StpU2(x))
  329. #define StpF3_U3(x) asfloat(StpU3(x))
  330. #define StpF4_U4(x) asfloat(StpU4(x))
  331. #define StpU1_F1(x) asuint(StpF1(x))
  332. #define StpU2_F2(x) asuint(StpF2(x))
  333. #define StpU3_F3(x) asuint(StpF3(x))
  334. #define StpU4_F4(x) asuint(StpF4(x))
  335. //------------------------------------------------------------------------------------------------------------------------------
  336. StpU1 StpU1_H2_F2_x(StpF2 a) { return f32tof16(a.x) | (f32tof16(a.y) << 16); }
  337. #define StpU1_H2_F2(a) StpU1_H2_F2_x(StpF2(a))
  338. //------------------------------------------------------------------------------------------------------------------------------
  339. StpF2 StpF2_H2_U1_x(StpU1 x) { return StpF2(f16tof32(x & 0xFFFF), f16tof32(x >> 16)); }
  340. #define StpF2_H2_U1(x) StpF2_H2_U1_x(StpU1(x))
  341. //------------------------------------------------------------------------------------------------------------------------------
  342. StpU1 StpBfeU1(StpU1 src, StpU1 off, StpU1 bits) { StpU1 msk = (1u << bits) - 1; return (src >> off) & msk; }
  343. StpU1 StpBfiMskU1(StpU1 src, StpU1 ins, StpU1 bits) { StpU1 msk = (1u << bits) - 1; return (ins & msk) | (src & (~msk)); }
  344. #endif // defined(STP_GPU) && defined(STP_HLSL)
  345. //==============================================================================================================================
  346. #if defined(STP_GPU) && defined(STP_HLSL) && defined(STP_MEDIUM)
  347. #define StpMU1 min16uint
  348. #define StpMU2 min16uint2
  349. #define StpMU3 min16uint3
  350. #define StpMU4 min16uint4
  351. //------------------------------------------------------------------------------------------------------------------------------
  352. #define StpMF1 min16float
  353. #define StpMF2 min16float2
  354. #define StpMF3 min16float3
  355. #define StpMF4 min16float4
  356. #endif // defined(STP_GPU) && defined(STP_HLSL) && defined(STP_MEDIUM)
  357. //==============================================================================================================================
  358. #if defined(STP_GPU) && (!defined(STP_MEDIUM))
  359. #define StpMU1 StpU1
  360. #define StpMU2 StpU2
  361. #define StpMU3 StpU3
  362. #define StpMU4 StpU4
  363. //------------------------------------------------------------------------------------------------------------------------------
  364. #define StpMF1 StpF1
  365. #define StpMF2 StpF2
  366. #define StpMF3 StpF3
  367. #define StpMF4 StpF4
  368. #endif // defined(STP_GPU) && (!defined(STP_MEDIUM))
  369. //==============================================================================================================================
  370. #if defined(STP_GPU) && defined(STP_HLSL) && defined(STP_16BIT)
  371. #define StpH1 float16_t
  372. #define StpH2 float16_t2
  373. #define StpH3 float16_t3
  374. #define StpH4 float16_t4
  375. //------------------------------------------------------------------------------------------------------------------------------
  376. #define StpW1 uint16_t
  377. #define StpW2 uint16_t2
  378. #define StpW3 uint16_t3
  379. #define StpW4 uint16_t4
  380. //------------------------------------------------------------------------------------------------------------------------------
  381. StpW2 StpW2_U1_x(StpU1 x) { StpU2 t = StpU2(x & 0xFFFF, x >> 16); return StpW2(t); }
  382. #define StpW2_U1(x) StpW2_U1_x(StpU1(x))
  383. StpH2 StpH2_U1_x(StpU1 x) { return asfloat16(StpW2((StpW1)(x & 0xFFFF), (StpW1)(x >> 16))); }
  384. #define StpH2_U1(x) StpH2_U1_x(StpU1(x))
  385. //------------------------------------------------------------------------------------------------------------------------------
  386. #define StpW1_H1(x) asuint16(StpH1(x))
  387. #define StpW2_H2(x) asuint16(StpH2(x))
  388. #define StpW3_H3(x) asuint16(StpH3(x))
  389. #define StpW4_H4(x) asuint16(StpH4(x))
  390. //------------------------------------------------------------------------------------------------------------------------------
  391. #define StpH1_W1(x) asfloat16(StpW1(x))
  392. #define StpH2_W2(x) asfloat16(StpW2(x))
  393. #define StpH3_W3(x) asfloat16(StpW3(x))
  394. #define StpH4_W4(x) asfloat16(StpW4(x))
  395. //------------------------------------------------------------------------------------------------------------------------------
  396. StpU1 StpU1_H2_x(StpH2 x) { StpW2 t = asuint16(x); return (((StpU1)t.x) | (((StpU1)t.y) << 16)); }
  397. #define StpU1_H2(x) StpU1_H2_x(StpH2(x))
  398. #endif // defined(STP_GPU) && defined(STP_HLSL) && defined(STP_16BIT)
  399. //==============================================================================================================================
  400. #if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL))
  401. StpF1 StpMaxF1(StpF1 a, StpF1 b) { return max(a, b); }
  402. //------------------------------------------------------------------------------------------------------------------------------
  403. StpP2 StpP2_x(StpP1 x) { return StpP2(x, x); }
  404. #define StpP2_(x) StpP2_x(StpP1(x))
  405. //------------------------------------------------------------------------------------------------------------------------------
  406. StpF1 StpF1_x(StpF1 x) { return StpF1(x); }
  407. StpF2 StpF2_x(StpF1 x) { return StpF2(x, x); }
  408. StpF3 StpF3_x(StpF1 x) { return StpF3(x, x, x); }
  409. StpF4 StpF4_x(StpF1 x) { return StpF4(x, x, x, x); }
  410. #define StpF1_(x) StpF1_x(StpF1(x))
  411. #define StpF2_(x) StpF2_x(StpF1(x))
  412. #define StpF3_(x) StpF3_x(StpF1(x))
  413. #define StpF4_(x) StpF4_x(StpF1(x))
  414. //------------------------------------------------------------------------------------------------------------------------------
  415. StpMF1 StpMF1_x(StpMF1 x) { return StpMF1(x); }
  416. StpMF2 StpMF2_x(StpMF1 x) { return StpMF2(x, x); }
  417. StpMF3 StpMF3_x(StpMF1 x) { return StpMF3(x, x, x); }
  418. StpMF4 StpMF4_x(StpMF1 x) { return StpMF4(x, x, x, x); }
  419. #define StpMF1_(x) StpMF1_x(StpMF1(x))
  420. #define StpMF2_(x) StpMF2_x(StpMF1(x))
  421. #define StpMF3_(x) StpMF3_x(StpMF1(x))
  422. #define StpMF4_(x) StpMF4_x(StpMF1(x))
  423. //------------------------------------------------------------------------------------------------------------------------------
  424. StpMU1 StpMU1_x(StpMU1 x) { return StpMU1(x); }
  425. StpMU2 StpMU2_x(StpMU1 x) { return StpMU2(x, x); }
  426. StpMU3 StpMU3_x(StpMU1 x) { return StpMU3(x, x, x); }
  427. StpMU4 StpMU4_x(StpMU1 x) { return StpMU4(x, x, x, x); }
  428. #define StpMU1_(x) StpMU1_x(StpMU1(x))
  429. #define StpMU2_(x) StpMU2_x(StpMU1(x))
  430. #define StpMU3_(x) StpMU3_x(StpMU1(x))
  431. #define StpMU4_(x) StpMU4_x(StpMU1(x))
  432. //------------------------------------------------------------------------------------------------------------------------------
  433. StpU1 StpU1_x(StpU1 x) { return StpU1(x); }
  434. StpU2 StpU2_x(StpU1 x) { return StpU2(x, x); }
  435. StpU3 StpU3_x(StpU1 x) { return StpU3(x, x, x); }
  436. StpU4 StpU4_x(StpU1 x) { return StpU4(x, x, x, x); }
  437. #define StpU1_(x) StpU1_x(StpU1(x))
  438. #define StpU2_(x) StpU2_x(StpU1(x))
  439. #define StpU3_(x) StpU3_x(StpU1(x))
  440. #define StpU4_(x) StpU4_x(StpU1(x))
  441. //------------------------------------------------------------------------------------------------------------------------------
  442. #if 0
  443. // Slow implementation (if not pattern matched by a compiler).
  444. StpF1 StpCpySgnF1(StpF1 d, StpF1 s) { return StpF1_U1(StpU1_F1(d) | (StpU1_F1(s) & StpU1_(0x80000000u))); }
  445. StpF2 StpCpySgnF2(StpF2 d, StpF2 s) { return StpF2_U2(StpU2_F2(d) | (StpU2_F2(s) & StpU2_(0x80000000u))); }
  446. StpF3 StpCpySgnF3(StpF3 d, StpF3 s) { return StpF3_U3(StpU3_F3(d) | (StpU3_F3(s) & StpU3_(0x80000000u))); }
  447. StpF4 StpCpySgnF4(StpF4 d, StpF4 s) { return StpF4_U4(StpU4_F4(d) | (StpU4_F4(s) & StpU4_(0x80000000u))); }
  448. #else
  449. // Faster implementation (one portable BFI).
  450. StpF1 StpCpySgnF1(StpF1 d, StpF1 s) { return StpF1_U1(StpBfiMskU1(StpU1_F1(s), StpU1_F1(d), StpU1_(31))); }
  451. StpF2 StpCpySgnF2(StpF2 d, StpF2 s) { return StpF2(StpCpySgnF1(d.x, s.x), StpCpySgnF1(d.y, s.y)); }
  452. StpF3 StpCpySgnF3(StpF3 d, StpF3 s) {
  453. return StpF3(StpCpySgnF1(d.x, s.x), StpCpySgnF1(d.y, s.y), StpCpySgnF1(d.z, s.z)); }
  454. StpF4 StpCpySgnF4(StpF4 d, StpF4 s) {
  455. return StpF4(StpCpySgnF1(d.x, s.x), StpCpySgnF1(d.y, s.y), StpCpySgnF1(d.z, s.z), StpCpySgnF1(d.w, s.w)); }
  456. #endif
  457. StpF1 StpMax3F1(StpF1 x, StpF1 y, StpF1 z) { return max(x, max(y, z)); }
  458. StpF2 StpMax3F2(StpF2 x, StpF2 y, StpF2 z) { return max(x, max(y, z)); }
  459. StpF3 StpMax3F3(StpF3 x, StpF3 y, StpF3 z) { return max(x, max(y, z)); }
  460. StpF4 StpMax3F4(StpF4 x, StpF4 y, StpF4 z) { return max(x, max(y, z)); }
  461. StpF1 StpMin3F1(StpF1 x, StpF1 y, StpF1 z) { return min(x, min(y, z)); }
  462. StpF2 StpMin3F2(StpF2 x, StpF2 y, StpF2 z) { return min(x, min(y, z)); }
  463. StpF3 StpMin3F3(StpF3 x, StpF3 y, StpF3 z) { return min(x, min(y, z)); }
  464. StpF4 StpMin3F4(StpF4 x, StpF4 y, StpF4 z) { return min(x, min(y, z)); }
  465. StpU1 StpMax3U1(StpU1 x, StpU1 y, StpU1 z) { return max(x, max(y, z)); }
  466. StpU1 StpMin3U1(StpU1 x, StpU1 y, StpU1 z) { return min(x, min(y, z)); }
  467. StpU4 StpMin3U4(StpU4 x, StpU4 y, StpU4 z) { return min(x, min(y, z)); }
  468. //------------------------------------------------------------------------------------------------------------------------------
  469. StpMF1 StpMax3MF1(StpMF1 x, StpMF1 y, StpMF1 z) { return max(x, max(y, z)); }
  470. StpMF2 StpMax3MF2(StpMF2 x, StpMF2 y, StpMF2 z) { return max(x, max(y, z)); }
  471. StpMF3 StpMax3MF3(StpMF3 x, StpMF3 y, StpMF3 z) { return max(x, max(y, z)); }
  472. StpMF4 StpMax3MF4(StpMF4 x, StpMF4 y, StpMF4 z) { return max(x, max(y, z)); }
  473. StpMF1 StpMin3MF1(StpMF1 x, StpMF1 y, StpMF1 z) { return min(x, min(y, z)); }
  474. StpMF2 StpMin3MF2(StpMF2 x, StpMF2 y, StpMF2 z) { return min(x, min(y, z)); }
  475. StpMF3 StpMin3MF3(StpMF3 x, StpMF3 y, StpMF3 z) { return min(x, min(y, z)); }
  476. StpMF4 StpMin3MF4(StpMF4 x, StpMF4 y, StpMF4 z) { return min(x, min(y, z)); }
  477. //------------------------------------------------------------------------------------------------------------------------------
  478. // Make {<+0 := -1.0, >=+0 := 1.0}.
  479. StpF1 StpSgnOneF1(StpF1 x) { return StpF1_U1(StpBfiMskU1(StpU1_F1(x), StpU1_(0x3f800000), StpU1_(31))); }
  480. #endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL))
  481. //==============================================================================================================================
  482. #if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_16BIT)
  483. StpH1 StpH1_x(StpH1 x) { return StpH1(x); }
  484. StpH2 StpH2_x(StpH1 x) { return StpH2(x, x); }
  485. StpH3 StpH3_x(StpH1 x) { return StpH3(x, x, x); }
  486. StpH4 StpH4_x(StpH1 x) { return StpH4(x, x, x, x); }
  487. #define StpH1_(x) StpH1_x(StpH1(x))
  488. #define StpH2_(x) StpH2_x(StpH1(x))
  489. #define StpH3_(x) StpH3_x(StpH1(x))
  490. #define StpH4_(x) StpH4_x(StpH1(x))
  491. //------------------------------------------------------------------------------------------------------------------------------
  492. StpW1 StpW1_x(StpW1 x) { return StpW1(x); }
  493. StpW2 StpW2_x(StpW1 x) { return StpW2(x, x); }
  494. StpW3 StpW3_x(StpW1 x) { return StpW3(x, x, x); }
  495. StpW4 StpW4_x(StpW1 x) { return StpW4(x, x, x, x); }
  496. #define StpW1_(x) StpW1_x(StpW1(x))
  497. #define StpW2_(x) StpW2_x(StpW1(x))
  498. #define StpW3_(x) StpW3_x(StpW1(x))
  499. #define StpW4_(x) StpW4_x(StpW1(x))
  500. //------------------------------------------------------------------------------------------------------------------------------
  501. StpH1 StpMax3H1(StpH1 x, StpH1 y, StpH1 z) { return max(x, max(y, z)); }
  502. StpH2 StpMax3H2(StpH2 x, StpH2 y, StpH2 z) { return max(x, max(y, z)); }
  503. StpH3 StpMax3H3(StpH3 x, StpH3 y, StpH3 z) { return max(x, max(y, z)); }
  504. StpH4 StpMax3H4(StpH4 x, StpH4 y, StpH4 z) { return max(x, max(y, z)); }
  505. StpH1 StpMin3H1(StpH1 x, StpH1 y, StpH1 z) { return min(x, min(y, z)); }
  506. StpH2 StpMin3H2(StpH2 x, StpH2 y, StpH2 z) { return min(x, min(y, z)); }
  507. StpH3 StpMin3H3(StpH3 x, StpH3 y, StpH3 z) { return min(x, min(y, z)); }
  508. StpH4 StpMin3H4(StpH4 x, StpH4 y, StpH4 z) { return min(x, min(y, z)); }
  509. StpW1 StpMax3W1(StpW1 x, StpW1 y, StpW1 z) { return max(x, max(y, z)); }
  510. StpW1 StpMin3W1(StpW1 x, StpW1 y, StpW1 z) { return min(x, min(y, z)); }
  511. #endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_16BIT)
  512. //==============================================================================================================================
  513. #if defined(STP_GPU) && defined(STP_GLSL)
  514. StpF1 StpFractF1(StpF1 x) { return fract(x); }
  515. StpF2 StpFractF2(StpF2 x) { return fract(x); }
  516. StpF3 StpFractF3(StpF3 x) { return fract(x); }
  517. StpF4 StpFractF4(StpF4 x) { return fract(x); }
  518. StpF1 StpLerpF1(StpF1 x, StpF1 y, StpF1 z) { return mix(x, y, z); }
  519. StpF2 StpLerpF2(StpF2 x, StpF2 y, StpF2 z) { return mix(x, y, z); }
  520. StpF3 StpLerpF3(StpF3 x, StpF3 y, StpF3 z) { return mix(x, y, z); }
  521. StpF4 StpLerpF4(StpF4 x, StpF4 y, StpF4 z) { return mix(x, y, z); }
  522. StpF1 StpRcpF1(StpF1 x) { return StpF1_(1.0) / x; }
  523. StpF2 StpRcpF2(StpF2 x) { return StpF2_(1.0) / x; }
  524. StpF3 StpRcpF3(StpF3 x) { return StpF3_(1.0) / x; }
  525. StpF4 StpRcpF4(StpF4 x) { return StpF4_(1.0) / x; }
  526. StpF1 StpRsqF1(StpF1 x) { return inversesqrt(x); }
  527. StpF2 StpRsqF2(StpF2 x) { return inversesqrt(x); }
  528. StpF3 StpRsqF3(StpF3 x) { return inversesqrt(x); }
  529. StpF4 StpRsqF4(StpF4 x) { return inversesqrt(x); }
  530. StpF1 StpSatF1(StpF1 x) { return clamp(x, StpF1_(0.0), StpF1_(1.0)); }
  531. StpF2 StpSatF2(StpF2 x) { return clamp(x, StpF2_(0.0), StpF2_(1.0)); }
  532. StpF3 StpSatF3(StpF3 x) { return clamp(x, StpF3_(0.0), StpF3_(1.0)); }
  533. StpF4 StpSatF4(StpF4 x) { return clamp(x, StpF4_(0.0), StpF4_(1.0)); }
  534. //------------------------------------------------------------------------------------------------------------------------------
  535. StpMF1 StpFractMF1(StpMF1 x) { return fract(x); }
  536. StpMF2 StpFractMF2(StpMF2 x) { return fract(x); }
  537. StpMF3 StpFractMF3(StpMF3 x) { return fract(x); }
  538. StpMF4 StpFractMF4(StpMF4 x) { return fract(x); }
  539. StpMF1 StpLerpMF1(StpMF1 x, StpMF1 y, StpMF1 z) { return mix(x, y, z); }
  540. StpMF2 StpLerpMF2(StpMF2 x, StpMF2 y, StpMF2 z) { return mix(x, y, z); }
  541. StpMF3 StpLerpMF3(StpMF3 x, StpMF3 y, StpMF3 z) { return mix(x, y, z); }
  542. StpMF4 StpLerpMF4(StpMF4 x, StpMF4 y, StpMF4 z) { return mix(x, y, z); }
  543. StpMF1 StpRcpMF1(StpMF1 x) { return StpMF1_(1.0) / x; }
  544. StpMF2 StpRcpMF2(StpMF2 x) { return StpMF2_(1.0) / x; }
  545. StpMF3 StpRcpMF3(StpMF3 x) { return StpMF3_(1.0) / x; }
  546. StpMF4 StpRcpMF4(StpMF4 x) { return StpMF4_(1.0) / x; }
  547. StpMF1 StpRsqMF1(StpMF1 x) { return inversesqrt(x); }
  548. StpMF2 StpRsqMF2(StpMF2 x) { return inversesqrt(x); }
  549. StpMF3 StpRsqMF3(StpMF3 x) { return inversesqrt(x); }
  550. StpMF4 StpRsqMF4(StpMF4 x) { return inversesqrt(x); }
  551. StpMF1 StpSatMF1(StpMF1 x) { return clamp(x, StpMF1_(0.0), StpMF1_(1.0)); }
  552. StpMF2 StpSatMF2(StpMF2 x) { return clamp(x, StpMF2_(0.0), StpMF2_(1.0)); }
  553. StpMF3 StpSatMF3(StpMF3 x) { return clamp(x, StpMF3_(0.0), StpMF3_(1.0)); }
  554. StpMF4 StpSatMF4(StpMF4 x) { return clamp(x, StpMF4_(0.0), StpMF4_(1.0)); }
  555. #endif // defined(STP_GPU) && defined(STP_GLSL)
  556. //==============================================================================================================================
  557. #if defined(STP_GPU) && defined(STP_GLSL) && defined(STP_16BIT)
  558. StpH1 StpFractH1(StpH1 x) { return fract(x); }
  559. StpH2 StpFractH2(StpH2 x) { return fract(x); }
  560. StpH3 StpFractH3(StpH3 x) { return fract(x); }
  561. StpH4 StpFractH4(StpH4 x) { return fract(x); }
  562. StpH1 StpLerpH1(StpH1 x, StpH1 y, StpH1 z) { return mix(x, y, z); }
  563. StpH2 StpLerpH2(StpH2 x, StpH2 y, StpH2 z) { return mix(x, y, z); }
  564. StpH3 StpLerpH3(StpH3 x, StpH3 y, StpH3 z) { return mix(x, y, z); }
  565. StpH4 StpLerpH4(StpH4 x, StpH4 y, StpH4 z) { return mix(x, y, z); }
  566. StpH1 StpRcpH1(StpH1 x) { return StpH1_(1.0) / x; }
  567. StpH2 StpRcpH2(StpH2 x) { return StpH2_(1.0) / x; }
  568. StpH3 StpRcpH3(StpH3 x) { return StpH3_(1.0) / x; }
  569. StpH4 StpRcpH4(StpH4 x) { return StpH4_(1.0) / x; }
  570. StpH1 StpRsqH1(StpH1 x) { return inversesqrt(x); }
  571. StpH2 StpRsqH2(StpH2 x) { return inversesqrt(x); }
  572. StpH3 StpRsqH3(StpH3 x) { return inversesqrt(x); }
  573. StpH4 StpRsqH4(StpH4 x) { return inversesqrt(x); }
  574. StpH1 StpSatH1(StpH1 x) { return clamp(x, StpH1_(0.0), StpH1_(1.0)); }
  575. StpH2 StpSatH2(StpH2 x) { return clamp(x, StpH2_(0.0), StpH2_(1.0)); }
  576. StpH3 StpSatH3(StpH3 x) { return clamp(x, StpH3_(0.0), StpH3_(1.0)); }
  577. StpH4 StpSatH4(StpH4 x) { return clamp(x, StpH4_(0.0), StpH4_(1.0)); }
  578. #endif // defined(STP_GPU) && defined(STP_GLSL) && defined(STP_16BIT)
  579. //==============================================================================================================================
  580. #if defined(STP_GPU) && defined(STP_HLSL)
  581. StpF1 StpFractF1(StpF1 x) { return x - floor(x); }
  582. StpF2 StpFractF2(StpF2 x) { return x - floor(x); }
  583. StpF3 StpFractF3(StpF3 x) { return x - floor(x); }
  584. StpF4 StpFractF4(StpF4 x) { return x - floor(x); }
  585. StpF1 StpLerpF1(StpF1 x, StpF1 y, StpF1 z) { return lerp(x, y, z); }
  586. StpF2 StpLerpF2(StpF2 x, StpF2 y, StpF2 z) { return lerp(x, y, z); }
  587. StpF3 StpLerpF3(StpF3 x, StpF3 y, StpF3 z) { return lerp(x, y, z); }
  588. StpF4 StpLerpF4(StpF4 x, StpF4 y, StpF4 z) { return lerp(x, y, z); }
  589. StpF1 StpRcpF1(StpF1 x) { return rcp(x); }
  590. StpF2 StpRcpF2(StpF2 x) { return rcp(x); }
  591. StpF3 StpRcpF3(StpF3 x) { return rcp(x); }
  592. StpF4 StpRcpF4(StpF4 x) { return rcp(x); }
  593. StpF1 StpRsqF1(StpF1 x) { return rsqrt(x); }
  594. StpF2 StpRsqF2(StpF2 x) { return rsqrt(x); }
  595. StpF3 StpRsqF3(StpF3 x) { return rsqrt(x); }
  596. StpF4 StpRsqF4(StpF4 x) { return rsqrt(x); }
  597. StpF1 StpSatF1(StpF1 x) { return saturate(x); }
  598. StpF2 StpSatF2(StpF2 x) { return saturate(x); }
  599. StpF3 StpSatF3(StpF3 x) { return saturate(x); }
  600. StpF4 StpSatF4(StpF4 x) { return saturate(x); }
  601. //------------------------------------------------------------------------------------------------------------------------------
  602. StpMF1 StpFractMF1(StpMF1 x) { return x - floor(x); }
  603. StpMF2 StpFractMF2(StpMF2 x) { return x - floor(x); }
  604. StpMF3 StpFractMF3(StpMF3 x) { return x - floor(x); }
  605. StpMF4 StpFractMF4(StpMF4 x) { return x - floor(x); }
  606. StpMF1 StpLerpMF1(StpMF1 x, StpMF1 y, StpMF1 z) { return lerp(x, y, z); }
  607. StpMF2 StpLerpMF2(StpMF2 x, StpMF2 y, StpMF2 z) { return lerp(x, y, z); }
  608. StpMF3 StpLerpMF3(StpMF3 x, StpMF3 y, StpMF3 z) { return lerp(x, y, z); }
  609. StpMF4 StpLerpMF4(StpMF4 x, StpMF4 y, StpMF4 z) { return lerp(x, y, z); }
  610. StpMF1 StpRcpMF1(StpMF1 x) { return rcp(x); }
  611. StpMF2 StpRcpMF2(StpMF2 x) { return rcp(x); }
  612. StpMF3 StpRcpMF3(StpMF3 x) { return rcp(x); }
  613. StpMF4 StpRcpMF4(StpMF4 x) { return rcp(x); }
  614. StpMF1 StpRsqMF1(StpMF1 x) { return rsqrt(x); }
  615. StpMF2 StpRsqMF2(StpMF2 x) { return rsqrt(x); }
  616. StpMF3 StpRsqMF3(StpMF3 x) { return rsqrt(x); }
  617. StpMF4 StpRsqMF4(StpMF4 x) { return rsqrt(x); }
  618. StpMF1 StpSatMF1(StpMF1 x) { return saturate(x); }
  619. StpMF2 StpSatMF2(StpMF2 x) { return saturate(x); }
  620. StpMF3 StpSatMF3(StpMF3 x) { return saturate(x); }
  621. StpMF4 StpSatMF4(StpMF4 x) { return saturate(x); }
  622. #endif // defined(STP_GPU) && defined(STP_HLSL)
  623. //==============================================================================================================================
  624. #if defined(STP_GPU) && defined(STP_HLSL) && defined(STP_16BIT)
  625. StpH1 StpFractH1(StpH1 x) { return x - floor(x); }
  626. StpH2 StpFractH2(StpH2 x) { return x - floor(x); }
  627. StpH3 StpFractH3(StpH3 x) { return x - floor(x); }
  628. StpH4 StpFractH4(StpH4 x) { return x - floor(x); }
  629. StpH1 StpLerpH1(StpH1 x, StpH1 y, StpH1 z) { return lerp(x, y, z); }
  630. StpH2 StpLerpH2(StpH2 x, StpH2 y, StpH2 z) { return lerp(x, y, z); }
  631. StpH3 StpLerpH3(StpH3 x, StpH3 y, StpH3 z) { return lerp(x, y, z); }
  632. StpH4 StpLerpH4(StpH4 x, StpH4 y, StpH4 z) { return lerp(x, y, z); }
  633. StpH1 StpRcpH1(StpH1 x) { return rcp(x); }
  634. StpH2 StpRcpH2(StpH2 x) { return rcp(x); }
  635. StpH3 StpRcpH3(StpH3 x) { return rcp(x); }
  636. StpH4 StpRcpH4(StpH4 x) { return rcp(x); }
  637. StpH1 StpRsqH1(StpH1 x) { return rsqrt(x); }
  638. StpH2 StpRsqH2(StpH2 x) { return rsqrt(x); }
  639. StpH3 StpRsqH3(StpH3 x) { return rsqrt(x); }
  640. StpH4 StpRsqH4(StpH4 x) { return rsqrt(x); }
  641. StpH1 StpSatH1(StpH1 x) { return saturate(x); }
  642. StpH2 StpSatH2(StpH2 x) { return saturate(x); }
  643. StpH3 StpSatH3(StpH3 x) { return saturate(x); }
  644. StpH4 StpSatH4(StpH4 x) { return saturate(x); }
  645. #endif // defined(STP_GPU) && defined(STP_HLSL) && defined(STP_16BIT)
  646. //==============================================================================================================================
  647. #if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL))
  648. StpF1 StpExp2F1(StpF1 x) { return exp2(x); }
  649. StpF1 StpLog2F1(StpF1 x) { return log2(x); }
  650. //------------------------------------------------------------------------------------------------------------------------------
  651. StpMF1 StpExp2MF1(StpMF1 x) { return exp2(x); }
  652. StpMF1 StpLog2MF1(StpMF1 x) { return log2(x); }
  653. //------------------------------------------------------------------------------------------------------------------------------
  654. #define STP_INFN_F StpF1_U1(0xff800000u)
  655. #define STP_INFP_F StpF1_U1(0x7f800000u)
  656. #if STP_BUG_SAT_INF
  657. // Defined if unable to use the fast path because of problem related to saturating +/- INF.
  658. StpF1 StpGtZeroF1(StpF1 x) { return (x > StpF1_(0.0)) ? StpF1_(1.0) : StpF1_(0.0); }
  659. StpF3 StpGtZeroF3(StpF3 x) { return StpF3(StpGtZeroF1(x.r), StpGtZeroF1(x.g), StpGtZeroF1(x.b)); }
  660. StpF4 StpGtZeroF4(StpF4 x) { return StpF4(StpGtZeroF1(x.r), StpGtZeroF1(x.g),
  661. StpGtZeroF1(x.b), StpGtZeroF1(x.a)); }
  662. StpF1 StpSignedF1(StpF1 x) { return (x < StpF1_(0.0)) ? StpF1_(1.0) : StpF1_(0.0); }
  663. StpF2 StpSignedF2(StpF2 x) { return StpF2(StpSignedF1(x.r), StpSignedF1(x.g)); }
  664. StpF3 StpSignedF3(StpF3 x) { return StpF3(StpSignedF1(x.r), StpSignedF1(x.g), StpSignedF1(x.b)); }
  665. StpF4 StpSignedF4(StpF4 x) { return StpF4(StpSignedF1(x.r), StpSignedF1(x.g),
  666. StpSignedF1(x.b), StpSignedF1(x.a)); }
  667. #else
  668. StpF1 StpGtZeroF1(StpF1 x) { return StpSatF1(x * StpF1_(STP_INFP_F)); }
  669. StpF3 StpGtZeroF3(StpF3 x) { return StpSatF3(x * StpF3_(STP_INFP_F)); }
  670. StpF4 StpGtZeroF4(StpF4 x) { return StpSatF4(x * StpF4_(STP_INFP_F)); }
  671. StpF1 StpSignedF1(StpF1 x) { return StpSatF1(x * StpF1_(STP_INFN_F)); }
  672. StpF2 StpSignedF2(StpF2 x) { return StpSatF2(x * StpF2_(STP_INFN_F)); }
  673. StpF3 StpSignedF3(StpF3 x) { return StpSatF3(x * StpF3_(STP_INFN_F)); }
  674. StpF4 StpSignedF4(StpF4 x) { return StpSatF4(x * StpF4_(STP_INFN_F)); }
  675. #endif // STP_BUG_SAT_INF
  676. //------------------------------------------------------------------------------------------------------------------------------
  677. #if STP_BUG_PRX
  678. StpF1 StpPrxLoSqrtF1(StpF1 a) { return sqrt(a); }
  679. StpF3 StpPrxLoSqrtF3(StpF3 a) { return sqrt(a); }
  680. StpF4 StpPrxLoSqrtF4(StpF4 a) { return sqrt(a); }
  681. #else
  682. StpF1 StpPrxLoSqrtF1(StpF1 a) { return StpF1_U1((StpU1_F1(a) >> StpU1_(1)) + StpU1_(0x1fbc4639)); }
  683. StpF3 StpPrxLoSqrtF3(StpF3 a) { return StpF3_U3((StpU3_F3(a) >> StpU3_(1)) + StpU3_(0x1fbc4639)); }
  684. StpF4 StpPrxLoSqrtF4(StpF4 a) { return StpF4_U4((StpU4_F4(a) >> StpU4_(1)) + StpU4_(0x1fbc4639)); }
  685. #endif // STP_BUG_PRX
  686. //------------------------------------------------------------------------------------------------------------------------------
  687. #if STP_BUG_PRX
  688. StpF1 StpPrxLoRcpF1(StpF1 a) { return StpRcpF1(a); }
  689. StpF2 StpPrxLoRcpF2(StpF2 a) { return StpRcpF2(a); }
  690. StpF3 StpPrxLoRcpF3(StpF3 a) { return StpRcpF3(a); }
  691. StpF4 StpPrxLoRcpF4(StpF4 a) { return StpRcpF4(a); }
  692. StpF1 StpPrxMedRcpF1(StpF1 a) { return StpRcpF1(a); }
  693. StpF3 StpPrxMedRcpF3(StpF3 a) { return StpRcpF3(a); }
  694. #else
  695. StpF1 StpPrxLoRcpF1(StpF1 a) { return StpF1_U1(StpU1_(0x7ef07ebb) - StpU1_F1(a)); }
  696. StpF2 StpPrxLoRcpF2(StpF2 a) { return StpF2_U2(StpU2_(0x7ef07ebb) - StpU2_F2(a)); }
  697. StpF3 StpPrxLoRcpF3(StpF3 a) { return StpF3_U3(StpU3_(0x7ef07ebb) - StpU3_F3(a)); }
  698. StpF4 StpPrxLoRcpF4(StpF4 a) { return StpF4_U4(StpU4_(0x7ef07ebb) - StpU4_F4(a)); }
  699. StpF1 StpPrxMedRcpF1(StpF1 a) { StpF1 b = StpF1_U1(StpU1_(0x7ef19fff) - StpU1_F1(a));
  700. return b * (-b * a + StpF1_(2.0)); }
  701. StpF3 StpPrxMedRcpF3(StpF3 a) { StpF3 b = StpF3_U3(StpU3_(0x7ef19fff) - StpU3_F3(a));
  702. return b * (-b * a + StpF3_(2.0)); }
  703. #endif // STP_BUG_PRX
  704. //------------------------------------------------------------------------------------------------------------------------------
  705. #define STP_STATIC /* */
  706. #define StpInF2 in StpF2
  707. #define StpInF4 in StpF4
  708. #define StpInOutU4 inout StpU4
  709. #define StpOutF2 out StpF2
  710. #define StpVarF2 StpF2
  711. #endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL))
  712. //==============================================================================================================================
  713. #if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_MEDIUM)
  714. #if STP_BUG_SAT_INF
  715. // Defined if unable to use the fast path because of problem related to saturating +/- INF.
  716. StpMF1 StpGtZeroMF1(StpMF1 x) { return (x > StpMF1_(0.0)) ? StpMF1_(1.0) : StpMF1_(0.0); }
  717. StpMF3 StpGtZeroMF3(StpMF3 x) { return StpMF3(StpGtZeroMF1(x.r), StpGtZeroMF1(x.g), StpGtZeroMF1(x.b)); }
  718. StpMF4 StpGtZeroMF4(StpMF4 x) { return StpMF4(StpGtZeroMF1(x.r), StpGtZeroMF1(x.g),
  719. StpGtZeroMF1(x.b), StpGtZeroMF1(x.a)); }
  720. StpMF1 StpSignedMF1(StpMF1 x) { return (x < StpMF1_(0.0)) ? StpMF1_(1.0) : StpMF1_(0.0); }
  721. StpMF2 StpSignedMF2(StpMF2 x) { return StpMF2(StpSignedMF1(x.r), StpSignedMF1(x.g)); }
  722. StpMF3 StpSignedMF3(StpMF3 x) { return StpMF3(StpSignedMF1(x.r), StpSignedMF1(x.g), StpSignedMF1(x.b)); }
  723. StpMF4 StpSignedMF4(StpMF4 x) { return StpMF4(StpSignedMF1(x.r), StpSignedMF1(x.g),
  724. StpSignedMF1(x.b), StpSignedMF1(x.a)); }
  725. #elif STP_BUG_SAT
  726. // Defined if compiler factors out saturation incorrectly.
  727. #define STP_INFN_MF StpMF1(StpF1_U1(0xff800000u))
  728. #define STP_INFP_MF StpMF1(StpF1_U1(0x7f800000u))
  729. StpMF1 StpGtZeroMF1(StpMF1 x) { return max(min(x * StpMF1_(STP_INFP_MF), StpMF1_(1.0)), StpMF1_(0.0)); }
  730. StpMF3 StpGtZeroMF3(StpMF3 x) { return max(min(x * StpMF3_(STP_INFP_MF), StpMF3_(1.0)), StpMF3_(0.0)); }
  731. StpMF4 StpGtZeroMF4(StpMF4 x) { return max(min(x * StpMF4_(STP_INFP_MF), StpMF4_(1.0)), StpMF4_(0.0)); }
  732. StpMF1 StpSignedMF1(StpMF1 x) { return max(min(x * StpMF1_(STP_INFN_MF), StpMF1_(1.0)), StpMF1_(0.0)); }
  733. StpMF2 StpSignedMF2(StpMF2 x) { return max(min(x * StpMF2_(STP_INFN_MF), StpMF2_(1.0)), StpMF2_(0.0)); }
  734. StpMF3 StpSignedMF3(StpMF3 x) { return max(min(x * StpMF3_(STP_INFN_MF), StpMF3_(1.0)), StpMF3_(0.0)); }
  735. StpMF4 StpSignedMF4(StpMF4 x) { return max(min(x * StpMF4_(STP_INFN_MF), StpMF4_(1.0)), StpMF4_(0.0)); }
  736. #else
  737. // Using +/- INF typecast down to medium precision.
  738. #define STP_INFN_MF StpMF1(StpF1_U1(0xff800000u))
  739. #define STP_INFP_MF StpMF1(StpF1_U1(0x7f800000u))
  740. StpMF1 StpGtZeroMF1(StpMF1 x) { return StpSatMF1(x * StpMF1_(STP_INFP_MF)); }
  741. StpMF3 StpGtZeroMF3(StpMF3 x) { return StpSatMF3(x * StpMF3_(STP_INFP_MF)); }
  742. StpMF4 StpGtZeroMF4(StpMF4 x) { return StpSatMF4(x * StpMF4_(STP_INFP_MF)); }
  743. StpMF1 StpSignedMF1(StpMF1 x) { return StpSatMF1(x * StpMF1_(STP_INFN_MF)); }
  744. StpMF2 StpSignedMF2(StpMF2 x) { return StpSatMF2(x * StpMF2_(STP_INFN_MF)); }
  745. StpMF3 StpSignedMF3(StpMF3 x) { return StpSatMF3(x * StpMF3_(STP_INFN_MF)); }
  746. StpMF4 StpSignedMF4(StpMF4 x) { return StpSatMF4(x * StpMF4_(STP_INFN_MF)); }
  747. #endif // STP_BUG_SAT_INF
  748. //------------------------------------------------------------------------------------------------------------------------------
  749. // Unable to use the approximations due to not knowing what the type actually is.
  750. StpMF1 StpPrxLoSqrtMF1(StpMF1 a) { return sqrt(a); }
  751. StpMF3 StpPrxLoSqrtMF3(StpMF3 a) { return sqrt(a); }
  752. StpMF4 StpPrxLoSqrtMF4(StpMF4 a) { return sqrt(a); }
  753. //------------------------------------------------------------------------------------------------------------------------------
  754. StpMF1 StpPrxLoRcpMF1(StpMF1 a) { return StpRcpMF1(a); }
  755. StpMF2 StpPrxLoRcpMF2(StpMF2 a) { return StpRcpMF2(a); }
  756. StpMF3 StpPrxLoRcpMF3(StpMF3 a) { return StpRcpMF3(a); }
  757. StpMF4 StpPrxLoRcpMF4(StpMF4 a) { return StpRcpMF4(a); }
  758. StpMF1 StpPrxMedRcpMF1(StpMF1 a) { return StpRcpMF1(a); }
  759. StpMF3 StpPrxMedRcpMF3(StpMF3 a) { return StpRcpMF3(a); }
  760. #endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_MEDIUM)
  761. //==============================================================================================================================
  762. #if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && (!defined(STP_MEDIUM))
  763. // Same types so just use the full precision version.
  764. #define StpGtZeroMF1(a) StpGtZeroF1(a)
  765. #define StpGtZeroMF2(a) StpGtZeroF2(a)
  766. #define StpGtZeroMF3(a) StpGtZeroF3(a)
  767. #define StpGtZeroMF4(a) StpGtZeroF4(a)
  768. #define StpSignedMF1(a) StpSignedF1(a)
  769. #define StpSignedMF2(a) StpSignedF2(a)
  770. #define StpSignedMF3(a) StpSignedF3(a)
  771. #define StpSignedMF4(a) StpSignedF4(a)
  772. //------------------------------------------------------------------------------------------------------------------------------
  773. // The medium precision types are the same as the full precision so use the full precision approximations.
  774. #define StpPrxLoSqrtMF1(a) StpPrxLoSqrtF1(a)
  775. #define StpPrxLoSqrtMF3(a) StpPrxLoSqrtF3(a)
  776. #define StpPrxLoSqrtMF4(a) StpPrxLoSqrtF4(a)
  777. //------------------------------------------------------------------------------------------------------------------------------
  778. #define StpPrxLoRcpMF1(a) StpPrxLoRcpF1(a)
  779. #define StpPrxLoRcpMF2(a) StpPrxLoRcpF2(a)
  780. #define StpPrxLoRcpMF3(a) StpPrxLoRcpF3(a)
  781. #define StpPrxLoRcpMF4(a) StpPrxLoRcpF4(a)
  782. #define StpPrxMedRcpMF1(a) StpPrxMedRcpF1(a)
  783. #define StpPrxMedRcpMF3(a) StpPrxMedRcpF3(a)
  784. #endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && (!defined(STP_MEDIUM))
  785. //==============================================================================================================================
  786. #if defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_16BIT)
  787. StpH1 StpExp2H1(StpH1 x) { return exp2(x); }
  788. StpH1 StpLog2H1(StpH1 x) { return log2(x); }
  789. //------------------------------------------------------------------------------------------------------------------------------
  790. #if STP_BUG_ALIAS16
  791. // Use 32-bit aliasing to build the +/-INF, then typecast to 16-bit.
  792. #define STP_INFN_H StpH1(StpF1_U1(0xff800000u))
  793. #define STP_INFP_H StpH1(StpF1_U1(0x7f800000u))
  794. #else
  795. #define STP_INFN_H StpH1_W1(StpW1_(0xfc00))
  796. #define STP_INFP_H StpH1_W1(StpW1_(0x7c00))
  797. #endif // STP_BUG_ALIAS16
  798. #if STP_BUG_SAT_INF
  799. StpH1 StpGtZeroH1(StpH1 x) { return (x > StpH1_(0.0)) ? StpH1_(1.0) : StpH1_(0.0); }
  800. StpH2 StpGtZeroH2(StpH2 x) { return StpH2(StpGtZeroH1(x.r), StpGtZeroH1(x.g)); }
  801. StpH3 StpGtZeroH3(StpH3 x) { return StpH3(StpGtZeroH1(x.r), StpGtZeroH1(x.g), StpGtZeroH1(x.b)); }
  802. StpH4 StpGtZeroH4(StpH4 x) { return StpH4(StpGtZeroH1(x.r), StpGtZeroH1(x.g),
  803. StpGtZeroH1(x.b), StpGtZeroH1(x.a)); }
  804. StpH1 StpSignedH1(StpH1 x) { return (x < StpH1_(0.0)) ? StpH1_(1.0) : StpH1_(0.0); }
  805. StpH2 StpSignedH2(StpH2 x) { return StpH2(StpSignedH1(x.r), StpSignedH1(x.g)); }
  806. StpH3 StpSignedH3(StpH3 x) { return StpH3(StpSignedH1(x.r), StpSignedH1(x.g), StpSignedH1(x.b)); }
  807. StpH4 StpSignedH4(StpH4 x) { return StpH4(StpSignedH1(x.r), StpSignedH1(x.g),
  808. StpSignedH1(x.b), StpSignedH1(x.a)); }
  809. #elif STP_BUG_SAT
  810. StpH1 StpGtZeroH1(StpH1 x) { return max(min(x * StpH1_(STP_INFP_H), StpH1_(1.0)), StpH1_(0.0)); }
  811. StpH2 StpGtZeroH2(StpH2 x) { return max(min(x * StpH2_(STP_INFP_H), StpH2_(1.0)), StpH2_(0.0)); }
  812. StpH3 StpGtZeroH3(StpH3 x) { return max(min(x * StpH3_(STP_INFP_H), StpH3_(1.0)), StpH3_(0.0)); }
  813. StpH4 StpGtZeroH4(StpH4 x) { return max(min(x * StpH4_(STP_INFP_H), StpH4_(1.0)), StpH4_(0.0)); }
  814. StpH1 StpSignedH1(StpH1 x) { return max(min(x * StpH1_(STP_INFN_H), StpH1_(1.0)), StpH1_(0.0)); }
  815. StpH2 StpSignedH2(StpH2 x) { return max(min(x * StpH2_(STP_INFN_H), StpH2_(1.0)), StpH2_(0.0)); }
  816. StpH3 StpSignedH3(StpH3 x) { return max(min(x * StpH3_(STP_INFN_H), StpH3_(1.0)), StpH3_(0.0)); }
  817. StpH4 StpSignedH4(StpH4 x) { return max(min(x * StpH4_(STP_INFN_H), StpH4_(1.0)), StpH4_(0.0)); }
  818. #else
  819. StpH1 StpGtZeroH1(StpH1 x) { return StpSatH1(x * StpH1_(STP_INFP_H)); }
  820. StpH2 StpGtZeroH2(StpH2 x) { return StpSatH2(x * StpH2_(STP_INFP_H)); }
  821. StpH3 StpGtZeroH3(StpH3 x) { return StpSatH3(x * StpH3_(STP_INFP_H)); }
  822. StpH4 StpGtZeroH4(StpH4 x) { return StpSatH4(x * StpH4_(STP_INFP_H)); }
  823. StpH1 StpSignedH1(StpH1 x) { return StpSatH1(x * StpH1_(STP_INFN_H)); }
  824. StpH2 StpSignedH2(StpH2 x) { return StpSatH2(x * StpH2_(STP_INFN_H)); }
  825. StpH3 StpSignedH3(StpH3 x) { return StpSatH3(x * StpH3_(STP_INFN_H)); }
  826. StpH4 StpSignedH4(StpH4 x) { return StpSatH4(x * StpH4_(STP_INFN_H)); }
  827. #endif // STP_BUG_SAT_INF
  828. //------------------------------------------------------------------------------------------------------------------------------
  829. #if STP_BUG_PRX
  830. StpH1 StpPrxLoSqrtH1(StpH1 a) { return sqrt(a); }
  831. StpH3 StpPrxLoSqrtH3(StpH3 a) { return sqrt(a); }
  832. StpH4 StpPrxLoSqrtH4(StpH4 a) { return sqrt(a); }
  833. #else
  834. StpH1 StpPrxLoSqrtH1(StpH1 a) { return StpH1_W1((StpW1_H1(a) >> StpW1_(1)) + StpW1_(0x1de2)); }
  835. StpH3 StpPrxLoSqrtH3(StpH3 a) { return StpH3_W3((StpW3_H3(a) >> StpW3_(1)) + StpW3_(0x1de2)); }
  836. StpH4 StpPrxLoSqrtH4(StpH4 a) { return StpH4_W4((StpW4_H4(a) >> StpW4_(1)) + StpW4_(0x1de2)); }
  837. #endif // STP_BUG_PRX
  838. //------------------------------------------------------------------------------------------------------------------------------
  839. #if STP_BUG_PRX
  840. StpH1 StpPrxLoRcpH1(StpH1 a) { return StpRcpH1(a); }
  841. StpH2 StpPrxLoRcpH2(StpH2 a) { return StpRcpH2(a); }
  842. StpH3 StpPrxLoRcpH3(StpH3 a) { return StpRcpH3(a); }
  843. StpH4 StpPrxLoRcpH4(StpH4 a) { return StpRcpH4(a); }
  844. StpH1 StpPrxMedRcpH1(StpH1 a) { return StpRcpH1(a); }
  845. StpH3 StpPrxMedRcpH3(StpH3 a) { return StpRcpH3(a); }
  846. #else
  847. // Note this will create denormals.
  848. // MAPPING
  849. // -------
  850. // +INF (7c00) -> -61568
  851. // 65504 (7bff) -> -61600
  852. // 30800 (7785) -> NaN
  853. // 30784 (7784) -> 0 ........ (any input larger than 30784 will break)
  854. // 1 (3c00) -> 0.9395 ... (so not energy preserving for 1.0)
  855. // 0 (0000) -> 30784
  856. StpH1 StpPrxLoRcpH1(StpH1 a) { return StpH1_W1(StpW1_(0x7784) - StpW1_H1(a)); }
  857. StpH2 StpPrxLoRcpH2(StpH2 a) { return StpH2_W2(StpW2_(0x7784) - StpW2_H2(a)); }
  858. StpH3 StpPrxLoRcpH3(StpH3 a) { return StpH3_W3(StpW3_(0x7784) - StpW3_H3(a)); }
  859. StpH4 StpPrxLoRcpH4(StpH4 a) { return StpH4_W4(StpW4_(0x7784) - StpW4_H4(a)); }
  860. // Anything larger than 30928 will break in this function.
  861. StpH1 StpPrxMedRcpH1(StpH1 a) { StpH1 b = StpH1_W1(StpW1_(0x778d) - StpW1_H1(a));
  862. return b * (-b * a + StpH1_(2.0)); }
  863. StpH3 StpPrxMedRcpH3(StpH3 a) { StpH3 b = StpH3_W3(StpW3_(0x778d) - StpW3_H3(a));
  864. return b * (-b * a + StpH3_(2.0)); }
  865. #endif // STP_BUG_PRX
  866. #endif // defined(STP_GPU) && (defined(STP_GLSL) || defined(STP_HLSL)) && defined(STP_16BIT)
  867. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  868. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  869. //_____________________________________________________________.._______________________________________________________________
  870. //==============================================================================================================================
  871. // LANE REMAPPING
  872. //==============================================================================================================================
  873. #if defined(STP_GPU)
  874. // More complex remap which is safe for both portability (different wave sizes up to 128) and for 2D wave reductions.
  875. // 6543210
  876. // =======
  877. // ..xx..x
  878. // yy..yy.
  879. // Details,
  880. // LANE TO 8x16 MAPPING
  881. // ====================
  882. // 00 01 08 09 10 11 18 19
  883. // 02 03 0a 0b 12 13 1a 1b
  884. // 04 05 0c 0d 14 15 1c 1d
  885. // 06 07 0e 0f 16 17 1e 1f
  886. // 20 21 28 29 30 31 38 39
  887. // 22 23 2a 2b 32 33 3a 3b
  888. // 24 25 2c 2d 34 35 3c 3d
  889. // 26 27 2e 2f 36 37 3e 3f
  890. // .......................
  891. // ... repeat the 8x8 ....
  892. // .... pattern, but .....
  893. // .... for 40 to 7f .....
  894. // .......................
  895. StpU2 StpRmp8x16U2(StpU1 a) {
  896. // Note the BFIs used for MSBs have "strange offsets" due to leaving space for the LSB bits replaced in the BFI.
  897. return StpU2(StpBfiMskU1(StpBfeU1(a, 2u, 3u), a, 1u),
  898. StpBfiMskU1(StpBfeU1(a, 3u, 4u), StpBfeU1(a, 1u, 2u), 2u)); }
  899. #endif // defined(STP_GPU)
  900. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  901. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  902. //_____________________________________________________________.._______________________________________________________________
  903. //==============================================================================================================================
  904. // PRESETS (DON'T CHANGE)
  905. //==============================================================================================================================
  906. // High-end mobile.
  907. #if (STP_TAA_Q == 0)
  908. #define STP_GEAA_P 1
  909. #define STP_GEAA_SUBPIX (2.0 / 16.0)
  910. #define STP_TAA_PEN_F1 (1.0 / 4.0)
  911. #define STP_TAA_PEN_F0 (1.0 / 2.0)
  912. #define STP_TAA_PEN_W (1.0 / 2.0)
  913. #define STP_TAA_PRX_LANCZOS 1
  914. #define STP_TAA_PRX_LANCZOS_DERING 0
  915. #endif // (STP_TAA_Q == 0)
  916. //------------------------------------------------------------------------------------------------------------------------------
  917. // Desktop.
  918. #if (STP_TAA_Q == 1)
  919. #define STP_GEAA_P 3
  920. #define STP_GEAA_SUBPIX (2.0 / 16.0)
  921. #define STP_TAA_PEN_F1 (1.0 / 4.0)
  922. #define STP_TAA_PEN_F0 (1.0 / 2.0)
  923. #define STP_TAA_PEN_W (1.0 / 2.0)
  924. #define STP_TAA_PRX_LANCZOS 2
  925. #define STP_TAA_PRX_LANCZOS_DERING 1
  926. #endif // (STP_TAA_Q == 1)
  927. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  928. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  929. //_____________________________________________________________.._______________________________________________________________
  930. //==============================================================================================================================
  931. // INTERNAL TUNING (DON'T CHANGE)
  932. //==============================================================================================================================
  933. // Limits on anti-flicker weighting, tuning for range and precision challenges of FP16.
  934. #define STP_ANTI_MAX 8192.0
  935. // Using '1/8192' provides known problems on some platforms that are 16-bit precision challenged.
  936. #define STP_ANTI_MIN (1.0 / 4096.0)
  937. //------------------------------------------------------------------------------------------------------------------------------
  938. #define STP_DITHER_DEPTH 1
  939. #define STP_DITHER_MOTION 1
  940. //------------------------------------------------------------------------------------------------------------------------------
  941. // Ratios for luma in a gamma space, using BT.709 luma.
  942. #define STP_LUMA_R 0.2126
  943. #define STP_LUMA_G 0.7152
  944. #define STP_LUMA_B 0.0722
  945. #define STP_LUMA STP_LUMA_R, STP_LUMA_G, STP_LUMA_B
  946. //------------------------------------------------------------------------------------------------------------------------------
  947. // Maximum frames of feedback.
  948. #define STP_FRAME_MAX 32.0
  949. //------------------------------------------------------------------------------------------------------------------------------
  950. // Control the min (motion match), and max (no motion match), in units of pixels.
  951. // Settings of {max=1.0} won't work for 8x area scaling (trailing edge smears).
  952. // Setting too tight won't have enough slop for motion matching (motion match easily fails, leading to loss of detail).
  953. // If STP_PAT_MOT_MAX is too big, it will look like edges expand (or float) during change of motion.
  954. #define STP_PAT_MOT_MIN (1.0 / 16.0)
  955. #define STP_PAT_MOT_MAX (1.0 / 8.0)
  956. // Computed constants.
  957. #define STP_PAT_MOT_ADD (STP_PAT_MOT_MIN * STP_PAT_MOT_MIN)
  958. #define STP_PAT_MOT_AMP (1.0 / (STP_PAT_MOT_MAX * STP_PAT_MOT_MAX - STP_PAT_MOT_ADD))
  959. //------------------------------------------------------------------------------------------------------------------------------
  960. // Larger numbers ghost more, smaller numbers flicker more.
  961. #define STP_PAT_DEMOIRE 64.0
  962. // Increase for less ghosting, decrease for more ghosting.
  963. #define STP_PAT_SENSITIVITY (2.0 / 16.0)
  964. // Amount to scale up sensitivity on responsive. Lower numbers ghost more, higher flicker more.
  965. #define STP_PAT_RESPONSIVE 16.0
  966. // Minimum neighborhood (defaults to 1/32 of maximum value of neighborhood to allow some noise).
  967. #define STP_PAT_NE_MIN (1.0 / 32.0)
  968. //------------------------------------------------------------------------------------------------------------------------------
  969. // {0} = default lowest dilation (higher chance of slight trailing ghost, but less overall flicker)
  970. // {1} = expand a little (higher cost)
  971. // {2} = expand by too much (a lot more cost, more flicker, perhaps less trailing ghost)
  972. // In practice it's dilation and motion match threshold (PAT_MOT) which results in the final {flicker, ghost} tradeoff.
  973. #define STP_SAFE_DILATE 1
  974. //------------------------------------------------------------------------------------------------------------------------------
  975. // Adjusts the point at which spatial-only weights blend up and anti-flicker fully takes over.
  976. #define STP_TAA_SAA (1.0 / 2.0)
  977. // De-weight pixel contribution for chopped corner.
  978. #define STP_TAA_TRI_MASK_AVOID (1.0 / 8192.0)
  979. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  980. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  981. //_____________________________________________________________.._______________________________________________________________
  982. //==============================================================================================================================
  983. // JITTER LOCATIONS
  984. //------------------------------------------------------------------------------------------------------------------------------
  985. // STP is now using Halton(2,3).
  986. //==============================================================================================================================
  987. // Generate jitter amount given frame index.
  988. STP_STATIC void StpJit(StpOutF2 p, StpU1 frame) {
  989. // TODO: This function isn't used inside Unity, if ever this is used the implementation should be added here.
  990. p[0] = StpF1_(0.0);
  991. p[1] = StpF1_(0.0); }
  992. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  993. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  994. //_____________________________________________________________.._______________________________________________________________
  995. //==============================================================================================================================
  996. // PARABOLIC {SIN,COS}
  997. //==============================================================================================================================
  998. #if defined(STP_GPU)
  999. // Input is {-1 to 1} representing {0 to 2 pi}, output is {-1/4 to 1/4} representing {-1 to 1}.
  1000. void StpPSinF2(inout StpF2 p) { p = p * abs(p) - p; }
  1001. // This is used to dither position of gather4 fetch for nearest motion vector to remove nearest artifacts when scaling.
  1002. // Input 'p.x' is {0 to 1} representing {0 to 2 pi}, output is {-1/4 to 1/4} representing {-1 to 1}.
  1003. void StpPSinCosF(inout StpF2 p) { p.y = StpFractF1(p.x + StpF1_(0.25)); p = p * StpF2_(2.0) - StpF2_(1.0); StpPSinF2(p); }
  1004. //------------------------------------------------------------------------------------------------------------------------------
  1005. void StpPSinMF2(inout StpMF2 p) { p = p * abs(p) - p; }
  1006. void StpPSinCosMF(inout StpMF2 p) {
  1007. p.y = StpFractMF1(p.x + StpMF1_(0.25));
  1008. p = p * StpMF2_(2.0) - StpMF2_(1.0); StpPSinMF2(p); }
  1009. #endif // defined(STP_GPU)
  1010. //==============================================================================================================================
  1011. #if defined(STP_GPU) && defined(STP_16BIT)
  1012. void StpPSinH2(inout StpH2 p) { p = p * abs(p) - p; }
  1013. void StpPSinCosH(inout StpH2 p) { p.y = StpFractH1(p.x + StpH1_(0.25)); p = p * StpH2_(2.0) - StpH2_(1.0); StpPSinH2(p); }
  1014. #endif // defined(STP_GPU) && defined(STP_16BIT)
  1015. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1016. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1017. //_____________________________________________________________.._______________________________________________________________
  1018. //==============================================================================================================================
  1019. // DEPTH ENCODING
  1020. //------------------------------------------------------------------------------------------------------------------------------
  1021. // Using a log2() based encoding, takes {0 to inf} to {0 to 1}.
  1022. // log2(k.x*z)*k.y
  1023. // Where
  1024. // k.x = 1/near ............ (so that k0*z is 1 when z=near)
  1025. // k.y = 1/log2(k.x*far) ... (so that output is {0 to 1} ranged)
  1026. //------------------------------------------------------------------------------------------------------------------------------
  1027. // And the inverse
  1028. // exp2(x*k.x)*k.y
  1029. // Where
  1030. // k.x = log2(far/near)
  1031. // k.y = near
  1032. //==============================================================================================================================
  1033. #if defined(STP_GPU)
  1034. // Build the constants, based on near and far planes.
  1035. // The 'far' is where anything more distant clamps to 1.0.
  1036. StpF2 StpZCon(StpF1 near, StpF1 far) {
  1037. StpF2 k;
  1038. k.x = StpRcpF1(near);
  1039. k.y = StpRcpF1(log2(k.x * far));
  1040. return k; }
  1041. //------------------------------------------------------------------------------------------------------------------------------
  1042. // Where 'k' is generated by StpZCon().
  1043. StpF1 StpZPack(StpF1 z, StpF2 k, StpF1 dit) {
  1044. #if (STP_DITHER_DEPTH == 0)
  1045. return StpSatF1(log2(k.x * z) * k.y);
  1046. #endif // (STP_DITHER_DEPTH == 0)
  1047. #if (STP_DITHER_DEPTH == 1)
  1048. // Fast linearly incorrect dither for 10-bit.
  1049. return StpSatF1(log2(k.x * z) * k.y + dit * StpF1_(1.0 / 1024.0) - StpF1_(0.5 / 1024.0));
  1050. #endif // (STP_DITHER_DEPTH == 1)
  1051. }
  1052. //==============================================================================================================================
  1053. // Build the constants, based on near and far planes.
  1054. // The 'far' is where anything more distant clamps to 1.0.
  1055. StpF2 StpZUnCon(StpF1 near, StpF1 far) {
  1056. StpF2 k;
  1057. k.x = log2(far * StpRcpF1(near));
  1058. k.y = near;
  1059. return k; }
  1060. //------------------------------------------------------------------------------------------------------------------------------
  1061. // Where 'k' is generated by StpZUnCon().
  1062. StpF1 StpZUnpack(StpF1 x, StpF2 k) { return exp2(x * k.x) * k.y; }
  1063. #endif // defined(STP_GPU)
  1064. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1065. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1066. //_____________________________________________________________.._______________________________________________________________
  1067. //==============================================================================================================================
  1068. // STATIC GEOMETRY MOTION FORWARD PROJECTION
  1069. //==============================================================================================================================
  1070. // This is a separate section simply for documentation.
  1071. // This logic must be computed in 32-bit precision (in theory).
  1072. //------------------------------------------------------------------------------------------------------------------------------
  1073. // MOTION MATCH NOTES
  1074. // ==================
  1075. // - The 'position - motion' is the reprojected position.
  1076. // - Where {0 to 1} is no motion to a screen in motion.
  1077. // - Motion check works with a differential vector '((motionPrior - motionCurrent) * kC)'.
  1078. // - For static forward projection it will be '((motionPrior*0.5 - motionCurrent) * kC)'.
  1079. // - Due to motionPrior being in {-1 to 1} NDC instead of {0 to 1} for screen.
  1080. // - Working with motion vector differences to avoid complexity with jitter.
  1081. //------------------------------------------------------------------------------------------------------------------------------
  1082. // MOTION VECTOR NOTES
  1083. // ===================
  1084. // - 'reprojection = position - motion'
  1085. // - 'reprojection + motion = position'
  1086. // - 'motion = position - reprojection'
  1087. // - So motion points forward.
  1088. //------------------------------------------------------------------------------------------------------------------------------
  1089. // FORWARD PROJECTION LOGIC
  1090. // ========================
  1091. // HAVE INPUT {0 TO 1} SCREEN POSITION
  1092. // xy
  1093. // GET XY INTO {-1 TO 1} NDC [2 FMA, CANNOT FACTOR, NEED AT END]
  1094. // x=x*2-1
  1095. // y=y*2-1
  1096. // HAVE INPUT {0 TO INF} DEPTH
  1097. // z
  1098. // GET FROM {XY NDC, DEPTH} TO 3D VIEW POSITION [4 FMA]
  1099. // xx=x*((z*g+h)/a) ... xx=x*(z*(g/a)+(h/a)) ... xx=x*(z*k0+k1)
  1100. // yy=y*((z*g+h)/b) ... yy=y*(z*(g/b)+(h/b)) ... yy=y*(z*k2+k3)
  1101. // TRANSFORM TO NEW VIEW
  1102. // xxx=xx*i+yy*j+z*k+l
  1103. // yyy=xx*m+yy*n+z*o+p
  1104. // zzz=xx*q+yy*r+z*s+t
  1105. // PROJECTION [9 FMA]
  1106. // xxxx=xxx*a ..... xxxx=xx*(i*a)+yy*(j*a)+z*(k*a)+(l*a) ..... xxxx=xx*k4+yy*k5+z*k6+k7
  1107. // yyyy=yyy*b ..... yyyy=xx*(m*b)+yy*(n*b)+z*(o*b)+(p*b) ..... yyyy=xx*k8+yy*k9+z*kA+kB
  1108. // wwww=zzz*g+h ... wwww=xx*(q*g)+yy*(r*g)+z*(s*g)+(t*g+h) ... wwww=xx*kC+yy*kD+z*kE+kF
  1109. // PERSPECTIVE DIVIDE [1 RCP]
  1110. // xxxxx=xxxx/wwww
  1111. // yyyyy=yyyy/wwww
  1112. // SUBTRACT TO GET 2X MOTION [2 FMA]
  1113. // u=xxxxx-x ... u=xxxx*(1/wwww)-x
  1114. // v=yyyyy-y ... v=yyyy*(1/wwww)-y
  1115. // CONSTANTS (SEE BELOW FOR MEANING OF VARIABLES)
  1116. // k0=g/a ... Constants {a,b,c,d,g,h} for prior projection
  1117. // k1=h/a
  1118. // k2=g/b
  1119. // k3=h/b
  1120. // k4=i*a ... Constants {a,b,c,d,g,h} for next projection
  1121. // k5=j*a
  1122. // k6=k*a
  1123. // k7=l*a
  1124. // k8=m*b
  1125. // k9=n*b
  1126. // kA=o*b
  1127. // kB=p*b
  1128. // kC=q*g
  1129. // kD=r*g
  1130. // kE=s*g
  1131. // kF=t*g+h
  1132. //------------------------------------------------------------------------------------------------------------------------------
  1133. // BACKWARD PROJECTION LOGIC
  1134. // =========================
  1135. // This starts from '3D VIEW POSITION' of 'FORWARD PROJECTION LOGIC', but with different constants.
  1136. // TRANSFORM TO NEW VIEW
  1137. // xxx=xx*i+yy*j+z*k+l
  1138. // yyy=xx*m+yy*n+z*o+p
  1139. // zzz=xx*q+yy*r+z*s+t
  1140. // PROJECTION [9 FMA]
  1141. // xxxx=xxx*a ..... xxxx=xx*(i*a)+yy*(j*a)+z*(k*a)+(l*a) ..... xxxx=xx*kG+yy*kH+z*kI+kJ
  1142. // yyyy=yyy*b ..... yyyy=xx*(m*b)+yy*(n*b)+z*(o*b)+(p*b) ..... yyyy=xx*kK+yy*kL+z*kM+kN
  1143. // wwww=zzz*g+h ... wwww=xx*(q*g)+yy*(r*g)+z*(s*g)+(t*g+h) ... wwww=xx*kO+yy*kP+z*kQ+kR
  1144. // PERSPECTIVE DIVIDE [1 RCP]
  1145. // xxxxx=xxxx/wwww
  1146. // yyyyy=yyyy/wwww
  1147. // SUBTRACT TO GET 2X MOTION [2 FMA]
  1148. // u=xxxxx-x ... u=xxxx*(1/wwww)-x
  1149. // v=yyyyy-y ... v=yyyy*(1/wwww)-y
  1150. // CONSTANTS (SEE BELOW FOR MEANING OF VARIABLES)
  1151. // kG=i*a ... Constants {a,b,c,d,g,h} for previous prior projection, and {i,j,k,l,m,n,o,p,q,r,s,t} for prior back projection
  1152. // kH=j*a
  1153. // kI=k*a
  1154. // kJ=l*a
  1155. // kK=m*b
  1156. // kL=n*b
  1157. // kM=o*b
  1158. // kN=p*b
  1159. // kO=q*g
  1160. // kP=r*g
  1161. // kQ=s*g
  1162. // kR=t*g+h
  1163. //==============================================================================================================================
  1164. // GET FROM {0 TO 1} TO {-1 TO 1}
  1165. // ==============================
  1166. // - Get to NDC for {x,y}
  1167. // X:=x*2-1
  1168. // Y:=y*2-1
  1169. //------------------------------------------------------------------------------------------------------------------------------
  1170. // FORWARD VIEW
  1171. // ============
  1172. // - Using 12 values
  1173. // X:=x*i+y*j+z*k+l
  1174. // Y:=x*m+y*n+z*o+p
  1175. // Z:=x*q+y*r+z*s+t
  1176. // W:=1
  1177. // i j k l
  1178. // m n o p
  1179. // q r s t
  1180. // 0 0 0 1
  1181. //------------------------------------------------------------------------------------------------------------------------------
  1182. // PROJECTIONS
  1183. // ===========
  1184. // - INPUTS
  1185. // n ... near plane z
  1186. // f ... far plane z
  1187. // - DX ORTHO PROJECTION
  1188. // c:=1/(f-n)
  1189. // d:=-n/(f-n)
  1190. // X:=x*a
  1191. // Y:=y*b
  1192. // Z:=z*c+d ... (w=1 on input)
  1193. // W:=1
  1194. // a 0 0 0
  1195. // 0 b 0 0
  1196. // 0 0 c d
  1197. // 0 0 0 1
  1198. // - DX PERSPECTIVE PROJECTION (LEFT HANDED)
  1199. // c:=f/(f-n)
  1200. // d:=-(f*n)/(f-n)
  1201. // X:=x*a
  1202. // Y:=y*b
  1203. // Z:=z*c+d ... (w=1 on input)
  1204. // W:=z
  1205. // a 0 0 0
  1206. // 0 b 0 0
  1207. // 0 0 c d
  1208. // 0 0 1 0 ... (note DX allows the 1 to be non-one)
  1209. // - DX PERSPECTIVE PROJECTION REVERSED FOR BETTER PRECISION (LEFT HANDED)
  1210. // c:=-n/(f-n)
  1211. // d:=(f*n)/(f-n)
  1212. // X:=x*a
  1213. // Y:=y*b
  1214. // Z:=z*c+d ... (w=1 on input)
  1215. // W:=z
  1216. // a 0 0 0
  1217. // 0 b 0 0
  1218. // 0 0 c d
  1219. // 0 0 1 0
  1220. // - DX PERSPECTIVE PROJECTION REVERSED WITH INF FAR (LEFT HANDED)
  1221. // X:=x*a
  1222. // Y:=y*b
  1223. // Z:=n ... (w=1 on input)
  1224. // W:=z
  1225. // a 0 0 0
  1226. // 0 b 0 0
  1227. // 0 0 0 n
  1228. // 0 0 1 0
  1229. // - GL PERSPECTIVE PROJECTION
  1230. // c:=-(f+n)/(f-n)
  1231. // d:=-(2fn)/(f-n)
  1232. // X:=x*a
  1233. // Y:=y*b
  1234. // Z:=z*c+d ... (w=1 on input)
  1235. // W:=z
  1236. // a 0 0 0
  1237. // 0 b 0 0
  1238. // 0 0 c d
  1239. // 0 0 -1 0
  1240. // - GENERALIZED (WILL DO ANYTHING)
  1241. // X:=x*a
  1242. // Y:=y*b
  1243. // Z:=z*c+d ... (w=1 on input)
  1244. // W:=z*g+h
  1245. // a 0 0 0
  1246. // 0 b 0 0
  1247. // 0 0 c d
  1248. // 0 0 g h
  1249. //------------------------------------------------------------------------------------------------------------------------------
  1250. // PROJECTED TO NDC
  1251. // ================
  1252. // - Ignoring viewport transform
  1253. // X:=x/w
  1254. // Y:=y/w
  1255. // Z:=z/w
  1256. // W:=1/w
  1257. // - Inverse
  1258. // x=X*w
  1259. // y=Y*w
  1260. //==============================================================================================================================
  1261. // MODIFICATIONS FOR COMPLEX PROJECTIONS
  1262. //------------------------------------------------------------------------------------------------------------------------------
  1263. // Since this worked out to just 2 more FMAs and 2 more constants, decided not to create a shader permutation.
  1264. //==============================================================================================================================
  1265. // COMPLEX PROJECTION
  1266. // ==================
  1267. // - GL PERSPECTIVE PROJECTION - WITH Z BASED {X,Y} MODIFICATIONS
  1268. // c:=-(F+N)/(F-N)
  1269. // d:=-(2FN)/(F-N)
  1270. // X:=x*a + z*e
  1271. // Y:=y*b + z*f
  1272. // Z:=z*c+d ... (w=1 on input)
  1273. // W:=z
  1274. // a 0 e 0
  1275. // 0 b f 0
  1276. // 0 0 c d
  1277. // 0 0 -1 0
  1278. // - GENERALIZED (WILL DO ANYTHING) - WITH Z BASED {X,Y} MODIFICATIONS
  1279. // X:=x*a + z*e
  1280. // Y:=y*b + z*f
  1281. // Z:=z*c+d ... (w=1 on input)
  1282. // W:=z*g+h
  1283. // a 0 e 0
  1284. // 0 b f 0
  1285. // 0 0 c d
  1286. // 0 0 g h
  1287. // - INVERSE GIVEN 'z'
  1288. // X:=x*a + z*e
  1289. // Y:=y*b + z*f
  1290. // X - z*e:=x*a
  1291. // Y - z*f:=y*b
  1292. // X/a - z*e/a:=x
  1293. // Y/b - z*f/b:=y
  1294. //------------------------------------------------------------------------------------------------------------------------------
  1295. // FORWARD PROJECTION LOGIC
  1296. // ========================
  1297. // HAVE INPUT {0 TO 1} SCREEN POSITION
  1298. // xy
  1299. // GET XY INTO {-1 TO 1} NDC [2 FMA, CANNOT FACTOR, NEED AT END]
  1300. // x=x*2-1
  1301. // y=y*2-1
  1302. // HAVE INPUT {0 TO INF} DEPTH
  1303. // z
  1304. // GET FROM {XY NDC, DEPTH} TO 3D VIEW POSITION [4 FMA]
  1305. // ... have {X,Y,z}
  1306. // ... xx=(x*(z*g+h))*(1/a) + z*(e/a)
  1307. // ... yy=(y*(z*g+h))*(1/b) + z*(f/b)
  1308. // ... xx=x*((z*g+h)/a) + z*(e/a)
  1309. // ... yy=y*((z*g+h)/b) + z*(f/b)
  1310. // ... xx=x*(z*(g/a)+(h/a)) + z*(e/a)
  1311. // ... yy=y*(z*(g/b)+(h/b)) + z*(f/b)
  1312. // xx=x*(z*k0+k1)+z*k2
  1313. // yy=y*(z*k3+k4)+z*k5
  1314. // TRANSFORM TO NEW VIEW
  1315. // xxx=xx*i+yy*j+z*k+l
  1316. // yyy=xx*m+yy*n+z*o+p
  1317. // zzz=xx*q+yy*r+z*s+t
  1318. // PROJECTION [9 FMA]
  1319. // xxxx=xxx*a+zzz*e
  1320. // ... xxxx=xx*(i*a)+yy*(j*a)+z*(k*a)+(l*a) + xx*(q*e)+yy*(r*e)+z*(s*e)+(t*e)
  1321. // ... xxxx=xx*k6+yy*k7+z*k8+k9
  1322. // yyyy=yyy*b+zzz*f
  1323. // ... yyyy=xx*(m*b)+yy*(n*b)+z*(o*b)+(p*b) + xx*(q*f)+yy*(r*f)+z*(s*f)+(t*f)
  1324. // ... yyyy=xx*kA+yy*kB+z*kC+kD
  1325. // wwww=zzz*g+h
  1326. // ... wwww=xx*(q*g)+yy*(r*g)+z*(s*g)+(t*g+h)
  1327. // ... wwww=xx*kE+yy*kF+z*kG+kH
  1328. // PERSPECTIVE DIVIDE [1 RCP]
  1329. // xxxxx=xxxx/wwww
  1330. // yyyyy=yyyy/wwww
  1331. // SUBTRACT TO GET 2X MOTION [2 FMA]
  1332. // u=xxxxx-x ... u=xxxx*(1/wwww)-x
  1333. // v=yyyyy-y ... v=yyyy*(1/wwww)-y
  1334. // CONSTANTS (SEE BELOW FOR MEANING OF VARIABLES)
  1335. // k0=g/a ... Constants {a,b,c,d,e,f,g,h} for prior projection
  1336. // k1=h/a
  1337. // k2=e/a
  1338. // k3=g/b
  1339. // k4=h/b
  1340. // k5=f/b
  1341. // k6=(i*a)+(q*e) ... Constants {a,b,c,d,e,f,g,h} for next projection
  1342. // k7=(j*a)+(r*e)
  1343. // k8=(k*a)+(s*e)
  1344. // k9=(l*a)+(t*e)
  1345. // kA=(m*b)+(q*f)
  1346. // kB=(n*b)+(r*f)
  1347. // kC=(o*b)+(s*f)
  1348. // kD=(p*b)+(t*f)
  1349. // kE=q*g
  1350. // kF=r*g
  1351. // kG=s*g
  1352. // kH=t*g+h
  1353. //------------------------------------------------------------------------------------------------------------------------------
  1354. // BACKWARD PROJECTION LOGIC
  1355. // =========================
  1356. // This starts from '3D VIEW POSITION' of 'FORWARD PROJECTION LOGIC', but with different constants.
  1357. // TRANSFORM TO NEW VIEW
  1358. // xxx=xx*i+yy*j+z*k+l
  1359. // yyy=xx*m+yy*n+z*o+p
  1360. // zzz=xx*q+yy*r+z*s+t
  1361. // PROJECTION [9 FMA]
  1362. // xxxx=xxx*a+zzz*e
  1363. // ..... xxxx=xx*(i*a)+yy*(j*a)+z*(k*a)+(l*a) + xx*(q*e)+yy*(r*e)+z*(s*e)+(t*e)
  1364. // ..... xxxx=xx*kI+yy*kJ+z*kK+kJL
  1365. // yyyy=yyy*b+zzz*f
  1366. // ..... yyyy=xx*(m*b)+yy*(n*b)+z*(o*b)+(p*b) + xx*(q*f)+yy*(r*f)+z*(s*f)+(t*f)
  1367. // ..... yyyy=xx*kM+yy*kN+z*kO+kP
  1368. // wwww=zzz*g+h
  1369. // ... wwww=xx*(q*g)+yy*(r*g)+z*(s*g)+(t*g+h)
  1370. // ... wwww=xx*kQ+yy*kR+z*kS+kT
  1371. // PERSPECTIVE DIVIDE [1 RCP]
  1372. // xxxxx=xxxx/wwww
  1373. // yyyyy=yyyy/wwww
  1374. // SUBTRACT TO GET 2X MOTION [2 FMA]
  1375. // u=xxxxx-x ... u=xxxx*(1/wwww)-x
  1376. // v=yyyyy-y ... v=yyyy*(1/wwww)-y
  1377. // CONSTANTS (SEE BELOW FOR MEANING OF VARIABLES)
  1378. // ... Constants {a,b,c,d,e,f,g,h} for previous prior projection
  1379. // ... Constants {i,j,k,l,m,n,o,p,q,r,s,t} for prior back projection
  1380. // kI=(i*a)+(q*e)
  1381. // kJ=(j*a)+(r*e)
  1382. // kK=(k*a)+(s*e)
  1383. // kL=(l*a)+(t*e)
  1384. // kM=(m*b)+(q*f)
  1385. // kN=(n*b)+(r*f)
  1386. // kO=(o*b)+(s*f)
  1387. // kP=(p*b)+(t*f)
  1388. // kQ=q*g
  1389. // kR=r*g
  1390. // kS=s*g
  1391. // kT=t*g+h
  1392. //==============================================================================================================================
  1393. #if defined(STP_GPU)
  1394. // Generates forward {-1 to 1} NDC forward projection vectors given (from prior frame),
  1395. // p .... {0 to 1} screen position
  1396. // z .... {0 to INF} depth
  1397. // m .... {0 to 1} prior motion vector
  1398. // The results are approximately corrected for dynamic motion.
  1399. // This takes 'dynamicMotion = priorMotionVector - priorStaticGeometryBackprojection'
  1400. // Then adds that estimate of dynamic motion to the static geometry forward projection vector.
  1401. StpF2 StpFor(StpF2 p, StpF1 z, StpF2 m, StpF1 kMotionMatch,
  1402. StpF4 k0123, StpF4 k4567, StpF4 k89AB, StpF4 kCDEF, StpF4 kGHIJ, StpF4 kKLMN, StpF4 kOPQR, StpF2 kST,
  1403. out StpF2 bugF, out StpF2 bugD){
  1404. // Implements the logic described above in the comments.
  1405. p = p * StpF2_(2.0) - StpF2_(1.0);
  1406. StpF2 q;
  1407. q.x = p.x * (z * k0123.x + k0123.y) + (z * k0123.z);
  1408. q.y = p.y * (z * k0123.w + k4567.x) + (z * k4567.y);
  1409. StpF3 v;
  1410. v.x = q.x * k4567.z + q.y * k4567.w + z * k89AB.x + k89AB.y;
  1411. v.y = q.x * k89AB.z + q.y * k89AB.w + z * kCDEF.x + kCDEF.y;
  1412. v.z = q.x * kCDEF.z + q.y * kCDEF.w + z * kGHIJ.x + kGHIJ.y;
  1413. v.z = StpRcpF1(v.z);
  1414. StpF3 v2;
  1415. v2.x = q.x * kGHIJ.z + q.y * kGHIJ.w + z * kKLMN.x + kKLMN.y;
  1416. v2.y = q.x * kKLMN.z + q.y * kKLMN.w + z * kOPQR.x + kOPQR.y;
  1417. v2.z = q.x * kOPQR.z + q.y * kOPQR.w + z * kST.x + kST.y;
  1418. v2.z = StpRcpF1(v2.z);
  1419. // Motion vector points forward (to estimated position in next frame).
  1420. // Negative motion vector points back to where the pixel was in the prior frame.
  1421. // Motion vector is {0 to 1} for one screen, but this logic is {-1 to 1} based (hence a 2x scaling).
  1422. bugF = (v.xy * StpF2_(v.z) - p); // Static forward estimate.
  1423. bugD = ((StpF2_(2.0) * m) - (p - v2.xy * StpF2_(v2.z))) * StpF2_(kMotionMatch); // Dynamic estimate.
  1424. return bugF + bugD; }
  1425. #endif // defined(STP_GPU)
  1426. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1427. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1428. //_____________________________________________________________.._______________________________________________________________
  1429. //==============================================================================================================================
  1430. // MOTION VECTOR ENCODING
  1431. //------------------------------------------------------------------------------------------------------------------------------
  1432. // {MSB 10-bit depth, LSB {11,11}-bit motion with sqrt() encoding}
  1433. // Motion is encoding in sqrt() space.
  1434. //------------------------------------------------------------------------------------------------------------------------------
  1435. // 11111111111111110000000000000000
  1436. // fedcba9876543210fedcba9876543210
  1437. // ================================
  1438. // zzzzzzzzzz...................... 10-bit encoded z
  1439. // ..........yyyyyyyyyyy........... 11-bit {-1 to <1} y encoded in gamma 2.0 (sqrt)
  1440. // .....................xxxxxxxxxxx 11-bit {-1 to <1} x encoded in gamma 2.0 (sqrt)
  1441. //------------------------------------------------------------------------------------------------------------------------------
  1442. // The 32-bit path is 8 ops to decode {x,y}.
  1443. //------------------------------------------------------------------------------------------------------------------------------
  1444. // There once was a 16-bit path which takes 6 ops to decode (bit extra because ABS isn't free).
  1445. // hhhhhhhhhhhhhhhhllllllllllllllll
  1446. // ================================
  1447. // zzzzzzzzzzyyyyyyyyyyyxxxxxxxxxxx input
  1448. // zzzzzyyyyyyyyyyyxxxxxxxxxxx00000 << 5
  1449. // 00000yyyyyyyyyyyxxxxxxxxxxx00000 & 0x7FFFFFF
  1450. // 00000yyyyyyyyyyy00000xxxxxxxxxxx >> 5 (for 16-bit LSB only)
  1451. // This gets 11-bit integers which perfectly alias lowest non-denormal and denormals of FP16.
  1452. // Can scale by '16384' and subtract 1 to decompress without a CVT.
  1453. //==============================================================================================================================
  1454. #if defined(STP_GPU)
  1455. // The 'z' comes in {0 to 1}.
  1456. // This depends on 'v' ranging inside and including {-1 to 1}.
  1457. StpU1 StpMvPack(StpF1 z, StpF2 v, StpF1 dit) {
  1458. // {-1 to 1} linear to gamma 2.0 {-1 to 1}
  1459. #if STP_DITHER_MOTION
  1460. v = StpCpySgnF2(StpSatF2(sqrt(abs(v)) + StpF2_(dit * StpF1_(1.0 / 1024.0) - StpF1_(0.5 / 1024.0))), v);
  1461. #else
  1462. v = StpCpySgnF2(sqrt(abs(v)), v);
  1463. #endif
  1464. // Limit to {-1024/1024 to 1023/1024}.
  1465. v = min(v, StpF2_(1023.0/1024.0));
  1466. // Encode to 11-bit with zero at center of one step.
  1467. v = v * StpF2_(1024.0) + StpF2_(1024.0);
  1468. // Pack.
  1469. return (StpU1(z * StpF1(1023.0)) << StpU1(22)) + (StpU1(v.y) << StpU1(11)) + StpU1(v.x); }
  1470. //------------------------------------------------------------------------------------------------------------------------------
  1471. // Unpacks all.
  1472. void StpMvUnpack(out StpF1 z, out StpF2 v, StpU1 i) {
  1473. StpU1 iz = StpBfeU1(i, 22u, 10u);
  1474. StpU1 iy = StpBfeU1(i, 11u, 11u);
  1475. StpU1 ix = StpBfeU1(i, 0, 11u);
  1476. z = StpF1(iz) * StpF1_(1.0 / 1023.0);
  1477. v.y = StpF1(iy) * StpF1_(1.0 / 1024.0) + StpF1_(-1.0);
  1478. v.x = StpF1(ix) * StpF1_(1.0 / 1024.0) + StpF1_(-1.0);
  1479. v *= abs(v); }
  1480. //------------------------------------------------------------------------------------------------------------------------------
  1481. // Unpack just velocity.
  1482. void StpMvUnpackV(out StpF2 v, StpU1 i) {
  1483. StpU1 iy = StpBfeU1(i, 11u, 11u);
  1484. StpU1 ix = StpBfeU1(i, 0, 11u);
  1485. v.y = StpF1(iy) * StpF1_(1.0 / 1024.0) + StpF1_(-1.0);
  1486. v.x = StpF1(ix) * StpF1_(1.0 / 1024.0) + StpF1_(-1.0);
  1487. v *= abs(v); }
  1488. #endif // defined(STP_GPU)
  1489. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1490. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1491. //_____________________________________________________________.._______________________________________________________________
  1492. //==============================================================================================================================
  1493. // COLOR CONVERSION
  1494. //==============================================================================================================================
  1495. #if defined(STP_GPU)
  1496. // Scaling in the reversible tonemapper (should be >= 1).
  1497. // Getting too close to 1.0 will result in luma inversions in highly saturated content in the oldest algorithm.
  1498. // Using 4.0 or ideally 8.0 is recommended.
  1499. #define STP_SAT 4.0
  1500. #endif // defined(STP_GPU)
  1501. //==============================================================================================================================
  1502. #if defined(STP_GPU) && defined(STP_32BIT)
  1503. void StpToneF1(inout StpF1 x) { StpF1 y = StpRcpF1(StpF1_(STP_SAT) + x); x = StpSatF1(x * StpF1_(y)); }
  1504. //------------------------------------------------------------------------------------------------------------------------------
  1505. // Reversible tonemapper.
  1506. void StpToneF3(inout StpF3 x) {
  1507. StpF1 y = StpRcpF1(StpF1_(STP_SAT) + StpMax3F1(x.r, x.g, x.b));
  1508. x = StpSatF3(x * StpF3_(y)); }
  1509. //------------------------------------------------------------------------------------------------------------------------------
  1510. void StpToneInvF3(inout StpF3 x) {
  1511. StpF1 y = StpRcpF1(
  1512. // |-----| <- Using 32768.0 causes problems in Unity with bloom on at least some platforms.
  1513. // | | So output maximum is 16384 for StpToneInvF3().
  1514. max(StpF1_(1.0 / 16384.0), StpSatF1(StpF1_(1.0 / STP_SAT) - StpMax3F1(x.r, x.g, x.b) * StpF1_(1.0 / STP_SAT))));
  1515. x *= StpF3_(y); }
  1516. //------------------------------------------------------------------------------------------------------------------------------
  1517. // This is currently unused but left in for reference.
  1518. // Convert LDR RGB to Gamma 2.0 RGB {0 to 1}.
  1519. // This is for storage to 8-bit.
  1520. // This is temporal dithered.
  1521. // Unoptimized logic (for reference).
  1522. // StpF3 n = sqrt(c);
  1523. // n = floor(n * StpF3_(255.0)) * StpF3_(1.0 / 255.0);
  1524. // StpF3 a = n * n;
  1525. // StpF3 b = n + StpF3_(1.0 / 255.0); b = b * b;
  1526. // // Ratio of 'a' to 'b' required to produce 'c'.
  1527. // StpF3 r = (c - b) * StpRcpF3(a - b);
  1528. // // Use the ratio as a cutoff to choose 'a' or 'b'.
  1529. // c = StpSatF3(n + StpGtZeroF3(StpF3_(dit) - r) * StpF3_(1.0 / 255.0));
  1530. // Optimized from 57 to 42 clks on GCN.
  1531. StpF3 StpRgbGamDit8F3(StpF3 c, StpF1 dit) {
  1532. StpF3 n = sqrt(c);
  1533. n = floor(n * StpF3_(255.0)) * StpF3_(1.0 / 255.0);
  1534. StpF3 a = n * n;
  1535. StpF3 b = n + StpF3_(1.0 / 255.0);
  1536. c = StpSatF3(n + StpGtZeroF3(StpF3_(dit) * (b * b - a) - (b * b - c)) * StpF3_(1.0 / 255.0)); return c; }
  1537. //------------------------------------------------------------------------------------------------------------------------------
  1538. // This is currently unused but left in for reference.
  1539. // Version for 10-bit for feedback.
  1540. StpF3 StpRgbGamDit10F3(StpF3 c, StpF1 dit) {
  1541. StpF3 n = sqrt(c);
  1542. n = floor(n * StpF3_(1023.0)) * StpF3_(1.0 / 1023.0);
  1543. StpF3 a = n * n;
  1544. StpF3 b = n + StpF3_(1.0 / 1023.0);
  1545. c = StpSatF3(n + StpGtZeroF3(StpF3_(dit) * (b * b - a) - (b * b - c)) * StpF3_(1.0 / 1023.0)); return c; }
  1546. //------------------------------------------------------------------------------------------------------------------------------
  1547. // Can use this function to convert feedback back to color.
  1548. void StpFeed2ClrF(inout StpF3 c) {
  1549. c *= c;
  1550. #if (STP_POSTMAP == 0)
  1551. StpToneInvF3(c.rgb);
  1552. #endif
  1553. }
  1554. #endif // defined(STP_GPU) && defined(STP_32BIT)
  1555. //==============================================================================================================================
  1556. #if defined(STP_GPU) && defined(STP_32BIT)
  1557. void StpToneMF1(inout StpMF1 x) { StpMF1 y = StpRcpMF1(StpMF1_(STP_SAT) + x); x = StpSatMF1(x * StpMF1_(y)); }
  1558. //------------------------------------------------------------------------------------------------------------------------------
  1559. void StpToneMF3(inout StpMF3 x) {
  1560. StpMF1 y = StpRcpMF1(StpMF1_(STP_SAT) + StpMax3MF1(x.r, x.g, x.b));
  1561. x = StpSatMF3(x * StpMF3_(y)); }
  1562. //------------------------------------------------------------------------------------------------------------------------------
  1563. void StpToneInvMF3(inout StpMF3 x) {
  1564. StpMF1 y = StpRcpMF1(
  1565. max(StpMF1_(1.0 / 16384.0), StpSatMF1(StpMF1_(1.0 / STP_SAT) -
  1566. StpMax3MF1(x.r, x.g, x.b) * StpMF1_(1.0 / STP_SAT))));
  1567. x *= StpMF3_(y); }
  1568. //------------------------------------------------------------------------------------------------------------------------------
  1569. StpMF3 StpRgbGamDit8MF3(StpMF3 c, StpMF1 dit) {
  1570. StpMF3 n = sqrt(c);
  1571. n = floor(n * StpMF3_(255.0)) * StpMF3_(1.0 / 255.0);
  1572. StpMF3 a = n * n;
  1573. StpMF3 b = n + StpMF3_(1.0 / 255.0);
  1574. c = StpSatMF3(n + StpGtZeroMF3(StpMF3_(dit) * (b * b - a) - (b * b - c)) * StpMF3_(1.0 / 255.0)); return c; }
  1575. //------------------------------------------------------------------------------------------------------------------------------
  1576. StpMF3 StpRgbGamDit10MF3(StpMF3 c, StpMF1 dit) {
  1577. StpMF3 n = sqrt(c);
  1578. n = floor(n * StpMF3_(1023.0)) * StpMF3_(1.0 / 1023.0);
  1579. StpMF3 a = n * n;
  1580. StpMF3 b = n + StpMF3_(1.0 / 1023.0);
  1581. c = StpSatMF3(n + StpGtZeroMF3(StpMF3_(dit) * (b * b - a) - (b * b - c)) * StpMF3_(1.0 / 1023.0)); return c; }
  1582. //------------------------------------------------------------------------------------------------------------------------------
  1583. void StpFeed2ClrMF(inout StpMF3 c) {
  1584. c *= c;
  1585. #if (STP_POSTMAP == 0)
  1586. StpToneInvMF3(c.rgb);
  1587. #endif
  1588. }
  1589. #endif // defined(STP_GPU) && defined(STP_32BIT)
  1590. //==============================================================================================================================
  1591. #if defined(STP_GPU) && defined(STP_16BIT)
  1592. void StpToneH1(inout StpH1 x) { StpH1 y = StpRcpH1(StpH1_(STP_SAT) + x); x = StpSatH1(x * StpH1_(y)); }
  1593. //------------------------------------------------------------------------------------------------------------------------------
  1594. void StpToneH3(inout StpH3 x) {
  1595. StpH1 y = StpRcpH1(StpH1_(STP_SAT) + StpMax3H1(x.r, x.g, x.b));
  1596. x = StpSatH3(x * StpH3_(y)); }
  1597. //------------------------------------------------------------------------------------------------------------------------------
  1598. void StpToneInvH3(inout StpH3 x) {
  1599. StpH1 y = StpRcpH1(
  1600. max(StpH1_(1.0 / 16384.0), StpSatH1(StpH1_(1.0 / STP_SAT) - StpMax3H1(x.r, x.g, x.b) * StpH1_(1.0 / STP_SAT))));
  1601. x *= StpH3_(y); }
  1602. //------------------------------------------------------------------------------------------------------------------------------
  1603. StpH3 StpRgbGamDit8H3(StpH3 c, StpH1 dit) {
  1604. StpH3 n = sqrt(c);
  1605. n = floor(n * StpH3_(255.0)) * StpH3_(1.0 / 255.0);
  1606. StpH3 a = n * n;
  1607. StpH3 b = n + StpH3_(1.0 / 255.0);
  1608. c = StpSatH3(n + StpGtZeroH3(StpH3_(dit) * (b * b - a) - (b * b - c)) * StpH3_(1.0 / 255.0)); return c; }
  1609. //------------------------------------------------------------------------------------------------------------------------------
  1610. StpH3 StpRgbGamDit10H3(StpH3 c, StpH1 dit) {
  1611. StpH3 n = sqrt(c);
  1612. n = floor(n * StpH3_(1023.0)) * StpH3_(1.0 / 1023.0);
  1613. StpH3 a = n * n;
  1614. StpH3 b = n + StpH3_(1.0 / 1023.0);
  1615. c = StpSatH3(n + StpGtZeroH3(StpH3_(dit) * (b * b - a) - (b * b - c)) * StpH3_(1.0 / 1023.0)); return c; }
  1616. //------------------------------------------------------------------------------------------------------------------------------
  1617. void StpFeed2ClrH(inout StpH3 c) {
  1618. c *= c;
  1619. #if (STP_POSTMAP == 0)
  1620. StpToneInvH3(c.rgb);
  1621. #endif
  1622. }
  1623. #endif // defined(STP_GPU) && defined(STP_16BIT)
  1624. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1625. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1626. //_____________________________________________________________.._______________________________________________________________
  1627. //==============================================================================================================================
  1628. // COLOR CONVERSION TOOLS
  1629. //------------------------------------------------------------------------------------------------------------------------------
  1630. // Some platforms do not have a hardware sRGB image store (requires manual conversion).
  1631. //==============================================================================================================================
  1632. #if defined(STP_GPU) && defined(STP_32BIT)
  1633. StpF3 StpLinearToSrgbF3(StpF3 c) {
  1634. StpF3 j = StpF3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); StpF2 k = StpF2(1.055, -0.055);
  1635. return clamp(j.xxx, c * j.yyy, pow(c, j.zzz) * k.xxx + k.yyy); }
  1636. //------------------------------------------------------------------------------------------------------------------------------
  1637. StpMF3 StpLinearToSrgbMF3(StpMF3 c) {
  1638. StpMF3 j = StpMF3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); StpMF2 k = StpMF2(1.055, -0.055);
  1639. return clamp(j.xxx, c * j.yyy, pow(c, j.zzz) * k.xxx + k.yyy); }
  1640. #endif // defined(STP_GPU) && defined(STP_32BIT)
  1641. //==============================================================================================================================
  1642. #if defined(STP_GPU) && defined(STP_16BIT)
  1643. StpH3 StpLinearToSrgbH3(StpH3 c) {
  1644. StpH3 j = StpH3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); StpH2 k = StpH2(1.055, -0.055);
  1645. return clamp(j.xxx, c * j.yyy, pow(c, j.zzz) * k.xxx + k.yyy); }
  1646. #endif // defined(STP_GPU) && defined(STP_16BIT)
  1647. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1648. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1649. //_____________________________________________________________.._______________________________________________________________
  1650. //==============================================================================================================================
  1651. // DEBUG COMMON
  1652. //==============================================================================================================================
  1653. #if defined(STP_GPU) && STP_BUG
  1654. void StpBugF(StpU3 p, StpF4 c);
  1655. #endif // defined(STP_GPU) && STP_BUG
  1656. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1657. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1658. //_____________________________________________________________.._______________________________________________________________
  1659. //==============================================================================================================================
  1660. // CONSTANT GENERATION
  1661. //==============================================================================================================================
  1662. STP_STATIC void StpDilCon(
  1663. // Generated constants.
  1664. StpInOutU4 con0,
  1665. // Current image resolution in pixels.
  1666. StpInF2 imgC) {
  1667. // StpF2 kRcpR := 4/size of current input image in pixels.
  1668. con0[0] = StpU1_F1(StpF1_(4.0) / imgC[0]);
  1669. con0[1] = StpU1_F1(StpF1_(4.0) / imgC[1]);
  1670. // StpU2 kR := size/4 of the current input image in pixels.
  1671. // Used for pass merging (DIL and SAA), since convergence is 1/16 area of input, must check position.
  1672. con0[2] = StpU1_(StpU1_(imgC[0]) >> StpU1_(2));
  1673. con0[3] = StpU1_(StpU1_(imgC[1]) >> StpU1_(2)); }
  1674. //==============================================================================================================================
  1675. STP_STATIC void StpPatCon(
  1676. // Generated constants.
  1677. StpInOutU4 con0,
  1678. StpInOutU4 con1,
  1679. StpInOutU4 con2,
  1680. StpInOutU4 con3,
  1681. StpInOutU4 con4,
  1682. StpInOutU4 con5,
  1683. StpInOutU4 con6,
  1684. StpInOutU4 con7,
  1685. StpInOutU4 con8,
  1686. StpInOutU4 con9,
  1687. StpInOutU4 conA,
  1688. StpInOutU4 conB,
  1689. StpInOutU4 conC,
  1690. // Linear depth near plane for log2 depth encoding.
  1691. StpF1 near,
  1692. // Linear depth far plane for log2 depth encoding.
  1693. StpF1 far,
  1694. // Frame count for current frame (sets jitter).
  1695. StpU1 frame,
  1696. // Current image resolution in pixels.
  1697. StpInF2 imgC,
  1698. // Prior image resolution in pixels.
  1699. StpInF2 imgP,
  1700. // Feedback (aka output) resolution in pixels.
  1701. StpInF2 imgF,
  1702. // Ratio of 'currentFrameTime/priorFrameTime'.
  1703. StpF1 motionMatch,
  1704. // Projection matrix data {a,b,c,d,e,f,g,h}.
  1705. // This is used to do static geometry forward projection.
  1706. // a 0 e 0
  1707. // 0 b f 0
  1708. // 0 0 c d
  1709. // 0 0 g h
  1710. // For reference, an DX ortho projection would be,
  1711. // a 0 e 0
  1712. // 0 b f 0
  1713. // 0 0 c d
  1714. // 0 0 0 1
  1715. // And a DX, left handed perspective projection would be,
  1716. // a 0 e 0
  1717. // 0 b f 0
  1718. // 0 0 c d ... c := F/(F-N), d := -(F*N)/(F-N)
  1719. // 0 0 1 0
  1720. // Previous prior projection.
  1721. StpInF4 prjPrvABEF,
  1722. StpInF4 prjPrvCDGH,
  1723. // Prior projection.
  1724. StpInF4 prjPriABEF,
  1725. StpInF4 prjPriCDGH,
  1726. // Current projection (the difference enables changing zoom).
  1727. StpInF4 prjCurABEF,
  1728. StpInF4 prjCurCDGH,
  1729. // Forward viewspace transform.
  1730. // Transform prior 3D view position into current 3D view position.
  1731. // This is used to do static geometry forward projection.
  1732. // X := x*i + y*j +z*k +l
  1733. // Y := x*m + y*n +z*o +p
  1734. // Z := x*q + y*r +z*s +t
  1735. // W := 1
  1736. // i j k l
  1737. // m n o p
  1738. // q r s t
  1739. // 0 0 0 1
  1740. StpInF4 forIJKL,
  1741. StpInF4 forMNOP,
  1742. StpInF4 forQRST,
  1743. // Prior frame backward viewspace transform.
  1744. // Transform prior 3D view position into previous-prior 3D view position.
  1745. // This is used to 'fix' static geometry forward projection for dynamic motion.
  1746. // X := x*i + y*j +z*k +l
  1747. // Y := x*m + y*n +z*o +p
  1748. // Z := x*q + y*r +z*s +t
  1749. // W := 1
  1750. // i j k l
  1751. // m n o p
  1752. // q r s t
  1753. // 0 0 0 1
  1754. StpInF4 bckIJKL,
  1755. StpInF4 bckMNOP,
  1756. StpInF4 bckQRST) {
  1757. //------------------------------------------------------------------------------------------------------------------------------
  1758. // StpF2 kRcpC := 1.0 / size of current input image in pixels.
  1759. con0[0] = StpU1_F1(StpF1_(1.0) / imgC[0]);
  1760. con0[1] = StpU1_F1(StpF1_(1.0) / imgC[1]);
  1761. // StpF2 kHalfRcpC := 0.5 / size of current input image in pixels.
  1762. con0[2] = StpU1_F1(StpF1_(0.5) / imgC[0]);
  1763. con0[3] = StpU1_F1(StpF1_(0.5) / imgC[1]);
  1764. //------------------------------------------------------------------------------------------------------------------------------
  1765. // Grab jitter for current and prior frames.
  1766. StpVarF2 jitP;
  1767. StpVarF2 jitC;
  1768. StpJit(jitP, frame - StpU1_(1));
  1769. StpJit(jitC, frame);
  1770. // StpF2 kJitCRcpCUnjitPRcpP := Map current into prior frame.
  1771. con1[0] = StpU1_F1(jitC[0] / imgC[0] - jitP[0] / imgP[0]);
  1772. con1[1] = StpU1_F1(jitC[1] / imgC[1] - jitP[1] / imgP[1]);
  1773. // StpF2 kJitCRcpC := Take {0 to 1} position in current image, and map back to {0 to 1} position in feedback (removes jitter).
  1774. con1[2] = StpU1_F1(jitC[0] / imgC[0]);
  1775. con1[3] = StpU1_F1(jitC[1] / imgC[1]);
  1776. //------------------------------------------------------------------------------------------------------------------------------
  1777. // StpF2 kF := size of feedback (aka output) in pixels.
  1778. con2[0] = StpU1_F1(imgF[0]);
  1779. con2[1] = StpU1_F1(imgF[1]);
  1780. // StpF2 kDepth := Copied logic from StpZCon().
  1781. StpF1 k0 = StpRcpF1(near);
  1782. StpF1 k1 = StpRcpF1(StpLog2F1(k0 * far));
  1783. con2[2] = StpU1_F1(k0);
  1784. con2[3] = StpU1_F1(k1);
  1785. //------------------------------------------------------------------------------------------------------------------------------
  1786. // StpF4 kOS := Scale and bias to check for out of bounds (and kill feedback).
  1787. // Scaled and biased output needs to {-1 out of bounds, >-1 in bounds, <1 in bounds, 1 out of bounds}.
  1788. StpVarF2 s;
  1789. // Undo 'pM' scaling, and multiply by 2 (as this needs to be -1 to 1 at edge of acceptable reprojection).
  1790. s[0] = StpF1_(2.0);
  1791. s[1] = StpF1_(2.0);
  1792. // Scaling to push outside safe reprojection over 1.
  1793. s[0] *= imgP[0] / (imgP[0] + StpF1_(4.0));
  1794. s[1] *= imgP[1] / (imgP[1] + StpF1_(4.0));
  1795. con3[0] = StpU1_F1(s[0]);
  1796. con3[1] = StpU1_F1(s[1]);
  1797. // Factor out subtracting off the mid point scaled by the multiply term.
  1798. con3[2] = StpU1_F1(StpF1_(-0.5) * s[0]);
  1799. con3[3] = StpU1_F1(StpF1_(-0.5) * s[1]);
  1800. //------------------------------------------------------------------------------------------------------------------------------
  1801. // StpF2 kUnDepth := Copied logic from StpZUnCon().
  1802. con4[0] = StpU1_F1(StpLog2F1(far * StpRcpF1(near)));
  1803. con4[1] = StpU1_F1(near);
  1804. // kMotionMatch
  1805. con4[2] = StpU1_F1(motionMatch);
  1806. // Unused for now.
  1807. con4[3] = StpU1_(0);
  1808. //------------------------------------------------------------------------------------------------------------------------------
  1809. // StpF2 kC := Size of current input image in pixels.
  1810. con5[0] = StpU1_F1(imgC[0]);
  1811. con5[1] = StpU1_F1(imgC[1]);
  1812. // kST
  1813. con5[2] = StpU1_F1(bckQRST.z * prjPrvCDGH.z);
  1814. con5[3] = StpU1_F1(bckQRST.w * prjPrvCDGH.z + prjPrvCDGH.w);
  1815. //------------------------------------------------------------------------------------------------------------------------------
  1816. // See header docs in "STATIC GEOMETRY MOTION FORWARD PROJECTION".
  1817. // k0123
  1818. con6[0] = StpU1_F1(prjPriCDGH.z / prjPriABEF.x);
  1819. con6[1] = StpU1_F1(prjPriCDGH.w / prjPriABEF.x);
  1820. con6[2] = StpU1_F1(prjPriABEF.z / prjPriABEF.x);
  1821. con6[3] = StpU1_F1(prjPriCDGH.z / prjPriABEF.y);
  1822. // k4567
  1823. con7[0] = StpU1_F1(prjPriCDGH.w / prjPriABEF.y);
  1824. con7[1] = StpU1_F1(prjPriABEF.w / prjPriABEF.y);
  1825. con7[2] = StpU1_F1(forIJKL.x * prjCurABEF.x + forQRST.x * prjCurABEF.z);
  1826. con7[3] = StpU1_F1(forIJKL.y * prjCurABEF.x + forQRST.y * prjCurABEF.z);
  1827. // k89AB
  1828. con8[0] = StpU1_F1(forIJKL.z * prjCurABEF.x + forQRST.z * prjCurABEF.z);
  1829. con8[1] = StpU1_F1(forIJKL.w * prjCurABEF.x + forQRST.w * prjCurABEF.z);
  1830. con8[2] = StpU1_F1(forMNOP.x * prjCurABEF.y + forQRST.x * prjCurABEF.w);
  1831. con8[3] = StpU1_F1(forMNOP.y * prjCurABEF.y + forQRST.y * prjCurABEF.w);
  1832. // kCDEF
  1833. con9[0] = StpU1_F1(forMNOP.z * prjCurABEF.y + forQRST.z * prjCurABEF.w);
  1834. con9[1] = StpU1_F1(forMNOP.w * prjCurABEF.y + forQRST.w * prjCurABEF.w);
  1835. con9[2] = StpU1_F1(forQRST.x * prjCurCDGH.z);
  1836. con9[3] = StpU1_F1(forQRST.y * prjCurCDGH.z);
  1837. // kGHIJ
  1838. conA[0] = StpU1_F1(forQRST.z * prjCurCDGH.z);
  1839. conA[1] = StpU1_F1(forQRST.w * prjCurCDGH.z + prjCurCDGH.w);
  1840. conA[2] = StpU1_F1(bckIJKL.x * prjPrvABEF.x + bckQRST.x * prjPrvABEF.z);
  1841. conA[3] = StpU1_F1(bckIJKL.y * prjPrvABEF.x + bckQRST.y * prjPrvABEF.z);
  1842. // kKLMN
  1843. conB[0] = StpU1_F1(bckIJKL.z * prjPrvABEF.x + bckQRST.z * prjPrvABEF.z);
  1844. conB[1] = StpU1_F1(bckIJKL.w * prjPrvABEF.x + bckQRST.w * prjPrvABEF.z);
  1845. conB[2] = StpU1_F1(bckMNOP.x * prjPrvABEF.y + bckQRST.x * prjPrvABEF.w);
  1846. conB[3] = StpU1_F1(bckMNOP.y * prjPrvABEF.y + bckQRST.y * prjPrvABEF.w);
  1847. // kOPQR
  1848. conC[0] = StpU1_F1(bckMNOP.z * prjPrvABEF.y + bckQRST.z * prjPrvABEF.w);
  1849. conC[1] = StpU1_F1(bckMNOP.w * prjPrvABEF.y + bckQRST.w * prjPrvABEF.w);
  1850. conC[2] = StpU1_F1(bckQRST.x * prjPrvCDGH.z);
  1851. conC[3] = StpU1_F1(bckQRST.y * prjPrvCDGH.z);}
  1852. //==============================================================================================================================
  1853. STP_STATIC void StpTaaCon(
  1854. // Generated constants.
  1855. StpInOutU4 con0,
  1856. StpInOutU4 con1,
  1857. StpInOutU4 con2,
  1858. StpInOutU4 con3,
  1859. // Amount of grain {0 = maximum, >0 is amount of stops less of grain}.
  1860. StpF1 grain,
  1861. // Frame count for current frame (sets jitter).
  1862. StpU1 frame,
  1863. // Current image resolution in pixels.
  1864. StpInF2 imgC,
  1865. // Feedback (aka output) resolution in pixels.
  1866. StpInF2 imgF) {
  1867. //------------------------------------------------------------------------------------------------------------------------------
  1868. // Grab jitter for current frame.
  1869. StpVarF2 jitC;
  1870. StpJit(jitC, frame);
  1871. //------------------------------------------------------------------------------------------------------------------------------
  1872. // Conversion from integer pix position to center pix float pixel position in image for current input.
  1873. // xy := multiply term (M) --- Scale by 1/imgF to get to {0 to 1}.
  1874. // zw := addition term (A) --- Add 0.5*M to get to center of pixel, then subtract jitC to undo jitter.
  1875. // StpF2 kCRcpF.
  1876. con0[0] = StpU1_F1(imgC[0] / imgF[0]);
  1877. con0[1] = StpU1_F1(imgC[1] / imgF[1]);
  1878. // StpF2 kHalfCRcpFUnjitC.
  1879. con0[2] = StpU1_F1(StpF1_(0.5) * imgC[0] / imgF[0] - jitC[0]);
  1880. con0[3] = StpU1_F1(StpF1_(0.5) * imgC[1] / imgF[1] - jitC[1]);
  1881. //------------------------------------------------------------------------------------------------------------------------------
  1882. // StpF2 kRcpC := 1/size of current input image in pixels.
  1883. con1[0] = StpU1_F1(StpF1_(1.0) / imgC[0]);
  1884. con1[1] = StpU1_F1(StpF1_(1.0) / imgC[1]);
  1885. //------------------------------------------------------------------------------------------------------------------------------
  1886. // StpF2 kRcpF := 1/size of feedback image (aka output) in pixels.
  1887. con1[2] = StpU1_F1(StpF1_(1.0) / imgF[0]);
  1888. con1[3] = StpU1_F1(StpF1_(1.0) / imgF[1]);
  1889. //------------------------------------------------------------------------------------------------------------------------------
  1890. // StpF2 kHalfRcpF := 0.5/size of feedback image (aka output) in pixels.
  1891. con2[0] = StpU1_F1(StpF1_(0.5) / imgF[0]);
  1892. con2[1] = StpU1_F1(StpF1_(0.5) / imgF[1]);
  1893. //------------------------------------------------------------------------------------------------------------------------------
  1894. // Conversion from a {0 to 1} position in current input to feedback.
  1895. // StpH3 kJitCRcpC0 := jitC / image image size in pixels + {-0.5/size, +0.5/size} of current input image in pixels.
  1896. con2[2] = StpU1_F1(jitC[0] / imgC[0] - StpF1_(0.5) / imgC[0]);
  1897. con2[3] = StpU1_F1(jitC[1] / imgC[1] + StpF1_(0.5) / imgC[1]);
  1898. //------------------------------------------------------------------------------------------------------------------------------
  1899. // StpF2 kHalfRcpC := 0.5/size of current input image in pixels.
  1900. con3[0] = StpU1_F1(StpF1_(0.5) / imgC[0]);
  1901. con3[1] = StpU1_F1(StpF1_(0.5) / imgC[1]);
  1902. //------------------------------------------------------------------------------------------------------------------------------
  1903. // StpF2 kF := size of feedback image in pixels.
  1904. con3[2] = StpU1_F1(imgF[0]);
  1905. con3[3] = StpU1_F1(imgF[1]); }
  1906. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1907. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1908. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1909. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  1910. //_____________________________________________________________.._______________________________________________________________
  1911. //==============================================================================================================================
  1912. //
  1913. // PATTERN ENTRY POINT
  1914. //
  1915. //==============================================================================================================================
  1916. // See the packed 16-bit version for comments.
  1917. #if defined(STP_GPU) && defined(STP_32BIT) && defined(STP_PAT)
  1918. void StpPat4x4MaxF8(StpMU1 i, inout StpF4 a, inout StpF4 b);
  1919. void StpPat4x4SumF4(StpMU1 i, inout StpF4 a);
  1920. //------------------------------------------------------------------------------------------------------------------------------
  1921. StpMF1 StpPatPriConF(StpF2 p);
  1922. //------------------------------------------------------------------------------------------------------------------------------
  1923. StpF2 StpPatDatMotF(StpMU2 o);
  1924. StpMF3 StpPatDatColF(StpMU2 o);
  1925. StpF1 StpPatDatZF(StpMU2 o);
  1926. StpF1 StpPatFixZF(StpF1 z);
  1927. StpU1 StpPatDatRF(StpMU2 o);
  1928. StpMF1 StpPatFixRF(StpU1 v);
  1929. //------------------------------------------------------------------------------------------------------------------------------
  1930. StpMF1 StpPatDitF(StpMU2 o);
  1931. //------------------------------------------------------------------------------------------------------------------------------
  1932. StpMF4 StpPatPriFedF(StpF2 p);
  1933. StpMF4 StpPatPriFedR4F(StpF2 p);
  1934. StpMF4 StpPatPriFedG4F(StpF2 p);
  1935. StpMF4 StpPatPriFedB4F(StpF2 p);
  1936. //------------------------------------------------------------------------------------------------------------------------------
  1937. StpMF2 StpPatPriLumF(StpF2 p);
  1938. //------------------------------------------------------------------------------------------------------------------------------
  1939. StpU4 StpPatPriMot4F(StpF2 p);
  1940. #if STP_MAX_MIN_UINT
  1941. StpU1 StpPatPriMotMinF(StpF2 p);
  1942. #endif // STP_MAX_MIN_UINT
  1943. #if STP_OFFSETS
  1944. StpU4 StpPatPriMot4OF(StpF2 p, StpI2 o);
  1945. #if STP_MAX_MIN_UINT
  1946. StpU1 StpPatPriMotMinOF(StpF2 p, StpI2 o);
  1947. #endif // STP_MAX_MIN_UINT
  1948. #endif // STP_OFFSETS
  1949. //------------------------------------------------------------------------------------------------------------------------------
  1950. void StpPatStMotF(StpMU2 p, StpU1 v);
  1951. void StpPatStColF(StpMU2 p, StpMF4 v);
  1952. void StpPatStLumF(StpMU2 p, StpMF2 v);
  1953. void StpPatStCnvF(StpMU2 p, StpMF1 v);
  1954. //==============================================================================================================================
  1955. void StpPatF(
  1956. StpMU1 lane,
  1957. StpMU2 pp,
  1958. StpU4 con0,
  1959. StpU4 con1,
  1960. StpU4 con2,
  1961. StpU4 con3,
  1962. StpU4 con4,
  1963. StpU4 con5,
  1964. StpU4 con6,
  1965. StpU4 con7,
  1966. StpU4 con8,
  1967. StpU4 con9,
  1968. StpU4 conA,
  1969. StpU4 conB,
  1970. StpU4 conC,
  1971. StpU4 conD) {
  1972. //------------------------------------------------------------------------------------------------------------------------------
  1973. StpMF4 rC;
  1974. StpU1 rM;
  1975. StpMF2 rL;
  1976. StpMF1 rCnv;
  1977. //------------------------------------------------------------------------------------------------------------------------------
  1978. StpF2 kRcpC = StpF2_U2(con0.xy);
  1979. StpF2 kHalfRcpC = StpF2_U2(con0.zw);
  1980. StpF2 kJitCRcpCUnjitPRcpP = StpF2_U2(con1.xy);
  1981. StpF2 kJitCRcpC = StpF2_U2(con1.zw);
  1982. StpF2 kF = StpF2_U2(con2.xy);
  1983. StpF4 kOS = StpF4_U4(con3);
  1984. StpF2 kDepth = StpF2_U2(con2.zw);
  1985. StpF2 kUnDepth = StpF2_U2(con4.xy);
  1986. StpF1 kMotionMatch = StpF1_U1(con4.z);
  1987. StpF2 kC = StpF2_U2(con5.xy);
  1988. StpF4 k0123 = StpF4_U4(con6);
  1989. StpF4 k4567 = StpF4_U4(con7);
  1990. StpF4 k89AB = StpF4_U4(con8);
  1991. StpF4 kCDEF = StpF4_U4(con9);
  1992. StpF4 kGHIJ = StpF4_U4(conA);
  1993. StpF4 kKLMN = StpF4_U4(conB);
  1994. StpF4 kOPQR = StpF4_U4(conC);
  1995. StpF2 kST = StpF2_U2(conD.xy);
  1996. //------------------------------------------------------------------------------------------------------------------------------
  1997. StpF2 m = StpPatDatMotF(pp);
  1998. StpMF1 d = StpPatDitF(pp);
  1999. StpF1 zPre = StpPatDatZF(pp);
  2000. StpMF3 c = StpPatDatColF(pp);
  2001. //==============================================================================================================================
  2002. // DEPENDENT INLINE INPUT MOTION
  2003. //==============================================================================================================================
  2004. StpF2 p = StpF2(pp) * kRcpC + kHalfRcpC;
  2005. //------------------------------------------------------------------------------------------------------------------------------
  2006. // Check the streaming bandwidth limit.
  2007. #if STP_BUG_BW_SOL
  2008. { StpMF2 lum2 = StpPatPriLumF(p);
  2009. StpMF1 cnvPrev = StpPatPriConF(p);
  2010. StpU4 mZVP4 = StpPatPriMot4F(p);
  2011. StpU1 rPre = StpPatDatRF(p);
  2012. StpMF3 f = StpPatPriFedF(p).rgb;
  2013. StpF1 z = StpPatFixZF(zPre);
  2014. StpMF1 r = StpPatFixRF(rPre);
  2015. rC.rgb = StpMF3_(m.x) + StpMF3_(d.x) + c + StpMF3_(lum2.x) + StpMF3_(cnvPrev) + StpMF3(mZVP4.xyz) + f + StpMF3_(z+r);
  2016. rC.a = StpMF1_(0.0);
  2017. rL = rC.rg;
  2018. rM = StpU1_(rC.r);
  2019. rCnv = rC.r;
  2020. StpPatStMotF(pp, rM);
  2021. StpPatStLumF(pp, rL);
  2022. StpPatStColF(pp, rC);
  2023. StpPatStCnvF(pp, rCnv);
  2024. return; }
  2025. #endif // STP_BUG_BW_SOL
  2026. //------------------------------------------------------------------------------------------------------------------------------
  2027. StpF2 pM = (p - m);
  2028. StpF2 pF = pM + kJitCRcpC;
  2029. pM = pM + kJitCRcpCUnjitPRcpP;
  2030. //------------------------------------------------------------------------------------------------------------------------------
  2031. StpMF2 lum2 = StpPatPriLumF(pM);
  2032. //------------------------------------------------------------------------------------------------------------------------------
  2033. StpMF1 cnvPrev = StpPatPriConF(pM);
  2034. //------------------------------------------------------------------------------------------------------------------------------
  2035. #if (STP_SAFE_DILATE == 2)
  2036. #if STP_MAX_MIN_UINT
  2037. StpU4 mZVP4;
  2038. #if STP_OFFSETS
  2039. mZVP4.x = StpPatPriMotMinOF(pM, StpI2(-1, -1));
  2040. mZVP4.y = StpPatPriMotMinOF(pM, StpI2( 1, -1));
  2041. mZVP4.z = StpPatPriMotMinOF(pM, StpI2(-1, 1));
  2042. mZVP4.w = StpPatPriMotMinOF(pM, StpI2( 1, 1));
  2043. #else // STP_OFFSETS
  2044. mZVP4.x = StpPatPriMotMinF(pM + StpF2(-kRcpC.x, -kRcpC.y));
  2045. mZVP4.y = StpPatPriMotMinF(pM + StpF2( kRcpC.x, -kRcpC.y));
  2046. mZVP4.z = StpPatPriMotMinF(pM + StpF2(-kRcpC.x, kRcpC.y));
  2047. mZVP4.w = StpPatPriMotMinF(pM + StpF2( kRcpC.x, kRcpC.y));
  2048. #endif // ST_OFFSETS
  2049. #else // STP_MAX_MIN_UINT
  2050. #if STP_OFFSETS
  2051. StpU4 mZVP4_0 = StpPatPriMot4OF(pM, StpI2(-1, -1));
  2052. StpU4 mZVP4_1 = StpPatPriMot4OF(pM, StpI2( 1, -1));
  2053. StpU4 mZVP4_2 = StpPatPriMot4OF(pM, StpI2(-1, 1));
  2054. StpU4 mZVP4_3 = StpPatPriMot4OF(pM, StpI2( 1, 1));
  2055. #else // STP_OFFSETS
  2056. StpU4 mZVP4_0 = StpPatPriMot4F(pM + StpF2(-kRcpC.x, -kRcpC.y));
  2057. StpU4 mZVP4_1 = StpPatPriMot4F(pM + StpF2( kRcpC.x, -kRcpC.y));
  2058. StpU4 mZVP4_2 = StpPatPriMot4F(pM + StpF2(-kRcpC.x, kRcpC.y));
  2059. StpU4 mZVP4_3 = StpPatPriMot4F(pM + StpF2( kRcpC.x, kRcpC.y));
  2060. #endif // STP_OFFSETS
  2061. #endif // STP_MAX_MIN_UINT
  2062. #else // (STP_SAFE_DILATE == 2)
  2063. StpU1 mZVPN;
  2064. StpU4 mZVP2a = StpPatPriMot4F(pM - kHalfRcpC);
  2065. StpU4 mZVP2b = StpPatPriMot4F(pM + kHalfRcpC);
  2066. #if STP_MAX_MIN_UINT
  2067. mZVPN = StpPatPriMotMinF(pM);
  2068. #else // STP_MAX_MIN_UINT
  2069. StpU4 mZVP4 = StpPatPriMot4F(pM);
  2070. #endif // STP_MAX_MIN_UINT
  2071. #endif // (STP_SAFE_DILATE == 2)
  2072. //------------------------------------------------------------------------------------------------------------------------------
  2073. StpU1 rPre = StpPatDatRF(pp);
  2074. //------------------------------------------------------------------------------------------------------------------------------
  2075. StpMF4 f4R = StpPatPriFedR4F(pF);
  2076. StpMF4 f4G = StpPatPriFedG4F(pF);
  2077. StpMF4 f4B = StpPatPriFedB4F(pF);
  2078. StpMF3 f = StpPatPriFedF(pF).rgb;
  2079. //==============================================================================================================================
  2080. // DEPENDENT ON DITHER AND INLINE INPUT PARAMETERS
  2081. //==============================================================================================================================
  2082. StpF1 dd = StpF1_(d);
  2083. StpF1 z = StpPatFixZF(zPre);
  2084. z = StpZPack(z, kDepth, dd);
  2085. rM = StpMvPack(z, m, dd);
  2086. StpPatStMotF(pp, rM);
  2087. //------------------------------------------------------------------------------------------------------------------------------
  2088. #if STP_BUG
  2089. // Pattern/Clipped Input Color
  2090. { StpF4 bug = StpF4_(0.0);
  2091. bug.rgb = sqrt(StpF3(c.rgb));
  2092. bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
  2093. StpBugF(StpU3(pp, 0), bug); }
  2094. //------------------------------------------------------------------------------------------------------------------------------
  2095. // Pattern/Log Input Depth
  2096. { StpF4 bug = StpF4_(0.0);
  2097. bug.rgb = StpF3_(StpSatF1(z + StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
  2098. StpBugF(StpU3(pp, 1), bug); }
  2099. #endif // STP_BUG
  2100. //------------------------------------------------------------------------------------------------------------------------------
  2101. #if (STP_POSTMAP == 0)
  2102. StpToneMF3(c);
  2103. #endif // (STP_POSTMAP == 0)
  2104. //------------------------------------------------------------------------------------------------------------------------------
  2105. #if STP_BUG
  2106. // Pattern/Reversible Tonemapped Input Color
  2107. { StpF4 bug = StpF4_(0.0);
  2108. bug.rgb = sqrt(StpF3(c.rgb));
  2109. bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
  2110. StpBugF(StpU3(pp, 2), bug); }
  2111. #endif // STP_BUG
  2112. //------------------------------------------------------------------------------------------------------------------------------
  2113. c = sqrt(c);
  2114. rC.rgb = StpSatMF3(c + StpMF3_(d * StpMF1(1.0 / 1023.0) + StpMF1(-0.5 / 1023.0)));
  2115. //------------------------------------------------------------------------------------------------------------------------------
  2116. rL.x = dot(c, StpMF3(STP_LUMA));
  2117. rL.y = lum2.x;
  2118. StpPatStLumF(pp, rL);
  2119. //------------------------------------------------------------------------------------------------------------------------------
  2120. #if STP_BUG
  2121. // Pattern/Shaped Absolute Input Motion
  2122. { StpF4 bug = StpF4_(0.0);
  2123. bug.b = sqrt(StpF1_(rL.x) * StpF1_(0.25));
  2124. bug.rg = StpF2_(1.0) - exp2(abs(StpF2(m)) * StpF2_(-32.0));
  2125. bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
  2126. StpBugF(StpU3(pp, 3), bug); }
  2127. #endif // STP_BUG
  2128. //------------------------------------------------------------------------------------------------------------------------------
  2129. StpMF1 moire = min(abs(rL.x - lum2.x), abs(lum2.x - lum2.y));
  2130. moire *= StpMF1_(STP_PAT_DEMOIRE);
  2131. //------------------------------------------------------------------------------------------------------------------------------
  2132. StpMF4 xnyRG = StpMF4(c.r, -c.r, c.g, -c.g);
  2133. StpMF4 xnyBC = StpMF4(c.b, -c.b, -cnvPrev, -cnvPrev);
  2134. #if defined(STP_16BIT)
  2135. #else // defined(STP_16BIT)
  2136. // We convert to full precision floats here since the reductions work on 32-bit values.
  2137. StpF4 xnyRGF = StpF4(xnyRG);
  2138. StpF4 xnyBCF = StpF4(xnyBC);
  2139. StpPat4x4MaxF8(lane, xnyRGF, xnyBCF);
  2140. xnyRG = StpMF4(xnyRGF);
  2141. xnyBC = StpMF4(xnyBCF);
  2142. #endif // defined(STP_16BIT)
  2143. cnvPrev = -xnyBC.z;
  2144. StpMF3 ne = max(StpMF3_(STP_PAT_NE_MIN) * StpMF3(xnyRG.x, xnyRG.z, xnyBC.x),
  2145. StpMF3(xnyRG.x + xnyRG.y, xnyRG.z + xnyRG.w, xnyBC.x + xnyBC.y));
  2146. StpMF1 ne1 = dot(ne, StpMF3(STP_LUMA));
  2147. //------------------------------------------------------------------------------------------------------------------------------
  2148. cnvPrev = StpSatMF1(cnvPrev + StpMF1_(1.0 / STP_FRAME_MAX));
  2149. //------------------------------------------------------------------------------------------------------------------------------
  2150. StpF2 onXY = StpF2(pM.xy);
  2151. onXY = onXY * kOS.xy + kOS.zw;
  2152. StpF1 onS = StpSignedF1(max(abs(onXY.x), abs(onXY.y)) - StpF1_(1.0));
  2153. //------------------------------------------------------------------------------------------------------------------------------
  2154. #if STP_BUG
  2155. // Pattern/Motion Reprojection {R=Prior G=This Sqrt Luma Feedback Diff, B=Offscreen}
  2156. { StpF4 bug = StpF4_(0.0);
  2157. bug.g = StpF1_(abs(rL.x - lum2.x));
  2158. bug.r = StpF1_(abs(lum2.x - lum2.y));
  2159. bug.b = StpF1_(1.0) - StpF1_(onS);
  2160. bug.rg = sqrt(bug.rg);
  2161. bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
  2162. StpBugF(StpU3(pp, 4), bug); }
  2163. #endif // STP_BUG
  2164. //==============================================================================================================================
  2165. // DEPENDENT ON PRIOR {Z, MOTION}
  2166. //==============================================================================================================================
  2167. #if (STP_SAFE_DILATE == 2)
  2168. #if (STP_MAX_MIN_UINT == 0)
  2169. StpU4 mZVP4 = min(StpMin3U4(mZVP4_0, mZVP4_1, mZVP4_2), mZVP4_3);
  2170. #endif // (STP_MAX_MIN_UINT == 0)
  2171. StpU1 mZVPN = min(StpMin3U1(mZVP4.x, mZVP4.y, mZVP4.z), mZVP4.w);
  2172. #else // (STP_SAFE_DILATE == 2)
  2173. #if (STP_MAX_MIN_UINT == 0)
  2174. mZVPN = min(StpMin3U1(mZVP4.x, mZVP4.y, mZVP4.z), mZVP4.w);
  2175. #endif // (STP_MAX_MIN_UINT == 0)
  2176. #if STP_SAFE_DILATE
  2177. mZVPN = StpMin3U1(StpMin3U1(mZVPN, mZVP2a.x, mZVP2a.z), mZVP2b.x, mZVP2b.z);
  2178. #endif // STP_SAFE_DILATE
  2179. #endif // (STP_SAFE_DILATE == 2)
  2180. //------------------------------------------------------------------------------------------------------------------------------
  2181. StpF2 mPN;
  2182. StpF1 mZPN;
  2183. StpMvUnpack(mZPN, mPN, mZVPN);
  2184. //------------------------------------------------------------------------------------------------------------------------------
  2185. StpF2 mE;
  2186. mE = sqrt(abs(m)) + StpF2_(1.0 / 256.0);
  2187. mE = mE * mE - abs(m);
  2188. //------------------------------------------------------------------------------------------------------------------------------
  2189. StpF1 sgZ = StpZUnpack(mZPN, kUnDepth);
  2190. StpF2 bugF; StpF2 bugD;
  2191. StpF2 sgM = StpFor(pM, sgZ, mPN, kMotionMatch, k0123, k4567, k89AB, kCDEF, kGHIJ, kKLMN, kOPQR, kST, bugF, bugD);
  2192. sgM = StpSatF2(abs(sgM * StpF2_(0.5) - m) - mE) * kC;
  2193. StpMF1 sgD = StpMF1(dot(sgM, sgM));
  2194. //------------------------------------------------------------------------------------------------------------------------------
  2195. StpMF1 match = StpMF1_(1.0) - StpSatMF1(sgD * StpMF1_(STP_PAT_MOT_AMP) - StpMF1_(STP_PAT_MOT_ADD * STP_PAT_MOT_AMP));
  2196. match *= StpMF1_(onS);
  2197. rC.a = match;
  2198. StpPatStColF(pp, rC);
  2199. //------------------------------------------------------------------------------------------------------------------------------
  2200. moire = moire * match + StpMF1_(1.0 / 8192.0);
  2201. moire = min(StpMF1_(1.0), ne1 * StpRcpMF1(moire));
  2202. //------------------------------------------------------------------------------------------------------------------------------
  2203. StpMF1 tS = moire;
  2204. StpMF1 r = StpPatFixRF(rPre);
  2205. tS = tS * (StpMF1_(STP_PAT_RESPONSIVE) - r * StpMF1_(STP_PAT_RESPONSIVE)) + tS;
  2206. //------------------------------------------------------------------------------------------------------------------------------
  2207. #if STP_BUG
  2208. // Pattern/Sensitivity {G=No motion match, R=Responsive, B=Luma}
  2209. { StpF4 bug = StpF4_(0.0);
  2210. bug.g = StpF1_(1.0) - StpF1(match);
  2211. bug.r = StpF1_(1.0) - StpF1(r);
  2212. bug.b = StpF1_(rL.x);
  2213. bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
  2214. StpBugF(StpU3(pp, 5), bug); }
  2215. #endif // STP_BUG
  2216. //==============================================================================================================================
  2217. // DEPENDENT ON FEEDBACK
  2218. //==============================================================================================================================
  2219. StpMF4 t;
  2220. t.rgb = c - f;
  2221. t.a = dot(abs(t.rgb), StpMF3(STP_LUMA));
  2222. StpMF4 t4R = f4R - StpMF4_(c.r);
  2223. StpMF4 t4G = f4G - StpMF4_(c.g);
  2224. StpMF4 t4B = f4B - StpMF4_(c.b);
  2225. StpMF4 t4A = abs(t4R) * StpMF4_(STP_LUMA_R) + abs(t4G) * StpMF4_(STP_LUMA_G) + abs(t4B) * StpMF4_(STP_LUMA_B);
  2226. t.a = StpMin3MF1(t.a, t4A.x, StpMin3MF1(t4A.y, t4A.z, t4A.w));
  2227. if(t.a == t4A.x) t.rgb = StpMF3(t4R.x, t4G.x, t4B.x);
  2228. if(t.a == t4A.y) t.rgb = StpMF3(t4R.y, t4G.y, t4B.y);
  2229. if(t.a == t4A.z) t.rgb = StpMF3(t4R.z, t4G.z, t4B.z);
  2230. if(t.a == t4A.w) t.rgb = StpMF3(t4R.w, t4G.w, t4B.w);
  2231. //------------------------------------------------------------------------------------------------------------------------------
  2232. t.rgb *= StpMF3_(tS);
  2233. //------------------------------------------------------------------------------------------------------------------------------
  2234. #if defined(STP_16BIT)
  2235. StpPat4x4SumH4(lane, t);
  2236. #else // defined(STP_16BIT)
  2237. // We convert to full precision floats here since the reductions work on 32-bit values, and MF might be 16-bit.
  2238. StpF4 tF = StpF4(t);
  2239. StpPat4x4SumF4(lane, tF);
  2240. t = StpMF4(tF);
  2241. #endif // defined(STP_16BIT)
  2242. t.rgb *= StpMF3_(STP_PAT_SENSITIVITY);
  2243. //------------------------------------------------------------------------------------------------------------------------------
  2244. StpMF3 bln3 = StpSatMF3(ne * StpRcpMF3(abs(t.rgb)));
  2245. StpMF1 bln = StpMin3MF1(bln3.r, bln3.g, bln3.b);
  2246. //------------------------------------------------------------------------------------------------------------------------------
  2247. StpMF1 cnv = StpSatMF1(bln * StpRcpMF1(StpMF1_(STP_FRAME_MAX) - StpMF1_(STP_FRAME_MAX) * bln));
  2248. //------------------------------------------------------------------------------------------------------------------------------
  2249. cnv = StpSatMF1(cnv - StpMF1_(1.0 / STP_FRAME_MAX));
  2250. rCnv = min(cnv, cnvPrev);
  2251. StpPatStCnvF(pp, rCnv); }
  2252. #endif // defined(STP_GPU) && defined(STP_32BIT) && defined(STP_PAT)
  2253. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2254. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2255. //_____________________________________________________________.._______________________________________________________________
  2256. //==============================================================================================================================
  2257. // 16-BIT PATH
  2258. //==============================================================================================================================
  2259. // See the packed 16-bit version for comments.
  2260. #if defined(STP_GPU) && defined(STP_16BIT) && defined(STP_PAT)
  2261. // 4x4 wave op: 8 component maximum.
  2262. void StpPat4x4MaxH8(StpW1 i, inout StpH4 a, inout StpH4 b);
  2263. // 4x4 wave op: 4 component sum.
  2264. void StpPat4x4SumH4(StpW1 i, inout StpH4 a);
  2265. //------------------------------------------------------------------------------------------------------------------------------
  2266. // Sample bilinear interpolated clamp to edge prior convergence.
  2267. StpH1 StpPatPriConH(StpF2 p);
  2268. //------------------------------------------------------------------------------------------------------------------------------
  2269. // Note this is still designed to be an inline function pass merged to avoid DRAM traffic.
  2270. // So in an ideal world (with better merging with pre-scale post) these would be already in registers.
  2271. // But when PAT pass is non-inline, these callbacks are placed in the right order for loads.
  2272. // Input motion, 'position - motion' is the reprojected position, where {0 to 1} is range of the screen.
  2273. StpF2 StpPatDatMotH(StpW2 o);
  2274. // Input color, this is linear HDR or post-tonemap-linear depending on STP_POSTMAP.
  2275. StpH3 StpPatDatColH(StpW2 o);
  2276. StpF1 StpPatDatZH(StpW2 o);
  2277. // Input depth, this is linear {0:near to INF:far} ranged.
  2278. StpF1 StpPatFixZH(StpF1 z);
  2279. StpU1 StpPatDatRH(StpW2 o);
  2280. // Responsive input pixel {0.0 := responsive, 1.0 := normal}.
  2281. StpH1 StpPatFixRH(StpU1 v);
  2282. //------------------------------------------------------------------------------------------------------------------------------
  2283. // Dither value {0 to 1} this should be input pixel frequency spatial temporal blue noise.
  2284. StpH1 StpPatDitH(StpW2 o);
  2285. //------------------------------------------------------------------------------------------------------------------------------
  2286. // Sample bilinear interpolated clamp to edge prior feedback.
  2287. StpH4 StpPatPriFedH(StpF2 p);
  2288. // Gather4 versions.
  2289. StpH4 StpPatPriFedR4H(StpF2 p);
  2290. StpH4 StpPatPriFedG4H(StpF2 p);
  2291. StpH4 StpPatPriFedB4H(StpF2 p);
  2292. //------------------------------------------------------------------------------------------------------------------------------
  2293. // Sample bilinear interpolated clamp to edge 2-frame luma ring.
  2294. StpH2 StpPatPriLumH(StpF2 p);
  2295. //------------------------------------------------------------------------------------------------------------------------------
  2296. // Gather4 on prior {z,motion}.
  2297. StpU4 StpPatPriMot4H(StpF2 p);
  2298. #if STP_MAX_MIN_UINT
  2299. StpU1 StpPatPriMotMinH(StpF2 p);
  2300. #endif // STP_MAX_MIN_UINT
  2301. #if STP_OFFSETS
  2302. StpU4 StpPatPriMot4OH(StpF2 p, StpI2 o);
  2303. #if STP_MAX_MIN_UINT
  2304. StpU1 StpPatPriMotMinOH(StpF2 p, StpI2 o);
  2305. #endif // STP_MAX_MIN_UINT
  2306. #endif // STP_OFFSETS
  2307. //------------------------------------------------------------------------------------------------------------------------------
  2308. void StpPatStMotH(StpW2 p, StpU1 v);
  2309. void StpPatStColH(StpW2 p, StpH4 v);
  2310. void StpPatStLumH(StpW2 p, StpH2 v);
  2311. void StpPatStCnvH(StpW2 p, StpH1 v);
  2312. //==============================================================================================================================
  2313. void StpPatH(
  2314. StpW1 lane,
  2315. StpW2 pp,
  2316. StpU4 con0,
  2317. StpU4 con1,
  2318. StpU4 con2,
  2319. StpU4 con3,
  2320. StpU4 con4,
  2321. StpU4 con5,
  2322. StpU4 con6,
  2323. StpU4 con7,
  2324. StpU4 con8,
  2325. StpU4 con9,
  2326. StpU4 conA,
  2327. StpU4 conB,
  2328. StpU4 conC,
  2329. StpU4 conD) {
  2330. //------------------------------------------------------------------------------------------------------------------------------
  2331. // Outputs.
  2332. StpH4 rC;
  2333. StpU1 rM;
  2334. StpH2 rL;
  2335. StpH1 rCnv;
  2336. //------------------------------------------------------------------------------------------------------------------------------
  2337. // Rename constants.
  2338. StpF2 kRcpC = StpF2_U2(con0.xy);
  2339. StpF2 kHalfRcpC = StpF2_U2(con0.zw);
  2340. StpF2 kJitCRcpCUnjitPRcpP = StpF2_U2(con1.xy);
  2341. StpF2 kJitCRcpC = StpF2_U2(con1.zw);
  2342. StpF2 kF = StpF2_U2(con2.xy);
  2343. StpF4 kOS = StpF4_U4(con3);
  2344. StpF2 kDepth = StpF2_U2(con2.zw);
  2345. StpF2 kUnDepth = StpF2_U2(con4.xy);
  2346. StpF1 kMotionMatch = StpF1_U1(con4.z);
  2347. StpF2 kC = StpF2_U2(con5.xy);
  2348. StpF4 k0123 = StpF4_U4(con6);
  2349. StpF4 k4567 = StpF4_U4(con7);
  2350. StpF4 k89AB = StpF4_U4(con8);
  2351. StpF4 kCDEF = StpF4_U4(con9);
  2352. StpF4 kGHIJ = StpF4_U4(conA);
  2353. StpF4 kKLMN = StpF4_U4(conB);
  2354. StpF4 kOPQR = StpF4_U4(conC);
  2355. StpF2 kST = StpF2_U2(conD.xy);
  2356. //------------------------------------------------------------------------------------------------------------------------------
  2357. StpF2 m = StpPatDatMotH(pp);
  2358. // This dither fetch should likely be shared with pass merged pre-scale post work in the future.
  2359. StpH1 d = StpPatDitH(pp);
  2360. StpF1 zPre = StpPatDatZH(pp);
  2361. StpH3 c = StpPatDatColH(pp);
  2362. //==============================================================================================================================
  2363. // DEPENDENT INLINE INPUT MOTION
  2364. //==============================================================================================================================
  2365. // Work towards getting all dependent fetches out first.
  2366. // Compute float position {0 to 1} across screen.
  2367. StpF2 p = StpF2(pp) * kRcpC + kHalfRcpC;
  2368. //------------------------------------------------------------------------------------------------------------------------------
  2369. #if STP_BUG_BW_SOL
  2370. { StpH2 lum2 = StpPatPriLumH(p);
  2371. StpH1 cnvPrev = StpPatPriConH(p);
  2372. StpU4 mZVP4 = StpPatPriMot4H(p);
  2373. StpU1 rPre = StpPatDatRH(p);
  2374. StpH3 f = StpPatPriFedH(p).rgb;
  2375. StpF1 z = StpPatFixZH(zPre);
  2376. StpH1 r = StpPatFixRH(rPre);
  2377. rC.rgb = StpH3_(m.x) + StpH3_(d.x) + c + StpH3_(lum2.x) + StpH3_(cnvPrev) + StpH3(mZVP4.xyz) + f + StpH3_(z+r);
  2378. rC.a = StpH1_(0.0);
  2379. rL = rC.rg;
  2380. rM = StpU1_(rC.r);
  2381. rCnv = rC.r;
  2382. StpPatStMotH(pp, rM);
  2383. StpPatStLumH(pp, rL);
  2384. StpPatStColH(pp, rC);
  2385. StpPatStCnvH(pp, rCnv);
  2386. return; }
  2387. #endif // STP_BUG_BW_SOL
  2388. //------------------------------------------------------------------------------------------------------------------------------
  2389. // Reprojection position in prior input and feedback.
  2390. StpF2 pM = (p - m);
  2391. StpF2 pF = pM + kJitCRcpC;
  2392. pM = pM + kJitCRcpCUnjitPRcpP;
  2393. //------------------------------------------------------------------------------------------------------------------------------
  2394. // Fetch 2-frame reprojected history ring of luma.
  2395. StpH2 lum2 = StpPatPriLumH(pM);
  2396. //------------------------------------------------------------------------------------------------------------------------------
  2397. // Fetch reprojected low-frequency convergence prior frame.
  2398. StpH1 cnvPrev = StpPatPriConH(pM);
  2399. //------------------------------------------------------------------------------------------------------------------------------
  2400. // Grab large enough neighborhood for prior reprojected nearest {z,motion}.
  2401. // This nearest dilates {z, motion} reprojection to avoid pulling in anti-aliased edges and leaving temporal ringing.
  2402. #if (STP_SAFE_DILATE == 2)
  2403. #if STP_MAX_MIN_UINT
  2404. StpU4 mZVP4;
  2405. #if STP_OFFSETS
  2406. mZVP4.x = StpPatPriMotMinOH(pM, StpI2(-1, -1));
  2407. mZVP4.y = StpPatPriMotMinOH(pM, StpI2( 1, -1));
  2408. mZVP4.z = StpPatPriMotMinOH(pM, StpI2(-1, 1));
  2409. mZVP4.w = StpPatPriMotMinOH(pM, StpI2( 1, 1));
  2410. #else // STP_OFFSETS
  2411. mZVP4.x = StpPatPriMotMinH(pM + StpF2(-kRcpC.x, -kRcpC.y));
  2412. mZVP4.y = StpPatPriMotMinH(pM + StpF2( kRcpC.x, -kRcpC.y));
  2413. mZVP4.z = StpPatPriMotMinH(pM + StpF2(-kRcpC.x, kRcpC.y));
  2414. mZVP4.w = StpPatPriMotMinH(pM + StpF2( kRcpC.x, kRcpC.y));
  2415. #endif // ST_OFFSETS
  2416. #else // STP_MAX_MIN_UINT
  2417. #if STP_OFFSETS
  2418. StpU4 mZVP4_0 = StpPatPriMot4OH(pM, StpI2(-1, -1));
  2419. StpU4 mZVP4_1 = StpPatPriMot4OH(pM, StpI2( 1, -1));
  2420. StpU4 mZVP4_2 = StpPatPriMot4OH(pM, StpI2(-1, 1));
  2421. StpU4 mZVP4_3 = StpPatPriMot4OH(pM, StpI2( 1, 1));
  2422. #else // STP_OFFSETS
  2423. StpU4 mZVP4_0 = StpPatPriMot4H(pM + StpF2(-kRcpC.x, -kRcpC.y));
  2424. StpU4 mZVP4_1 = StpPatPriMot4H(pM + StpF2( kRcpC.x, -kRcpC.y));
  2425. StpU4 mZVP4_2 = StpPatPriMot4H(pM + StpF2(-kRcpC.x, kRcpC.y));
  2426. StpU4 mZVP4_3 = StpPatPriMot4H(pM + StpF2( kRcpC.x, kRcpC.y));
  2427. #endif // STP_OFFSETS
  2428. #endif // STP_MAX_MIN_UINT
  2429. #else // (STP_SAFE_DILATE == 2)
  2430. StpU1 mZVPN;
  2431. // To be correct here this needs 'kHalfRcpP' (prior instead of current).
  2432. // But didn't want to pass yet another pair of constants, so using current instead.
  2433. // TODO: If later moving to 'kHalfRcpP' can use one sample by offset to save some VALU ops.
  2434. // Also this is only used if STP_SAFE_DILATE=1 (else dead code).
  2435. StpU4 mZVP2a = StpPatPriMot4H(pM - kHalfRcpC);
  2436. StpU4 mZVP2b = StpPatPriMot4H(pM + kHalfRcpC);
  2437. #if STP_MAX_MIN_UINT
  2438. mZVPN = StpPatPriMotMinH(pM);
  2439. #else // STP_MAX_MIN_UINT
  2440. StpU4 mZVP4 = StpPatPriMot4H(pM);
  2441. #endif // STP_MAX_MIN_UINT
  2442. #endif // (STP_SAFE_DILATE == 2)
  2443. //------------------------------------------------------------------------------------------------------------------------------
  2444. StpU1 rPre = StpPatDatRH(pp);
  2445. //------------------------------------------------------------------------------------------------------------------------------
  2446. // Gather 4 on feedback.
  2447. StpH4 f4R = StpPatPriFedR4H(pF);
  2448. StpH4 f4G = StpPatPriFedG4H(pF);
  2449. StpH4 f4B = StpPatPriFedB4H(pF);
  2450. // Grab bilinear feedback.
  2451. StpH3 f = StpPatPriFedH(pF).rgb;
  2452. //==============================================================================================================================
  2453. // DEPENDENT ON DITHER AND INLINE INPUT PARAMETERS
  2454. //==============================================================================================================================
  2455. StpF1 dd = StpF1_(d);
  2456. // Convert depth {0 to inf} to {0 to 1} safe for 10-bit value.
  2457. StpF1 z = StpPatFixZH(zPre);
  2458. z = StpZPack(z, kDepth, dd);
  2459. // Pack {MSB depth, LSB 11-bit XY motion}.
  2460. rM = StpMvPack(z, m, dd);
  2461. StpPatStMotH(pp, rM);
  2462. //------------------------------------------------------------------------------------------------------------------------------
  2463. #if STP_BUG
  2464. // Pattern/Clipped Input Color
  2465. { StpF4 bug = StpF4_(0.0);
  2466. bug.rgb = sqrt(StpF3(c));
  2467. bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
  2468. StpBugF(StpU3(pp, 0), bug); }
  2469. //------------------------------------------------------------------------------------------------------------------------------
  2470. // Pattern/Log Input Depth
  2471. { StpF4 bug = StpF4_(0.0);
  2472. bug.rgb = StpF3_(StpSatF1(z + StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
  2473. StpBugF(StpU3(pp, 1), bug); }
  2474. #endif // STP_BUG
  2475. //------------------------------------------------------------------------------------------------------------------------------
  2476. // Pre-process color.
  2477. // If running pre-tonemap, then do a fast reversible tonemapper (convert from {0 to inf} to {0 to 1}).
  2478. #if (STP_POSTMAP == 0)
  2479. StpToneH3(c);
  2480. #endif // (STP_POSTMAP == 0)
  2481. //------------------------------------------------------------------------------------------------------------------------------
  2482. #if STP_BUG
  2483. // Pattern/Reversible Tonemapped Input Color
  2484. { StpF4 bug = StpF4_(0.0);
  2485. bug.rgb = sqrt(StpF3(c));
  2486. bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
  2487. StpBugF(StpU3(pp, 2), bug); }
  2488. #endif // STP_BUG
  2489. //------------------------------------------------------------------------------------------------------------------------------
  2490. // Output intermediate color.
  2491. // Dither from linear to gamma 2.0.
  2492. // Simple non-energy conserving dither is working, using 10-bit/channel.
  2493. c = sqrt(c);
  2494. rC.rgb = StpSatH3(c + StpH3_(d * StpH1(1.0 / 1023.0) + StpH1(-0.5 / 1023.0)));
  2495. //------------------------------------------------------------------------------------------------------------------------------
  2496. // Setup the new 3-ring output luma.
  2497. rL.x = dot(c, StpH3(STP_LUMA));
  2498. rL.y = lum2.x;
  2499. StpPatStLumH(pp, rL);
  2500. //------------------------------------------------------------------------------------------------------------------------------
  2501. #if STP_BUG
  2502. // Pattern/Shaped Absolute Input Motion
  2503. { StpF4 bug = StpF4_(0.0);
  2504. bug.b = sqrt(StpF1_(rL.x) * StpF1_(0.25));
  2505. bug.rg = StpF2_(1.0) - exp2(abs(StpF2(m)) * StpF2_(-32.0));
  2506. bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
  2507. StpBugF(StpU3(pp, 3), bug); }
  2508. #endif // STP_BUG
  2509. //------------------------------------------------------------------------------------------------------------------------------
  2510. // Minimum change across the 3 frames {current, 2-frame reprojected history}.
  2511. StpH1 moire = min(abs(rL.x - lum2.x), abs(lum2.x - lum2.y));
  2512. moire *= StpH1_(STP_PAT_DEMOIRE);
  2513. //------------------------------------------------------------------------------------------------------------------------------
  2514. // Grab neighborhood.
  2515. // Parallel block {max,-min}, and -min of convergence.
  2516. StpH4 xnyRG = StpH4(c.r, -c.r, c.g, -c.g);
  2517. StpH4 xnyBC = StpH4(c.b, -c.b, -cnvPrev, -cnvPrev);
  2518. #if defined(STP_16BIT)
  2519. StpPat4x4MaxH8(lane, xnyRG, xnyBC);
  2520. #else // defined(STP_16BIT)
  2521. // We convert to full precision floats here since the reductions work on 32-bit values.
  2522. StpF4 xnyRGF = StpF4_(xnyRG);
  2523. StpF4 xnyBCF = StpF4_(xnyBC);
  2524. StpPat4x4MaxF8(lane, xnyRGF, xnyBCF);
  2525. xnyRG = StpMF4_(xnyRGF);
  2526. xnyBC = StpMF4_(xnyBCF);
  2527. #endif // defined(STP_16BIT)
  2528. cnvPrev = -xnyBC.z;
  2529. // This is max minus min (the '.y' is already negative).
  2530. StpH3 ne = max(StpH3_(STP_PAT_NE_MIN) * StpH3(xnyRG.x, xnyRG.z, xnyBC.x),
  2531. StpH3(xnyRG.x + xnyRG.y, xnyRG.z + xnyRG.w, xnyBC.x + xnyBC.y));
  2532. StpH1 ne1 = dot(ne, StpH3(STP_LUMA));
  2533. //------------------------------------------------------------------------------------------------------------------------------
  2534. // Advance low frequency convergence.
  2535. cnvPrev = StpSatH1(cnvPrev + StpH1_(1.0 / STP_FRAME_MAX));
  2536. //------------------------------------------------------------------------------------------------------------------------------
  2537. // Estimate if reprojection is on-screen.
  2538. StpF2 onXY = StpF2(pM.xy);
  2539. // {-1 to 1} is on screen.
  2540. onXY = onXY * kOS.xy + kOS.zw;
  2541. // {0 := offscreen, 1 := onscreen}.
  2542. StpF1 onS = StpSignedF1(max(abs(onXY.x), abs(onXY.y)) - StpF1_(1.0));
  2543. //------------------------------------------------------------------------------------------------------------------------------
  2544. #if STP_BUG
  2545. // Pattern/Motion Reprojection {R=Prior G=This Sqrt Luma Feedback Diff, B=Offscreen}
  2546. { StpF4 bug = StpF4_(0.0);
  2547. bug.g = StpF1_(abs(rL.x - lum2.x));
  2548. bug.r = StpF1_(abs(lum2.x - lum2.y));
  2549. bug.b = StpF1_(1.0) - StpF1_(onS);
  2550. bug.rg = sqrt(bug.rg);
  2551. bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
  2552. StpBugF(StpU3(pp, 4), bug); }
  2553. #endif // STP_BUG
  2554. //==============================================================================================================================
  2555. // DEPENDENT ON PRIOR {Z, MOTION}
  2556. //==============================================================================================================================
  2557. // Compute a motion match value.
  2558. // Finish {z, motion} nearest dilation.
  2559. #if (STP_SAFE_DILATE == 2)
  2560. #if (STP_MAX_MIN_UINT == 0)
  2561. StpU4 mZVP4 = min(StpMin3U4(mZVP4_0, mZVP4_1, mZVP4_2), mZVP4_3);
  2562. #endif // (STP_MAX_MIN_UINT == 0)
  2563. StpU1 mZVPN = min(StpMin3U1(mZVP4.x, mZVP4.y, mZVP4.z), mZVP4.w);
  2564. #else // (STP_SAFE_DILATE == 2)
  2565. #if (STP_MAX_MIN_UINT == 0)
  2566. mZVPN = min(StpMin3U1(mZVP4.x, mZVP4.y, mZVP4.z), mZVP4.w);
  2567. #endif // (STP_MAX_MIN_UINT == 0)
  2568. #if STP_SAFE_DILATE
  2569. mZVPN = StpMin3U1(StpMin3U1(mZVPN, mZVP2a.x, mZVP2a.z), mZVP2b.x, mZVP2b.z);
  2570. #endif // STP_SAFE_DILATE
  2571. #endif // (STP_SAFE_DILATE == 2)
  2572. //------------------------------------------------------------------------------------------------------------------------------
  2573. // The {motion} matching logic.
  2574. StpF2 mPN;
  2575. StpF1 mZPN;
  2576. // Motion 'm' units are {1 := move by one screen}.
  2577. StpMvUnpack(mZPN, mPN, mZVPN);
  2578. //------------------------------------------------------------------------------------------------------------------------------
  2579. StpF2 mE;
  2580. // Use a smoother error estimate.
  2581. // This '1/256' instead of '1/1024' is to be more accepting of a motion match.
  2582. // The 'sqrt()' cannot be the low precision approximation without visually seeing differences in the mask.
  2583. mE = sqrt(abs(m)) + StpF2_(1.0 / 256.0);
  2584. mE = mE * mE - abs(m);
  2585. //------------------------------------------------------------------------------------------------------------------------------
  2586. // Static geometry motion + estimated dynamic motion matching logic.
  2587. // Take unpacked low precision {0 to 1} Z and decode to {0 to INF}.
  2588. StpF1 sgZ = StpZUnpack(mZPN, kUnDepth);
  2589. StpF2 bugF; StpF2 bugD;
  2590. StpF2 sgM = StpFor(pM, sgZ, mPN, kMotionMatch, k0123, k4567, k89AB, kCDEF, kGHIJ, kKLMN, kOPQR, kST, bugF, bugD);
  2591. // Note 'sgM' is in NDC {-1 to 1} space and 'm' is in {0 to 1} space, thus the 0.5 scaling factor.
  2592. // The difference gets conservative possible motion encoding error subtracted out via 'saturate(abs(..)-mE)'.
  2593. sgM = StpSatF2(abs(sgM * StpF2_(0.5) - m) - mE) * kC;
  2594. StpH1 sgD = StpH1(dot(sgM, sgM));
  2595. //------------------------------------------------------------------------------------------------------------------------------
  2596. // Motion match {0 := no match, 1 := match}.
  2597. StpH1 match = StpH1_(1.0) - StpSatH1(sgD * StpH1_(STP_PAT_MOT_AMP) - StpH1_(STP_PAT_MOT_ADD * STP_PAT_MOT_AMP));
  2598. // Offscreen is a non-match.
  2599. match *= StpH1_(onS);
  2600. // Pass motion match in alpha.
  2601. rC.a = match;
  2602. StpPatStColH(pp, rC);
  2603. //------------------------------------------------------------------------------------------------------------------------------
  2604. // Must disable on non-motion match, but make sure it doesn't fully /0 later.
  2605. moire = moire * match + StpH1_(1.0 / 8192.0);
  2606. // Scale down temporal change proportional to ratio of local neighborhood and minimum 3-frame temporal change.
  2607. moire = min(StpH1_(1.0), ne1 * StpRcpH1(moire));
  2608. //------------------------------------------------------------------------------------------------------------------------------
  2609. // Sensitivity modifiers.
  2610. // The following which gets optimized to two FMAs.
  2611. // tS = tS * ((1-v)*k + 1) ... logic
  2612. // tS = tS * ((1-v)*k) + tS
  2613. // tS = tS * (k-v*k) + tS ..... optimized
  2614. StpH1 tS = moire;
  2615. StpH1 r = StpPatFixRH(rPre);
  2616. tS = tS * (StpH1_(STP_PAT_RESPONSIVE) - r * StpH1_(STP_PAT_RESPONSIVE)) + tS;
  2617. //------------------------------------------------------------------------------------------------------------------------------
  2618. #if STP_BUG
  2619. // Pattern/Sensitivity {G=No motion match, R=Responsive, B=Luma}
  2620. { StpF4 bug = StpF4_(0.0);
  2621. bug.g = StpF1_(1.0) - StpF1(match);
  2622. bug.r = StpF1_(1.0) - StpF1(r);
  2623. bug.b = StpF1_(rL.x);
  2624. bug.rgb = StpSatF3(bug.rgb + StpF3_(StpF1_(d) * StpF1_(1.0 / 255.0) + StpF1_(-0.5 / 255.0)));
  2625. StpBugF(StpU3(pp, 5), bug); }
  2626. #endif // STP_BUG
  2627. //==============================================================================================================================
  2628. // DEPENDENT ON FEEDBACK
  2629. //==============================================================================================================================
  2630. // Find lowest temporal difference.
  2631. StpH4 t;
  2632. t.rgb = c - f;
  2633. // Luma diff in alpha.
  2634. t.a = dot(abs(t.rgb), StpH3(STP_LUMA));
  2635. // Compute lowest difference for all in quad.
  2636. StpH4 t4R = f4R - StpH4_(c.r);
  2637. StpH4 t4G = f4G - StpH4_(c.g);
  2638. StpH4 t4B = f4B - StpH4_(c.b);
  2639. StpH4 t4A = abs(t4R) * StpH4_(STP_LUMA_R) + abs(t4G) * StpH4_(STP_LUMA_G) + abs(t4B) * StpH4_(STP_LUMA_B);
  2640. // Override with lower from gather4.
  2641. t.a = StpMin3H1(t.a, t4A.x, StpMin3H1(t4A.y, t4A.z, t4A.w));
  2642. if(t.a == t4A.x) t.rgb = StpH3(t4R.x, t4G.x, t4B.x);
  2643. if(t.a == t4A.y) t.rgb = StpH3(t4R.y, t4G.y, t4B.y);
  2644. if(t.a == t4A.z) t.rgb = StpH3(t4R.z, t4G.z, t4B.z);
  2645. if(t.a == t4A.w) t.rgb = StpH3(t4R.w, t4G.w, t4B.w);
  2646. //------------------------------------------------------------------------------------------------------------------------------
  2647. // Factor in sensitivity and reduce.
  2648. t.rgb *= StpH3_(tS);
  2649. //------------------------------------------------------------------------------------------------------------------------------
  2650. #if defined(STP_16BIT)
  2651. StpPat4x4SumH4(lane, t);
  2652. #else // defined(STP_16BIT)
  2653. // We convert to full precision floats here since the reductions work on 32-bit values, and MF might be 16-bit.
  2654. StpF4 tF = StpF4(t);
  2655. StpPat4x4SumF4(lane, tF);
  2656. t = StpMF4(tF);
  2657. #endif // defined(STP_16BIT)
  2658. t.rgb *= StpH3_(STP_PAT_SENSITIVITY);
  2659. //------------------------------------------------------------------------------------------------------------------------------
  2660. // Ratio of 'spatial/temporal' change.
  2661. StpH3 bln3 = StpSatH3(ne * StpPrxLoRcpH3(abs(t.rgb)));
  2662. // Worst channel limits to avoid chroma ghosting.
  2663. StpH1 bln = StpMin3H1(bln3.r, bln3.g, bln3.b);
  2664. //------------------------------------------------------------------------------------------------------------------------------
  2665. // Convert from blend ratio to convergence.
  2666. // Note, 'rcp(0)=+INF' when approximations are not used.
  2667. StpH1 cnv = StpSatH1(bln * StpPrxLoRcpH1(StpH1_(STP_FRAME_MAX) - StpH1_(STP_FRAME_MAX) * bln));
  2668. //------------------------------------------------------------------------------------------------------------------------------
  2669. // Feedback the min of reprojected convergence, and subtract one frame (as next frame advances by one).
  2670. cnv = StpSatH1(cnv - StpH1_(1.0 / STP_FRAME_MAX));
  2671. rCnv = min(cnv, cnvPrev);
  2672. StpPatStCnvH(pp, rCnv); }
  2673. #endif // defined(STP_GPU) && defined(STP_16BIT) && defined(STP_PAT)
  2674. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2675. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2676. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2677. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2678. //_____________________________________________________________.._______________________________________________________________
  2679. //==============================================================================================================================
  2680. //
  2681. // PATTERN DILATION ENTRY POINT
  2682. //
  2683. //------------------------------------------------------------------------------------------------------------------------------
  2684. // This should be pass merged with STP_SAA.
  2685. // Dilates low frequency convergence.
  2686. //==============================================================================================================================
  2687. #if defined(STP_GPU) && defined(STP_32BIT) && defined(STP_DIL)
  2688. StpMF1 StpDilDitF(StpMU2 o);
  2689. StpMF1 StpDilConF(StpF2 p);
  2690. StpMF4 StpDilCon4F(StpF2 p);
  2691. #if STP_OFFSETS
  2692. StpMF1 StpDilConOF(StpF2 p, StpI2 o);
  2693. StpMF4 StpDilCon4OF(StpF2 p, StpI2 o);
  2694. #endif // STP_OFFSETS
  2695. //==============================================================================================================================
  2696. void StpDilF(out StpMF1 oC, StpU2 pp, StpU4 con0) {
  2697. StpF2 kRcpR = StpF2_U2(con0.xy);
  2698. //------------------------------------------------------------------------------------------------------------------------------
  2699. StpF2 p = StpF2(pp) * kRcpR;
  2700. //------------------------------------------------------------------------------------------------------------------------------
  2701. #if STP_BUG_BW_SOL
  2702. { oC = StpDilCon4F(p).x; return; }
  2703. #endif // STP_BUG_BW_SOL
  2704. //------------------------------------------------------------------------------------------------------------------------------
  2705. #if STP_OFFSETS
  2706. StpMF4 g0 = StpDilCon4OF(p, StpI2(-1.0, -1.0));
  2707. StpMF4 g1 = StpDilCon4OF(p, StpI2( 1.0, -1.0));
  2708. StpMF4 g2 = StpDilCon4OF(p, StpI2( 3.0, -1.0));
  2709. StpMF4 g3 = StpDilCon4OF(p, StpI2(-1.0, 1.0));
  2710. StpMF4 g4 = StpDilCon4OF(p, StpI2( 1.0, 1.0));
  2711. StpMF4 g5 = StpDilCon4OF(p, StpI2( 3.0, 1.0));
  2712. StpMF4 g6 = StpDilCon4OF(p, StpI2(-1.0, 3.0));
  2713. StpMF4 g7 = StpDilCon4OF(p, StpI2( 1.0, 3.0));
  2714. StpMF4 g8 = StpDilCon4OF(p, StpI2( 3.0, 3.0));
  2715. #else // STP_OFFSETS
  2716. StpMF4 g0 = StpDilCon4F(p + StpF2(-1.0 * kRcpR.x, -1.0 * kRcpR.y));
  2717. StpMF4 g1 = StpDilCon4F(p + StpF2( 1.0 * kRcpR.x, -1.0 * kRcpR.y));
  2718. StpMF4 g2 = StpDilCon4F(p + StpF2( 3.0 * kRcpR.x, -1.0 * kRcpR.y));
  2719. StpMF4 g3 = StpDilCon4F(p + StpF2(-1.0 * kRcpR.x, 1.0 * kRcpR.y));
  2720. StpMF4 g4 = StpDilCon4F(p + StpF2( 1.0 * kRcpR.x, 1.0 * kRcpR.y));
  2721. StpMF4 g5 = StpDilCon4F(p + StpF2( 3.0 * kRcpR.x, 1.0 * kRcpR.y));
  2722. StpMF4 g6 = StpDilCon4F(p + StpF2(-1.0 * kRcpR.x, 3.0 * kRcpR.y));
  2723. StpMF4 g7 = StpDilCon4F(p + StpF2( 1.0 * kRcpR.x, 3.0 * kRcpR.y));
  2724. StpMF4 g8 = StpDilCon4F(p + StpF2( 3.0 * kRcpR.x, 3.0 * kRcpR.y));
  2725. #endif // STP_OFFSETS
  2726. //------------------------------------------------------------------------------------------------------------------------------
  2727. StpMF1 cA = g0.w;
  2728. StpMF1 cB = g0.z;
  2729. StpMF1 cC = g1.w;
  2730. StpMF1 cD = g1.z;
  2731. StpMF1 cE = g2.w;
  2732. StpMF1 cF = g0.x;
  2733. StpMF1 cG = g0.y;
  2734. StpMF1 cH = g1.x;
  2735. StpMF1 cI = g1.y;
  2736. StpMF1 cJ = g2.x;
  2737. StpMF1 cK = g3.w;
  2738. StpMF1 cL = g3.z;
  2739. StpMF1 cM = g4.w;
  2740. StpMF1 cN = g4.z;
  2741. StpMF1 cO = g5.w;
  2742. StpMF1 cP = g3.x;
  2743. StpMF1 cQ = g3.y;
  2744. StpMF1 cR = g4.x;
  2745. StpMF1 cS = g4.y;
  2746. StpMF1 cT = g5.x;
  2747. StpMF1 cU = g6.w;
  2748. StpMF1 cV = g6.z;
  2749. StpMF1 cW = g7.w;
  2750. StpMF1 cX = g7.z;
  2751. StpMF1 cY = g8.w;
  2752. //------------------------------------------------------------------------------------------------------------------------------
  2753. StpMF4 m1345;
  2754. m1345.x = StpMin3MF1(StpMin3MF1(cG, cH, cI), cC, cM);
  2755. m1345.y = StpMin3MF1(StpMin3MF1(cK, cL, cM), cG, cQ);
  2756. m1345.z = StpMin3MF1(StpMin3MF1(cL, cM, cN), cH, cR);
  2757. m1345.w = StpMin3MF1(StpMin3MF1(cM, cN, cO), cI, cS);
  2758. StpMF1 m7 = StpMin3MF1(StpMin3MF1(cQ, cR, cS), cM, cW);
  2759. //------------------------------------------------------------------------------------------------------------------------------
  2760. StpMF1 b0 = StpMF1_(0.5);
  2761. StpMF1 b1 = (StpMF1_(1.0) - b0) * StpMF1_(0.25);
  2762. oC = m1345.z * b0 + m1345.x * b1 + m1345.y * b1 + m1345.w * b1 + m7 * b1; }
  2763. #endif // defined(STP_GPU) && defined(STP_32BIT) && defined(STP_DIL)
  2764. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2765. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2766. //_____________________________________________________________.._______________________________________________________________
  2767. //==============================================================================================================================
  2768. // 16-BIT PATH
  2769. //==============================================================================================================================
  2770. #if defined(STP_GPU) && defined(STP_16BIT) && defined(STP_DIL)
  2771. // Some of these are unused, possibly for future experimentation.
  2772. StpH1 StpDilDitH(StpW2 o);
  2773. StpH1 StpDilConH(StpF2 p);
  2774. StpH4 StpDilCon4H(StpF2 p);
  2775. #if STP_OFFSETS
  2776. StpH1 StpDilConOH(StpF2 p, StpI2 o);
  2777. StpH4 StpDilCon4OH(StpF2 p, StpI2 o);
  2778. #endif // STP_OFFSETS
  2779. //==============================================================================================================================
  2780. void StpDilH(out StpH1 oC, StpU2 pp, StpU4 con0) {
  2781. StpF2 kRcpR = StpF2_U2(con0.xy);
  2782. StpF2 p = StpF2(pp) * kRcpR;
  2783. //------------------------------------------------------------------------------------------------------------------------------
  2784. #if STP_BUG_BW_SOL
  2785. { oC = StpDilCon4H(p).x; return; }
  2786. #endif // STP_BUG_BW_SOL
  2787. //------------------------------------------------------------------------------------------------------------------------------
  2788. // Gather.
  2789. // 0 1 2
  2790. //
  2791. // 3 4 5
  2792. //
  2793. // 6 7 8
  2794. // For.
  2795. // w z w z w z
  2796. // x y.x y x y
  2797. // w z[w]z w z
  2798. // x y x y x y
  2799. // w z w z w z
  2800. // x y x y x y
  2801. #if STP_OFFSETS
  2802. StpH4 g0 = StpDilCon4OH(p, StpI2(-1.0, -1.0));
  2803. StpH4 g1 = StpDilCon4OH(p, StpI2( 1.0, -1.0));
  2804. StpH4 g2 = StpDilCon4OH(p, StpI2( 3.0, -1.0));
  2805. StpH4 g3 = StpDilCon4OH(p, StpI2(-1.0, 1.0));
  2806. StpH4 g4 = StpDilCon4OH(p, StpI2( 1.0, 1.0));
  2807. StpH4 g5 = StpDilCon4OH(p, StpI2( 3.0, 1.0));
  2808. StpH4 g6 = StpDilCon4OH(p, StpI2(-1.0, 3.0));
  2809. StpH4 g7 = StpDilCon4OH(p, StpI2( 1.0, 3.0));
  2810. StpH4 g8 = StpDilCon4OH(p, StpI2( 3.0, 3.0));
  2811. #else // STP_OFFSETS
  2812. StpH4 g0 = StpDilCon4H(p + StpF2(-1.0 * kRcpR.x, -1.0 * kRcpR.y));
  2813. StpH4 g1 = StpDilCon4H(p + StpF2( 1.0 * kRcpR.x, -1.0 * kRcpR.y));
  2814. StpH4 g2 = StpDilCon4H(p + StpF2( 3.0 * kRcpR.x, -1.0 * kRcpR.y));
  2815. StpH4 g3 = StpDilCon4H(p + StpF2(-1.0 * kRcpR.x, 1.0 * kRcpR.y));
  2816. StpH4 g4 = StpDilCon4H(p + StpF2( 1.0 * kRcpR.x, 1.0 * kRcpR.y));
  2817. StpH4 g5 = StpDilCon4H(p + StpF2( 3.0 * kRcpR.x, 1.0 * kRcpR.y));
  2818. StpH4 g6 = StpDilCon4H(p + StpF2(-1.0 * kRcpR.x, 3.0 * kRcpR.y));
  2819. StpH4 g7 = StpDilCon4H(p + StpF2( 1.0 * kRcpR.x, 3.0 * kRcpR.y));
  2820. StpH4 g8 = StpDilCon4H(p + StpF2( 3.0 * kRcpR.x, 3.0 * kRcpR.y));
  2821. #endif // STP_OFFSETS
  2822. //------------------------------------------------------------------------------------------------------------------------------
  2823. // Rename
  2824. // a b c d e
  2825. // f g h i j
  2826. // k l m n o
  2827. // p q r s t
  2828. // u v w x y
  2829. StpH1 cA = g0.w;
  2830. StpH1 cB = g0.z;
  2831. StpH1 cC = g1.w;
  2832. StpH1 cD = g1.z;
  2833. StpH1 cE = g2.w;
  2834. StpH1 cF = g0.x;
  2835. StpH1 cG = g0.y;
  2836. StpH1 cH = g1.x;
  2837. StpH1 cI = g1.y;
  2838. StpH1 cJ = g2.x;
  2839. StpH1 cK = g3.w;
  2840. StpH1 cL = g3.z;
  2841. StpH1 cM = g4.w;
  2842. StpH1 cN = g4.z;
  2843. StpH1 cO = g5.w;
  2844. StpH1 cP = g3.x;
  2845. StpH1 cQ = g3.y;
  2846. StpH1 cR = g4.x;
  2847. StpH1 cS = g4.y;
  2848. StpH1 cT = g5.x;
  2849. StpH1 cU = g6.w;
  2850. StpH1 cV = g6.z;
  2851. StpH1 cW = g7.w;
  2852. StpH1 cX = g7.z;
  2853. StpH1 cY = g8.w;
  2854. //------------------------------------------------------------------------------------------------------------------------------
  2855. // 5 point min.
  2856. // . 1 .
  2857. // 3 4 5
  2858. // . 7 .
  2859. StpH4 m1345;
  2860. m1345.x = StpMin3H1(StpMin3H1(cG, cH, cI), cC, cM);
  2861. m1345.y = StpMin3H1(StpMin3H1(cK, cL, cM), cG, cQ);
  2862. m1345.z = StpMin3H1(StpMin3H1(cL, cM, cN), cH, cR);
  2863. m1345.w = StpMin3H1(StpMin3H1(cM, cN, cO), cI, cS);
  2864. StpH1 m7 = StpMin3H1(StpMin3H1(cQ, cR, cS), cM, cW);
  2865. //------------------------------------------------------------------------------------------------------------------------------
  2866. StpH1 b0 = StpH1_(0.5);
  2867. StpH1 b1 = (StpH1_(1.0) - b0) * StpH1_(0.25);
  2868. oC = m1345.z * b0 + m1345.x * b1 + m1345.y * b1 + m1345.w * b1 + m7 * b1; }
  2869. #endif // defined(STP_GPU) && defined(STP_16BIT) && defined(STP_DIL)
  2870. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2871. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2872. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2873. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2874. //_____________________________________________________________.._______________________________________________________________
  2875. //==============================================================================================================================
  2876. //
  2877. // SPATIAL ANTI-ALIASING ENTRY POINT
  2878. //
  2879. //------------------------------------------------------------------------------------------------------------------------------
  2880. // This should be pass merged with STP_DIL.
  2881. // It's a shell, GEAA is separated as a modified form could be useful on its own.
  2882. //==============================================================================================================================
  2883. #if defined(STP_GPU) && defined(STP_32BIT) && defined(STP_SAA)
  2884. StpMF4 StpSaaLum4F(StpF2 p);
  2885. #if STP_OFFSETS
  2886. StpMF4 StpSaaLum4OF(StpF2 p, StpI2 o);
  2887. #endif
  2888. //------------------------------------------------------------------------------------------------------------------------------
  2889. #define STP_GEAA 1
  2890. StpMF4 StpGeaa4F(StpF2 p) { return StpSaaLum4F(p); }
  2891. #if STP_OFFSETS
  2892. StpMF4 StpGeaa4OF(StpF2 p, StpI2 o) { return StpSaaLum4OF(p, o); }
  2893. #endif
  2894. void StpGeaaF(out StpMF1 gW, out StpMF1 gLuma, out StpF2 gFilter, out StpF2 gDilate, StpF2 p, StpF2 kRcpI, StpF2 kHalfRcpI);
  2895. //==============================================================================================================================
  2896. void StpSaaF(out StpMF1 oN, StpU2 pp, StpU4 con0) {
  2897. //------------------------------------------------------------------------------------------------------------------------------
  2898. StpF2 kRcpC = StpF2_U2(con0.xy);
  2899. StpF2 kHalfRcpC = StpF2_U2(con0.zw);
  2900. //------------------------------------------------------------------------------------------------------------------------------
  2901. StpF2 p = StpF2(pp) * kRcpC + kHalfRcpC;
  2902. //------------------------------------------------------------------------------------------------------------------------------
  2903. #if STP_BUG_BW_SOL
  2904. { oN = StpSaaLum4F(p).x; return; }
  2905. #endif // STP_BUG_BW_SOL
  2906. //------------------------------------------------------------------------------------------------------------------------------
  2907. StpMF1 gLuma;
  2908. StpMF1 gNe;
  2909. StpF2 gFilter;
  2910. StpF2 gDilate;
  2911. StpGeaaF(oN, gLuma, gFilter, gDilate, p, kRcpC, kHalfRcpC); }
  2912. #endif // defined(STP_GPU) && defined(STP_32BIT) && defined(STP_SAA)
  2913. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2914. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2915. //_____________________________________________________________.._______________________________________________________________
  2916. //==============================================================================================================================
  2917. // 16-BIT PATH
  2918. //==============================================================================================================================
  2919. #if defined(STP_GPU) && defined(STP_16BIT) && defined(STP_SAA)
  2920. // Gather4 on current luma.
  2921. StpH4 StpSaaLum4H(StpF2 p);
  2922. #if STP_OFFSETS
  2923. StpH4 StpSaaLum4OH(StpF2 p, StpI2 o);
  2924. #endif
  2925. //------------------------------------------------------------------------------------------------------------------------------
  2926. #define STP_GEAA 1
  2927. StpH4 StpGeaa4H(StpF2 p) { return StpSaaLum4H(p); }
  2928. #if STP_OFFSETS
  2929. StpH4 StpGeaa4OH(StpF2 p, StpI2 o) { return StpSaaLum4OH(p, o); }
  2930. #endif
  2931. void StpGeaaH(out StpH1 gW, out StpH1 gLuma, out StpF2 gFilter, out StpF2 gDilate, StpF2 p, StpF2 kRcpI, StpF2 kHalfRcpI);
  2932. //==============================================================================================================================
  2933. void StpSaaH(
  2934. out StpH1 oN, // Output control (to be stored).
  2935. StpU2 pp, // Input position {0 to size-1} across the input frame.
  2936. StpU4 con0) { // Shared, first constant generated by StpPatCon().
  2937. //------------------------------------------------------------------------------------------------------------------------------
  2938. StpF2 kRcpC = StpF2_U2(con0.xy);
  2939. StpF2 kHalfRcpC = StpF2_U2(con0.zw);
  2940. //------------------------------------------------------------------------------------------------------------------------------
  2941. // Float position {0 to 1} across screen.
  2942. StpF2 p = StpF2(pp) * kRcpC + kHalfRcpC;
  2943. //------------------------------------------------------------------------------------------------------------------------------
  2944. #if STP_BUG_BW_SOL
  2945. { oN = StpSaaLum4H(p).x; return; }
  2946. #endif // STP_BUG_BW_SOL
  2947. //------------------------------------------------------------------------------------------------------------------------------
  2948. StpH1 gLuma; // Spatial AA (unused).
  2949. StpH1 gNe; // Output spatial neighborhood (unused).
  2950. StpF2 gFilter; // Output position for anti-aliased color sampling if standalone (unused).
  2951. StpF2 gDilate; // Output for {z,motion} dilation (unused).
  2952. StpGeaaH(oN, gLuma, gFilter, gDilate, p, kRcpC, kHalfRcpC); }
  2953. #endif // defined(STP_GPU) && defined(STP_16BIT) && defined(STP_SAA)
  2954. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2955. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2956. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2957. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  2958. //_____________________________________________________________.._______________________________________________________________
  2959. //==============================================================================================================================
  2960. //
  2961. // SCALING TAA ENTRY POINT
  2962. //
  2963. //==============================================================================================================================
  2964. #if defined(STP_GPU) && defined(STP_TAA) && defined(STP_32BIT)
  2965. StpMF4 StpTaaCtl4F(StpF2 p);
  2966. //------------------------------------------------------------------------------------------------------------------------------
  2967. StpMF4 StpTaaCol4RF(StpF2 p);
  2968. StpMF4 StpTaaCol4GF(StpF2 p);
  2969. StpMF4 StpTaaCol4BF(StpF2 p);
  2970. StpMF4 StpTaaCol4AF(StpF2 p);
  2971. //------------------------------------------------------------------------------------------------------------------------------
  2972. StpMF1 StpTaaConF(StpF2 p);
  2973. //------------------------------------------------------------------------------------------------------------------------------
  2974. StpMF1 StpTaaDitF(StpMU2 o);
  2975. //------------------------------------------------------------------------------------------------------------------------------
  2976. StpU4 StpTaaMot4F(StpF2 p);
  2977. //------------------------------------------------------------------------------------------------------------------------------
  2978. StpMF4 StpTaaPriFedF(StpF2 p);
  2979. StpMF4 StpTaaPriFed4RF(StpF2 p);
  2980. StpMF4 StpTaaPriFed4GF(StpF2 p);
  2981. StpMF4 StpTaaPriFed4BF(StpF2 p);
  2982. #if STP_MAX_MIN_10BIT
  2983. StpMF4 StpTaaPriFedMaxF(StpF2 p);
  2984. StpMF4 StpTaaPriFedMinF(StpF2 p);
  2985. #endif // STP_MAX_MIN_10BIT
  2986. #if STP_OFFSETS
  2987. StpMF4 StpTaaPriFedOF(StpF2 p, StpI2 o);
  2988. StpMF4 StpTaaPriFed4ROF(StpF2 p, StpI2 o);
  2989. StpMF4 StpTaaPriFed4GOF(StpF2 p, StpI2 o);
  2990. StpMF4 StpTaaPriFed4BOF(StpF2 p, StpI2 o);
  2991. #endif // STP_OFFSETS
  2992. //==============================================================================================================================
  2993. void StpTaaF(
  2994. StpMU1 lane,
  2995. StpMU2 o,
  2996. out StpMF4 rF,
  2997. out StpMF4 rW,
  2998. StpU4 con0,
  2999. StpU4 con1,
  3000. StpU4 con2,
  3001. StpU4 con3) {
  3002. //------------------------------------------------------------------------------------------------------------------------------
  3003. StpMF1 dit = StpTaaDitF(o);
  3004. //------------------------------------------------------------------------------------------------------------------------------
  3005. StpF2 kCRcpF = StpF2_U2(con0.xy);
  3006. StpF2 kHalfCRcpFUnjitC = StpF2_U2(con0.zw);
  3007. StpF2 kRcpC = StpF2_U2(con1.xy);
  3008. StpF2 kRcpF = StpF2_U2(con1.zw);
  3009. StpF2 kHalfRcpF = StpF2_U2(con2.xy);
  3010. StpF2 kJitCRcpC0 = StpF2_U2(con2.zw);
  3011. StpF2 kHalfRcpC = StpF2_U2(con3.xy);
  3012. StpF2 kF = StpF2_U2(con3.zw);
  3013. //------------------------------------------------------------------------------------------------------------------------------
  3014. #if STP_BUG_BW_SOL
  3015. { StpF2 oo = StpF2(o) * kRcpF;
  3016. StpMF4 g4 = StpTaaCtl4RF(oo);
  3017. StpU4 m4 = StpTaaMot4F(oo);
  3018. StpMF1 cnv = StpTaaConF(oo);
  3019. StpMF4 f = StpTaaPriFedF(oo);
  3020. StpMF4 c4R = StpTaaCol4RF(oo);
  3021. rW = rF = l4 + g4 + StpMF4(m4) + StpMF4_(cnv) + f + c4R;
  3022. return; }
  3023. #endif // STP_BUG_BW_SOL
  3024. //------------------------------------------------------------------------------------------------------------------------------
  3025. StpF2 oI = StpF2(o);
  3026. StpF2 oC = oI * kCRcpF + kHalfCRcpFUnjitC;
  3027. StpF2 oCNW = floor(oC + StpF2_(-0.5));
  3028. StpF2 oC4 = oCNW * kRcpC + kRcpC;
  3029. StpF2 oC1 = oC * kRcpC;
  3030. //==============================================================================================================================
  3031. // FETCH {CONVERGENCE, COLOR, CONTROL, Z+MOTION}
  3032. //==============================================================================================================================
  3033. StpMF1 cnv = StpTaaConF(oC1);
  3034. StpMF4 c4R = StpTaaCol4RF(oC4);
  3035. StpMF4 c4G = StpTaaCol4GF(oC4);
  3036. StpMF4 c4B = StpTaaCol4BF(oC4);
  3037. StpMF4 c4A = StpTaaCol4AF(oC4);
  3038. StpMF4 g4 = StpTaaCtl4F(oC4);
  3039. StpU4 m4 = StpTaaMot4F(oC4);
  3040. //------------------------------------------------------------------------------------------------------------------------------
  3041. // INDEPENDENT
  3042. //------------------------------------------------------------------------------------------------------------------------------
  3043. StpMF2 rP = StpMF2(oC - oCNW) - StpMF2_(0.5);
  3044. //------------------------------------------------------------------------------------------------------------------------------
  3045. StpMF2 rPX10 = StpMF2(1.0, 0.0) + StpMF2(-rP.x, rP.x);
  3046. StpMF2 rPY01 = StpMF2(0.0, 1.0) + StpMF2(rP.y, -rP.y);
  3047. StpMF4 pen4x = StpMF4(rPX10.g, rPX10.r, rPX10.r, rPX10.g);
  3048. StpMF4 pen4y = StpMF4(rPY01.g, rPY01.g, rPY01.r, rPY01.r);
  3049. StpMF4 pen4 = StpSatMF4(pen4x * pen4x + pen4y * pen4y);
  3050. //==============================================================================================================================
  3051. // DEPENDENT ON {CONVERGENCE}
  3052. //==============================================================================================================================
  3053. cnv = StpSatMF1(cnv - StpMF1_(1.0 / STP_FRAME_MAX));
  3054. //------------------------------------------------------------------------------------------------------------------------------
  3055. StpMF1 pen = StpMF1_(cnv) * StpMF1_(STP_FRAME_MAX) + StpMF1_(1.0);
  3056. pen = StpPrxLoSqrtMF1(pen);
  3057. pen4 = StpSatMF4(StpMF4_(1.0) - pen4 * StpMF4_(pen));
  3058. #if defined(STP_16BIT)
  3059. #else // defined(STP_16BIT)
  3060. pen = StpSatMF1(pen4.x * pen4.x + pen4.y * pen4.y + pen4.z * pen4.z + pen4.w * pen4.w);
  3061. #endif // defined(STP_16BIT)
  3062. //==============================================================================================================================
  3063. // DEPENDENT ON {COLOR}
  3064. //==============================================================================================================================
  3065. StpMF4 wG;
  3066. StpMF4 l4 = c4R + c4G * StpMF4_(2.0) + c4B;
  3067. StpMF2 difST = abs(l4.gr - l4.ab);
  3068. StpP1 useS = difST.x > difST.y;
  3069. StpMF2 wTrb = StpSatMF2(StpMF2(-rP.x, rP.x) + StpMF2(rP.y, -rP.y));
  3070. StpMF2 wSrb = min(rPX10, rPY01);
  3071. if(useS) wTrb = wSrb;
  3072. StpMF2 wTga = rPY01 - wTrb;
  3073. wG.rg = StpMF2(wTrb.x, wTga.x);
  3074. wG.ba = StpMF2(wTrb.y, wTga.y);
  3075. wG *= wG;
  3076. wG *= wG;
  3077. //------------------------------------------------------------------------------------------------------------------------------
  3078. wG *= g4;
  3079. StpMF4 triMask = StpMF4_(1.0);
  3080. StpMF2 wGmin2 = min(wG.xy, wG.zw);
  3081. //==============================================================================================================================
  3082. // DEPENDENT ON {Z,MOTION}
  3083. //==============================================================================================================================
  3084. if(wGmin2.x < wGmin2.y) {
  3085. if(wG.x < wG.z) { triMask.x = StpMF1_(STP_TAA_TRI_MASK_AVOID); m4.x = 0xFFFFFFFF; }
  3086. else { triMask.z = StpMF1_(STP_TAA_TRI_MASK_AVOID); m4.z = 0xFFFFFFFF; } }
  3087. else {
  3088. if(wG.y < wG.w) { triMask.y = StpMF1_(STP_TAA_TRI_MASK_AVOID); m4.y = 0xFFFFFFFF; }
  3089. else { triMask.w = StpMF1_(STP_TAA_TRI_MASK_AVOID); m4.w = 0xFFFFFFFF; } }
  3090. StpU1 m1 = min(StpMin3U1(m4.x, m4.y, m4.z), m4.w);
  3091. //------------------------------------------------------------------------------------------------------------------------------
  3092. wG *= triMask;
  3093. //------------------------------------------------------------------------------------------------------------------------------
  3094. StpF2 mXY;
  3095. StpMvUnpackV(mXY, m1);
  3096. //==============================================================================================================================
  3097. // GET ALL FEEDBACK FILTERING DONE
  3098. //==============================================================================================================================
  3099. StpF2 oF = oI * kRcpF + kHalfRcpF - mXY;
  3100. //------------------------------------------------------------------------------------------------------------------------------
  3101. StpMF3 f;
  3102. #if STP_TAA_PRX_LANCZOS
  3103. StpF2 oM = oI + StpF2_(0.5) - mXY * kF;
  3104. StpF2 oMNW = floor(oM + StpF2_(-0.5));
  3105. StpF2 oM4 = oMNW * kRcpF + kRcpF;
  3106. StpMF3 fMax, fMin;
  3107. #else // STP_TAA_PRX_LANCZOS
  3108. f = StpTaaPriFedF(oF).rgb;
  3109. #endif // STP_TAA_PRX_LANCZOS
  3110. //==============================================================================================================================
  3111. #if (STP_TAA_PRX_LANCZOS == 1)
  3112. #if STP_OFFSETS
  3113. StpF2 oM0 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-1.5));
  3114. StpMF3 f0 = StpTaaPriFedF(oM0).rgb;
  3115. StpMF3 f1 = StpTaaPriFedOF(oM0, StpI2(0, 1)).rgb;
  3116. StpMF3 f2 = StpTaaPriFedOF(oM0, StpI2(0, 2)).rgb;
  3117. StpMF3 f3 = StpTaaPriFedOF(oM0, StpI2(0, 3)).rgb;
  3118. #else // STP_OFFSETS
  3119. StpF2 oM0 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-1.5));
  3120. StpF2 oM1 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-0.5));
  3121. StpF2 oM2 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_( 0.5));
  3122. StpF2 oM3 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_( 1.5));
  3123. StpMF3 f0 = StpTaaPriFedF(oM0).rgb;
  3124. StpMF3 f1 = StpTaaPriFedF(oM1).rgb;
  3125. StpMF3 f2 = StpTaaPriFedF(oM2).rgb;
  3126. StpMF3 f3 = StpTaaPriFedF(oM3).rgb;
  3127. #endif // STP_OFFSETS
  3128. #if (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
  3129. fMax = StpTaaPriFedMaxF(oM4).rgb;
  3130. fMin = StpTaaPriFedMinF(oM4).rgb;
  3131. #endif // (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
  3132. #if ((STP_MAX_MIN_10BIT == 0) && STP_TAA_PRX_LANCZOS_DERING)
  3133. StpMF4 f4R = StpTaaPriFed4RF(oM4);
  3134. StpMF4 f4G = StpTaaPriFed4GF(oM4);
  3135. StpMF4 f4B = StpTaaPriFed4BF(oM4);
  3136. #endif // ((STP_MAX_MIN_10BIT == 0) && STP_TAA_PRX_LANCZOS_DERING)
  3137. //------------------------------------------------------------------------------------------------------------------------------
  3138. // INDEPENDENT
  3139. //------------------------------------------------------------------------------------------------------------------------------
  3140. StpMF2 fP = StpMF2(oM - oMNW);
  3141. StpMF4 fPY = StpMF4_(-fP.y * StpMF1_(0.5)) + StpMF4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
  3142. fPY = StpSatMF4(StpMF4_(1.0) - fPY * fPY);
  3143. fPY *= fPY;
  3144. StpMF4 fPY4 = fPY * fPY;
  3145. fPY = (StpMF4_(1.0 + 81.0 / 175.0) * fPY4 - StpMF4_(81.0 / 175.0)) * fPY;
  3146. #if defined(STP_16BIT)
  3147. #else // defined(STP_16BIT)
  3148. StpMF1 fRcp = StpPrxLoRcpMF1(fPY.r + fPY.g + fPY.b + fPY.a);
  3149. #endif // defined(STP_16BIT)
  3150. //------------------------------------------------------------------------------------------------------------------------------
  3151. // DEPENDENT
  3152. //------------------------------------------------------------------------------------------------------------------------------
  3153. f.rgb = f0 * StpMF3_(fPY.r) + f1 * StpMF3_(fPY.g) + f2 * StpMF3_(fPY.b) + f3 * StpMF3_(fPY.a);
  3154. f.rgb *= StpMF3_(fRcp);
  3155. #if STP_TAA_PRX_LANCZOS_DERING
  3156. #if (STP_MAX_MIN_10BIT == 0)
  3157. #if defined(STP_16BIT)
  3158. #else // defined(STP_16BIT)
  3159. fMax.r = max(StpMax3MF1(f4R.x, f4R.y, f4R.z), f4R.w);
  3160. fMax.g = max(StpMax3MF1(f4G.x, f4G.y, f4G.z), f4G.w);
  3161. fMax.b = max(StpMax3MF1(f4B.x, f4B.y, f4B.z), f4B.w);
  3162. fMin.r = min(StpMin3MF1(f4R.x, f4R.y, f4R.z), f4R.w);
  3163. fMin.g = min(StpMin3MF1(f4G.x, f4G.y, f4G.z), f4G.w);
  3164. fMin.b = min(StpMin3MF1(f4B.x, f4B.y, f4B.z), f4B.w);
  3165. f = clamp(f, fMin, fMax);
  3166. #endif // defined(STP_16BIT)
  3167. #else // (STP_MAX_MIN_10BIT == 0)
  3168. f = clamp(f, fMin, fMax);
  3169. #endif // (STP_MAX_MIN_10BIT == 0)
  3170. #endif // STP_TAA_PRX_LANCZOS_DERING
  3171. #endif // (STP_TAA_PRX_LANCZOS == 1)
  3172. //==============================================================================================================================
  3173. #if (STP_TAA_PRX_LANCZOS == 2)
  3174. #if STP_OFFSETS
  3175. StpMF4 f4R0 = StpTaaPriFed4ROF(oM4, StpI2(-1, -1));
  3176. StpMF4 f4G0 = StpTaaPriFed4GOF(oM4, StpI2(-1, -1));
  3177. StpMF4 f4B0 = StpTaaPriFed4BOF(oM4, StpI2(-1, -1));
  3178. StpMF4 f4R1 = StpTaaPriFed4ROF(oM4, StpI2( 1, -1));
  3179. StpMF4 f4G1 = StpTaaPriFed4GOF(oM4, StpI2( 1, -1));
  3180. StpMF4 f4B1 = StpTaaPriFed4BOF(oM4, StpI2( 1, -1));
  3181. StpMF4 f4R2 = StpTaaPriFed4ROF(oM4, StpI2(-1, 1));
  3182. StpMF4 f4G2 = StpTaaPriFed4GOF(oM4, StpI2(-1, 1));
  3183. StpMF4 f4B2 = StpTaaPriFed4BOF(oM4, StpI2(-1, 1));
  3184. StpMF4 f4R3 = StpTaaPriFed4ROF(oM4, StpI2( 1, 1));
  3185. StpMF4 f4G3 = StpTaaPriFed4GOF(oM4, StpI2( 1, 1));
  3186. StpMF4 f4B3 = StpTaaPriFed4BOF(oM4, StpI2( 1, 1));
  3187. #else // STP_OFFSETS
  3188. StpF2 oM0 = oM4 + StpF2(-kRcpF.x, -kRcpF.y);
  3189. StpF2 oM1 = oM4 + StpF2( kRcpF.x, -kRcpF.y);
  3190. StpF2 oM2 = oM4 + StpF2(-kRcpF.x, kRcpF.y);
  3191. StpF2 oM3 = oM4 + StpF2( kRcpF.x, kRcpF.y);
  3192. StpMF4 f4R0 = StpTaaPriFed4RF(oM0);
  3193. StpMF4 f4G0 = StpTaaPriFed4GF(oM0);
  3194. StpMF4 f4B0 = StpTaaPriFed4BF(oM0);
  3195. StpMF4 f4R1 = StpTaaPriFed4RF(oM1);
  3196. StpMF4 f4G1 = StpTaaPriFed4GF(oM1);
  3197. StpMF4 f4B1 = StpTaaPriFed4BF(oM1);
  3198. StpMF4 f4R2 = StpTaaPriFed4RF(oM2);
  3199. StpMF4 f4G2 = StpTaaPriFed4GF(oM2);
  3200. StpMF4 f4B2 = StpTaaPriFed4BF(oM2);
  3201. StpMF4 f4R3 = StpTaaPriFed4RF(oM3);
  3202. StpMF4 f4G3 = StpTaaPriFed4GF(oM3);
  3203. StpMF4 f4B3 = StpTaaPriFed4BF(oM3);
  3204. #endif // STP_OFFSETS
  3205. #if (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
  3206. fMax = StpTaaPriFedMaxF(oM4).rgb;
  3207. fMin = StpTaaPriFedMinF(oM4).rgb;
  3208. #endif // (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
  3209. //------------------------------------------------------------------------------------------------------------------------------
  3210. // INDEPENDENT
  3211. //------------------------------------------------------------------------------------------------------------------------------
  3212. StpMF2 fP = StpMF2(oM - oMNW);
  3213. StpMF4 fPX = StpMF4_(-fP.x * StpMF1_(0.5)) + StpMF4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
  3214. StpMF4 fPY = StpMF4_(-fP.y * StpMF1_(0.5)) + StpMF4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
  3215. fPX = StpSatMF4(StpMF4_(1.0) - fPX * fPX);
  3216. fPY = StpSatMF4(StpMF4_(1.0) - fPY * fPY);
  3217. fPX *= fPX;
  3218. fPY *= fPY;
  3219. StpMF4 fPX4 = fPX * fPX;
  3220. StpMF4 fPY4 = fPY * fPY;
  3221. fPX = (StpMF4_(1.0 + 81.0 / 175.0) * fPX4 - StpMF4_(81.0 / 175.0)) * fPX;
  3222. fPY = (StpMF4_(1.0 + 81.0 / 175.0) * fPY4 - StpMF4_(81.0 / 175.0)) * fPY;
  3223. #if defined(STP_16BIT)
  3224. #else // defined(STP_16BIT)
  3225. fPX *= StpMF4_(StpPrxLoRcpMF1(fPX.r + fPX.g + fPX.b + fPX.a));
  3226. fPY *= StpMF4_(StpPrxLoRcpMF1(fPY.r + fPY.g + fPY.b + fPY.a));
  3227. #endif // defined(STP_16BIT)
  3228. StpMF4 fPX0 = fPX * StpMF4_(fPY.r);
  3229. StpMF4 fPX1 = fPX * StpMF4_(fPY.g);
  3230. StpMF4 fPX2 = fPX * StpMF4_(fPY.b);
  3231. StpMF4 fPX3 = fPX * StpMF4_(fPY.a);
  3232. //------------------------------------------------------------------------------------------------------------------------------
  3233. // DEPENDENT
  3234. //------------------------------------------------------------------------------------------------------------------------------
  3235. #if defined(STP_16BIT)
  3236. #else // defined(STP_16BIT)
  3237. f.r = f4R0.w * fPX0.r + f4R0.z * fPX0.g + f4R1.w * fPX0.b + f4R1.z * fPX0.a +
  3238. f4R0.x * fPX1.r + f4R0.y * fPX1.g + f4R1.x * fPX1.b + f4R1.y * fPX1.a +
  3239. f4R2.w * fPX2.r + f4R2.z * fPX2.g + f4R3.w * fPX2.b + f4R3.z * fPX2.a +
  3240. f4R2.x * fPX3.r + f4R2.y * fPX3.g + f4R3.x * fPX3.b + f4R3.y * fPX3.a;
  3241. f.g = f4G0.w * fPX0.r + f4G0.z * fPX0.g + f4G1.w * fPX0.b + f4G1.z * fPX0.a +
  3242. f4G0.x * fPX1.r + f4G0.y * fPX1.g + f4G1.x * fPX1.b + f4G1.y * fPX1.a +
  3243. f4G2.w * fPX2.r + f4G2.z * fPX2.g + f4G3.w * fPX2.b + f4G3.z * fPX2.a +
  3244. f4G2.x * fPX3.r + f4G2.y * fPX3.g + f4G3.x * fPX3.b + f4G3.y * fPX3.a;
  3245. f.b = f4B0.w * fPX0.r + f4B0.z * fPX0.g + f4B1.w * fPX0.b + f4B1.z * fPX0.a +
  3246. f4B0.x * fPX1.r + f4B0.y * fPX1.g + f4B1.x * fPX1.b + f4B1.y * fPX1.a +
  3247. f4B2.w * fPX2.r + f4B2.z * fPX2.g + f4B3.w * fPX2.b + f4B3.z * fPX2.a +
  3248. f4B2.x * fPX3.r + f4B2.y * fPX3.g + f4B3.x * fPX3.b + f4B3.y * fPX3.a;
  3249. #endif // defined(STP_16BIT)
  3250. #if STP_TAA_PRX_LANCZOS_DERING
  3251. #if (STP_MAX_MIN_10BIT == 0)
  3252. #if defined(STP_16BIT)
  3253. #else // defined(STP_16BIT)
  3254. fMax.r = max(StpMax3MF1(f4R0.y, f4R1.x, f4R2.z), f4R3.w);
  3255. fMax.g = max(StpMax3MF1(f4G0.y, f4G1.x, f4G2.z), f4G3.w);
  3256. fMax.b = max(StpMax3MF1(f4B0.y, f4B1.x, f4B2.z), f4B3.w);
  3257. fMin.r = min(StpMin3MF1(f4R0.y, f4R1.x, f4R2.z), f4R3.w);
  3258. fMin.g = min(StpMin3MF1(f4G0.y, f4G1.x, f4G2.z), f4G3.w);
  3259. fMin.b = min(StpMin3MF1(f4B0.y, f4B1.x, f4B2.z), f4B3.w);
  3260. f = clamp(f, fMin, fMax);
  3261. #endif // defined(STP_16BIT)
  3262. #else // (STP_MAX_MIN_10BIT == 0)
  3263. f = clamp(f, fMin, fMax);
  3264. #endif // (STP_MAX_MIN_10BIT == 0)
  3265. #endif // STP_TAA_PRX_LANCZOS_DERING
  3266. #endif // (STP_TAA_PRX_LANCZOS == 2)
  3267. //==============================================================================================================================
  3268. // DISPLACEMENT
  3269. //==============================================================================================================================
  3270. StpF2 oD0 = oC4 + kJitCRcpC0 - mXY;
  3271. StpF2 oD1 = StpF2(kRcpC.x, 0.0) + oD0;
  3272. StpF2 oD2 = StpF2(kRcpC.x, -kRcpC.y) + oD0;
  3273. StpF2 oD3 = StpF2(0.0, -kRcpC.y) + oD0;
  3274. StpMF3 d0 = StpTaaPriFedF(oD0).rgb;
  3275. StpMF3 d1 = StpTaaPriFedF(oD1).rgb;
  3276. StpMF3 d2 = StpTaaPriFedF(oD2).rgb;
  3277. StpMF3 d3 = StpTaaPriFedF(oD3).rgb;
  3278. //------------------------------------------------------------------------------------------------------------------------------
  3279. // INDEPENDENT
  3280. //------------------------------------------------------------------------------------------------------------------------------
  3281. #if defined(STP_16BIT)
  3282. #else // defined(STP_16BIT)
  3283. wG = StpSatMF4(wG * StpMF4_(StpPrxLoRcpMF1(wG.x + wG.y + wG.z + wG.w)));
  3284. #endif // defined(STP_16BIT)
  3285. //------------------------------------------------------------------------------------------------------------------------------
  3286. StpMF4 wT = abs(c4R - StpMF4_(f.r)) * StpMF4_(STP_LUMA_R) +
  3287. abs(c4G - StpMF4_(f.g)) * StpMF4_(STP_LUMA_G) +
  3288. abs(c4B - StpMF4_(f.b)) * StpMF4_(STP_LUMA_B);
  3289. wT = StpPrxLoRcpMF4(wT * StpMF4_(STP_ANTI_MAX) + StpMF4_(STP_ANTI_MIN)) * triMask;
  3290. //------------------------------------------------------------------------------------------------------------------------------
  3291. #if defined(STP_16BIT)
  3292. #else // defined(STP_16BIT)
  3293. wT = StpSatMF4(wT * StpMF4_(StpPrxLoRcpMF1(wT.x + wT.y + wT.z + wT.w)));
  3294. #endif // defined(STP_16BIT)
  3295. //------------------------------------------------------------------------------------------------------------------------------
  3296. StpMF4 wM = wT * StpMF4_(0.5) + wG * StpMF4_(0.5);
  3297. #if defined(STP_16BIT)
  3298. #else // defined(STP_16BIT)
  3299. StpMF1 match = c4A.x * wM.x + c4A.y * wM.y + c4A.z * wM.z + c4A.w * wM.w;
  3300. #endif // defined(STP_16BIT)
  3301. cnv *= match;
  3302. //------------------------------------------------------------------------------------------------------------------------------
  3303. // DEPENDENT
  3304. //------------------------------------------------------------------------------------------------------------------------------
  3305. StpMF3 dG = d0 * StpMF3_(wG.x) + d1 * StpMF3_(wG.y) + d2 * StpMF3_(wG.z) + d3 * StpMF3_(wG.w);
  3306. StpMF3 dT = d0 * StpMF3_(wT.x) + d1 * StpMF3_(wT.y) + d2 * StpMF3_(wT.z) + d3 * StpMF3_(wT.w);
  3307. //------------------------------------------------------------------------------------------------------------------------------
  3308. #if defined(STP_16BIT)
  3309. #else // defined(STP_16BIT)
  3310. StpMF3 t = StpMF3(
  3311. c4R.x * wT.x + c4R.y * wT.y + c4R.z * wT.z + c4R.w * wT.w,
  3312. c4G.x * wT.x + c4G.y * wT.y + c4G.z * wT.z + c4G.w * wT.w,
  3313. c4B.x * wT.x + c4B.y * wT.y + c4B.z * wT.z + c4B.w * wT.w);
  3314. StpMF3 c = StpMF3(
  3315. c4R.x * wG.x + c4R.y * wG.y + c4R.z * wG.z + c4R.w * wG.w,
  3316. c4G.x * wG.x + c4G.y * wG.y + c4G.z * wG.z + c4G.w * wG.w,
  3317. c4B.x * wG.x + c4B.y * wG.y + c4B.z * wG.z + c4B.w * wG.w);
  3318. #endif // defined(STP_16BIT)
  3319. //------------------------------------------------------------------------------------------------------------------------------
  3320. StpMF1 bln = StpSatMF1(cnv * StpPrxLoRcpMF1(cnv + StpMF1_(1.0 / STP_FRAME_MAX)));
  3321. StpMF1 blnT = StpMF1_(1.0) - bln;
  3322. StpMF3 b = f * StpMF3_(bln) + t * StpMF3_(blnT);
  3323. StpMF3 minNe = min(c, b);
  3324. StpMF3 maxNe = max(c, b);
  3325. //------------------------------------------------------------------------------------------------------------------------------
  3326. StpMF3 penC = StpSatMF3(c + (f - dG) * StpMF3_(StpMF1_(0.9875) * match));
  3327. StpMF2 penWF;
  3328. penWF.x = pen * StpMF1_(STP_TAA_PEN_W);
  3329. penWF.y = pen * lerp(StpMF1_(STP_TAA_PEN_F0), StpMF1_(STP_TAA_PEN_F1), cnv);
  3330. StpMF2 penNotWF = StpMF2_(1.0) - penWF;
  3331. rF.rgb = t + (f - dT);
  3332. rF.rgb = rF.rgb * StpMF3_(blnT) + f * StpMF3_(bln);
  3333. rW.rgb = StpSatMF3(rF.rgb * StpMF3_(penNotWF.x) + penC * StpMF3_(penWF.x));
  3334. rF.rgb = StpSatMF3(rF.rgb * StpMF3_(penNotWF.y) + penC * StpMF3_(penWF.y));
  3335. rW.rgb = clamp(rW.rgb, minNe, maxNe);
  3336. rF.rgb = clamp(rF.rgb, minNe, maxNe);
  3337. //------------------------------------------------------------------------------------------------------------------------------
  3338. rW.rgb *= rW.rgb;
  3339. #if (STP_POSTMAP == 0)
  3340. StpToneInvMF3(rW.rgb);
  3341. #endif // (STP_POSTMAP == 0)
  3342. rF.a = rW.a = StpMF1(0.0); }
  3343. #endif // defined(STP_GPU) && defined(STP_TAA) && defined(STP_32BIT)
  3344. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  3345. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  3346. //_____________________________________________________________.._______________________________________________________________
  3347. //==============================================================================================================================
  3348. // 16-BIT PATH
  3349. //==============================================================================================================================
  3350. #if defined(STP_GPU) && defined(STP_TAA) && defined(STP_16BIT)
  3351. // Callbacks.
  3352. // Gather4 of GEAA control data.
  3353. StpH4 StpTaaCtl4H(StpF2 p);
  3354. //------------------------------------------------------------------------------------------------------------------------------
  3355. // Current frame {color,anti} input.
  3356. // Gather4 specific channels.
  3357. StpH4 StpTaaCol4RH(StpF2 p);
  3358. StpH4 StpTaaCol4GH(StpF2 p);
  3359. StpH4 StpTaaCol4BH(StpF2 p);
  3360. StpH4 StpTaaCol4AH(StpF2 p);
  3361. //------------------------------------------------------------------------------------------------------------------------------
  3362. // Bilinear sampling of low-frequency convergence.
  3363. StpH1 StpTaaConH(StpF2 p);
  3364. //------------------------------------------------------------------------------------------------------------------------------
  3365. // Dither value {0 to 1} this should be output pixel frequency spatial temporal blue noise.
  3366. StpH1 StpTaaDitH(StpW2 o);
  3367. //------------------------------------------------------------------------------------------------------------------------------
  3368. // Gather4 current frame motion {z,x,y} packed input, same as the 32-bit version (just renamed).
  3369. StpU4 StpTaaMot4H(StpF2 p);
  3370. //------------------------------------------------------------------------------------------------------------------------------
  3371. // Feedback {color, alpha}.
  3372. // Bilinear fetch with clamp to edge.
  3373. StpH4 StpTaaPriFedH(StpF2 p);
  3374. // Gather4.
  3375. StpH4 StpTaaPriFed4RH(StpF2 p);
  3376. StpH4 StpTaaPriFed4GH(StpF2 p);
  3377. StpH4 StpTaaPriFed4BH(StpF2 p);
  3378. // Min/max sampling used for dering.
  3379. #if STP_MAX_MIN_10BIT
  3380. StpH4 StpTaaPriFedMaxH(StpF2 p);
  3381. StpH4 StpTaaPriFedMinH(StpF2 p);
  3382. #endif // STP_MAX_MIN_10BIT
  3383. // Sampling with offsets.
  3384. #if STP_OFFSETS
  3385. StpH4 StpTaaPriFedOH(StpF2 p, StpI2 o);
  3386. StpH4 StpTaaPriFed4ROH(StpF2 p, StpI2 o);
  3387. StpH4 StpTaaPriFed4GOH(StpF2 p, StpI2 o);
  3388. StpH4 StpTaaPriFed4BOH(StpF2 p, StpI2 o);
  3389. #endif // STP_OFFSETS
  3390. //==============================================================================================================================
  3391. void StpTaaH(
  3392. StpW1 lane, // Currently unused but in the interface for possible future expansion.
  3393. StpW2 o, // Integer pixel offset in output.
  3394. out StpH4 rF, // Return Feedback (to be stored).
  3395. out StpH4 rW, // Return Output (to be stored).
  3396. StpU4 con0, // Constants generated by StpTaaCon().
  3397. StpU4 con1,
  3398. StpU4 con2,
  3399. StpU4 con3) {
  3400. //------------------------------------------------------------------------------------------------------------------------------
  3401. // This is only currently used for debug.
  3402. StpH1 dit = StpTaaDitH(o);
  3403. //------------------------------------------------------------------------------------------------------------------------------
  3404. // Rename constants.
  3405. StpF2 kCRcpF = StpF2_U2(con0.xy);
  3406. StpF2 kHalfCRcpFUnjitC = StpF2_U2(con0.zw);
  3407. StpF2 kRcpC = StpF2_U2(con1.xy);
  3408. StpF2 kRcpF = StpF2_U2(con1.zw);
  3409. StpF2 kHalfRcpF = StpF2_U2(con2.xy);
  3410. StpF2 kJitCRcpC0 = StpF2_U2(con2.zw);
  3411. StpF2 kHalfRcpC = StpF2_U2(con3.xy);
  3412. StpF2 kF = StpF2_U2(con3.zw);
  3413. //------------------------------------------------------------------------------------------------------------------------------
  3414. // Check the streaming bandwidth limit.
  3415. #if STP_BUG_BW_SOL
  3416. { StpF2 oo = StpF2(o) * kRcpF;
  3417. StpH4 g4 = StpTaaCtl4RH(oo);
  3418. StpU4 m4 = StpTaaMot4H(oo);
  3419. StpH1 cnv = StpTaaConH(oo);
  3420. StpH4 f = StpTaaPriFedH(oo);
  3421. StpH4 c4R = StpTaaCol4RH(oo);
  3422. rW = rF = l4 + g4 + StpH4(m4) + StpH4_(cnv) + f + c4R;
  3423. return; }
  3424. #endif // STP_BUG_BW_SOL
  3425. //------------------------------------------------------------------------------------------------------------------------------
  3426. // Locate 2x2 neighborhood.
  3427. // Float version of integer pixel offset in output.
  3428. // All the 'o' prefixed variables are offset (aka position/coordinate) related.
  3429. StpF2 oI = StpF2(o);
  3430. // This gets to the center of the 2x2 quad directly because of possibility of shader/tex precision mismatch.
  3431. // Precision mismatch could yield different 2x2 quads.
  3432. StpF2 oC = oI * kCRcpF + kHalfCRcpFUnjitC;
  3433. // NW of 2x2 quad.
  3434. StpF2 oCNW = floor(oC + StpF2_(-0.5));
  3435. // Center of the 2x2 quad.
  3436. StpF2 oC4 = oCNW * kRcpC + kRcpC;
  3437. // Coordinates for low frequency convergence.
  3438. StpF2 oC1 = oC * kRcpC;
  3439. //==============================================================================================================================
  3440. // FETCH {CONVERGENCE, COLOR, CONTROL, Z+MOTION}
  3441. //==============================================================================================================================
  3442. // Fetch low-frequency convergence.
  3443. StpH1 cnv = StpTaaConH(oC1);
  3444. // Fetch color.
  3445. StpH4 c4R = StpTaaCol4RH(oC4);
  3446. StpH4 c4G = StpTaaCol4GH(oC4);
  3447. StpH4 c4B = StpTaaCol4BH(oC4);
  3448. StpH4 c4A = StpTaaCol4AH(oC4);
  3449. // Control (GEAA weights)
  3450. StpH4 g4 = StpTaaCtl4H(oC4);
  3451. // Fetch {z,motion}.
  3452. StpU4 m4 = StpTaaMot4H(oC4);
  3453. //------------------------------------------------------------------------------------------------------------------------------
  3454. // INDEPENDENT
  3455. //------------------------------------------------------------------------------------------------------------------------------
  3456. // Setup resolve position {0 to 1} inside 2x2 quad.
  3457. // The extra -0.5 is to get from NW position to center.
  3458. StpH2 rP = StpH2(oC - oCNW) - StpH2_(0.5);
  3459. //------------------------------------------------------------------------------------------------------------------------------
  3460. // The 'rP' is resolve position {0 to 1} inside 2x2 quad, this is distance to ends of 2x2.
  3461. // Instead of using {a,a-1} this uses {a,1-a} for reuse with the simple angular filtering.
  3462. StpH2 rPX10 = StpH2(1.0, 0.0) + StpH2(-rP.x, rP.x);
  3463. StpH2 rPY01 = StpH2(0.0, 1.0) + StpH2(rP.y, -rP.y);
  3464. // Distance^2 {0 := on, 1 := off}.
  3465. StpH4 pen4x = StpH4(rPX10.g, rPX10.r, rPX10.r, rPX10.g);
  3466. StpH4 pen4y = StpH4(rPY01.g, rPY01.g, rPY01.r, rPY01.r);
  3467. // Pen starts with distance squared to all 2x2 points.
  3468. StpH4 pen4 = StpSatH4(pen4x * pen4x + pen4y * pen4y);
  3469. //==============================================================================================================================
  3470. // DEPENDENT ON {CONVERGENCE}
  3471. //==============================================================================================================================
  3472. // Low frequency convergence keeps the next frame value, so subtract one frame.
  3473. cnv = StpSatH1(cnv - StpH1_(1.0 / STP_FRAME_MAX));
  3474. //------------------------------------------------------------------------------------------------------------------------------
  3475. // Pen size based on convergence.
  3476. StpH1 pen = StpH1_(cnv) * StpH1_(STP_FRAME_MAX) + StpH1_(1.0);
  3477. pen = StpPrxLoSqrtH1(pen);
  3478. pen4 = StpSatH4(StpH4_(1.0) - pen4 * StpH4_(pen));
  3479. #if defined(STP_16BIT)
  3480. StpH2 pen2 = pen4.xy * pen4.xy + pen4.zw * pen4.zw;
  3481. pen = StpSatH1(pen2.x + pen2.y);
  3482. #else // defined(STP_16BIT)
  3483. pen = StpSatMF1(pen4.x * pen4.x + pen4.y * pen4.y + pen4.z * pen4.z + pen4.w * pen4.w);
  3484. #endif // defined(STP_16BIT)
  3485. //==============================================================================================================================
  3486. // DEPENDENT ON {COLOR}
  3487. //==============================================================================================================================
  3488. // Simple angular filtering (gets rid of block artifacts, adds sawtooth artifacts which are not a problem in practice).
  3489. // Create a GEAA based weighting for no temporal feedback case.
  3490. StpH4 wG;
  3491. // Selects between either (S) or (T).
  3492. // (S) A--B ... (T) A--B
  3493. // |\ | | /|
  3494. // | \| |/ |
  3495. // R--G R--G
  3496. // S and T only use the other diagonal.
  3497. // Exact luma not required.
  3498. StpH4 l4 = c4R + c4G * StpH4_(2.0) + c4B;
  3499. StpH2 difST = abs(l4.gr - l4.ab);
  3500. // Choose configuration based on which difference is maximum.
  3501. StpP1 useS = difST.x > difST.y;
  3502. // Choose interpolation weights given the configuration.
  3503. // _T__________ _S__________
  3504. // R | sat( -x+ y) min(1-x, y) = y-G
  3505. // G | min( x, y) sat(x-1+ y) = y-R
  3506. // B | sat( x- y) min( x,1-y) = (1-y)-A
  3507. // A | min(1-x,1-y) sat(1-x- y) = (1-y)-B
  3508. // Difference between S and T is a {x} vs {1-x} and a RGBA vs GRAB swap.
  3509. StpH2 wTrb = StpSatH2(StpH2(-rP.x, rP.x) + StpH2(rP.y, -rP.y));
  3510. StpH2 wSrb = min(rPX10, rPY01);
  3511. if(useS) wTrb = wSrb;
  3512. StpH2 wTga = rPY01 - wTrb;
  3513. wG.rg = StpH2(wTrb.x, wTga.x);
  3514. wG.ba = StpH2(wTrb.y, wTga.y);
  3515. // Shaping is needed to get good high area scaling (remove the transition region).
  3516. wG *= wG;
  3517. wG *= wG;
  3518. //------------------------------------------------------------------------------------------------------------------------------
  3519. // Scale directional interpolation weights by GEAA weights to introduce anti-aliasing.
  3520. wG *= g4;
  3521. // Triangular nearest.
  3522. // This works by removing the corner which contributes the least to the spatial interpolated result.
  3523. StpH4 triMask = StpH4_(1.0);
  3524. StpH2 wGmin2 = min(wG.xy, wG.zw);
  3525. //==============================================================================================================================
  3526. // DEPENDENT ON {Z,MOTION}
  3527. //==============================================================================================================================
  3528. // This overwrites gather4 results.
  3529. if(wGmin2.x < wGmin2.y) {
  3530. if(wG.x < wG.z) { triMask.x = StpH1_(STP_TAA_TRI_MASK_AVOID); m4.x = 0xFFFFFFFF; }
  3531. else { triMask.z = StpH1_(STP_TAA_TRI_MASK_AVOID); m4.z = 0xFFFFFFFF; } }
  3532. else {
  3533. if(wG.y < wG.w) { triMask.y = StpH1_(STP_TAA_TRI_MASK_AVOID); m4.y = 0xFFFFFFFF; }
  3534. else { triMask.w = StpH1_(STP_TAA_TRI_MASK_AVOID); m4.w = 0xFFFFFFFF; } }
  3535. StpU1 m1 = min(StpMin3U1(m4.x, m4.y, m4.z), m4.w);
  3536. //------------------------------------------------------------------------------------------------------------------------------
  3537. // Want to consume 'triMask' to free up register space.
  3538. wG *= triMask;
  3539. //------------------------------------------------------------------------------------------------------------------------------
  3540. StpF2 mXY;
  3541. // Motion 'm' units are {1 := move by one screen}.
  3542. StpMvUnpackV(mXY, m1);
  3543. //==============================================================================================================================
  3544. // GET ALL FEEDBACK FILTERING DONE
  3545. //==============================================================================================================================
  3546. // This region of code will have the highest register pressure in some configs, so doing as early as possible.
  3547. // Setup for fetch feedback.
  3548. StpF2 oF = oI * kRcpF + kHalfRcpF - mXY;
  3549. //------------------------------------------------------------------------------------------------------------------------------
  3550. StpH3 f;
  3551. // Lanczos common.
  3552. #if STP_TAA_PRX_LANCZOS
  3553. // Motion reprojection position in feedback pixels.
  3554. StpF2 oM = oI + StpF2_(0.5) - mXY * kF;
  3555. // NW of center 2x2 quad.
  3556. StpF2 oMNW = floor(oM + StpF2_(-0.5));
  3557. // Center of the center 2x2 quad.
  3558. StpF2 oM4 = oMNW * kRcpF + kRcpF;
  3559. StpH3 fMax, fMin;
  3560. #else // STP_TAA_PRX_LANCZOS
  3561. // Sample nearest feedback.
  3562. f = StpTaaPriFedH(oF).rgb;
  3563. #endif // STP_TAA_PRX_LANCZOS
  3564. //==============================================================================================================================
  3565. #if (STP_TAA_PRX_LANCZOS == 1)
  3566. // This one does a fixed 1x4 to try to cut cost in half relative to the complete 4x4.
  3567. // It uses bilinear sampling on the 'x'.
  3568. // Lanczos on the 'y' because most floating camera motion is 'y' based.
  3569. // Fetch {feedback}.
  3570. #if STP_OFFSETS
  3571. // TODO: Can optimize out the 'oM4.y' add with constant change.
  3572. StpF2 oM0 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-1.5));
  3573. StpH3 f0 = StpTaaPriFedH(oM0).rgb;
  3574. StpH3 f1 = StpTaaPriFedOH(oM0, StpI2(0, 1)).rgb;
  3575. StpH3 f2 = StpTaaPriFedOH(oM0, StpI2(0, 2)).rgb;
  3576. StpH3 f3 = StpTaaPriFedOH(oM0, StpI2(0, 3)).rgb;
  3577. #else // STP_OFFSETS
  3578. StpF2 oM0 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-1.5));
  3579. StpF2 oM1 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_(-0.5));
  3580. StpF2 oM2 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_( 0.5));
  3581. StpF2 oM3 = StpF2(oF.x, oM4.y + kRcpF.y * StpF1_( 1.5));
  3582. StpH3 f0 = StpTaaPriFedH(oM0).rgb;
  3583. StpH3 f1 = StpTaaPriFedH(oM1).rgb;
  3584. StpH3 f2 = StpTaaPriFedH(oM2).rgb;
  3585. StpH3 f3 = StpTaaPriFedH(oM3).rgb;
  3586. #endif // STP_OFFSETS
  3587. // Want this last because it's used last.
  3588. #if (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
  3589. fMax = StpTaaPriFedMaxH(oM4).rgb;
  3590. fMin = StpTaaPriFedMinH(oM4).rgb;
  3591. #endif // (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
  3592. #if ((STP_MAX_MIN_10BIT == 0) && STP_TAA_PRX_LANCZOS_DERING)
  3593. // Without {min,max} sampling, must gather4.
  3594. StpH4 f4R = StpTaaPriFed4RH(oM4);
  3595. StpH4 f4G = StpTaaPriFed4GH(oM4);
  3596. StpH4 f4B = StpTaaPriFed4BH(oM4);
  3597. #endif // ((STP_MAX_MIN_10BIT == 0) && STP_TAA_PRX_LANCZOS_DERING)
  3598. //------------------------------------------------------------------------------------------------------------------------------
  3599. // INDEPENDENT
  3600. //------------------------------------------------------------------------------------------------------------------------------
  3601. // Convert to approximate lanczos weights.
  3602. // Feedback position {0 to 1} inside 2x2 quad + 0.5.
  3603. StpH2 fP = StpH2(oM - oMNW);
  3604. // Convert to approximate lanczos weights.
  3605. // This converts {-2 to 2} to {-1 to 1} because the kernel approximation is written for {-1 to 1}.
  3606. StpH4 fPY = StpH4_(-fP.y * StpH1_(0.5)) + StpH4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
  3607. // Weights in one axis.
  3608. fPY = StpSatH4(StpH4_(1.0) - fPY * fPY);
  3609. fPY *= fPY;
  3610. StpH4 fPY4 = fPY * fPY;
  3611. // ^6 (slightly more negative lobe than lanczos 2, slightly less expensive)
  3612. fPY = (StpH4_(1.0 + 81.0 / 175.0) * fPY4 - StpH4_(81.0 / 175.0)) * fPY;
  3613. #if defined(STP_16BIT)
  3614. StpH2 fRcp2 = fPY.rg + fPY.ba;
  3615. StpH1 fRcp = StpPrxLoRcpH1(fRcp2.x + fRcp2.y);
  3616. #else // defined(STP_16BIT)
  3617. StpMF1 fRcp = StpPrxLoRcpMF1(fPY.r + fPY.g + fPY.b + fPY.a);
  3618. #endif // defined(STP_16BIT)
  3619. //------------------------------------------------------------------------------------------------------------------------------
  3620. // DEPENDENT
  3621. //------------------------------------------------------------------------------------------------------------------------------
  3622. f.rgb = f0 * StpH3_(fPY.r) + f1 * StpH3_(fPY.g) + f2 * StpH3_(fPY.b) + f3 * StpH3_(fPY.a);
  3623. f.rgb *= StpH3_(fRcp);
  3624. #if STP_TAA_PRX_LANCZOS_DERING
  3625. #if (STP_MAX_MIN_10BIT == 0)
  3626. #if defined(STP_16BIT)
  3627. StpH2 fXnyR = max(max(StpH2(f4R.x, -f4R.x), StpH2(f4R.y, -f4R.y)),
  3628. max(StpH2(f4R.z, -f4R.z), StpH2(f4R.w, -f4R.w)));
  3629. StpH2 fXnyG = max(max(StpH2(f4G.x, -f4G.x), StpH2(f4G.y, -f4G.y)),
  3630. max(StpH2(f4G.z, -f4G.z), StpH2(f4G.w, -f4G.w)));
  3631. StpH2 fXnyB = max(max(StpH2(f4B.x, -f4B.x), StpH2(f4B.y, -f4B.y)),
  3632. max(StpH2(f4B.z, -f4B.z), StpH2(f4B.w, -f4B.w)));
  3633. f = clamp(f, StpH3(-fXnyR.y, -fXnyG.y, -fXnyB.y), StpH3(fXnyR.x, fXnyG.x, fXnyB.x));
  3634. #else // defined(STP_16BIT)
  3635. fMax.r = max(StpMax3H1(f4R.x, f4R.y, f4R.z), f4R.w);
  3636. fMax.g = max(StpMax3H1(f4G.x, f4G.y, f4G.z), f4G.w);
  3637. fMax.b = max(StpMax3H1(f4B.x, f4B.y, f4B.z), f4B.w);
  3638. fMin.r = min(StpMin3H1(f4R.x, f4R.y, f4R.z), f4R.w);
  3639. fMin.g = min(StpMin3H1(f4G.x, f4G.y, f4G.z), f4G.w);
  3640. fMin.b = min(StpMin3H1(f4B.x, f4B.y, f4B.z), f4B.w);
  3641. f = clamp(f, fMin, fMax);
  3642. #endif // defined(STP_16BIT)
  3643. #else // (STP_MAX_MIN_10BIT == 0)
  3644. // Leaning on {min,max} sampling so no 16/32-bit permutation.
  3645. f = clamp(f, fMin, fMax);
  3646. #endif // (STP_MAX_MIN_10BIT == 0)
  3647. #endif // STP_TAA_PRX_LANCZOS_DERING
  3648. #endif // (STP_TAA_PRX_LANCZOS == 1)
  3649. //==============================================================================================================================
  3650. #if (STP_TAA_PRX_LANCZOS == 2)
  3651. // Unstable approximate lanczos feedback, full 4x4.
  3652. // a = saturate(1-x*x)
  3653. // u = 1+v
  3654. // v = moves the zero crossing to 0.5
  3655. // w = adjusts the shape
  3656. // u*a^w - v*a^2
  3657. // Fetch {feedback}.
  3658. // 0w 0z 1w 1z | R
  3659. // 0x 0y 1x 1y | G
  3660. // 2w 2z 3w 3z | B
  3661. // 2x 2y 3x 3y | A
  3662. // -- -- -- --
  3663. // R G B A
  3664. #if STP_OFFSETS
  3665. StpH4 f4R0 = StpTaaPriFed4ROH(oM4, StpI2(-1, -1));
  3666. StpH4 f4G0 = StpTaaPriFed4GOH(oM4, StpI2(-1, -1));
  3667. StpH4 f4B0 = StpTaaPriFed4BOH(oM4, StpI2(-1, -1));
  3668. StpH4 f4R1 = StpTaaPriFed4ROH(oM4, StpI2( 1, -1));
  3669. StpH4 f4G1 = StpTaaPriFed4GOH(oM4, StpI2( 1, -1));
  3670. StpH4 f4B1 = StpTaaPriFed4BOH(oM4, StpI2( 1, -1));
  3671. StpH4 f4R2 = StpTaaPriFed4ROH(oM4, StpI2(-1, 1));
  3672. StpH4 f4G2 = StpTaaPriFed4GOH(oM4, StpI2(-1, 1));
  3673. StpH4 f4B2 = StpTaaPriFed4BOH(oM4, StpI2(-1, 1));
  3674. StpH4 f4R3 = StpTaaPriFed4ROH(oM4, StpI2( 1, 1));
  3675. StpH4 f4G3 = StpTaaPriFed4GOH(oM4, StpI2( 1, 1));
  3676. StpH4 f4B3 = StpTaaPriFed4BOH(oM4, StpI2( 1, 1));
  3677. #else // STP_OFFSETS
  3678. StpF2 oM0 = oM4 + StpF2(-kRcpF.x, -kRcpF.y);
  3679. StpF2 oM1 = oM4 + StpF2( kRcpF.x, -kRcpF.y);
  3680. StpF2 oM2 = oM4 + StpF2(-kRcpF.x, kRcpF.y);
  3681. StpF2 oM3 = oM4 + StpF2( kRcpF.x, kRcpF.y);
  3682. StpH4 f4R0 = StpTaaPriFed4RH(oM0);
  3683. StpH4 f4G0 = StpTaaPriFed4GH(oM0);
  3684. StpH4 f4B0 = StpTaaPriFed4BH(oM0);
  3685. StpH4 f4R1 = StpTaaPriFed4RH(oM1);
  3686. StpH4 f4G1 = StpTaaPriFed4GH(oM1);
  3687. StpH4 f4B1 = StpTaaPriFed4BH(oM1);
  3688. StpH4 f4R2 = StpTaaPriFed4RH(oM2);
  3689. StpH4 f4G2 = StpTaaPriFed4GH(oM2);
  3690. StpH4 f4B2 = StpTaaPriFed4BH(oM2);
  3691. StpH4 f4R3 = StpTaaPriFed4RH(oM3);
  3692. StpH4 f4G3 = StpTaaPriFed4GH(oM3);
  3693. StpH4 f4B3 = StpTaaPriFed4BH(oM3);
  3694. #endif // STP_OFFSETS
  3695. // Want this last because it's used last.
  3696. #if (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
  3697. fMax = StpTaaPriFedMaxH(oM4).rgb;
  3698. fMin = StpTaaPriFedMinH(oM4).rgb;
  3699. #endif // (STP_MAX_MIN_10BIT && STP_TAA_PRX_LANCZOS_DERING)
  3700. //------------------------------------------------------------------------------------------------------------------------------
  3701. // INDEPENDENT
  3702. //------------------------------------------------------------------------------------------------------------------------------
  3703. // Feedback position {0 to 1} inside 2x2 quad + 0.5.
  3704. StpH2 fP = StpH2(oM - oMNW);
  3705. // Convert to approximate lanczos weights.
  3706. // This converts {-2 to 2} to {-1 to 1} because the kernel approximation is written for {-1 to 1}.
  3707. StpH4 fPX = StpH4_(-fP.x * StpH1_(0.5)) + StpH4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
  3708. StpH4 fPY = StpH4_(-fP.y * StpH1_(0.5)) + StpH4(-0.5 * 0.5, 0.5 * 0.5, 1.5 * 0.5, 2.5 * 0.5);
  3709. // Weights in both axis.
  3710. fPX = StpSatH4(StpH4_(1.0) - fPX * fPX);
  3711. fPY = StpSatH4(StpH4_(1.0) - fPY * fPY);
  3712. fPX *= fPX;
  3713. fPY *= fPY;
  3714. StpH4 fPX4 = fPX * fPX;
  3715. StpH4 fPY4 = fPY * fPY;
  3716. // ^6 (slightly more negative lobe than lanczos 2, slightly less expensive)
  3717. fPX = (StpH4_(1.0 + 81.0 / 175.0) * fPX4 - StpH4_(81.0 / 175.0)) * fPX;
  3718. fPY = (StpH4_(1.0 + 81.0 / 175.0) * fPY4 - StpH4_(81.0 / 175.0)) * fPY;
  3719. #if defined(STP_16BIT)
  3720. StpH2 fRcpX = fPX.rg + fPX.ba;
  3721. StpH2 fRcpY = fPY.rg + fPY.ba;
  3722. fPX *= StpH4_(StpPrxLoRcpH1(fRcpX.r + fRcpX.y));
  3723. fPY *= StpH4_(StpPrxLoRcpH1(fRcpY.r + fRcpY.y));
  3724. #else // defined(STP_16BIT)
  3725. fPX *= StpMF4_(StpPrxLoRcpMF1(fPX.r + fPX.g + fPX.b + fPX.a));
  3726. fPY *= StpMF4_(StpPrxLoRcpMF1(fPY.r + fPY.g + fPY.b + fPY.a));
  3727. #endif // defined(STP_16BIT)
  3728. StpH4 fPX0 = fPX * StpH4_(fPY.r);
  3729. StpH4 fPX1 = fPX * StpH4_(fPY.g);
  3730. StpH4 fPX2 = fPX * StpH4_(fPY.b);
  3731. StpH4 fPX3 = fPX * StpH4_(fPY.a);
  3732. //------------------------------------------------------------------------------------------------------------------------------
  3733. // DEPENDENT
  3734. //------------------------------------------------------------------------------------------------------------------------------
  3735. #if defined(STP_16BIT)
  3736. StpH2 fR2 = f4R0.wz * fPX0.xy + f4R1.wz * fPX0.zw + f4R0.xy * fPX1.xy + f4R1.xy * fPX1.zw +
  3737. f4R2.wz * fPX2.xy + f4R3.wz * fPX2.zw + f4R2.xy * fPX3.xy + f4R3.xy * fPX3.zw;
  3738. StpH2 fG2 = f4G0.wz * fPX0.xy + f4G1.wz * fPX0.zw + f4G0.xy * fPX1.xy + f4G1.xy * fPX1.zw +
  3739. f4G2.wz * fPX2.xy + f4G3.wz * fPX2.zw + f4G2.xy * fPX3.xy + f4G3.xy * fPX3.zw;
  3740. StpH2 fB2 = f4B0.wz * fPX0.xy + f4B1.wz * fPX0.zw + f4B0.xy * fPX1.xy + f4B1.xy * fPX1.zw +
  3741. f4B2.wz * fPX2.xy + f4B3.wz * fPX2.zw + f4B2.xy * fPX3.xy + f4B3.xy * fPX3.zw;
  3742. f = StpH3(fR2.x + fR2.y, fG2.x + fG2.y, fB2.x + fB2.y);
  3743. #else // defined(STP_16BIT)
  3744. f.r = f4R0.w * fPX0.r + f4R0.z * fPX0.g + f4R1.w * fPX0.b + f4R1.z * fPX0.a +
  3745. f4R0.x * fPX1.r + f4R0.y * fPX1.g + f4R1.x * fPX1.b + f4R1.y * fPX1.a +
  3746. f4R2.w * fPX2.r + f4R2.z * fPX2.g + f4R3.w * fPX2.b + f4R3.z * fPX2.a +
  3747. f4R2.x * fPX3.r + f4R2.y * fPX3.g + f4R3.x * fPX3.b + f4R3.y * fPX3.a;
  3748. f.g = f4G0.w * fPX0.r + f4G0.z * fPX0.g + f4G1.w * fPX0.b + f4G1.z * fPX0.a +
  3749. f4G0.x * fPX1.r + f4G0.y * fPX1.g + f4G1.x * fPX1.b + f4G1.y * fPX1.a +
  3750. f4G2.w * fPX2.r + f4G2.z * fPX2.g + f4G3.w * fPX2.b + f4G3.z * fPX2.a +
  3751. f4G2.x * fPX3.r + f4G2.y * fPX3.g + f4G3.x * fPX3.b + f4G3.y * fPX3.a;
  3752. f.b = f4B0.w * fPX0.r + f4B0.z * fPX0.g + f4B1.w * fPX0.b + f4B1.z * fPX0.a +
  3753. f4B0.x * fPX1.r + f4B0.y * fPX1.g + f4B1.x * fPX1.b + f4B1.y * fPX1.a +
  3754. f4B2.w * fPX2.r + f4B2.z * fPX2.g + f4B3.w * fPX2.b + f4B3.z * fPX2.a +
  3755. f4B2.x * fPX3.r + f4B2.y * fPX3.g + f4B3.x * fPX3.b + f4B3.y * fPX3.a;
  3756. #endif // defined(STP_16BIT)
  3757. #if STP_TAA_PRX_LANCZOS_DERING
  3758. #if (STP_MAX_MIN_10BIT == 0)
  3759. #if defined(STP_16BIT)
  3760. StpH2 fXnyR = max(max(StpH2(f4R0.y, -f4R0.y), StpH2(f4R1.x, -f4R1.x)),
  3761. max(StpH2(f4R2.z, -f4R2.z), StpH2(f4R3.w, -f4R3.w)));
  3762. StpH2 fXnyG = max(max(StpH2(f4G0.y, -f4G0.y), StpH2(f4G1.x, -f4G1.x)),
  3763. max(StpH2(f4G2.z, -f4G2.z), StpH2(f4G3.w, -f4G3.w)));
  3764. StpH2 fXnyB = max(max(StpH2(f4B0.y, -f4B0.y), StpH2(f4B1.x, -f4B1.x)),
  3765. max(StpH2(f4B2.z, -f4B2.z), StpH2(f4B3.w, -f4B3.w)));
  3766. f = clamp(f, StpH3(-fXnyR.y, -fXnyG.y, -fXnyB.y), StpH3(fXnyR.x, fXnyG.x, fXnyB.x));
  3767. #else // defined(STP_16BIT)
  3768. fMax.r = max(StpMax3H1(f4R0.y, f4R1.x, f4R2.z), f4R3.w);
  3769. fMax.g = max(StpMax3H1(f4G0.y, f4G1.x, f4G2.z), f4G3.w);
  3770. fMax.b = max(StpMax3H1(f4B0.y, f4B1.x, f4B2.z), f4B3.w);
  3771. fMin.r = min(StpMin3H1(f4R0.y, f4R1.x, f4R2.z), f4R3.w);
  3772. fMin.g = min(StpMin3H1(f4G0.y, f4G1.x, f4G2.z), f4G3.w);
  3773. fMin.b = min(StpMin3H1(f4B0.y, f4B1.x, f4B2.z), f4B3.w);
  3774. f = clamp(f, fMin, fMax);
  3775. #endif // defined(STP_16BIT)
  3776. #else // (STP_MAX_MIN_10BIT == 0)
  3777. // Leaning on {min,max} sampling so no 16/32-bit permutation.
  3778. f = clamp(f, fMin, fMax);
  3779. #endif // (STP_MAX_MIN_10BIT == 0)
  3780. #endif // STP_TAA_PRX_LANCZOS_DERING
  3781. #endif // (STP_TAA_PRX_LANCZOS == 2)
  3782. //==============================================================================================================================
  3783. // DISPLACEMENT
  3784. //==============================================================================================================================
  3785. // Note the 'kJitCRcpC0' gets to position 0 to save some runtime maths.
  3786. // 3 2
  3787. // 0 1
  3788. StpF2 oD0 = oC4 + kJitCRcpC0 - mXY;
  3789. StpF2 oD1 = StpF2(kRcpC.x, 0.0) + oD0;
  3790. StpF2 oD2 = StpF2(kRcpC.x, -kRcpC.y) + oD0;
  3791. StpF2 oD3 = StpF2(0.0, -kRcpC.y) + oD0;
  3792. StpH3 d0 = StpTaaPriFedH(oD0).rgb;
  3793. StpH3 d1 = StpTaaPriFedH(oD1).rgb;
  3794. StpH3 d2 = StpTaaPriFedH(oD2).rgb;
  3795. StpH3 d3 = StpTaaPriFedH(oD3).rgb;
  3796. //------------------------------------------------------------------------------------------------------------------------------
  3797. // INDEPENDENT
  3798. //------------------------------------------------------------------------------------------------------------------------------
  3799. // Normalize interpolation weights.
  3800. #if defined(STP_16BIT)
  3801. StpH2 wG2 = wG.xy + wG.zw;
  3802. wG = StpSatH4(wG * StpH4_(StpPrxLoRcpH1(wG2.x + wG2.y)));
  3803. #else // defined(STP_16BIT)
  3804. wG = StpSatMF4(wG * StpMF4_(StpPrxLoRcpMF1(wG.x + wG.y + wG.z + wG.w)));
  3805. #endif // defined(STP_16BIT)
  3806. //------------------------------------------------------------------------------------------------------------------------------
  3807. // Temporal weighting.
  3808. StpH4 wT = abs(c4R - StpH4_(f.r)) * StpH4_(STP_LUMA_R) +
  3809. abs(c4G - StpH4_(f.g)) * StpH4_(STP_LUMA_G) +
  3810. abs(c4B - StpH4_(f.b)) * StpH4_(STP_LUMA_B);
  3811. wT = StpPrxLoRcpH4(wT * StpH4_(STP_ANTI_MAX) + StpH4_(STP_ANTI_MIN)) * triMask;
  3812. //------------------------------------------------------------------------------------------------------------------------------
  3813. #if defined(STP_16BIT)
  3814. StpH2 wT2 = wT.xy + wT.zw;
  3815. wT = StpSatH4(wT * StpH4_(StpPrxLoRcpH1(wT2.x + wT2.y)));
  3816. #else // defined(STP_16BIT)
  3817. wT = StpSatMF4(wT * StpMF4_(StpPrxLoRcpMF1(wT.x + wT.y + wT.z + wT.w)));
  3818. #endif // defined(STP_16BIT)
  3819. //------------------------------------------------------------------------------------------------------------------------------
  3820. // Interpolate match.
  3821. // Using a fixed 50/50 split of two normalized weights yields a normalized weight.
  3822. StpH4 wM = wT * StpH4_(0.5) + wG * StpH4_(0.5);
  3823. #if defined(STP_16BIT)
  3824. StpH2 match2 = (c4A.xy * wM.xy) + (c4A.zw * wM.zw);
  3825. StpH1 match = match2.x + match2.y;
  3826. #else // defined(STP_16BIT)
  3827. StpMF1 match = c4A.x * wM.x + c4A.y * wM.y + c4A.z * wM.z + c4A.w * wM.w;
  3828. #endif // defined(STP_16BIT)
  3829. // Non-motion-match kills convergence for this frame only.
  3830. cnv *= match;
  3831. //------------------------------------------------------------------------------------------------------------------------------
  3832. // DEPENDENT
  3833. //------------------------------------------------------------------------------------------------------------------------------
  3834. // Interpolation, this first section doesn't have gather4, so probably no gain in swizzling.
  3835. StpH3 dG = d0 * StpH3_(wG.x) + d1 * StpH3_(wG.y) + d2 * StpH3_(wG.z) + d3 * StpH3_(wG.w);
  3836. StpH3 dT = d0 * StpH3_(wT.x) + d1 * StpH3_(wT.y) + d2 * StpH3_(wT.z) + d3 * StpH3_(wT.w);
  3837. //------------------------------------------------------------------------------------------------------------------------------
  3838. #if defined(STP_16BIT)
  3839. StpH2 t2R = (c4R.xy * wT.xy) + (c4R.zw * wT.zw);
  3840. StpH2 t2G = (c4G.xy * wT.xy) + (c4G.zw * wT.zw);
  3841. StpH2 t2B = (c4B.xy * wT.xy) + (c4B.zw * wT.zw);
  3842. StpH3 t = StpH3(t2R.x + t2R.y, t2G.x + t2G.y, t2B.x + t2B.y);
  3843. StpH2 c2R = (c4R.xy * wG.xy) + (c4R.zw * wG.zw);
  3844. StpH2 c2G = (c4G.xy * wG.xy) + (c4G.zw * wG.zw);
  3845. StpH2 c2B = (c4B.xy * wG.xy) + (c4B.zw * wG.zw);
  3846. StpH3 c = StpH3(c2R.x + c2R.y, c2G.x + c2G.y, c2B.x + c2B.y);
  3847. #else // defined(STP_16BIT)
  3848. StpMF3 t = StpMF3(
  3849. c4R.x * wT.x + c4R.y * wT.y + c4R.z * wT.z + c4R.w * wT.w,
  3850. c4G.x * wT.x + c4G.y * wT.y + c4G.z * wT.z + c4G.w * wT.w,
  3851. c4B.x * wT.x + c4B.y * wT.y + c4B.z * wT.z + c4B.w * wT.w);
  3852. StpMF3 c = StpMF3(
  3853. c4R.x * wG.x + c4R.y * wG.y + c4R.z * wG.z + c4R.w * wG.w,
  3854. c4G.x * wG.x + c4G.y * wG.y + c4G.z * wG.z + c4G.w * wG.w,
  3855. c4B.x * wG.x + c4B.y * wG.y + c4B.z * wG.z + c4B.w * wG.w);
  3856. #endif // defined(STP_16BIT)
  3857. //------------------------------------------------------------------------------------------------------------------------------
  3858. // Neighborhood.
  3859. StpH1 bln = StpSatH1(cnv * StpPrxLoRcpH1(cnv + StpH1_(1.0 / STP_FRAME_MAX)));
  3860. StpH1 blnT = StpH1_(1.0) - bln;
  3861. StpH3 b = f * StpH3_(bln) + t * StpH3_(blnT);
  3862. StpH3 minNe = min(c, b);
  3863. StpH3 maxNe = max(c, b);
  3864. //------------------------------------------------------------------------------------------------------------------------------
  3865. // Apply pen.
  3866. StpH3 penC = StpSatH3(c + (f - dG) * StpH3_(StpH1_(0.9875) * match));
  3867. StpH2 penWF;
  3868. penWF.x = pen * StpH1_(STP_TAA_PEN_W);
  3869. penWF.y = pen * lerp(StpH1_(STP_TAA_PEN_F0), StpH1_(STP_TAA_PEN_F1), cnv);
  3870. StpH2 penNotWF = StpH2_(1.0) - penWF;
  3871. rF.rgb = t + (f - dT);
  3872. rF.rgb = rF.rgb * StpH3_(blnT) + f * StpH3_(bln);
  3873. rW.rgb = StpSatH3(rF.rgb * StpH3_(penNotWF.x) + penC * StpH3_(penWF.x));
  3874. rF.rgb = StpSatH3(rF.rgb * StpH3_(penNotWF.y) + penC * StpH3_(penWF.y));
  3875. rW.rgb = clamp(rW.rgb, minNe, maxNe);
  3876. rF.rgb = clamp(rF.rgb, minNe, maxNe);
  3877. //------------------------------------------------------------------------------------------------------------------------------
  3878. // Get back into linear, and then HDR.
  3879. rW.rgb *= rW.rgb;
  3880. #if (STP_POSTMAP == 0)
  3881. StpToneInvH3(rW.rgb);
  3882. #endif // (STP_POSTMAP == 0)
  3883. // Alpha is currently unused, this might improve compression (vs undefined).
  3884. rF.a = rW.a = StpH1(0.0); }
  3885. #endif // defined(STP_GPU) && defined(STP_TAA) && defined(STP_16BIT)
  3886. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  3887. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  3888. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  3889. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  3890. //_____________________________________________________________.._______________________________________________________________
  3891. //==============================================================================================================================
  3892. //
  3893. // GOOD ENOUGH ANTI-ALIASING [GEAA]
  3894. //
  3895. //------------------------------------------------------------------------------------------------------------------------------
  3896. // Yet another simplified spatial morphological AA.
  3897. // Not perfect, but it has low complexity (one pass), and is good enough for a TAA override.
  3898. // Fails on longer edges (due to low maximum search), doesn't get diagonals perfect.
  3899. // But good on already part AA'ed inputs.
  3900. // The spatial AA is not used in STP, only a weighting value which is later used to guide a quick-and-dirty scalar.
  3901. // With some modification this could be used for spatial AA, with or without scaling.
  3902. //------------------------------------------------------------------------------------------------------------------------------
  3903. // CALLBACKS
  3904. // =========
  3905. // StpMF4 StpGeaa4F(StpF2 p) - Gather4 of luma (or green as luma).
  3906. // ---------
  3907. // StpH4 StpGeaa4H(StpF2 p)
  3908. //==============================================================================================================================
  3909. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  3910. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  3911. //_____________________________________________________________.._______________________________________________________________
  3912. //==============================================================================================================================
  3913. // [GEAA] DEFAULTS
  3914. //==============================================================================================================================
  3915. // Choose a configuration of number of positions to sample.
  3916. // 0 ... 3 per side (faster, less quality)
  3917. // 1 ... 5 per side
  3918. // 2 ... 7 per side
  3919. // 3 ... 9 per side (slower, higher quality)
  3920. #ifndef STP_GEAA_P
  3921. #define STP_GEAA_P 3
  3922. #endif // STP_GEAA_P
  3923. //------------------------------------------------------------------------------------------------------------------------------
  3924. // Amount of sub-pixel blur.
  3925. // 0.50 ... Turn it off
  3926. // 0.25 ... Middle ground
  3927. // 0.00 ... More blur
  3928. #ifndef STP_GEAA_SUBPIX
  3929. #define STP_GEAA_SUBPIX (8.0 / 16.0)
  3930. #endif // STP_GEAA_SUBPIX
  3931. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  3932. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  3933. //_____________________________________________________________.._______________________________________________________________
  3934. //==============================================================================================================================
  3935. // [GEAA] INTERNAL TUNING
  3936. //==============================================================================================================================
  3937. // Higher numbers can reduce the amount of AA, lower numbers can increase it but can look dirty.
  3938. // Best not to mess with this, 1/3 is the 'correct' value for 2 of the 3 edge cases.
  3939. #define STP_GEAA_THRESHOLD (1.0/3.0)
  3940. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  3941. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  3942. //_____________________________________________________________.._______________________________________________________________
  3943. //==============================================================================================================================
  3944. // [GEAA] 32-BIT ENTRY POINT
  3945. //==============================================================================================================================
  3946. // See the 16-bit version for all comments.
  3947. #if defined(STP_GPU) && defined(STP_GEAA) && defined(STP_32BIT)
  3948. void StpGeaaF(
  3949. out StpMF1 gW, out StpMF1 gLuma, out StpF2 gFilter, out StpF2 gDilate, StpF2 p, StpF2 kRcpI, StpF2 kHalfRcpI) {
  3950. //------------------------------------------------------------------------------------------------------------------------------
  3951. #if STP_OFFSETS
  3952. StpF2 pDEBA = p + StpF2(-kHalfRcpI.x, -kHalfRcpI.y);
  3953. StpMF4 gDEBA = StpGeaa4F(pDEBA);
  3954. StpMF4 gEFCB = StpGeaa4OF(pDEBA, StpI2(1, 0));
  3955. StpMF4 gGHED = StpGeaa4OF(pDEBA, StpI2(0, 1));
  3956. StpMF4 gHIFE = StpGeaa4OF(pDEBA, StpI2(1, 1));
  3957. #else // STP_OFFSETS
  3958. StpMF4 gDEBA = StpGeaa4F(p + StpF2(-kHalfRcpI.x, -kHalfRcpI.y));
  3959. StpMF4 gEFCB = StpGeaa4F(p + StpF2( kHalfRcpI.x, -kHalfRcpI.y));
  3960. StpMF4 gGHED = StpGeaa4F(p + StpF2(-kHalfRcpI.x, kHalfRcpI.y));
  3961. StpMF4 gHIFE = StpGeaa4F(p + StpF2( kHalfRcpI.x, kHalfRcpI.y));
  3962. #endif // STP_OFFSETS
  3963. //------------------------------------------------------------------------------------------------------------------------------
  3964. StpMF2 gHV0,gHV1,gHV2;
  3965. gHV0.x = gDEBA.z * StpMF1_(-2.0) + gEFCB.z;
  3966. gHV0.y = gDEBA.x * StpMF1_(-2.0) + gGHED.x;
  3967. gHV0 += StpMF2_(gDEBA.w);
  3968. gHV1.x = gDEBA.x + gEFCB.y;
  3969. gHV1.y = gDEBA.z + gGHED.y;
  3970. gHV1 += StpMF2_(gDEBA.y) * StpMF2_(-2.0);
  3971. gHV2.x = gGHED.x + gGHED.y * StpMF1_(-2.0);
  3972. gHV2.y = gEFCB.z + gEFCB.y * StpMF1_(-2.0);
  3973. gHV2 += StpMF2_(gHIFE.y);
  3974. #if 0
  3975. StpMF2 gHV = abs(gHV0) + abs(gHV1) * StpMF2_(2.0) + abs(gHV2);
  3976. #else
  3977. StpMF2 gHV = gHV0 * gHV0 + gHV1 * gHV1 * StpMF2_(2.0) + gHV2 * gHV2;
  3978. #endif
  3979. StpP1 gVert = gHV.x > gHV.y;
  3980. //------------------------------------------------------------------------------------------------------------------------------
  3981. StpMF2 gBH = gVert ? StpMF2(gDEBA.x, gEFCB.y) : StpMF2(gDEBA.z, gGHED.y);
  3982. StpMF2 gAC = gVert ? StpMF2(gDEBA.w, gGHED.x) : StpMF2(gDEBA.w, gEFCB.z);
  3983. StpMF2 gDF = gVert ? StpMF2(gDEBA.z, gGHED.y) : StpMF2(gDEBA.x, gEFCB.y);
  3984. StpMF2 gGI = gVert ? StpMF2(gEFCB.y, gHIFE.y) : StpMF2(gGHED.x, gHIFE.y);
  3985. StpMF2 gBHMinusE = gBH - StpMF2_(gDEBA.y);
  3986. StpMF2 gEnd2 = abs(gBHMinusE);
  3987. StpP1 gUp = gEnd2.x >= gEnd2.y;
  3988. //------------------------------------------------------------------------------------------------------------------------------
  3989. StpMF1 gE = gDEBA.y;
  3990. gBH = gUp ? gBH : gBH.yx;
  3991. //------------------------------------------------------------------------------------------------------------------------------
  3992. StpMF2 gBi = gUp ? StpMF2(2.0 / 3.0, 1.0 / 3.0) : StpMF2(1.0 / 3.0 , 2.0 / 3.0);
  3993. StpMF1 gBMinusE = gUp ? gBHMinusE.x : gBHMinusE.y;
  3994. StpMF2 gBi0 = (gUp ? gAC : gGI) * StpMF2_(1.0 / 3.0) + gDF * StpMF2_(2.0 / 3.0);
  3995. StpMF2 gLo0 = gDF;
  3996. StpMF1 gAbsBMinusE = abs(gBMinusE);
  3997. StpMF1 gNe = gAbsBMinusE;
  3998. StpMF1 gGood = StpGtZeroMF1(gBMinusE);
  3999. //------------------------------------------------------------------------------------------------------------------------------
  4000. StpF2 gWalk = gVert ? StpF2(0.0, kRcpI.y) : StpF2(kRcpI.x, 0.0);
  4001. StpF2 gDecon = gVert ? StpF2(kRcpI.x, 0.0) : StpF2(0.0, kRcpI.y);
  4002. if(gUp) gDecon = -gDecon;
  4003. //------------------------------------------------------------------------------------------------------------------------------
  4004. StpF2 gP = p + gDecon * StpF2_(1.0/3.0);
  4005. //------------------------------------------------------------------------------------------------------------------------------
  4006. StpF2 gPN3 = gP - StpF2_(8.5) * gWalk;
  4007. StpF2 gPN2 = gP - StpF2_(6.5) * gWalk;
  4008. StpF2 gPN1 = gP - StpF2_(4.5) * gWalk;
  4009. StpF2 gPN0 = gP - StpF2_(2.5) * gWalk;
  4010. StpF2 gPP0 = gP + StpF2_(2.5) * gWalk;
  4011. StpF2 gPP1 = gP + StpF2_(4.5) * gWalk;
  4012. StpF2 gPP2 = gP + StpF2_(6.5) * gWalk;
  4013. StpF2 gPP3 = gP + StpF2_(8.5) * gWalk;
  4014. //------------------------------------------------------------------------------------------------------------------------------
  4015. StpMF4 gGN3, gGN2, gGN1, gGN0, gGP0, gGP1, gGP2, gGP3;
  4016. gGN3 = StpGeaa4F(gPN3);
  4017. gGN2 = StpGeaa4F(gPN2);
  4018. gGN1 = StpGeaa4F(gPN1);
  4019. gGN0 = StpGeaa4F(gPN0);
  4020. gGP0 = StpGeaa4F(gPP0);
  4021. gGP1 = StpGeaa4F(gPP1);
  4022. gGP2 = StpGeaa4F(gPP2);
  4023. gGP3 = StpGeaa4F(gPP3);
  4024. //------------------------------------------------------------------------------------------------------------------------------
  4025. if(gVert) {
  4026. gGN3 = gGN3.zyxw;
  4027. gGN2 = gGN2.zyxw;
  4028. gGN1 = gGN1.zyxw;
  4029. gGN0 = gGN0.zyxw;
  4030. gGP0 = gGP0.zyxw;
  4031. gGP1 = gGP1.zyxw;
  4032. gGP2 = gGP2.zyxw;
  4033. gGP3 = gGP3.zyxw; }
  4034. //------------------------------------------------------------------------------------------------------------------------------
  4035. StpMF2 gLo8 = StpMF2(gGN3.x, gGP3.y);
  4036. StpMF2 gLo7 = StpMF2(gGN3.y, gGP3.x);
  4037. StpMF2 gLo6 = StpMF2(gGN2.x, gGP2.y);
  4038. StpMF2 gLo5 = StpMF2(gGN2.y, gGP2.x);
  4039. StpMF2 gLo4 = StpMF2(gGN1.x, gGP1.y);
  4040. StpMF2 gLo3 = StpMF2(gGN1.y, gGP1.x);
  4041. StpMF2 gLo2 = StpMF2(gGN0.x, gGP0.y);
  4042. StpMF2 gLo1 = StpMF2(gGN0.y, gGP0.x);
  4043. if(!gUp) {
  4044. gLo8 = StpMF2(gGN3.w, gGP3.z);
  4045. gLo7 = StpMF2(gGN3.z, gGP3.w);
  4046. gLo6 = StpMF2(gGN2.w, gGP2.z);
  4047. gLo5 = StpMF2(gGN2.z, gGP2.w);
  4048. gLo4 = StpMF2(gGN1.w, gGP1.z);
  4049. gLo3 = StpMF2(gGN1.z, gGP1.w);
  4050. gLo2 = StpMF2(gGN0.w, gGP0.z);
  4051. gLo1 = StpMF2(gGN0.z, gGP0.w); }
  4052. //------------------------------------------------------------------------------------------------------------------------------
  4053. StpMF2 gGN3Bi = gGN3.yx * StpMF2_(gBi.x) + gGN3.zw * StpMF2_(gBi.y);
  4054. StpMF2 gGN2Bi = gGN2.yx * StpMF2_(gBi.x) + gGN2.zw * StpMF2_(gBi.y);
  4055. StpMF2 gGN1Bi = gGN1.yx * StpMF2_(gBi.x) + gGN1.zw * StpMF2_(gBi.y);
  4056. StpMF2 gGN0Bi = gGN0.yx * StpMF2_(gBi.x) + gGN0.zw * StpMF2_(gBi.y);
  4057. StpMF2 gGP0Bi = gGP0.yx * StpMF2_(gBi.x) + gGP0.zw * StpMF2_(gBi.y);
  4058. StpMF2 gGP1Bi = gGP1.yx * StpMF2_(gBi.x) + gGP1.zw * StpMF2_(gBi.y);
  4059. StpMF2 gGP2Bi = gGP2.yx * StpMF2_(gBi.x) + gGP2.zw * StpMF2_(gBi.y);
  4060. StpMF2 gGP3Bi = gGP3.yx * StpMF2_(gBi.x) + gGP3.zw * StpMF2_(gBi.y);
  4061. StpMF2 gBi8 = StpMF2(gGN3Bi.y, gGP3Bi.x);
  4062. StpMF2 gBi7 = StpMF2(gGN3Bi.x, gGP3Bi.y);
  4063. StpMF2 gBi6 = StpMF2(gGN2Bi.y, gGP2Bi.x);
  4064. StpMF2 gBi5 = StpMF2(gGN2Bi.x, gGP2Bi.y);
  4065. StpMF2 gBi4 = StpMF2(gGN1Bi.y, gGP1Bi.x);
  4066. StpMF2 gBi3 = StpMF2(gGN1Bi.x, gGP1Bi.y);
  4067. StpMF2 gBi2 = StpMF2(gGN0Bi.y, gGP0Bi.x);
  4068. StpMF2 gBi1 = StpMF2(gGN0Bi.x, gGP0Bi.y);
  4069. //------------------------------------------------------------------------------------------------------------------------------
  4070. StpMF2 gEndBase;
  4071. gEndBase.y = gBMinusE * StpMF1_(1.0/3.0) + gE;
  4072. gEndBase.x = gAbsBMinusE * StpMF1_(STP_GEAA_THRESHOLD);
  4073. #if 0
  4074. gEndBase.x = StpRcpMF1(max(StpMF1_(1.0 / 16384.0), gEndBase.x));
  4075. #else
  4076. gEndBase.x = StpPrxLoRcpMF1(gEndBase.x);
  4077. #endif
  4078. //------------------------------------------------------------------------------------------------------------------------------
  4079. #if (STP_GEAA_P > 2)
  4080. StpMF2 gUseP8 = StpSatMF2(abs(gBi8 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
  4081. StpMF2 gUseP7 = StpSatMF2(abs(gBi7 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
  4082. #endif
  4083. #if (STP_GEAA_P > 1)
  4084. StpMF2 gUseP6 = StpSatMF2(abs(gBi6 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
  4085. StpMF2 gUseP5 = StpSatMF2(abs(gBi5 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
  4086. #endif
  4087. #if (STP_GEAA_P > 0)
  4088. StpMF2 gUseP4 = StpSatMF2(abs(gBi4 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
  4089. StpMF2 gUseP3 = StpSatMF2(abs(gBi3 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
  4090. #endif
  4091. StpMF2 gUseP2 = StpSatMF2(abs(gBi2 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
  4092. StpMF2 gUseP1 = StpSatMF2(abs(gBi1 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
  4093. StpMF2 gUseP0 = StpSatMF2(abs(gBi0 - StpMF2_(gEndBase.y)) * StpMF2_(gEndBase.x));
  4094. //------------------------------------------------------------------------------------------------------------------------------
  4095. #if (STP_GEAA_P == 3)
  4096. StpMF2 gDst2 = StpMF2_(9.5);
  4097. #endif
  4098. #if (STP_GEAA_P == 2)
  4099. StpMF2 gDst2 = StpMF2_(7.5);
  4100. #endif
  4101. #if (STP_GEAA_P == 1)
  4102. StpMF2 gDst2 = StpMF2_(5.5);
  4103. #endif
  4104. #if (STP_GEAA_P == 0)
  4105. StpMF2 gDst2 = StpMF2_(3.5);
  4106. #endif
  4107. #if (STP_GEAA_P > 2)
  4108. gDst2 = gDst2 + (StpMF2_(8.5) - gDst2) * gUseP8;
  4109. gDst2 = gDst2 + (StpMF2_(7.5) - gDst2) * gUseP7;
  4110. #endif
  4111. #if (STP_GEAA_P > 1)
  4112. gDst2 = gDst2 + (StpMF2_(6.5) - gDst2) * gUseP6;
  4113. gDst2 = gDst2 + (StpMF2_(5.5) - gDst2) * gUseP5;
  4114. #endif
  4115. #if (STP_GEAA_P > 0)
  4116. gDst2 = gDst2 + (StpMF2_(4.5) - gDst2) * gUseP4;
  4117. gDst2 = gDst2 + (StpMF2_(3.5) - gDst2) * gUseP3;
  4118. #endif
  4119. gDst2 = gDst2 + (StpMF2_(2.5) - gDst2) * gUseP2;
  4120. gDst2 = gDst2 + (StpMF2_(1.5) - gDst2) * gUseP1;
  4121. gDst2 = gDst2 + (StpMF2_(0.5) - gDst2) * gUseP0;
  4122. //------------------------------------------------------------------------------------------------------------------------------
  4123. StpMF1 gLoSub = (gDst2.x + gDst2.y) * StpMF1_(0.5) - StpMF1_(STP_GEAA_SUBPIX);
  4124. StpMF2 gLoW01 = StpMF2_(1.0) - StpSatMF2(StpMF2(1.0, 2.0) - StpMF2_(gLoSub));
  4125. StpMF2 gLoW23 = StpMF2_(1.0) - StpSatMF2(StpMF2(3.0, 4.0) - StpMF2_(gLoSub));
  4126. StpMF2 gLoW45 = StpMF2_(1.0) - StpSatMF2(StpMF2(5.0, 6.0) - StpMF2_(gLoSub));
  4127. StpMF2 gLoW67 = StpMF2_(1.0) - StpSatMF2(StpMF2(7.0, 8.0) - StpMF2_(gLoSub));
  4128. StpMF2 gLoW89 = StpMF2_(1.0) - StpSatMF2(StpMF2(9.0,10.0) - StpMF2_(gLoSub));
  4129. StpMF2 gLoAcc2 =
  4130. gLo0 * StpMF2_(gLoW01.x) +
  4131. gLo1 * StpMF2_(gLoW01.y) +
  4132. gLo2 * StpMF2_(gLoW23.x) +
  4133. gLo3 * StpMF2_(gLoW23.y) +
  4134. gLo4 * StpMF2_(gLoW45.x) +
  4135. gLo5 * StpMF2_(gLoW45.y) +
  4136. gLo6 * StpMF2_(gLoW67.x) +
  4137. gLo7 * StpMF2_(gLoW67.y) +
  4138. gLo8 * StpMF2_(gLoW89.x);
  4139. StpMF1 gLoAcc = gE + gLoAcc2.x + gLoAcc2.y;
  4140. StpMF2 gLoW2 = gLoW01 + gLoW23 + gLoW45 + gLoW67;
  4141. gLoW2 *= StpMF2_(2.0);
  4142. gLoAcc *= StpRcpMF1(StpMF1_(1.0) + gLoW89.x * StpMF1_(2.0) + gLoW2.x + gLoW2.y);
  4143. StpMF1 gOff = StpSatMF1((gLoAcc - gE) * StpRcpMF1(gBH.x - gE));
  4144. gOff = min(gOff, StpMF1_(0.5));
  4145. //------------------------------------------------------------------------------------------------------------------------------
  4146. gDilate = p + gDecon;
  4147. gFilter = p + gDecon * StpF2_(gOff);
  4148. gLuma = lerp(gE, gBH.x, gOff);
  4149. //------------------------------------------------------------------------------------------------------------------------------
  4150. StpMF1 gAnti = lerp(gE, gBH.x, gOff);
  4151. StpMF1 gT = StpSatMF1((StpMF1_(-2.0) * gAnti + gBH.x + gE) * StpRcpMF1(gE - gBH.y));
  4152. StpMF1 gFix = gE * (gT - StpMF1_(1.0)) - gBH.y * gT;
  4153. gFix = StpSatMF1((gFix + gAnti) * StpRcpMF1(gFix + gBH.x));
  4154. //------------------------------------------------------------------------------------------------------------------------------
  4155. gW = gFix;
  4156. gW = StpRcpMF1(gW + StpMF1_(0.5)) - StpMF1_(1.0);
  4157. gW *= gW;
  4158. gW = max(gW, StpMF1_(1.0/255.0)); }
  4159. #endif // defined(STP_GPU) && defined(STP_GEAA) && defined(STP_32BIT)
  4160. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  4161. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  4162. //_____________________________________________________________.._______________________________________________________________
  4163. //==============================================================================================================================
  4164. // [GEAA] PACKED 16-BIT ENTRY POINT
  4165. //==============================================================================================================================
  4166. #if defined(STP_GPU) && defined(STP_GEAA) && defined(STP_16BIT)
  4167. void StpGeaaH(
  4168. out StpH1 gW, // Output weight for pixel art scalar.
  4169. out StpH1 gLuma, // Filtered luma for debug.
  4170. out StpF2 gFilter, // Location to sample for standalone unscaled spatial AA.
  4171. out StpF2 gDilate, // Location of highest contrast neighbor.
  4172. StpF2 p, // {0 to 1} position across screen.
  4173. StpF2 kRcpI, // 1.0 / input image size in pixels.
  4174. StpF2 kHalfRcpI) { // 0.5 / input image size in pixels.
  4175. //------------------------------------------------------------------------------------------------------------------------------
  4176. // Sample 3x3 input pattern in luma (or green).
  4177. // A B C
  4178. // D E F
  4179. // G H I
  4180. // Via four gather4s, usage for the next section to try to improve operand caching.
  4181. #if STP_OFFSETS
  4182. StpF2 pDEBA = p + StpF2(-kHalfRcpI.x, -kHalfRcpI.y);
  4183. StpH4 gDEBA = StpGeaa4H(pDEBA);
  4184. StpH4 gEFCB = StpGeaa4OH(pDEBA, StpI2(1, 0));
  4185. StpH4 gGHED = StpGeaa4OH(pDEBA, StpI2(0, 1));
  4186. StpH4 gHIFE = StpGeaa4OH(pDEBA, StpI2(1, 1));
  4187. #else // STP_OFFSETS
  4188. StpH4 gDEBA = StpGeaa4H(p + StpF2(-kHalfRcpI.x, -kHalfRcpI.y)); // .xyzw=DEBA
  4189. StpH4 gEFCB = StpGeaa4H(p + StpF2( kHalfRcpI.x, -kHalfRcpI.y)); // .yz =FC
  4190. StpH4 gGHED = StpGeaa4H(p + StpF2(-kHalfRcpI.x, kHalfRcpI.y)); // .xy =GH
  4191. StpH4 gHIFE = StpGeaa4H(p + StpF2( kHalfRcpI.x, kHalfRcpI.y)); // .y =I
  4192. #endif // STP_OFFSETS
  4193. //------------------------------------------------------------------------------------------------------------------------------
  4194. // Compute {horz,vert} change terms. Complex to decide on either horizontal or vertical direction.
  4195. // Trouble case for some algorithms,
  4196. // 0 1 0
  4197. // 0 1 0
  4198. // 0 1 0
  4199. // This should present as a vertical search direction.
  4200. // Simple stuff like sum of each 2x2 produces,
  4201. // 2 2
  4202. // 2 2
  4203. // Which has no direction.
  4204. // {ABC,ADG}
  4205. StpH2 gHV0,gHV1,gHV2;
  4206. gHV0.x = gDEBA.z * StpH1_(-2.0) + gEFCB.z;
  4207. gHV0.y = gDEBA.x * StpH1_(-2.0) + gGHED.x;
  4208. gHV0 += StpH2_(gDEBA.w);
  4209. // {DEF,BEH}
  4210. gHV1.x = gDEBA.x + gEFCB.y;
  4211. gHV1.y = gDEBA.z + gGHED.y;
  4212. gHV1 += StpH2_(gDEBA.y) * StpH2_(-2.0);
  4213. // {GHI,CFI}
  4214. gHV2.x = gGHED.x + gGHED.y * StpH1_(-2.0);
  4215. gHV2.y = gEFCB.z + gEFCB.y * StpH1_(-2.0);
  4216. gHV2 += StpH2_(gHIFE.y);
  4217. // Combine terms.
  4218. #if 0
  4219. // What FXAA does, better for a diagonal computation (which is not needed), left for reference.
  4220. StpH2 gHV = abs(gHV0) + abs(gHV1) * StpH2_(2.0) + abs(gHV2);
  4221. #else
  4222. // Slightly faster for packed 16-bit (which has no free ABS on AMD).
  4223. StpH2 gHV = gHV0 * gHV0 + gHV1 * gHV1 * StpH2_(2.0) + gHV2 * gHV2;
  4224. #endif
  4225. // Choose search direction, the 'gVert' is true:=vert, false:=horz.
  4226. // Go vertical search if horizontal has higher contrast (search perpendicular).
  4227. StpP1 gVert = gHV.x > gHV.y;
  4228. //------------------------------------------------------------------------------------------------------------------------------
  4229. // This is BH if search horzontal, else DF (as BH) if search vertical.
  4230. StpH2 gBH = gVert ? StpH2(gDEBA.x, gEFCB.y) : StpH2(gDEBA.z, gGHED.y);
  4231. // Will need these later, will let the compiler move around the transpose.
  4232. StpH2 gAC = gVert ? StpH2(gDEBA.w, gGHED.x) : StpH2(gDEBA.w, gEFCB.z);
  4233. StpH2 gDF = gVert ? StpH2(gDEBA.z, gGHED.y) : StpH2(gDEBA.x, gEFCB.y);
  4234. StpH2 gGI = gVert ? StpH2(gEFCB.y, gHIFE.y) : StpH2(gGHED.x, gHIFE.y);
  4235. // Start to compute threshold for end of span, compute a gradient pair.
  4236. StpH2 gBHMinusE = gBH - StpH2_(gDEBA.y);
  4237. StpH2 gEnd2 = abs(gBHMinusE);
  4238. // If gradient is larger upward (or leftward if vert).
  4239. StpP1 gUp = gEnd2.x >= gEnd2.y;
  4240. //------------------------------------------------------------------------------------------------------------------------------
  4241. // Rename.
  4242. StpH1 gE = gDEBA.y;
  4243. // Swap if not up. From this point on, the B is the high-contrast neighbor, and the H is the other one in same dir.
  4244. gBH = gUp ? gBH : gBH.yx;
  4245. //------------------------------------------------------------------------------------------------------------------------------
  4246. // Choose the bilinear scalar (gets to 1/3 between texels during the search).
  4247. // .x ... For texel closer to pixel axis when up (reversed when down).
  4248. // .y ... For more distant texel.
  4249. // LOGIC
  4250. // =====
  4251. // This keeps threshold of 2 of the 3 end conditions the same (so 1/3 shift is better than 1/4).
  4252. // =====
  4253. // e e e <- e = end cases
  4254. // 0 0 1 1 <- 1/3 of high contrast neighbor
  4255. // 0 1 0 1 <- 2/3 of self
  4256. // ------------------
  4257. // 0 2/3 1/3 1 <- blended value (2/3 is the target)
  4258. // 2/3 0 1/3 1/3 <- abs(difference to target)
  4259. StpH2 gBi = gUp ? StpH2(2.0 / 3.0, 1.0 / 3.0) : StpH2(1.0 / 3.0 , 2.0 / 3.0);
  4260. // Choose either {B-E, or H-E}.
  4261. StpH1 gBMinusE = gUp ? gBHMinusE.x : gBHMinusE.y;
  4262. // Finish Bi0, this is the first 2 texture fetches (done using math instead) at P0 (1 texel away from center).
  4263. StpH2 gBi0 = (gUp ? gAC : gGI) * StpH2_(1.0 / 3.0) + gDF * StpH2_(2.0 / 3.0);
  4264. // Finish Lo0, for the directional blur.
  4265. StpH2 gLo0 = gDF;
  4266. // Store out spatial neighborhood.
  4267. StpH1 gAbsBMinusE = abs(gBMinusE);
  4268. // This is just the highest contrast neighbor along the choosen direction, may report less contrast then actual.
  4269. StpH1 gNe = gAbsBMinusE;
  4270. // Good direction to compare against at the end.
  4271. // Good means 'don't flip' to the other side.
  4272. // Have 'B-E' want 'signed(E-(B/2+E/2))' = 'signed(E/2-B/2)' = 'signed(E-B)' = 'gtzero(B-E)'
  4273. StpH1 gGood = StpGtZeroH1(gBMinusE);
  4274. //------------------------------------------------------------------------------------------------------------------------------
  4275. // One pixel walk distance for search.
  4276. StpF2 gWalk = gVert ? StpF2(0.0, kRcpI.y) : StpF2(kRcpI.x, 0.0);
  4277. // This is the direction of decontrast (towards the highest contrast neighbor).
  4278. StpF2 gDecon = gVert ? StpF2(kRcpI.x, 0.0) : StpF2(0.0, kRcpI.y);
  4279. // If up (or left) work negative.
  4280. if(gUp) gDecon = -gDecon;
  4281. //------------------------------------------------------------------------------------------------------------------------------
  4282. // Have enough now to build out sampling positions.
  4283. // This works in gather4 to get two samples per gather, then uses math to finish the bilinear fetch.
  4284. // In case the logic ever goes back to a non-gather4 version, this keeps with the 1/3 offset.
  4285. // Build base, 1/3 to neighbor pixel.
  4286. // It must be 1/3 to neighbor pixel to be able to find the end of thin stuff like this.
  4287. // . . . . . . . . . . .
  4288. // . . . . . . x x x x x
  4289. // . x x x x x . . . . .
  4290. // | |
  4291. // |------>|
  4292. // | . x
  4293. // If it was 1/2 to neighbor, then x and . would look the same.
  4294. StpF2 gP = p + gDecon * StpF2_(1.0/3.0);
  4295. // The gather4 positions are (assuming horizontal then up).
  4296. // 3 3 2 2 1 1 0 0 A B C 0 0 1 1 2 2 3 3
  4297. // 3 3 2 2 1 1 0 0 D E F 0 0 1 1 2 2 3 3
  4298. // G H I
  4299. //------------------------------------------------------------------------------------------------------------------------------
  4300. // Sampling positions.
  4301. // Currently walking without gaps, but could skip along too!
  4302. StpF2 gPN3 = gP - StpF2_(8.5) * gWalk;
  4303. StpF2 gPN2 = gP - StpF2_(6.5) * gWalk;
  4304. StpF2 gPN1 = gP - StpF2_(4.5) * gWalk;
  4305. StpF2 gPN0 = gP - StpF2_(2.5) * gWalk;
  4306. StpF2 gPP0 = gP + StpF2_(2.5) * gWalk;
  4307. StpF2 gPP1 = gP + StpF2_(4.5) * gWalk;
  4308. StpF2 gPP2 = gP + StpF2_(6.5) * gWalk;
  4309. StpF2 gPP3 = gP + StpF2_(8.5) * gWalk;
  4310. //------------------------------------------------------------------------------------------------------------------------------
  4311. // This attempts to do sampling in a cache friendly way.
  4312. // Cannot sample with offsets, because it could be vertical or horizontal and offsets need to be static in DX.
  4313. // Sampling pairs {negative, positive} directions.
  4314. StpH4 gGN3, gGN2, gGN1, gGN0, gGP0, gGP1, gGP2, gGP3;
  4315. gGN3 = StpGeaa4H(gPN3);
  4316. gGN2 = StpGeaa4H(gPN2);
  4317. gGN1 = StpGeaa4H(gPN1);
  4318. gGN0 = StpGeaa4H(gPN0);
  4319. gGP0 = StpGeaa4H(gPP0);
  4320. gGP1 = StpGeaa4H(gPP1);
  4321. gGP2 = StpGeaa4H(gPP2);
  4322. gGP3 = StpGeaa4H(gPP3);
  4323. //------------------------------------------------------------------------------------------------------------------------------
  4324. // Finish the bilinear fetch.
  4325. // For 'vertical' this needs to do a transpose.
  4326. // The FMAs are duplicated, else the compiler would need to do that anyway.
  4327. // 1st 2nd for N side (P side is reversed)
  4328. // ----------- | |
  4329. // W Z w z !vert & up ... Y X, Z W
  4330. // X Y [p] x y
  4331. // -----------
  4332. // W Z [p] w z !vert & !up ... Z W, Y X
  4333. // X Y x y
  4334. // -----------
  4335. // W Z vert & up ... Y Z, X W
  4336. // X Y
  4337. // [p]
  4338. // w z
  4339. // x y
  4340. // -----------
  4341. // W Z vert & !up ... X W, Y Z
  4342. // X Y | | | |
  4343. // [p] | | 0.33 term
  4344. // w z | |
  4345. // x y 0.66 term
  4346. // -----------
  4347. if(gVert) {
  4348. gGN3 = gGN3.zyxw;
  4349. gGN2 = gGN2.zyxw;
  4350. gGN1 = gGN1.zyxw;
  4351. gGN0 = gGN0.zyxw;
  4352. gGP0 = gGP0.zyxw;
  4353. gGP1 = gGP1.zyxw;
  4354. gGP2 = gGP2.zyxw;
  4355. gGP3 = gGP3.zyxw; }
  4356. //------------------------------------------------------------------------------------------------------------------------------
  4357. // Grab the texels for the variable length inline low-pass box blur.
  4358. StpH2 gLo8 = StpH2(gGN3.x, gGP3.y);
  4359. StpH2 gLo7 = StpH2(gGN3.y, gGP3.x);
  4360. StpH2 gLo6 = StpH2(gGN2.x, gGP2.y);
  4361. StpH2 gLo5 = StpH2(gGN2.y, gGP2.x);
  4362. StpH2 gLo4 = StpH2(gGN1.x, gGP1.y);
  4363. StpH2 gLo3 = StpH2(gGN1.y, gGP1.x);
  4364. StpH2 gLo2 = StpH2(gGN0.x, gGP0.y);
  4365. StpH2 gLo1 = StpH2(gGN0.y, gGP0.x);
  4366. if(!gUp) {
  4367. gLo8 = StpH2(gGN3.w, gGP3.z);
  4368. gLo7 = StpH2(gGN3.z, gGP3.w);
  4369. gLo6 = StpH2(gGN2.w, gGP2.z);
  4370. gLo5 = StpH2(gGN2.z, gGP2.w);
  4371. gLo4 = StpH2(gGN1.w, gGP1.z);
  4372. gLo3 = StpH2(gGN1.z, gGP1.w);
  4373. gLo2 = StpH2(gGN0.w, gGP0.z);
  4374. gLo1 = StpH2(gGN0.z, gGP0.w); }
  4375. //------------------------------------------------------------------------------------------------------------------------------
  4376. // Simulate the bilinear fetch.
  4377. StpH2 gGN3Bi = gGN3.yx * StpH2_(gBi.x) + gGN3.zw * StpH2_(gBi.y);
  4378. StpH2 gGN2Bi = gGN2.yx * StpH2_(gBi.x) + gGN2.zw * StpH2_(gBi.y);
  4379. StpH2 gGN1Bi = gGN1.yx * StpH2_(gBi.x) + gGN1.zw * StpH2_(gBi.y);
  4380. StpH2 gGN0Bi = gGN0.yx * StpH2_(gBi.x) + gGN0.zw * StpH2_(gBi.y);
  4381. StpH2 gGP0Bi = gGP0.yx * StpH2_(gBi.x) + gGP0.zw * StpH2_(gBi.y);
  4382. StpH2 gGP1Bi = gGP1.yx * StpH2_(gBi.x) + gGP1.zw * StpH2_(gBi.y);
  4383. StpH2 gGP2Bi = gGP2.yx * StpH2_(gBi.x) + gGP2.zw * StpH2_(gBi.y);
  4384. StpH2 gGP3Bi = gGP3.yx * StpH2_(gBi.x) + gGP3.zw * StpH2_(gBi.y);
  4385. // Note positive side the {x,y} order is reversed.
  4386. StpH2 gBi8 = StpH2(gGN3Bi.y, gGP3Bi.x);
  4387. StpH2 gBi7 = StpH2(gGN3Bi.x, gGP3Bi.y);
  4388. StpH2 gBi6 = StpH2(gGN2Bi.y, gGP2Bi.x);
  4389. StpH2 gBi5 = StpH2(gGN2Bi.x, gGP2Bi.y);
  4390. StpH2 gBi4 = StpH2(gGN1Bi.y, gGP1Bi.x);
  4391. StpH2 gBi3 = StpH2(gGN1Bi.x, gGP1Bi.y);
  4392. StpH2 gBi2 = StpH2(gGN0Bi.y, gGP0Bi.x);
  4393. StpH2 gBi1 = StpH2(gGN0Bi.x, gGP0Bi.y);
  4394. //------------------------------------------------------------------------------------------------------------------------------
  4395. // Threshold for end of span (X), and base to compare against (Y).
  4396. StpH2 gEndBase;
  4397. // For a (1.0/3.0) pixel shift.
  4398. // The 'gBMinusE = other - self', and want 'self * (2.0/3.0) + other * (1.0/3.0)'.
  4399. gEndBase.y = gBMinusE * StpH1_(1.0/3.0) + gE;
  4400. gEndBase.x = gAbsBMinusE * StpH1_(STP_GEAA_THRESHOLD);
  4401. // Safer version here for reference.
  4402. #if 0
  4403. gEndBase.x = StpRcpH1(max(StpH1_(1.0 / 16384.0), gEndBase.x));
  4404. #else
  4405. gEndBase.x = StpPrxLoRcpH1(gEndBase.x);
  4406. #endif
  4407. //------------------------------------------------------------------------------------------------------------------------------
  4408. // Compute opacity term, {0 := not done, 1 := end of span}.
  4409. #if (STP_GEAA_P > 2)
  4410. StpH2 gUseP8 = StpSatH2(abs(gBi8 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
  4411. StpH2 gUseP7 = StpSatH2(abs(gBi7 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
  4412. #endif
  4413. #if (STP_GEAA_P > 1)
  4414. StpH2 gUseP6 = StpSatH2(abs(gBi6 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
  4415. StpH2 gUseP5 = StpSatH2(abs(gBi5 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
  4416. #endif
  4417. #if (STP_GEAA_P > 0)
  4418. StpH2 gUseP4 = StpSatH2(abs(gBi4 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
  4419. StpH2 gUseP3 = StpSatH2(abs(gBi3 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
  4420. #endif
  4421. StpH2 gUseP2 = StpSatH2(abs(gBi2 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
  4422. StpH2 gUseP1 = StpSatH2(abs(gBi1 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
  4423. StpH2 gUseP0 = StpSatH2(abs(gBi0 - StpH2_(gEndBase.y)) * StpH2_(gEndBase.x));
  4424. //------------------------------------------------------------------------------------------------------------------------------
  4425. // Work this like painters alpha blending.
  4426. // This analog path is faster and cleaner than binary logic.
  4427. // Distance traveled for {negative, positive} paths.
  4428. // LOGIC
  4429. // =====
  4430. // Note distance factors already have the 0.5 factored in.
  4431. // N := negative search end (1 pixel away, but edge is 0.5 pixel away)
  4432. // P := positive search end (4 pixel away, but edge is 3.5 pixel away)
  4433. // X := the pixel to filter
  4434. // :<->:<------------->:
  4435. // : : :
  4436. // : : +---+---+---+---+
  4437. // : : | : | | | |
  4438. // N +---+---+---+---+-P-+---+---+---+
  4439. // | X | | | | | | | |
  4440. // +---+---+---+---+---+---+---+---+---+---+---+---+
  4441. // | | | | | | | | | | | | |
  4442. // +---+---+---+---+---+---+---+---+---+---+---+---+
  4443. #if (STP_GEAA_P == 3)
  4444. StpH2 gDst2 = StpH2_(9.5);
  4445. #endif
  4446. #if (STP_GEAA_P == 2)
  4447. StpH2 gDst2 = StpH2_(7.5);
  4448. #endif
  4449. #if (STP_GEAA_P == 1)
  4450. StpH2 gDst2 = StpH2_(5.5);
  4451. #endif
  4452. #if (STP_GEAA_P == 0)
  4453. StpH2 gDst2 = StpH2_(3.5);
  4454. #endif
  4455. #if (STP_GEAA_P > 2)
  4456. gDst2 = gDst2 + (StpH2_(8.5) - gDst2) * gUseP8;
  4457. gDst2 = gDst2 + (StpH2_(7.5) - gDst2) * gUseP7;
  4458. #endif
  4459. #if (STP_GEAA_P > 1)
  4460. gDst2 = gDst2 + (StpH2_(6.5) - gDst2) * gUseP6;
  4461. gDst2 = gDst2 + (StpH2_(5.5) - gDst2) * gUseP5;
  4462. #endif
  4463. #if (STP_GEAA_P > 0)
  4464. gDst2 = gDst2 + (StpH2_(4.5) - gDst2) * gUseP4;
  4465. gDst2 = gDst2 + (StpH2_(3.5) - gDst2) * gUseP3;
  4466. #endif
  4467. gDst2 = gDst2 + (StpH2_(2.5) - gDst2) * gUseP2;
  4468. gDst2 = gDst2 + (StpH2_(1.5) - gDst2) * gUseP1;
  4469. gDst2 = gDst2 + (StpH2_(0.5) - gDst2) * gUseP0;
  4470. //------------------------------------------------------------------------------------------------------------------------------
  4471. // Run the variable length low-pass box blur.
  4472. // Need half distance with half pixel removed.
  4473. StpH1 gLoSub = (gDst2.x + gDst2.y) * StpH1_(0.5) - StpH1_(STP_GEAA_SUBPIX);
  4474. // compute the weights (if should be included or not).
  4475. StpH2 gLoW01 = StpH2_(1.0) - StpSatH2(StpH2(1.0, 2.0) - StpH2_(gLoSub));
  4476. StpH2 gLoW23 = StpH2_(1.0) - StpSatH2(StpH2(3.0, 4.0) - StpH2_(gLoSub));
  4477. StpH2 gLoW45 = StpH2_(1.0) - StpSatH2(StpH2(5.0, 6.0) - StpH2_(gLoSub));
  4478. StpH2 gLoW67 = StpH2_(1.0) - StpSatH2(StpH2(7.0, 8.0) - StpH2_(gLoSub));
  4479. StpH2 gLoW89 = StpH2_(1.0) - StpSatH2(StpH2(9.0,10.0) - StpH2_(gLoSub));
  4480. // Weighted accumulation of samples.
  4481. StpH2 gLoAcc2 =
  4482. gLo0 * StpH2_(gLoW01.x) +
  4483. gLo1 * StpH2_(gLoW01.y) +
  4484. gLo2 * StpH2_(gLoW23.x) +
  4485. gLo3 * StpH2_(gLoW23.y) +
  4486. gLo4 * StpH2_(gLoW45.x) +
  4487. gLo5 * StpH2_(gLoW45.y) +
  4488. gLo6 * StpH2_(gLoW67.x) +
  4489. gLo7 * StpH2_(gLoW67.y) +
  4490. gLo8 * StpH2_(gLoW89.x);
  4491. StpH1 gLoAcc = gE + gLoAcc2.x + gLoAcc2.y;
  4492. // Weight sum.
  4493. StpH2 gLoW2 = gLoW01 + gLoW23 + gLoW45 + gLoW67;
  4494. gLoW2 *= StpH2_(2.0);
  4495. gLoAcc *= StpRcpH1(StpH1_(1.0) + gLoW89.x * StpH1_(2.0) + gLoW2.x + gLoW2.y);
  4496. // Convert to blend between self and high-contrast neighbor.
  4497. // This currently allows full {0.0 to 1.0} blend.
  4498. StpH1 gOff = StpSatH1((gLoAcc - gE) * StpRcpH1(gBH.x - gE));
  4499. // It is important to not exceed 0.5 weight for PIXart scaling.
  4500. gOff = min(gOff, StpH1_(0.5));
  4501. //------------------------------------------------------------------------------------------------------------------------------
  4502. // Save out dilation pixel for {z,motion}.
  4503. gDilate = p + gDecon;
  4504. // Save out filter position.
  4505. gFilter = p + gDecon * StpF2_(gOff);
  4506. gLuma = lerp(gE, gBH.x, gOff);
  4507. //------------------------------------------------------------------------------------------------------------------------------
  4508. // GEAA up to this point creates weights that only help a scalar for aliased edges.
  4509. // This attempts to increase weight to also restore some anti-aliased edges.
  4510. // It does this by increasing weight as much as can be borrowed from the 'E to H' side.
  4511. // An equation for movement towards H,
  4512. // E+(H-E)*T ... Where T must be {0 to 1} ranged, but want {0 to 0.5} ranged (same as 'gOff').
  4513. // Equation for E motion with respect to the B side,
  4514. // A=E+(B-E)*F ... Where A is the anti-aliased output, and F would typically be 'gOff'.
  4515. // Solving that for E,
  4516. // E=((A-F*B)/(1-F)
  4517. // Combining equations,
  4518. // E+(H-E)*T = ((A-F*B)/(1-F)
  4519. // Then solving for T when 'F=0.5' (maximum 'gOff' weight),
  4520. // T=(-2*A+B+E)/(E-H)
  4521. // Then limit T inside {0 to 0.5}.
  4522. // And use limited 'T' to recompute a new 'F' which becomes the 'gOff' fixed weight.
  4523. StpH1 gAnti = lerp(gE, gBH.x, gOff);
  4524. // Solve for the movement towards 'H'.
  4525. // This in theory should be limited to {0 to 0.5}, but {0 to 1} seems to work too.
  4526. StpH1 gT = StpSatH1((StpH1_(-2.0) * gAnti + gBH.x + gE) * StpRcpH1(gE - gBH.y));
  4527. StpH1 gFix = gE * (gT - StpH1_(1.0)) - gBH.y * gT;
  4528. gFix = StpSatH1((gFix + gAnti) * StpRcpH1(gFix + gBH.x));
  4529. //------------------------------------------------------------------------------------------------------------------------------
  4530. // Output weight for pixel art scalar.
  4531. // The 'gOff'set goes between {0 := no change, to 0.5 := half to neighbor}.
  4532. // The half to neighbor position would be where the edge crosses between two pixels.
  4533. // The sample size needs to be {0 := at the crossing, to 1 := no change}.
  4534. // Can solve this, the 1D kernel will look like,
  4535. // u = (1-x)*s ... weighting terms
  4536. // v = x *t
  4537. // w = 1/(u+v)
  4538. // o = a*u*w + b*v*w
  4539. // The split is where weights are the same,
  4540. // u*w == v*w ... ((1-x)*s)/(((1-x)*s)+(x*t)) == (x*t)/(((1-x)*s)+(x*t))
  4541. // Can assume s=1.0 (the other sample), thus this reduces to,
  4542. // u*w == v*w ... (1-x)/((1-x)+(x*t)) == (x*t)/((1-x)+(x*t))
  4543. // Then solve for 't' given crossing point 'x'.
  4544. // t=1/x-1
  4545. // Convert to 'x=gOffset+1/2'.
  4546. // Solve for 't=1/x-1', or 't=1/(gOffset+1/2)-1'.
  4547. gW = gFix;
  4548. gW = StpRcpH1(gW + StpH1_(0.5)) - StpH1_(1.0);
  4549. // Send squared (as needed by scalar).
  4550. gW *= gW;
  4551. // Make sure not zero.
  4552. gW = max(gW, StpH1_(1.0/255.0)); }
  4553. #endif // defined(STP_GPU) && defined(STP_GEAA) && defined(STP_16BIT)
  4554. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  4555. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  4556. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  4557. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  4558. #endif // STP_UNITY_INCLUDE_GUARD