No Description
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

StpSetup.compute 21KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
  1. #pragma kernel StpSetup
  2. #pragma multi_compile _ ENABLE_DEBUG_MODE
  3. #pragma multi_compile _ ENABLE_STENCIL_RESPONSIVE
  4. #pragma multi_compile _ ENABLE_LARGE_KERNEL
  5. #pragma multi_compile _ UNITY_DEVICE_SUPPORTS_NATIVE_16BIT
  6. #pragma multi_compile _ UNITY_DEVICE_SUPPORTS_WAVE_ANY
  7. #pragma multi_compile _ DISABLE_TEXTURE2D_X_ARRAY
  8. #pragma only_renderers d3d11 playstation xboxone xboxseries vulkan metal switch
  9. #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl"
  10. #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Color.hlsl"
  11. #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/UnityInstancing.hlsl"
  12. #define STP_PAT 1
  13. #include "Packages/com.unity.render-pipelines.core/Runtime/STP/StpCommon.hlsl"
  14. //
  15. // Input
  16. //
  17. TEXTURE2D_X(_StpInputColor);
  18. TEXTURE2D_X(_StpInputDepth);
  19. TEXTURE2D_X(_StpInputMotion);
  20. #if defined(ENABLE_STENCIL_RESPONSIVE)
  21. TYPED_TEXTURE2D_X(uint2, _StpInputStencil);
  22. #endif
  23. //
  24. // Intermediate Output
  25. //
  26. RW_TEXTURE2D_X(float4, _StpIntermediateColor);
  27. RW_TEXTURE2D_X(float, _StpIntermediateConvergence);
  28. //
  29. // History Input/Output
  30. //
  31. TYPED_TEXTURE2D_X(uint, _StpPriorDepthMotion);
  32. RW_TEXTURE2D_X(uint, _StpDepthMotion);
  33. TEXTURE2D_X(_StpPriorLuma);
  34. RW_TEXTURE2D_X(float2, _StpLuma);
  35. TEXTURE2D_X(_StpPriorConvergence);
  36. TEXTURE2D_X(_StpPriorFeedback);
  37. #define STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET (SLICE_ARRAY_INDEX * STPSETUPPERVIEWCONSTANTS_COUNT)
  38. #if defined(SHADER_API_PSSL) || defined(SHADER_API_SWITCH) || (defined(SHADER_API_METAL) && !defined(SHADER_API_MOBILE))
  39. // Force usage of the 32-bit reduction path even in 16-bit environments
  40. #define STP_FORCE_32BIT_REDUCTION
  41. #endif
  42. #if defined(SHADER_API_PSSL) || defined(SHADER_API_GAMECORE) || defined(SHADER_API_METAL) || (defined(SHADER_API_VULKAN) && defined(SHADER_API_MOBILE))
  43. // Force usage of group shared memory instead using wave operations
  44. #define STP_FORCE_GROUPSHARED
  45. #endif
  46. // Enable the use of wave operations when they're supported by the current hardware and usage of groupshared hasn't been forced.
  47. #if defined(UNITY_HW_SUPPORTS_WAVE) && !defined(STP_FORCE_GROUPSHARED)
  48. #define STP_ENABLE_WAVEOPS
  49. #endif
  50. // STP requires a 4x4 reduction which must be implemented by either wave operations, or group shared memory.
  51. #if !defined(STP_ENABLE_WAVEOPS)
  52. #if defined(STP_16BIT) && !defined(STP_FORCE_32BIT_REDUCTION)
  53. groupshared uint4 gs_StpScratch[STP_GROUP_SIZE];
  54. #else
  55. groupshared float4 gs_StpScratch[STP_GROUP_SIZE * 2];
  56. #endif
  57. #endif
  58. // In some cases, we have to expose the 32-bit reduction code in the 16-bit path
  59. #if defined(STP_32BIT) || defined(STP_FORCE_32BIT_REDUCTION)
  60. void StpPat4x4MaxF8(StpMU1 i, inout StpF4 a, inout StpF4 b)
  61. {
  62. #if defined(STP_ENABLE_WAVEOPS)
  63. a.x = max(a.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 1)));
  64. a.y = max(a.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 1)));
  65. a.z = max(a.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 1)));
  66. a.w = max(a.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 1)));
  67. b.x = max(b.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.x), WaveGetLaneIndex() ^ 1)));
  68. b.y = max(b.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.y), WaveGetLaneIndex() ^ 1)));
  69. b.z = max(b.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.z), WaveGetLaneIndex() ^ 1)));
  70. b.w = max(b.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.w), WaveGetLaneIndex() ^ 1)));
  71. a.x = max(a.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 2)));
  72. a.y = max(a.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 2)));
  73. a.z = max(a.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 2)));
  74. a.w = max(a.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 2)));
  75. b.x = max(b.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.x), WaveGetLaneIndex() ^ 2)));
  76. b.y = max(b.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.y), WaveGetLaneIndex() ^ 2)));
  77. b.z = max(b.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.z), WaveGetLaneIndex() ^ 2)));
  78. b.w = max(b.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.w), WaveGetLaneIndex() ^ 2)));
  79. a.x = max(a.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 4)));
  80. a.y = max(a.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 4)));
  81. a.z = max(a.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 4)));
  82. a.w = max(a.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 4)));
  83. b.x = max(b.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.x), WaveGetLaneIndex() ^ 4)));
  84. b.y = max(b.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.y), WaveGetLaneIndex() ^ 4)));
  85. b.z = max(b.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.z), WaveGetLaneIndex() ^ 4)));
  86. b.w = max(b.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.w), WaveGetLaneIndex() ^ 4)));
  87. a.x = max(a.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 8)));
  88. a.y = max(a.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 8)));
  89. a.z = max(a.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 8)));
  90. a.w = max(a.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 8)));
  91. b.x = max(b.x, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.x), WaveGetLaneIndex() ^ 8)));
  92. b.y = max(b.y, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.y), WaveGetLaneIndex() ^ 8)));
  93. b.z = max(b.z, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.z), WaveGetLaneIndex() ^ 8)));
  94. b.w = max(b.w, StpF1_U1(WaveReadLaneAt(StpU1_F1(b.w), WaveGetLaneIndex() ^ 8)));
  95. #else
  96. gs_StpScratch[i] = a;
  97. gs_StpScratch[i + STP_GROUP_SIZE] = b;
  98. GroupMemoryBarrierWithGroupSync();
  99. // 2x2 Reduction
  100. {
  101. StpMU1 offset = (i & ~StpMU1(3));
  102. StpMU1 a0 = offset + ((i + StpMU1(1)) & StpMU1(3));
  103. StpMU1 a1 = offset + ((i + StpMU1(2)) & StpMU1(3));
  104. StpMU1 a2 = offset + ((i + StpMU1(3)) & StpMU1(3));
  105. float4 x0 = gs_StpScratch[a0];
  106. float4 x1 = gs_StpScratch[a1];
  107. float4 x2 = gs_StpScratch[a2];
  108. float4 y0 = gs_StpScratch[a0 + STP_GROUP_SIZE];
  109. float4 y1 = gs_StpScratch[a1 + STP_GROUP_SIZE];
  110. float4 y2 = gs_StpScratch[a2 + STP_GROUP_SIZE];
  111. GroupMemoryBarrierWithGroupSync();
  112. a = max(max(max(a, x0), x1), x2);
  113. b = max(max(max(b, y0), y1), y2);
  114. }
  115. gs_StpScratch[i] = a;
  116. gs_StpScratch[i + STP_GROUP_SIZE] = b;
  117. GroupMemoryBarrierWithGroupSync();
  118. // 4x4 Reduction
  119. {
  120. StpMU1 offset = (i & ~StpMU1(15));
  121. StpMU1 a0 = offset + ((i + StpMU1(4)) & StpMU1(15));
  122. StpMU1 a1 = offset + ((i + StpMU1(8)) & StpMU1(15));
  123. StpMU1 a2 = offset + ((i + StpMU1(12)) & StpMU1(15));
  124. float4 x0 = gs_StpScratch[a0];
  125. float4 x1 = gs_StpScratch[a1];
  126. float4 x2 = gs_StpScratch[a2];
  127. float4 y0 = gs_StpScratch[a0 + STP_GROUP_SIZE];
  128. float4 y1 = gs_StpScratch[a1 + STP_GROUP_SIZE];
  129. float4 y2 = gs_StpScratch[a2 + STP_GROUP_SIZE];
  130. GroupMemoryBarrierWithGroupSync();
  131. a = max(max(max(a, x0), x1), x2);
  132. b = max(max(max(b, y0), y1), y2);
  133. }
  134. #endif
  135. }
  136. void StpPat4x4SumF4(StpMU1 i, inout StpF4 a)
  137. {
  138. #if defined(STP_ENABLE_WAVEOPS)
  139. a.x += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 1));
  140. a.y += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 1));
  141. a.z += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 1));
  142. a.w += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 1));
  143. a.x += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 2));
  144. a.y += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 2));
  145. a.z += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 2));
  146. a.w += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 2));
  147. a.x += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 4));
  148. a.y += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 4));
  149. a.z += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 4));
  150. a.w += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 4));
  151. a.x += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.x), WaveGetLaneIndex() ^ 8));
  152. a.y += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.y), WaveGetLaneIndex() ^ 8));
  153. a.z += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.z), WaveGetLaneIndex() ^ 8));
  154. a.w += StpF1_U1(WaveReadLaneAt(StpU1_F1(a.w), WaveGetLaneIndex() ^ 8));
  155. #else
  156. gs_StpScratch[i] = a;
  157. GroupMemoryBarrierWithGroupSync();
  158. // 2x2 Reduction
  159. {
  160. StpMU1 offset = (i & ~StpMU1(3));
  161. StpMU1 a0 = offset + ((i + StpMU1(1)) & StpMU1(3));
  162. StpMU1 a1 = offset + ((i + StpMU1(2)) & StpMU1(3));
  163. StpMU1 a2 = offset + ((i + StpMU1(3)) & StpMU1(3));
  164. float4 x0 = gs_StpScratch[a0];
  165. float4 x1 = gs_StpScratch[a1];
  166. float4 x2 = gs_StpScratch[a2];
  167. GroupMemoryBarrierWithGroupSync();
  168. a = a + x0 + x1 + x2;
  169. }
  170. gs_StpScratch[i] = a;
  171. GroupMemoryBarrierWithGroupSync();
  172. // 4x4 Reduction
  173. {
  174. StpMU1 offset = (i & ~StpMU1(15));
  175. StpMU1 a0 = offset + ((i + StpMU1(4)) & StpMU1(15));
  176. StpMU1 a1 = offset + ((i + StpMU1(8)) & StpMU1(15));
  177. StpMU1 a2 = offset + ((i + StpMU1(12)) & StpMU1(15));
  178. float4 x0 = gs_StpScratch[a0];
  179. float4 x1 = gs_StpScratch[a1];
  180. float4 x2 = gs_StpScratch[a2];
  181. GroupMemoryBarrierWithGroupSync();
  182. a = a + x0 + x1 + x2;
  183. }
  184. #endif
  185. }
  186. #endif
  187. #if defined(STP_16BIT)
  188. void StpPat4x4MaxH8(StpW1 i, inout StpH4 a, inout StpH4 b)
  189. {
  190. #if defined(STP_FORCE_32BIT_REDUCTION)
  191. StpPat4x4MaxF8(i, a, b);
  192. #else
  193. #if defined(STP_ENABLE_WAVEOPS)
  194. a.xy = max(a.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 1)));
  195. a.zw = max(a.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 1)));
  196. b.xy = max(b.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.xy), WaveGetLaneIndex() ^ 1)));
  197. b.zw = max(b.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.zw), WaveGetLaneIndex() ^ 1)));
  198. a.xy = max(a.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 2)));
  199. a.zw = max(a.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 2)));
  200. b.xy = max(b.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.xy), WaveGetLaneIndex() ^ 2)));
  201. b.zw = max(b.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.zw), WaveGetLaneIndex() ^ 2)));
  202. a.xy = max(a.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 4)));
  203. a.zw = max(a.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 4)));
  204. b.xy = max(b.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.xy), WaveGetLaneIndex() ^ 4)));
  205. b.zw = max(b.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.zw), WaveGetLaneIndex() ^ 4)));
  206. a.xy = max(a.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 8)));
  207. a.zw = max(a.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 8)));
  208. b.xy = max(b.xy, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.xy), WaveGetLaneIndex() ^ 8)));
  209. b.zw = max(b.zw, StpH2_U1(WaveReadLaneAt(StpU1_H2(b.zw), WaveGetLaneIndex() ^ 8)));
  210. #else
  211. gs_StpScratch[i] = StpU4(StpU1_H2(a.xy), StpU1_H2(a.zw), StpU1_H2(b.xy), StpU1_H2(b.zw));
  212. GroupMemoryBarrierWithGroupSync();
  213. // 2x2 Reduction
  214. {
  215. StpW1 offset = (i & ~StpW1(3));
  216. StpW1 a0 = offset + ((i + StpW1(1)) & StpW1(3));
  217. StpW1 a1 = offset + ((i + StpW1(2)) & StpW1(3));
  218. StpW1 a2 = offset + ((i + StpW1(3)) & StpW1(3));
  219. uint4 x0 = gs_StpScratch[a0];
  220. uint4 x1 = gs_StpScratch[a1];
  221. uint4 x2 = gs_StpScratch[a2];
  222. GroupMemoryBarrierWithGroupSync();
  223. a.xy = max(max(max(a.xy, StpH2_U1(x0.x)), StpH2_U1(x1.x)), StpH2_U1(x2.x));
  224. a.zw = max(max(max(a.zw, StpH2_U1(x0.y)), StpH2_U1(x1.y)), StpH2_U1(x2.y));
  225. b.xy = max(max(max(b.xy, StpH2_U1(x0.z)), StpH2_U1(x1.z)), StpH2_U1(x2.z));
  226. b.zw = max(max(max(b.zw, StpH2_U1(x0.w)), StpH2_U1(x1.w)), StpH2_U1(x2.w));
  227. }
  228. gs_StpScratch[i] = StpU4(StpU1_H2(a.xy), StpU1_H2(a.zw), StpU1_H2(b.xy), StpU1_H2(b.zw));
  229. GroupMemoryBarrierWithGroupSync();
  230. // 4x4 Reduction
  231. {
  232. StpW1 offset = (i & ~StpW1(15));
  233. StpW1 a0 = offset + ((i + StpW1(4)) & StpW1(15));
  234. StpW1 a1 = offset + ((i + StpW1(8)) & StpW1(15));
  235. StpW1 a2 = offset + ((i + StpW1(12)) & StpW1(15));
  236. uint4 x0 = gs_StpScratch[a0];
  237. uint4 x1 = gs_StpScratch[a1];
  238. uint4 x2 = gs_StpScratch[a2];
  239. GroupMemoryBarrierWithGroupSync();
  240. a.xy = max(max(max(a.xy, StpH2_U1(x0.x)), StpH2_U1(x1.x)), StpH2_U1(x2.x));
  241. a.zw = max(max(max(a.zw, StpH2_U1(x0.y)), StpH2_U1(x1.y)), StpH2_U1(x2.y));
  242. b.xy = max(max(max(b.xy, StpH2_U1(x0.z)), StpH2_U1(x1.z)), StpH2_U1(x2.z));
  243. b.zw = max(max(max(b.zw, StpH2_U1(x0.w)), StpH2_U1(x1.w)), StpH2_U1(x2.w));
  244. }
  245. #endif
  246. #endif
  247. }
  248. void StpPat4x4SumH4(StpW1 i, inout StpH4 a)
  249. {
  250. #if defined(STP_FORCE_32BIT_REDUCTION)
  251. StpPat4x4SumF4(i, a);
  252. #else
  253. #if defined(STP_ENABLE_WAVEOPS)
  254. a.xy += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 1));
  255. a.zw += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 1));
  256. a.xy += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 2));
  257. a.zw += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 2));
  258. a.xy += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 4));
  259. a.zw += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 4));
  260. a.xy += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.xy), WaveGetLaneIndex() ^ 8));
  261. a.zw += StpH2_U1(WaveReadLaneAt(StpU1_H2(a.zw), WaveGetLaneIndex() ^ 8));
  262. #else
  263. gs_StpScratch[i].xy = StpU2(StpU1_H2(a.xy), StpU1_H2(a.zw));
  264. GroupMemoryBarrierWithGroupSync();
  265. // 2x2 Reduction
  266. {
  267. StpW1 offset = (i & ~StpW1(3));
  268. StpW1 a0 = offset + ((i + StpW1(1)) & StpW1(3));
  269. StpW1 a1 = offset + ((i + StpW1(2)) & StpW1(3));
  270. StpW1 a2 = offset + ((i + StpW1(3)) & StpW1(3));
  271. uint2 x0 = gs_StpScratch[a0].xy;
  272. uint2 x1 = gs_StpScratch[a1].xy;
  273. uint2 x2 = gs_StpScratch[a2].xy;
  274. GroupMemoryBarrierWithGroupSync();
  275. a.xy = a.xy + StpH2_U1(x0.x) + StpH2_U1(x1.x) + StpH2_U1(x2.x);
  276. a.zw = a.zw + StpH2_U1(x0.y) + StpH2_U1(x1.y) + StpH2_U1(x2.y);
  277. }
  278. gs_StpScratch[i].xy = StpU2(StpU1_H2(a.xy), StpU1_H2(a.zw));
  279. GroupMemoryBarrierWithGroupSync();
  280. // 4x4 Reduction
  281. {
  282. StpW1 offset = (i & ~StpW1(15));
  283. StpW1 a0 = offset + ((i + StpW1(4)) & StpW1(15));
  284. StpW1 a1 = offset + ((i + StpW1(8)) & StpW1(15));
  285. StpW1 a2 = offset + ((i + StpW1(12)) & StpW1(15));
  286. uint2 x0 = gs_StpScratch[a0].xy;
  287. uint2 x1 = gs_StpScratch[a1].xy;
  288. uint2 x2 = gs_StpScratch[a2].xy;
  289. GroupMemoryBarrierWithGroupSync();
  290. a.xy = a.xy + StpH2_U1(x0.x) + StpH2_U1(x1.x) + StpH2_U1(x2.x);
  291. a.zw = a.zw + StpH2_U1(x0.y) + StpH2_U1(x1.y) + StpH2_U1(x2.y);
  292. }
  293. #endif
  294. #endif
  295. }
  296. StpH1 StpPatPriConH(StpF2 p) { return (StpH1)SAMPLE_TEXTURE2D_X_LOD(_StpPriorConvergence, s_linear_clamp_sampler, p, 0); }
  297. // These are separate to support inline operation (pass merged instead of loads).
  298. StpF2 StpPatDatMotH(StpW2 o) { return LOAD_TEXTURE2D_X_LOD(_StpInputMotion, o, 0).xy; }
  299. StpH3 StpPatDatColH(StpW2 o) { return (StpH3)LOAD_TEXTURE2D_X_LOD(_StpInputColor, o, 0).rgb; }
  300. StpF1 StpPatDatZH(StpW2 o) { return LOAD_TEXTURE2D_X_LOD(_StpInputDepth, o, 0).x; }
  301. // This provides a place to convert Z from depth to linear if not inlined and actually loaded.
  302. StpF1 StpPatFixZH(StpF1 z) { return 1.0 / (STP_ZBUFFER_PARAMS_Z * z + STP_ZBUFFER_PARAMS_W); }
  303. StpU1 StpPatDatRH(StpW2 o) {
  304. #if defined(ENABLE_STENCIL_RESPONSIVE)
  305. return GetStencilValue(LOAD_TEXTURE2D_X_LOD(_StpInputStencil, o, 0).xy);
  306. #endif // defined(ENABLE_STENCIL_RESPONSIVE)
  307. return StpU1_(0); }
  308. StpH1 StpPatFixRH(StpU1 v) {
  309. // Activate the "responsive" feature when we don't have valid history textures.
  310. bool hasValidHistory = DecodeHasValidHistory(STP_COMMON_CONSTANT);
  311. bool excludeTaa = false;
  312. #if defined(ENABLE_STENCIL_RESPONSIVE)
  313. excludeTaa = (v & DecodeStencilMask(STP_COMMON_CONSTANT)) != 0;
  314. #endif // defined(ENABLE_STENCIL_RESPONSIVE)
  315. return (hasValidHistory && !excludeTaa) ? StpH1_(1.0) : StpH1_(0.0); }
  316. StpH1 StpPatDitH(StpW2 o) { return StpDitH1(o); }
  317. StpH4 StpPatPriFedH(StpF2 p) { return (StpH4)SAMPLE_TEXTURE2D_X_LOD(_StpPriorFeedback, s_linear_clamp_sampler, p, 0); }
  318. StpH4 StpPatPriFedR4H(StpF2 p) { return (StpH4)GATHER_RED_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
  319. StpH4 StpPatPriFedG4H(StpF2 p) { return (StpH4)GATHER_GREEN_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
  320. StpH4 StpPatPriFedB4H(StpF2 p) { return (StpH4)GATHER_BLUE_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
  321. StpH2 StpPatPriLumH(StpF2 p) { return (StpH2)SAMPLE_TEXTURE2D_X_LOD(_StpPriorLuma, s_linear_clamp_sampler, p, 0); }
  322. StpU4 StpPatPriMot4H(StpF2 p) { return GATHER_RED_TEXTURE2D_X(_StpPriorDepthMotion, s_point_clamp_sampler, p); }
  323. void StpPatStMotH(StpW2 p, StpU1 v) { _StpDepthMotion[COORD_TEXTURE2D_X(p)] = v; }
  324. void StpPatStColH(StpW2 p, StpH4 v) { _StpIntermediateColor[COORD_TEXTURE2D_X(p)] = v; }
  325. void StpPatStLumH(StpW2 p, StpH2 v) { _StpLuma[COORD_TEXTURE2D_X(p)] = v; }
  326. void StpPatStCnvH(StpW2 p, StpH1 v) { _StpIntermediateConvergence[COORD_TEXTURE2D_X(p >> StpW1(2))] = v; }
  327. #endif
  328. #if defined(STP_32BIT)
  329. StpMF1 StpPatPriConF(StpF2 p) { return (StpMF1)SAMPLE_TEXTURE2D_X_LOD(_StpPriorConvergence, s_linear_clamp_sampler, p, 0); }
  330. // These are separate to support inline operation (pass merged instead of loads).
  331. StpF2 StpPatDatMotF(StpMU2 o) { return LOAD_TEXTURE2D_X_LOD(_StpInputMotion, o, 0).xy; }
  332. StpMF3 StpPatDatColF(StpMU2 o) { return (StpMF3)LOAD_TEXTURE2D_X_LOD(_StpInputColor, o, 0).rgb; }
  333. StpF1 StpPatDatZF(StpMU2 o) { return LOAD_TEXTURE2D_X_LOD(_StpInputDepth, o, 0).x; }
  334. // This provides a place to convert Z from depth to linear if not inlined and actually loaded.
  335. StpF1 StpPatFixZF(StpF1 z) { return 1.0 / (STP_ZBUFFER_PARAMS_Z * z + STP_ZBUFFER_PARAMS_W); }
  336. StpU1 StpPatDatRF(StpMU2 o) {
  337. #if defined(ENABLE_STENCIL_RESPONSIVE)
  338. return GetStencilValue(LOAD_TEXTURE2D_X_LOD(_StpInputStencil, o, 0).xy);
  339. #endif // defined(ENABLE_STENCIL_RESPONSIVE)
  340. return StpU1_(0); }
  341. StpMF1 StpPatFixRF(StpU1 v) {
  342. // Activate the "responsive" feature when we don't have valid history textures.
  343. bool hasValidHistory = DecodeHasValidHistory(STP_COMMON_CONSTANT);
  344. bool excludeTaa = false;
  345. #if defined(ENABLE_STENCIL_RESPONSIVE)
  346. excludeTaa = (v & DecodeStencilMask(STP_COMMON_CONSTANT)) != 0;
  347. #endif // defined(ENABLE_STENCIL_RESPONSIVE)
  348. return (hasValidHistory && !excludeTaa) ? StpMF1_(1.0) : StpMF1_(0.0); }
  349. StpMF1 StpPatDitF(StpMU2 o) { return (StpMF1)StpDitF1(o); }
  350. StpMF4 StpPatPriFedF(StpF2 p) { return (StpMF4)SAMPLE_TEXTURE2D_X_LOD(_StpPriorFeedback, s_linear_clamp_sampler, p, 0); }
  351. StpMF4 StpPatPriFedR4F(StpF2 p) { return (StpMF4)GATHER_RED_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
  352. StpMF4 StpPatPriFedG4F(StpF2 p) { return (StpMF4)GATHER_GREEN_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
  353. StpMF4 StpPatPriFedB4F(StpF2 p) { return (StpMF4)GATHER_BLUE_TEXTURE2D_X(_StpPriorFeedback, s_point_clamp_sampler, p); }
  354. StpMF2 StpPatPriLumF(StpF2 p) { return (StpMF2)SAMPLE_TEXTURE2D_X_LOD(_StpPriorLuma, s_linear_clamp_sampler, p, 0); }
  355. StpU4 StpPatPriMot4F(StpF2 p) { return GATHER_RED_TEXTURE2D_X(_StpPriorDepthMotion, s_point_clamp_sampler, p); }
  356. void StpPatStMotF(StpMU2 p, StpU1 v) { _StpDepthMotion[COORD_TEXTURE2D_X(p)] = v; }
  357. void StpPatStColF(StpMU2 p, StpMF4 v) { _StpIntermediateColor[COORD_TEXTURE2D_X(p)] = v; }
  358. void StpPatStLumF(StpMU2 p, StpMF2 v) { _StpLuma[COORD_TEXTURE2D_X(p)] = v; }
  359. void StpPatStCnvF(StpMU2 p, StpMF1 v) { _StpIntermediateConvergence[COORD_TEXTURE2D_X(p >> StpMU1(2))] = v; }
  360. #endif
  361. #define THREADING_BLOCK_SIZE STP_GROUP_SIZE
  362. #include "Packages/com.unity.render-pipelines.core/ShaderLibrary/Threading.hlsl"
  363. [numthreads(STP_GROUP_SIZE, 1, 1)]
  364. void StpSetup(Threading::Group group)
  365. {
  366. UNITY_XR_ASSIGN_VIEW_INDEX(group.groupID.z);
  367. #if defined(STP_16BIT)
  368. StpW1 lane = StpW1_(group.groupIndex);
  369. StpW2 groupPos = ComputeGroupPos(StpW2(group.groupID.xy));
  370. StpW2 pos = groupPos + StpRemapLaneTo8x16H(lane);
  371. #else
  372. StpMU1 lane = StpMU1_(group.groupIndex);
  373. StpMU2 groupPos = ComputeGroupPos(StpMU2(group.groupID.xy));
  374. StpMU2 pos = groupPos + StpRemapLaneTo8x16F(lane);
  375. #endif
  376. #if defined(STP_16BIT)
  377. StpPatH(
  378. lane,
  379. pos,
  380. #else
  381. StpPatF(
  382. lane,
  383. pos,
  384. #endif
  385. asuint(_StpSetupConstants0),
  386. asuint(_StpSetupConstants1),
  387. asuint(_StpSetupConstants2),
  388. asuint(_StpSetupConstants3),
  389. asuint(_StpSetupConstants4),
  390. asuint(_StpSetupConstants5),
  391. asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 0]),
  392. asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 1]),
  393. asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 2]),
  394. asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 3]),
  395. asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 4]),
  396. asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 5]),
  397. asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 6]),
  398. asuint(_StpSetupPerViewConstants[STP_SETUP_PER_VIEW_CONSTANTS_STEREO_OFFSET + 7])
  399. );
  400. }