Нет описания
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112
  1. using System;
  2. using System.Diagnostics;
  3. namespace Unity.Burst.Intrinsics
  4. {
  5. public unsafe static partial class X86
  6. {
  7. /// <summary>
  8. /// SSE 4.1 intrinsics
  9. /// </summary>
  10. public static class Sse4_1
  11. {
  12. /// <summary>
  13. /// Evaluates to true at compile time if SSE 4.1 intrinsics are supported.
  14. /// </summary>
  15. public static bool IsSse41Supported { get { return false; } }
  16. // _mm_stream_load_si128
  17. /// <summary>
  18. /// Load 128-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
  19. /// </summary>
  20. /// <param name="mem_addr">Memory address</param>
  21. /// <returns>Vector</returns>
  22. [DebuggerStepThrough]
  23. public static v128 stream_load_si128(void* mem_addr)
  24. {
  25. return GenericCSharpLoad(mem_addr);
  26. }
  27. // _mm_blend_pd
  28. /// <summary> Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". </summary>
  29. /// <param name="a">Vector a</param>
  30. /// <param name="b">Vector b</param>
  31. /// <param name="imm8">Control mask</param>
  32. /// <returns>Vector</returns>
  33. [DebuggerStepThrough]
  34. public static v128 blend_pd(v128 a, v128 b, int imm8)
  35. {
  36. int j;
  37. v128 dst = default(v128);
  38. double* dptr = &dst.Double0;
  39. double* aptr = &a.Double0;
  40. double* bptr = &b.Double0;
  41. for (j = 0; j <= 1; j++)
  42. {
  43. if (0 != (imm8 & (1 << j)))
  44. {
  45. dptr[j] = bptr[j];
  46. }
  47. else
  48. {
  49. dptr[j] = aptr[j];
  50. }
  51. }
  52. return dst;
  53. }
  54. // _mm_blend_ps
  55. /// <summary> Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". </summary>
  56. /// <param name="a">Vector a</param>
  57. /// <param name="b">Vector b</param>
  58. /// <param name="imm8">Control mask</param>
  59. /// <returns>Vector</returns>
  60. [DebuggerStepThrough]
  61. public static v128 blend_ps(v128 a, v128 b, int imm8)
  62. {
  63. int j;
  64. v128 dst = default(v128);
  65. // This implementation is used also for blend_epi32
  66. // When casting the vectors to float*, NaNs works incorrectly (if the input was actually integer which bitcasts into a NaN)
  67. // It seems to happen on Mono only, IL2CPP or .NET are working fine
  68. // Hence cast to int here, and shuffle the ints
  69. int* dptr = &dst.SInt0;
  70. int* aptr = &a.SInt0;
  71. int* bptr = &b.SInt0;
  72. for (j = 0; j <= 3; j++)
  73. {
  74. if (0 != (imm8 & (1 << j)))
  75. {
  76. dptr[j] = bptr[j];
  77. }
  78. else
  79. {
  80. dptr[j] = aptr[j];
  81. }
  82. }
  83. return dst;
  84. }
  85. // _mm_blendv_pd
  86. /// <summary> Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". </summary>
  87. /// <param name="a">Vector a</param>
  88. /// <param name="b">Vector b</param>
  89. /// <param name="mask">Mask</param>
  90. /// <returns>Vector</returns>
  91. [DebuggerStepThrough]
  92. public static v128 blendv_pd(v128 a, v128 b, v128 mask)
  93. {
  94. int j;
  95. v128 dst = default(v128);
  96. double* dptr = &dst.Double0;
  97. double* aptr = &a.Double0;
  98. double* bptr = &b.Double0;
  99. long* mptr = &mask.SLong0;
  100. for (j = 0; j <= 1; j++)
  101. {
  102. if (mptr[j] < 0)
  103. {
  104. dptr[j] = bptr[j];
  105. }
  106. else
  107. {
  108. dptr[j] = aptr[j];
  109. }
  110. }
  111. return dst;
  112. }
  113. // _mm_blendv_ps
  114. /// <summary> Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". </summary>
  115. /// <param name="a">Vector a</param>
  116. /// <param name="b">Vector b</param>
  117. /// <param name="mask">Mask</param>
  118. /// <returns>Vector</returns>
  119. [DebuggerStepThrough]
  120. public static v128 blendv_ps(v128 a, v128 b, v128 mask)
  121. {
  122. int j;
  123. v128 dst = default(v128);
  124. float* dptr = &dst.Float0;
  125. float* aptr = &a.Float0;
  126. float* bptr = &b.Float0;
  127. int* mptr = &mask.SInt0;
  128. for (j = 0; j <= 3; j++)
  129. {
  130. if (mptr[j] < 0)
  131. {
  132. dptr[j] = bptr[j];
  133. }
  134. else
  135. {
  136. dptr[j] = aptr[j];
  137. }
  138. }
  139. return dst;
  140. }
  141. // _mm_blendv_epi8
  142. /// <summary> Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst". </summary>
  143. /// <param name="a">Vector a</param>
  144. /// <param name="b">Vector b</param>
  145. /// <param name="mask">Mask</param>
  146. /// <returns>Vector</returns>
  147. [DebuggerStepThrough]
  148. public static v128 blendv_epi8(v128 a, v128 b, v128 mask)
  149. {
  150. int j;
  151. v128 dst = default(v128);
  152. byte* dptr = &dst.Byte0;
  153. byte* aptr = &a.Byte0;
  154. byte* bptr = &b.Byte0;
  155. sbyte* mptr = &mask.SByte0;
  156. for (j = 0; j <= 15; j++)
  157. {
  158. if (mptr[j] < 0)
  159. {
  160. dptr[j] = bptr[j];
  161. }
  162. else
  163. {
  164. dptr[j] = aptr[j];
  165. }
  166. }
  167. return dst;
  168. }
  169. // _mm_blend_epi16
  170. /// <summary> Blend packed 16-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst". </summary>
  171. /// <param name="a">Vector a</param>
  172. /// <param name="b">Vector b</param>
  173. /// <param name="imm8">Control mask</param>
  174. /// <returns>Vector</returns>
  175. [DebuggerStepThrough]
  176. public static v128 blend_epi16(v128 a, v128 b, int imm8)
  177. {
  178. int j;
  179. v128 dst = default(v128);
  180. short* dptr = &dst.SShort0;
  181. short* aptr = &a.SShort0;
  182. short* bptr = &b.SShort0;
  183. for (j = 0; j <= 7; j++)
  184. {
  185. if (0 != ((imm8 >> j) & 1))
  186. {
  187. dptr[j] = bptr[j];
  188. }
  189. else
  190. {
  191. dptr[j] = aptr[j];
  192. }
  193. }
  194. return dst;
  195. }
  196. // _mm_dp_pd
  197. /// <summary> Conditionally multiply the packed double-precision (64-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". </summary>
  198. /// <param name="a">Vector a</param>
  199. /// <param name="b">Vector b</param>
  200. /// <param name="imm8">High 4 bits in imm8</param>
  201. /// <returns>Vector</returns>
  202. [DebuggerStepThrough]
  203. public static v128 dp_pd(v128 a, v128 b, int imm8)
  204. {
  205. double t0 = (imm8 & 0x10) != 0 ? a.Double0 * b.Double0 : 0.0;
  206. double t1 = (imm8 & 0x20) != 0 ? a.Double1 * b.Double1 : 0.0;
  207. double sum = t0 + t1;
  208. v128 dst = default(v128);
  209. dst.Double0 = (imm8 & 1) != 0 ? sum : 0.0;
  210. dst.Double1 = (imm8 & 2) != 0 ? sum : 0.0;
  211. return dst;
  212. }
  213. // _mm_dp_ps
  214. /// <summary> Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". </summary>
  215. /// <param name="a">Vector a</param>
  216. /// <param name="b">Vector b</param>
  217. /// <param name="imm8">High 4 bits in imm8</param>
  218. /// <returns>Vector</returns>
  219. [DebuggerStepThrough]
  220. public static v128 dp_ps(v128 a, v128 b, int imm8)
  221. {
  222. float t0 = (imm8 & 0x10) != 0 ? a.Float0 * b.Float0 : 0.0f;
  223. float t1 = (imm8 & 0x20) != 0 ? a.Float1 * b.Float1 : 0.0f;
  224. float t2 = (imm8 & 0x40) != 0 ? a.Float2 * b.Float2 : 0.0f;
  225. float t3 = (imm8 & 0x80) != 0 ? a.Float3 * b.Float3 : 0.0f;
  226. float sum = t0 + t1 + t2 + t3;
  227. v128 dst = default(v128);
  228. dst.Float0 = (imm8 & 1) != 0 ? sum : 0.0f;
  229. dst.Float1 = (imm8 & 2) != 0 ? sum : 0.0f;
  230. dst.Float2 = (imm8 & 4) != 0 ? sum : 0.0f;
  231. dst.Float3 = (imm8 & 8) != 0 ? sum : 0.0f;
  232. return dst;
  233. }
  234. // _mm_extract_ps
  235. /// <summary> Extract a single-precision (32-bit) floating-point element from "a", selected with "imm8", and store the result in "dst". </summary>
  236. /// <param name="a">Vector a</param>
  237. /// <param name="imm8">imm8</param>
  238. /// <returns>Integer</returns>
  239. [DebuggerStepThrough]
  240. public static int extract_ps(v128 a, int imm8)
  241. {
  242. int* iptr = &a.SInt0;
  243. return iptr[imm8 & 0x3];
  244. }
  245. // unity extension
  246. /// <summary> Extract a single-precision (32-bit) floating-point element from "a", selected with "imm8", and store the result in "dst" (as a float).</summary>
  247. /// <param name="a">Vector a</param>
  248. /// <param name="imm8">imm8</param>
  249. /// <returns>Float</returns>
  250. [DebuggerStepThrough]
  251. public static float extractf_ps(v128 a, int imm8)
  252. {
  253. float* fptr = &a.Float0;
  254. return fptr[imm8 & 0x3];
  255. }
  256. // _mm_extract_epi8
  257. /// <summary> Extract an 8-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". </summary>
  258. /// <param name="a">Vector a</param>
  259. /// <param name="imm8">imm8</param>
  260. /// <returns>Byte</returns>
  261. [DebuggerStepThrough]
  262. public static byte extract_epi8(v128 a, int imm8)
  263. {
  264. byte* bptr = &a.Byte0;
  265. return bptr[imm8 & 0xf];
  266. }
  267. // _mm_extract_epi32
  268. /// <summary> Extract a 32-bit integer from "a", selected with "imm8", and store the result in "dst". </summary>
  269. /// <param name="a">Vector a</param>
  270. /// <param name="imm8">imm8</param>
  271. /// <returns>Integer</returns>
  272. [DebuggerStepThrough]
  273. public static int extract_epi32(v128 a, int imm8)
  274. {
  275. int* iptr = &a.SInt0;
  276. return iptr[imm8 & 0x3];
  277. }
  278. // _mm_extract_epi64
  279. /// <summary> Extract a 64-bit integer from "a", selected with "imm8", and store the result in "dst". </summary>
  280. /// <param name="a">Vector a</param>
  281. /// <param name="imm8">imm8</param>
  282. /// <returns>64-bit integer</returns>
  283. [DebuggerStepThrough]
  284. public static long extract_epi64(v128 a, int imm8)
  285. {
  286. long* lptr = &a.SLong0;
  287. return lptr[imm8 & 0x1];
  288. }
  289. // _mm_insert_ps
  290. /// <summary> Copy "a" to "tmp", then insert a single-precision (32-bit) floating-point element from "b" into "tmp" using the control in "imm8". Store "tmp" to "dst" using the mask in "imm8" (elements are zeroed out when the corresponding bit is set). </summary>
  291. /// <param name="a">Vector a</param>
  292. /// <param name="b">Vector b</param>
  293. /// <param name="imm8">Control mask</param>
  294. /// <returns>Vector</returns>
  295. [DebuggerStepThrough]
  296. public static v128 insert_ps(v128 a, v128 b, int imm8)
  297. {
  298. v128 dst = a;
  299. (&dst.Float0)[(imm8 >> 4) & 3] = (&b.Float0)[(imm8 >> 6) & 3];
  300. for (int i = 0; i < 4; ++i)
  301. {
  302. if (0 != (imm8 & (1 << i)))
  303. (&dst.Float0)[i] = 0.0f;
  304. }
  305. return dst;
  306. }
  307. // _mm_insert_epi8
  308. /// <summary> Copy "a" to "dst", and insert the lower 8-bit integer from "i" into "dst" at the location specified by "imm8". </summary>
  309. /// <param name="a">Vector a</param>
  310. /// <param name="i">lower 8-bit integer</param>
  311. /// <param name="imm8">Location</param>
  312. /// <returns>Vector</returns>
  313. [DebuggerStepThrough]
  314. public static v128 insert_epi8(v128 a, byte i, int imm8)
  315. {
  316. v128 dst = a;
  317. (&dst.Byte0)[imm8 & 0xf] = i;
  318. return dst;
  319. }
  320. // _mm_insert_epi32
  321. /// <summary> Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "imm8". </summary>
  322. /// <param name="a">Vector a</param>
  323. /// <param name="i">32-bit integer</param>
  324. /// <param name="imm8">Location</param>
  325. /// <returns>Vector</returns>
  326. [DebuggerStepThrough]
  327. public static v128 insert_epi32(v128 a, int i, int imm8)
  328. {
  329. v128 dst = a;
  330. (&dst.SInt0)[imm8 & 0x3] = i;
  331. return dst;
  332. }
  333. // _mm_insert_epi64
  334. /// <summary> Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "imm8". </summary>
  335. /// <param name="a">Vector a</param>
  336. /// <param name="i">64-bit integer</param>
  337. /// <param name="imm8">Location</param>
  338. /// <returns>Vector</returns>
  339. [DebuggerStepThrough]
  340. public static v128 insert_epi64(v128 a, long i, int imm8)
  341. {
  342. v128 dst = a;
  343. (&dst.SLong0)[imm8 & 0x1] = i;
  344. return dst;
  345. }
  346. // _mm_max_epi8
  347. /// <summary> Compare packed 8-bit integers in "a" and "b", and store packed maximum values in "dst". </summary>
  348. /// <param name="a">Vector a</param>
  349. /// <param name="b">Vector b</param>
  350. /// <returns>Vector</returns>
  351. [DebuggerStepThrough]
  352. public static v128 max_epi8(v128 a, v128 b)
  353. {
  354. v128 dst = default(v128);
  355. sbyte* dptr = &dst.SByte0;
  356. sbyte* aptr = &a.SByte0;
  357. sbyte* bptr = &b.SByte0;
  358. for (int j = 0; j <= 15; j++)
  359. {
  360. dptr[j] = Math.Max(aptr[j], bptr[j]);
  361. }
  362. return dst;
  363. }
  364. // _mm_max_epi32
  365. /// <summary> Compare packed 32-bit integers in "a" and "b", and store packed maximum values in "dst". </summary>
  366. /// <param name="a">Vector a</param>
  367. /// <param name="b">Vector b</param>
  368. /// <returns>Vector</returns>
  369. [DebuggerStepThrough]
  370. public static v128 max_epi32(v128 a, v128 b)
  371. {
  372. v128 dst = default(v128);
  373. int* dptr = &dst.SInt0;
  374. int* aptr = &a.SInt0;
  375. int* bptr = &b.SInt0;
  376. for (int j = 0; j <= 3; j++)
  377. {
  378. dptr[j] = Math.Max(aptr[j], bptr[j]);
  379. }
  380. return dst;
  381. }
  382. // _mm_max_epu32
  383. /// <summary> Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst". </summary>
  384. /// <param name="a">Vector a</param>
  385. /// <param name="b">Vector b</param>
  386. /// <returns>Vector</returns>
  387. [DebuggerStepThrough]
  388. public static v128 max_epu32(v128 a, v128 b)
  389. {
  390. v128 dst = default(v128);
  391. uint* dptr = &dst.UInt0;
  392. uint* aptr = &a.UInt0;
  393. uint* bptr = &b.UInt0;
  394. for (int j = 0; j <= 3; j++)
  395. {
  396. dptr[j] = Math.Max(aptr[j], bptr[j]);
  397. }
  398. return dst;
  399. }
  400. // _mm_max_epu16
  401. /// <summary> Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst". </summary>
  402. /// <param name="a">Vector a</param>
  403. /// <param name="b">Vector b</param>
  404. /// <returns>Vector</returns>
  405. [DebuggerStepThrough]
  406. public static v128 max_epu16(v128 a, v128 b)
  407. {
  408. v128 dst = default(v128);
  409. ushort* dptr = &dst.UShort0;
  410. ushort* aptr = &a.UShort0;
  411. ushort* bptr = &b.UShort0;
  412. for (int j = 0; j <= 7; j++)
  413. {
  414. dptr[j] = Math.Max(aptr[j], bptr[j]);
  415. }
  416. return dst;
  417. }
  418. // _mm_min_epi8
  419. /// <summary> Compare packed 8-bit integers in "a" and "b", and store packed minimum values in "dst". </summary>
  420. /// <param name="a">Vector a</param>
  421. /// <param name="b">Vector b</param>
  422. /// <returns>Vector</returns>
  423. [DebuggerStepThrough]
  424. public static v128 min_epi8(v128 a, v128 b)
  425. {
  426. v128 dst = default(v128);
  427. sbyte* dptr = &dst.SByte0;
  428. sbyte* aptr = &a.SByte0;
  429. sbyte* bptr = &b.SByte0;
  430. for (int j = 0; j <= 15; j++)
  431. {
  432. dptr[j] = Math.Min(aptr[j], bptr[j]);
  433. }
  434. return dst;
  435. }
  436. // _mm_min_epi32
  437. /// <summary> Compare packed 32-bit integers in "a" and "b", and store packed minimum values in "dst". </summary>
  438. /// <param name="a">Vector a</param>
  439. /// <param name="b">Vector b</param>
  440. /// <returns>Vector</returns>
  441. [DebuggerStepThrough]
  442. public static v128 min_epi32(v128 a, v128 b)
  443. {
  444. v128 dst = default(v128);
  445. int* dptr = &dst.SInt0;
  446. int* aptr = &a.SInt0;
  447. int* bptr = &b.SInt0;
  448. for (int j = 0; j <= 3; j++)
  449. {
  450. dptr[j] = Math.Min(aptr[j], bptr[j]);
  451. }
  452. return dst;
  453. }
  454. // _mm_min_epu32
  455. /// <summary> Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst". </summary>
  456. /// <param name="a">Vector a</param>
  457. /// <param name="b">Vector b</param>
  458. /// <returns>Vector</returns>
  459. [DebuggerStepThrough]
  460. public static v128 min_epu32(v128 a, v128 b)
  461. {
  462. v128 dst = default(v128);
  463. uint* dptr = &dst.UInt0;
  464. uint* aptr = &a.UInt0;
  465. uint* bptr = &b.UInt0;
  466. for (int j = 0; j <= 3; j++)
  467. {
  468. dptr[j] = Math.Min(aptr[j], bptr[j]);
  469. }
  470. return dst;
  471. }
  472. // _mm_min_epu16
  473. /// <summary> Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst". </summary>
  474. /// <param name="a">Vector a</param>
  475. /// <param name="b">Vector b</param>
  476. /// <returns>Vector</returns>
  477. [DebuggerStepThrough]
  478. public static v128 min_epu16(v128 a, v128 b)
  479. {
  480. v128 dst = default(v128);
  481. ushort* dptr = &dst.UShort0;
  482. ushort* aptr = &a.UShort0;
  483. ushort* bptr = &b.UShort0;
  484. for (int j = 0; j <= 7; j++)
  485. {
  486. dptr[j] = Math.Min(aptr[j], bptr[j]);
  487. }
  488. return dst;
  489. }
  490. // _mm_packus_epi32
  491. /// <summary> Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst". </summary>
  492. /// <param name="a">Vector a</param>
  493. /// <param name="b">Vector b</param>
  494. /// <returns>Vector</returns>
  495. [DebuggerStepThrough]
  496. public static v128 packus_epi32(v128 a, v128 b)
  497. {
  498. v128 dst = default(v128);
  499. dst.UShort0 = Saturate_To_UnsignedInt16(a.SInt0);
  500. dst.UShort1 = Saturate_To_UnsignedInt16(a.SInt1);
  501. dst.UShort2 = Saturate_To_UnsignedInt16(a.SInt2);
  502. dst.UShort3 = Saturate_To_UnsignedInt16(a.SInt3);
  503. dst.UShort4 = Saturate_To_UnsignedInt16(b.SInt0);
  504. dst.UShort5 = Saturate_To_UnsignedInt16(b.SInt1);
  505. dst.UShort6 = Saturate_To_UnsignedInt16(b.SInt2);
  506. dst.UShort7 = Saturate_To_UnsignedInt16(b.SInt3);
  507. return dst;
  508. }
  509. // _mm_cmpeq_epi64
  510. /// <summary> Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst". </summary>
  511. /// <param name="a">Vector a</param>
  512. /// <param name="b">Vector b</param>
  513. /// <returns>Vector</returns>
  514. [DebuggerStepThrough]
  515. public static v128 cmpeq_epi64(v128 a, v128 b)
  516. {
  517. v128 dst = default(v128);
  518. dst.SLong0 = a.SLong0 == b.SLong0 ? -1L : 0L;
  519. dst.SLong1 = a.SLong1 == b.SLong1 ? -1L : 0L;
  520. return dst;
  521. }
  522. // _mm_cvtepi8_epi16
  523. /// <summary> Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". </summary>
  524. /// <param name="a">Vector a</param>
  525. /// <returns>Vector</returns>
  526. [DebuggerStepThrough]
  527. public static v128 cvtepi8_epi16(v128 a)
  528. {
  529. v128 dst = default(v128);
  530. short* dptr = &dst.SShort0;
  531. sbyte* aptr = &a.SByte0;
  532. for (int j = 0; j <= 7; j++)
  533. {
  534. dptr[j] = aptr[j];
  535. }
  536. return dst;
  537. }
  538. // _mm_cvtepi8_epi32
  539. /// <summary> Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". </summary>
  540. /// <param name="a">Vector a</param>
  541. /// <returns>Vector</returns>
  542. [DebuggerStepThrough]
  543. public static v128 cvtepi8_epi32(v128 a)
  544. {
  545. v128 dst = default(v128);
  546. int* dptr = &dst.SInt0;
  547. sbyte* aptr = &a.SByte0;
  548. for (int j = 0; j <= 3; j++)
  549. {
  550. dptr[j] = aptr[j];
  551. }
  552. return dst;
  553. }
  554. // _mm_cvtepi8_epi64
  555. /// <summary> Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst". </summary>
  556. /// <param name="a">Vector a</param>
  557. /// <returns>Vector</returns>
  558. [DebuggerStepThrough]
  559. public static v128 cvtepi8_epi64(v128 a)
  560. {
  561. v128 dst = default(v128);
  562. long* dptr = &dst.SLong0;
  563. sbyte* aptr = &a.SByte0;
  564. for (int j = 0; j <= 1; j++)
  565. {
  566. dptr[j] = aptr[j];
  567. }
  568. return dst;
  569. }
  570. // _mm_cvtepi16_epi32
  571. /// <summary> Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". </summary>
  572. /// <param name="a">Vector a</param>
  573. /// <returns>Vector</returns>
  574. [DebuggerStepThrough]
  575. public static v128 cvtepi16_epi32(v128 a)
  576. {
  577. v128 dst = default(v128);
  578. int* dptr = &dst.SInt0;
  579. short* aptr = &a.SShort0;
  580. for (int j = 0; j <= 3; j++)
  581. {
  582. dptr[j] = aptr[j];
  583. }
  584. return dst;
  585. }
  586. // _mm_cvtepi16_epi64
  587. /// <summary> Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". </summary>
  588. /// <param name="a">Vector a</param>
  589. /// <returns>Vector</returns>
  590. [DebuggerStepThrough]
  591. public static v128 cvtepi16_epi64(v128 a)
  592. {
  593. v128 dst = default(v128);
  594. long* dptr = &dst.SLong0;
  595. short* aptr = &a.SShort0;
  596. for (int j = 0; j <= 1; j++)
  597. {
  598. dptr[j] = aptr[j];
  599. }
  600. return dst;
  601. }
  602. // _mm_cvtepi32_epi64
  603. /// <summary> Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". </summary>
  604. /// <param name="a">Vector a</param>
  605. /// <returns>Vector</returns>
  606. [DebuggerStepThrough]
  607. public static v128 cvtepi32_epi64(v128 a)
  608. {
  609. v128 dst = default(v128);
  610. long* dptr = &dst.SLong0;
  611. int* aptr = &a.SInt0;
  612. for (int j = 0; j <= 1; j++)
  613. {
  614. dptr[j] = aptr[j];
  615. }
  616. return dst;
  617. }
  618. // _mm_cvtepu8_epi16
  619. /// <summary> Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". </summary>
  620. /// <param name="a">Vector a</param>
  621. /// <returns>Vector</returns>
  622. [DebuggerStepThrough]
  623. public static v128 cvtepu8_epi16(v128 a)
  624. {
  625. v128 dst = default(v128);
  626. short* dptr = &dst.SShort0;
  627. byte* aptr = &a.Byte0;
  628. for (int j = 0; j <= 7; j++)
  629. {
  630. dptr[j] = aptr[j];
  631. }
  632. return dst;
  633. }
  634. // _mm_cvtepu8_epi32
  635. /// <summary> Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". </summary>
  636. /// <param name="a">Vector a</param>
  637. /// <returns>Vector</returns>
  638. [DebuggerStepThrough]
  639. public static v128 cvtepu8_epi32(v128 a)
  640. {
  641. v128 dst = default(v128);
  642. int* dptr = &dst.SInt0;
  643. byte* aptr = &a.Byte0;
  644. for (int j = 0; j <= 3; j++)
  645. {
  646. dptr[j] = aptr[j];
  647. }
  648. return dst;
  649. }
  650. // _mm_cvtepu8_epi64
  651. /// <summary> Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst". </summary>
  652. /// <param name="a">Vector a</param>
  653. /// <returns>Vector</returns>
  654. [DebuggerStepThrough]
  655. public static v128 cvtepu8_epi64(v128 a)
  656. {
  657. v128 dst = default(v128);
  658. long* dptr = &dst.SLong0;
  659. byte* aptr = &a.Byte0;
  660. for (int j = 0; j <= 1; j++)
  661. {
  662. dptr[j] = aptr[j];
  663. }
  664. return dst;
  665. }
  666. // _mm_cvtepu16_epi32
  667. /// <summary> Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". </summary>
  668. /// <param name="a">Vector a</param>
  669. /// <returns>Vector</returns>
  670. [DebuggerStepThrough]
  671. public static v128 cvtepu16_epi32(v128 a)
  672. {
  673. v128 dst = default(v128);
  674. int* dptr = &dst.SInt0;
  675. ushort* aptr = &a.UShort0;
  676. for (int j = 0; j <= 3; j++)
  677. {
  678. dptr[j] = aptr[j];
  679. }
  680. return dst;
  681. }
  682. // _mm_cvtepu16_epi64
  683. /// <summary> Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". </summary>
  684. /// <param name="a">Vector a</param>
  685. /// <returns>Vector</returns>
  686. [DebuggerStepThrough]
  687. public static v128 cvtepu16_epi64(v128 a)
  688. {
  689. v128 dst = default(v128);
  690. long* dptr = &dst.SLong0;
  691. ushort* aptr = &a.UShort0;
  692. for (int j = 0; j <= 1; j++)
  693. {
  694. dptr[j] = aptr[j];
  695. }
  696. return dst;
  697. }
  698. // _mm_cvtepu32_epi64
  699. /// <summary> Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". </summary>
  700. /// <param name="a">Vector a</param>
  701. /// <returns>Vector</returns>
  702. [DebuggerStepThrough]
  703. public static v128 cvtepu32_epi64(v128 a)
  704. {
  705. v128 dst = default(v128);
  706. long* dptr = &dst.SLong0;
  707. uint* aptr = &a.UInt0;
  708. for (int j = 0; j <= 1; j++)
  709. {
  710. dptr[j] = aptr[j];
  711. }
  712. return dst;
  713. }
  714. // _mm_mul_epi32
  715. /// <summary> Multiply the low 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". </summary>
  716. /// <param name="a">Vector a</param>
  717. /// <param name="b">Vector b</param>
  718. /// <returns>Vector</returns>
  719. [DebuggerStepThrough]
  720. public static v128 mul_epi32(v128 a, v128 b)
  721. {
  722. v128 dst = default(v128);
  723. dst.SLong0 = a.SInt0 * (long)b.SInt0;
  724. dst.SLong1 = a.SInt2 * (long)b.SInt2;
  725. return dst;
  726. }
  727. // _mm_mullo_epi32
  728. /// <summary> Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". </summary>
  729. /// <param name="a">Vector a</param>
  730. /// <param name="b">Vector b</param>
  731. /// <returns>Vector</returns>
  732. [DebuggerStepThrough]
  733. public static v128 mullo_epi32(v128 a, v128 b)
  734. {
  735. v128 dst = default(v128);
  736. int* dptr = &dst.SInt0;
  737. int* aptr = &a.SInt0;
  738. int* bptr = &b.SInt0;
  739. for (int j = 0; j <= 3; j++)
  740. {
  741. dptr[j] = aptr[j] * bptr[j];
  742. }
  743. return dst;
  744. }
  745. // _mm_testz_si128
  746. /// <summary> Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value. </summary>
  747. /// <param name="a">Vector a</param>
  748. /// <param name="b">Vector b</param>
  749. /// <returns>ZF value</returns>
  750. [DebuggerStepThrough]
  751. public static int testz_si128(v128 a, v128 b)
  752. {
  753. return ((a.SLong0 & b.SLong0) == 0 && (a.SLong1 & b.SLong1) == 0) ? 1 : 0;
  754. }
  755. // _mm_testc_si128
  756. /// <summary> Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value. </summary>
  757. /// <param name="a">Vector a</param>
  758. /// <param name="b">Vector b</param>
  759. /// <returns>CF value</returns>
  760. [DebuggerStepThrough]
  761. public static int testc_si128(v128 a, v128 b)
  762. {
  763. return (((~a.SLong0) & b.SLong0) == 0 && ((~a.SLong1) & b.SLong1) == 0) ? 1 : 0;
  764. }
  765. // _mm_testnzc_si128
  766. /// <summary>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</summary>
  767. /// <param name="a">Vector a</param>
  768. /// <param name="b">Vector b</param>
  769. /// <returns>Boolean result</returns>
  770. [DebuggerStepThrough]
  771. public static int testnzc_si128(v128 a, v128 b)
  772. {
  773. int zf = ((a.SLong0 & b.SLong0) == 0 && (a.SLong1 & b.SLong1) == 0) ? 1 : 0;
  774. int cf = (((~a.SLong0) & b.SLong0) == 0 && ((~a.SLong1) & b.SLong1) == 0) ? 1 : 0;
  775. return 1 - (zf | cf);
  776. }
  777. // _mm_test_all_zeros
  778. /// <summary> Compute the bitwise AND of 128 bits (representing integer data) in "a" and "mask", and return 1 if the result is zero, otherwise return 0. </summary>
  779. /// <param name="a">Vector a</param>
  780. /// <param name="mask">Mask</param>
  781. /// <returns>Boolean result</returns>
  782. [DebuggerStepThrough]
  783. [BurstTargetCpu(BurstTargetCpu.X64_SSE4)]
  784. public static int test_all_zeros(v128 a, v128 mask)
  785. {
  786. return testz_si128(a, mask);
  787. }
  788. // _mm_test_mix_ones_zeros
  789. /// <summary>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "mask", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "mask", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</summary>
  790. /// <param name="a">Vector a</param>
  791. /// <param name="mask">Mask</param>
  792. /// <returns>Boolean result</returns>
  793. [DebuggerStepThrough]
  794. [BurstTargetCpu(BurstTargetCpu.X64_SSE4)]
  795. public static int test_mix_ones_zeroes(v128 a, v128 mask)
  796. {
  797. return testnzc_si128(a, mask);
  798. }
  799. // _mm_test_all_ones
  800. /// <summary>Compute the bitwise NOT of "a" and then AND with a 128-bit vector containing all 1's, and return 1 if the result is zero, otherwise return 0.></summary>
  801. /// <param name="a">Vector a</param>
  802. /// <returns>Boolean result</returns>
  803. [DebuggerStepThrough]
  804. [BurstTargetCpu(BurstTargetCpu.X64_SSE4)]
  805. public static int test_all_ones(v128 a)
  806. {
  807. return testc_si128(a, Sse2.cmpeq_epi32(a, a));
  808. }
  809. // Wrapper for C# reference mode to handle FROUND_xxx
  810. private static double RoundDImpl(double d, int roundingMode)
  811. {
  812. switch (roundingMode & 7)
  813. {
  814. case 0: return Math.Round(d);
  815. case 1: return Math.Floor(d);
  816. case 2:
  817. {
  818. double r = Math.Ceiling(d);
  819. if (r == 0.0 && d < 0.0)
  820. {
  821. // Emulate intel's ceil rounding to zero leaving the data at negative zero
  822. return new v128(0x8000_0000_0000_0000).Double0;
  823. }
  824. else
  825. {
  826. return r;
  827. }
  828. }
  829. case 3: return Math.Truncate(d);
  830. default:
  831. switch (MXCSR & MXCSRBits.RoundingControlMask)
  832. {
  833. case MXCSRBits.RoundToNearest: return Math.Round(d);
  834. case MXCSRBits.RoundDown: return Math.Floor(d);
  835. case MXCSRBits.RoundUp: return Math.Ceiling(d);
  836. default: return Math.Truncate(d);
  837. }
  838. }
  839. }
  840. // _mm_round_pd
  841. /// <summary> Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst".</summary>
  842. /// <param name="a">Vector a</param>
  843. /// <param name="rounding">Rounding mode</param>
  844. /// <returns>Vector</returns>
  845. [DebuggerStepThrough]
  846. public static v128 round_pd(v128 a, int rounding)
  847. {
  848. v128 dst = default(v128);
  849. dst.Double0 = RoundDImpl(a.Double0, rounding);
  850. dst.Double1 = RoundDImpl(a.Double1, rounding);
  851. return dst;
  852. }
  853. // _mm_floor_pd
  854. /// <summary> Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". </summary>
  855. /// <param name="a">Vector a</param>
  856. /// <returns>Vector</returns>
  857. [DebuggerStepThrough]
  858. [BurstTargetCpu(BurstTargetCpu.X64_SSE4)]
  859. public static v128 floor_pd(v128 a)
  860. {
  861. return round_pd(a, (int)RoundingMode.FROUND_FLOOR);
  862. }
  863. // _mm_ceil_pd
  864. /// <summary> Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". </summary>
  865. /// <param name="a">Vector a</param>
  866. /// <returns>Vector</returns>
  867. [DebuggerStepThrough]
  868. [BurstTargetCpu(BurstTargetCpu.X64_SSE4)]
  869. public static v128 ceil_pd(v128 a)
  870. {
  871. return round_pd(a, (int)RoundingMode.FROUND_CEIL);
  872. }
  873. // _mm_round_ps
  874. /// <summary> Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst". </summary>
  875. /// <param name="a">Vector a</param>
  876. /// <param name="rounding">Rounding mode</param>
  877. /// <returns>Vector</returns>
  878. [DebuggerStepThrough]
  879. public static v128 round_ps(v128 a, int rounding)
  880. {
  881. v128 dst = default(v128);
  882. dst.Float0 = (float)RoundDImpl(a.Float0, rounding);
  883. dst.Float1 = (float)RoundDImpl(a.Float1, rounding);
  884. dst.Float2 = (float)RoundDImpl(a.Float2, rounding);
  885. dst.Float3 = (float)RoundDImpl(a.Float3, rounding);
  886. return dst;
  887. }
  888. // _mm_floor_ps
  889. /// <summary> Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". </summary>
  890. /// <param name="a">Vector a</param>
  891. /// <returns>Vector</returns>
  892. [DebuggerStepThrough]
  893. [BurstTargetCpu(BurstTargetCpu.X64_SSE4)]
  894. public static v128 floor_ps(v128 a)
  895. {
  896. return round_ps(a, (int)RoundingMode.FROUND_FLOOR);
  897. }
  898. // _mm_ceil_ps
  899. /// <summary> Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". </summary>
  900. /// <param name="a">Vector a</param>
  901. /// <returns>Vector</returns>
  902. [DebuggerStepThrough]
  903. [BurstTargetCpu(BurstTargetCpu.X64_SSE4)]
  904. public static v128 ceil_ps(v128 a)
  905. {
  906. return round_ps(a, (int)RoundingMode.FROUND_CEIL);
  907. }
  908. // _mm_round_sd
  909. /// <summary> Round the lower double-precision (64-bit) floating-point element in "b" using the "rounding" parameter, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". </summary>
  910. /// <param name="a">Vector a</param>
  911. /// <param name="b">Vector b</param>
  912. /// <param name="rounding">Rounding mode</param>
  913. /// <returns>Vector</returns>
  914. [DebuggerStepThrough]
  915. public static v128 round_sd(v128 a, v128 b, int rounding)
  916. {
  917. v128 dst = default(v128);
  918. dst.Double0 = RoundDImpl(b.Double0, rounding);
  919. dst.Double1 = a.Double1;
  920. return dst;
  921. }
  922. // _mm_floor_sd
  923. /// <summary> Round the lower double-precision (64-bit) floating-point element in "b" down to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". </summary>
  924. /// <param name="a">Vector a</param>
  925. /// <param name="b">Vector b</param>
  926. /// <returns>Vector</returns>
  927. [DebuggerStepThrough]
  928. [BurstTargetCpu(BurstTargetCpu.X64_SSE4)]
  929. public static v128 floor_sd(v128 a, v128 b)
  930. {
  931. return round_sd(a, b, (int)RoundingMode.FROUND_FLOOR);
  932. }
  933. // _mm_ceil_sd
  934. /// <summary> Round the lower double-precision (64-bit) floating-point element in "b" up to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". </summary>
  935. /// <param name="a">Vector a</param>
  936. /// <param name="b">Vector b</param>
  937. /// <returns>Vector</returns>
  938. [DebuggerStepThrough]
  939. [BurstTargetCpu(BurstTargetCpu.X64_SSE4)]
  940. public static v128 ceil_sd(v128 a, v128 b)
  941. {
  942. return round_sd(a, b, (int)RoundingMode.FROUND_CEIL);
  943. }
  944. // _mm_round_ss
  945. /// <summary> Round the lower single-precision (32-bit) floating-point element in "b" using the "rounding" parameter, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</summary>
  946. /// <param name="a">Vector a</param>
  947. /// <param name="b">Vector b</param>
  948. /// <param name="rounding">Rounding mode</param>
  949. /// <returns>Vector</returns>
  950. [DebuggerStepThrough]
  951. public static v128 round_ss(v128 a, v128 b, int rounding)
  952. {
  953. v128 dst = a;
  954. dst.Float0 = (float)RoundDImpl(b.Float0, rounding);
  955. return dst;
  956. }
  957. // _mm_floor_ss
  958. /// <summary> Round the lower single-precision (32-bit) floating-point element in "b" down to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". </summary>
  959. /// <param name="a">Vector a</param>
  960. /// <param name="b">Vector b</param>
  961. /// <returns>Vector</returns>
  962. [DebuggerStepThrough]
  963. [BurstTargetCpu(BurstTargetCpu.X64_SSE4)]
  964. public static v128 floor_ss(v128 a, v128 b)
  965. {
  966. return round_ss(a, b, (int)RoundingMode.FROUND_FLOOR);
  967. }
  968. // _mm_ceil_ss
  969. /// <summary> Round the lower single-precision (32-bit) floating-point element in "b" up to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". </summary>
  970. /// <param name="a">Vector a</param>
  971. /// <param name="b">Vector b</param>
  972. /// <returns>Vector</returns>
  973. [DebuggerStepThrough]
  974. [BurstTargetCpu(BurstTargetCpu.X64_SSE4)]
  975. public static v128 ceil_ss(v128 a, v128 b)
  976. {
  977. return round_ss(a, b, (int)RoundingMode.FROUND_CEIL);
  978. }
  979. // _mm_minpos_epu16
  980. /// <summary> Horizontally compute the minimum amongst the packed unsigned 16-bit integers in "a", store the minimum and index in "dst", and zero the remaining bits in "dst". </summary>
  981. /// <param name="a">Vector a</param>
  982. /// <returns>Vector</returns>
  983. [DebuggerStepThrough]
  984. public static v128 minpos_epu16(v128 a)
  985. {
  986. int index = 0;
  987. ushort min = a.UShort0;
  988. ushort* aptr = &a.UShort0;
  989. for (int j = 1; j <= 7; j++)
  990. {
  991. if (aptr[j] < min)
  992. {
  993. index = j;
  994. min = aptr[j];
  995. }
  996. }
  997. v128 dst = default(v128);
  998. dst.UShort0 = min;
  999. dst.UShort1 = (ushort)index;
  1000. return dst;
  1001. }
  1002. // _mm_mpsadbw_epu8
  1003. /// <summary> Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".</summary>
  1004. /// <remarks>Eight SADs are performed using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8".</remarks>
  1005. /// <param name="a">Vector a</param>
  1006. /// <param name="b">Vector b</param>
  1007. /// <param name="imm8">Offset</param>
  1008. /// <returns>Vector</returns>
  1009. [DebuggerStepThrough]
  1010. public static v128 mpsadbw_epu8(v128 a, v128 b, int imm8)
  1011. {
  1012. v128 dst = default(v128);
  1013. ushort* dptr = &dst.UShort0;
  1014. byte* aptr = &a.Byte0 + ((imm8 >> 2) & 1) * 4;
  1015. byte* bptr = &b.Byte0 + (imm8 & 3) * 4;
  1016. byte b0 = bptr[0];
  1017. byte b1 = bptr[1];
  1018. byte b2 = bptr[2];
  1019. byte b3 = bptr[3];
  1020. for (int j = 0; j <= 7; j++)
  1021. {
  1022. dptr[j] = (ushort)(Math.Abs(aptr[j + 0] - b0) + Math.Abs(aptr[j + 1] - b1) + Math.Abs(aptr[j + 2] - b2) + Math.Abs(aptr[j + 3] - b3));
  1023. }
  1024. return dst;
  1025. }
  1026. /// <summary>Helper macro to create index-parameter value for insert_ps</summary>
  1027. /// <param name="srcField">Source field</param>
  1028. /// <param name="dstField">Destination field</param>
  1029. /// <param name="zeroMask">Zero mask</param>
  1030. /// <returns>Integer</returns>
  1031. [DebuggerStepThrough]
  1032. public static int MK_INSERTPS_NDX(int srcField, int dstField, int zeroMask)
  1033. {
  1034. return (srcField << 6) | (dstField << 4) | zeroMask;
  1035. }
  1036. }
  1037. }
  1038. }