説明なし
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

Avx.cs 154KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485
  1. using System;
  2. using System.Diagnostics;
  3. namespace Unity.Burst.Intrinsics
  4. {
  5. public unsafe static partial class X86
  6. {
  7. /// <summary>
  8. /// AVX intrinsics
  9. /// </summary>
  10. public static class Avx
  11. {
  12. /// <summary>
  13. /// Evaluates to true at compile time if AVX intrinsics are supported.
  14. /// </summary>
  15. public static bool IsAvxSupported { get { return false; } }
  16. /// <summary>
  17. /// Compare predicates for scalar and packed compare intrinsic functions
  18. /// </summary>
  19. public enum CMP
  20. {
  21. ///<summary>
  22. /// Equal (ordered, nonsignaling)
  23. ///</summary>
  24. EQ_OQ = 0x00,
  25. /// <summary>
  26. /// Less-than (ordered, signaling)
  27. /// </summary>
  28. LT_OS = 0x01,
  29. /// <summary>
  30. /// Less-than-or-equal (ordered, signaling)
  31. /// </summary>
  32. LE_OS = 0x02,
  33. /// <summary>
  34. /// Unordered (nonsignaling)
  35. /// </summary>
  36. UNORD_Q = 0x03,
  37. /// <summary>
  38. /// Not-equal (unordered, nonsignaling)
  39. /// </summary>
  40. NEQ_UQ = 0x04,
  41. /// <summary>
  42. /// Not-less-than (unordered, signaling)
  43. /// </summary>
  44. NLT_US = 0x05,
  45. /// <summary>
  46. /// Not-less-than-or-equal (unordered, ignaling)
  47. /// </summary>
  48. NLE_US = 0x06,
  49. /// <summary>
  50. /// Ordered (nonsignaling)
  51. /// </summary>
  52. ORD_Q = 0x07,
  53. /// <summary>
  54. /// Equal (unordered, non-signaling)
  55. /// </summary>
  56. EQ_UQ = 0x08,
  57. /// <summary>
  58. /// Not-greater-than-or-equal (unordered, signaling)
  59. /// </summary>
  60. NGE_US = 0x09,
  61. /// <summary>
  62. /// Not-greater-than (unordered, signaling)
  63. /// </summary>
  64. NGT_US = 0x0A,
  65. /// <summary>
  66. /// False (ordered, nonsignaling)
  67. /// </summary>
  68. FALSE_OQ = 0x0B,
  69. /// <summary>
  70. /// Not-equal (ordered, non-signaling)
  71. /// </summary>
  72. NEQ_OQ = 0x0C,
  73. /// <summary>
  74. /// Greater-than-or-equal (ordered, signaling)
  75. /// </summary>
  76. GE_OS = 0x0D,
  77. /// <summary>
  78. /// Greater-than (ordered, signaling)
  79. /// </summary>
  80. GT_OS = 0x0E,
  81. /// <summary>
  82. /// True (unordered, non-signaling)
  83. /// </summary>
  84. TRUE_UQ = 0x0F,
  85. /// <summary>
  86. /// Equal (ordered, signaling)
  87. /// </summary>
  88. EQ_OS = 0x10,
  89. /// <summary>
  90. /// Less-than (ordered, nonsignaling)
  91. /// </summary>
  92. LT_OQ = 0x11,
  93. /// <summary>
  94. /// Less-than-or-equal (ordered, nonsignaling)
  95. /// </summary>
  96. LE_OQ = 0x12,
  97. /// <summary>
  98. /// Unordered (signaling)
  99. /// </summary>
  100. UNORD_S = 0x13,
  101. /// <summary>
  102. /// Not-equal (unordered, signaling)
  103. /// </summary>
  104. NEQ_US = 0x14,
  105. /// <summary>
  106. /// Not-less-than (unordered, nonsignaling)
  107. /// </summary>
  108. NLT_UQ = 0x15,
  109. /// <summary>
  110. /// Not-less-than-or-equal (unordered, nonsignaling)
  111. /// </summary>
  112. NLE_UQ = 0x16,
  113. /// <summary>
  114. /// Ordered (signaling)
  115. /// </summary>
  116. ORD_S = 0x17,
  117. /// <summary>
  118. /// Equal (unordered, signaling)
  119. /// </summary>
  120. EQ_US = 0x18,
  121. /// <summary>
  122. /// Not-greater-than-or-equal (unordered, nonsignaling)
  123. /// </summary>
  124. NGE_UQ = 0x19,
  125. /// <summary>
  126. /// Not-greater-than (unordered, nonsignaling)
  127. /// </summary>
  128. NGT_UQ = 0x1A,
  129. /// <summary>
  130. /// False (ordered, signaling)
  131. /// </summary>
  132. FALSE_OS = 0x1B,
  133. /// <summary>
  134. /// Not-equal (ordered, signaling)
  135. /// </summary>
  136. NEQ_OS = 0x1C,
  137. /// <summary>
  138. /// Greater-than-or-equal (ordered, nonsignaling)
  139. /// </summary>
  140. GE_OQ = 0x1D,
  141. /// <summary>
  142. /// Greater-than (ordered, nonsignaling)
  143. /// </summary>
  144. GT_OQ = 0x1E,
  145. /// <summary>
  146. /// True (unordered, signaling)
  147. /// </summary>
  148. TRUE_US = 0x1F,
  149. }
  150. /// <summary>
  151. /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
  152. /// </summary>
  153. /// <param name="a">Vector a</param>
  154. /// <param name="b">Vector b</param>
  155. /// <returns>Vector</returns>
  156. [DebuggerStepThrough]
  157. public static v256 mm256_add_pd(v256 a, v256 b)
  158. {
  159. return new v256(Sse2.add_pd(a.Lo128, b.Lo128), Sse2.add_pd(a.Hi128, b.Hi128));
  160. }
  161. /// <summary>
  162. /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
  163. /// </summary>
  164. /// <param name="a">Vector a</param>
  165. /// <param name="b">Vector b</param>
  166. /// <returns>Vector</returns>
  167. [DebuggerStepThrough]
  168. public static v256 mm256_add_ps(v256 a, v256 b)
  169. {
  170. return new v256(Sse.add_ps(a.Lo128, b.Lo128), Sse.add_ps(a.Hi128, b.Hi128));
  171. }
  172. /// <summary>
  173. /// Alternatively add and subtract packed double-precision (64-bit) floating-point elements in a to/from packed elements in b, and store the results in dst.
  174. /// </summary>
  175. /// <param name="a">Vector a</param>
  176. /// <param name="b">Vector b</param>
  177. /// <returns>Vector</returns>
  178. [DebuggerStepThrough]
  179. public static v256 mm256_addsub_pd(v256 a, v256 b)
  180. {
  181. return new v256(Sse3.addsub_pd(a.Lo128, b.Lo128), Sse3.addsub_pd(a.Hi128, b.Hi128));
  182. }
  183. /// <summary>
  184. /// Alternatively add and subtract packed single-precision (32-bit) floating-point elements in a to/from packed elements in b, and store the results in dst.
  185. /// </summary>
  186. /// <param name="a">Vector a</param>
  187. /// <param name="b">Vector b</param>
  188. /// <returns>Vector</returns>
  189. [DebuggerStepThrough]
  190. public static v256 mm256_addsub_ps(v256 a, v256 b)
  191. {
  192. return new v256(Sse3.addsub_ps(a.Lo128, b.Lo128), Sse3.addsub_ps(a.Hi128, b.Hi128));
  193. }
  194. /// <summary>
  195. /// Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
  196. /// </summary>
  197. /// <param name="a">Vector a</param>
  198. /// <param name="b">Vector b</param>
  199. /// <returns>Vector</returns>
  200. [DebuggerStepThrough]
  201. public static v256 mm256_and_pd(v256 a, v256 b)
  202. {
  203. return new v256(Sse2.and_pd(a.Lo128, b.Lo128), Sse2.and_pd(a.Hi128, b.Hi128));
  204. }
  205. /// <summary>
  206. /// Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
  207. /// </summary>
  208. /// <param name="a">Vector a</param>
  209. /// <param name="b">Vector b</param>
  210. /// <returns>Vector</returns>
  211. [DebuggerStepThrough]
  212. public static v256 mm256_and_ps(v256 a, v256 b)
  213. {
  214. return new v256(Sse.and_ps(a.Lo128, b.Lo128), Sse.and_ps(a.Hi128, b.Hi128));
  215. }
  216. /// <summary>
  217. /// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in a and then AND with b, and store the results in dst.
  218. /// </summary>
  219. /// <param name="a">Vector a</param>
  220. /// <param name="b">Vector b</param>
  221. /// <returns>Vector</returns>
  222. [DebuggerStepThrough]
  223. public static v256 mm256_andnot_pd(v256 a, v256 b)
  224. {
  225. return new v256(Sse2.andnot_pd(a.Lo128, b.Lo128), Sse2.andnot_pd(a.Hi128, b.Hi128));
  226. }
  227. /// <summary>
  228. /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in a and then AND with b, and store the results in dst.
  229. /// </summary>
  230. /// <param name="a">Vector a</param>
  231. /// <param name="b">Vector b</param>
  232. /// <returns>Vector</returns>
  233. [DebuggerStepThrough]
  234. public static v256 mm256_andnot_ps(v256 a, v256 b)
  235. {
  236. return new v256(Sse.andnot_ps(a.Lo128, b.Lo128), Sse.andnot_ps(a.Hi128, b.Hi128));
  237. }
  238. /// <summary>
  239. /// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask imm8, and store the results in dst.
  240. /// </summary>
  241. /// <remarks>
  242. /// **** VBLENDPD ymm1, ymm2, ymm3/v256, imm8
  243. /// Double-Precision Floating-Point values from the second source operand are
  244. /// conditionally merged with values from the first source operand and written
  245. /// to the destination. The immediate bits [3:0] determine whether the
  246. /// corresponding Double-Precision Floating Point value in the destination is
  247. /// copied from the second source or first source. If a bit in the mask,
  248. /// corresponding to a word, is "1", then the Double-Precision Floating-Point
  249. /// value in the second source operand is copied, else the value in the first
  250. /// source operand is copied
  251. /// </remarks>
  252. /// <param name="a">Vector a</param>
  253. /// <param name="b">Vector b</param>
  254. /// <param name="imm8">Control mask</param>
  255. /// <returns>Vector</returns>
  256. [DebuggerStepThrough]
  257. public static v256 mm256_blend_pd(v256 a, v256 b, int imm8)
  258. {
  259. return new v256(Sse4_1.blend_pd(a.Lo128, b.Lo128, imm8 & 0x3), Sse4_1.blend_pd(a.Hi128, b.Hi128, imm8 >> 2));
  260. }
  261. /// <summary>
  262. /// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask imm8, and store the results in dst.
  263. /// </summary>
  264. /// <remarks>
  265. /// **** VBLENDPS ymm1, ymm2, ymm3/v256, imm8
  266. /// Single precision floating point values from the second source operand are
  267. /// conditionally merged with values from the first source operand and written
  268. /// to the destination. The immediate bits [7:0] determine whether the
  269. /// corresponding single precision floating-point value in the destination is
  270. /// copied from the second source or first source. If a bit in the mask,
  271. /// corresponding to a word, is "1", then the single-precision floating-point
  272. /// value in the second source operand is copied, else the value in the first
  273. /// source operand is copied
  274. /// </remarks>
  275. /// <param name="a">Vector a</param>
  276. /// <param name="b">Vector b</param>
  277. /// <param name="imm8">Control mask</param>
  278. /// <returns>Vector</returns>
  279. [DebuggerStepThrough]
  280. public static v256 mm256_blend_ps(v256 a, v256 b, int imm8)
  281. {
  282. return new v256(Sse4_1.blend_ps(a.Lo128, b.Lo128, imm8 & 0xf), Sse4_1.blend_ps(a.Hi128, b.Hi128, imm8 >> 4));
  283. }
  284. /// <summary>
  285. /// Blend packed double-precision (64-bit) floating-point elements from a and b using mask, and store the results in dst.
  286. /// </summary>
  287. /// <remarks>
  288. /// **** VBLENDVPD ymm1, ymm2, ymm3/v256, ymm4
  289. /// Conditionally copy each quadword data element of double-precision
  290. /// floating-point value from the second source operand (third operand) and the
  291. /// first source operand (second operand) depending on mask bits defined in the
  292. /// mask register operand (fourth operand).
  293. /// </remarks>
  294. /// <param name="a">Vector a</param>
  295. /// <param name="b">Vector b</param>
  296. /// <param name="mask">Mask</param>
  297. /// <returns>Vector</returns>
  298. [DebuggerStepThrough]
  299. public static v256 mm256_blendv_pd(v256 a, v256 b, v256 mask)
  300. {
  301. return new v256(Sse4_1.blendv_pd(a.Lo128, b.Lo128, mask.Lo128), Sse4_1.blendv_pd(a.Hi128, b.Hi128, mask.Hi128));
  302. }
  303. /// <summary>
  304. /// Blend packed single-precision (32-bit) floating-point elements from a and b using mask, and store the results in dst.
  305. /// </summary>
  306. /// <remarks>
  307. /// Blend Packed Single Precision Floating-Point Values
  308. /// **** VBLENDVPS ymm1, ymm2, ymm3/v256, ymm4
  309. /// Conditionally copy each dword data element of single-precision
  310. /// floating-point value from the second source operand (third operand) and the
  311. /// first source operand (second operand) depending on mask bits defined in the
  312. /// mask register operand (fourth operand).
  313. /// </remarks>
  314. /// <param name="a">Vector a</param>
  315. /// <param name="b">Vector b</param>
  316. /// <param name="mask">Mask</param>
  317. /// <returns>Vector</returns>
  318. [DebuggerStepThrough]
  319. public static v256 mm256_blendv_ps(v256 a, v256 b, v256 mask)
  320. {
  321. return new v256(Sse4_1.blendv_ps(a.Lo128, b.Lo128, mask.Lo128), Sse4_1.blendv_ps(a.Hi128, b.Hi128, mask.Hi128));
  322. }
  323. /// <summary>
  324. /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst.
  325. /// </summary>
  326. /// <remarks>
  327. /// **** VDIVPD ymm1, ymm2, ymm3/v256
  328. /// Performs an SIMD divide of the four packed double-precision floating-point
  329. /// values in the first source operand by the four packed double-precision
  330. /// floating-point values in the second source operand
  331. /// </remarks>
  332. /// <param name="a">Vector a</param>
  333. /// <param name="b">Vector b</param>
  334. /// <returns>Vector</returns>
  335. [DebuggerStepThrough]
  336. public static v256 mm256_div_pd(v256 a, v256 b)
  337. {
  338. return new v256(Sse2.div_pd(a.Lo128, b.Lo128), Sse2.div_pd(a.Hi128, b.Hi128));
  339. }
  340. /// <summary>
  341. /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.
  342. /// </summary>
  343. /// <remarks>
  344. /// Divide Packed Single-Precision Floating-Point Values
  345. /// **** VDIVPS ymm1, ymm2, ymm3/v256
  346. /// Performs an SIMD divide of the eight packed single-precision
  347. /// floating-point values in the first source operand by the eight packed
  348. /// single-precision floating-point values in the second source operand
  349. /// </remarks>
  350. /// <param name="a">Vector a</param>
  351. /// <param name="b">Vector b</param>
  352. /// <returns>Vector</returns>
  353. [DebuggerStepThrough]
  354. public static v256 mm256_div_ps(v256 a, v256 b)
  355. {
  356. return new v256(Sse.div_ps(a.Lo128, b.Lo128), Sse.div_ps(a.Hi128, b.Hi128));
  357. }
  358. /// <summary>
  359. /// Conditionally multiply the packed single-precision (32-bit)
  360. /// floating-point elements in a and b using the high 4 bits in
  361. /// imm8, sum the four products, and conditionally store the sum in
  362. /// dst using the low 4 bits of imm8.
  363. /// </summary>
  364. /// <remarks>
  365. /// **** VDPPS ymm1, ymm2, ymm3/v256, imm8
  366. /// Multiplies the packed single precision floating point values in the
  367. /// first source operand with the packed single-precision floats in the
  368. /// second source. Each of the four resulting single-precision values is
  369. /// conditionally summed depending on a mask extracted from the high 4 bits
  370. /// of the immediate operand. This sum is broadcast to each of 4 positions
  371. /// in the destination if the corresponding bit of the mask selected from
  372. /// the low 4 bits of the immediate operand is "1". If the corresponding
  373. /// low bit 0-3 of the mask is zero, the destination is set to zero.
  374. /// The process is replicated for the high elements of the destination.
  375. /// </remarks>
  376. /// <param name="a">Vector a</param>
  377. /// <param name="b">Vector b</param>
  378. /// <param name="imm8">imm8</param>
  379. /// <returns>Vector</returns>
  380. [DebuggerStepThrough]
  381. public static v256 mm256_dp_ps(v256 a, v256 b, int imm8)
  382. {
  383. return new v256(Sse4_1.dp_ps(a.Lo128, b.Lo128, imm8), Sse4_1.dp_ps(a.Hi128, b.Hi128, imm8));
  384. }
  385. /// <summary>
  386. /// Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the results in dst.
  387. /// </summary>
  388. /// <remarks>
  389. /// **** VHADDPD ymm1, ymm2, ymm3/v256
  390. /// Adds pairs of adjacent double-precision floating-point values in the
  391. /// first source operand and second source operand and stores results in
  392. /// the destination
  393. /// </remarks>
  394. /// <param name="a">Vector a</param>
  395. /// <param name="b">Vector b</param>
  396. /// <returns>Vector</returns>
  397. [DebuggerStepThrough]
  398. public static v256 mm256_hadd_pd(v256 a, v256 b)
  399. {
  400. return new v256(Sse3.hadd_pd(a.Lo128, b.Lo128), Sse3.hadd_pd(a.Hi128, b.Hi128));
  401. }
  402. /// <summary>
  403. /// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in a and b, and pack the results in dst.
  404. /// </summary>
  405. /// <remarks>
  406. /// **** VHADDPS ymm1, ymm2, ymm3/v256
  407. /// Adds pairs of adjacent single-precision floating-point values in the
  408. /// first source operand and second source operand and stores results in
  409. /// the destination
  410. /// </remarks>
  411. /// <param name="a">Vector a</param>
  412. /// <param name="b">Vector b</param>
  413. /// <returns>Vector</returns>
  414. [DebuggerStepThrough]
  415. public static v256 mm256_hadd_ps(v256 a, v256 b)
  416. {
  417. return new v256(Sse3.hadd_ps(a.Lo128, b.Lo128), Sse3.hadd_ps(a.Hi128, b.Hi128));
  418. }
  419. /// <summary>
  420. /// Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the results in dst.
  421. /// </summary>
  422. /// <remarks>
  423. /// **** VHSUBPD ymm1, ymm2, ymm3/v256
  424. /// Subtract pairs of adjacent double-precision floating-point values in
  425. /// the first source operand and second source operand and stores results
  426. /// in the destination
  427. /// </remarks>
  428. /// <param name="a">Vector a</param>
  429. /// <param name="b">Vector b</param>
  430. /// <returns>Vector</returns>
  431. [DebuggerStepThrough]
  432. public static v256 mm256_hsub_pd(v256 a, v256 b)
  433. {
  434. return new v256(Sse3.hsub_pd(a.Lo128, b.Lo128), Sse3.hsub_pd(a.Hi128, b.Hi128));
  435. }
  436. /// <summary>
  437. /// Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in a and b, and pack the results in dst.
  438. /// </summary>
  439. /// <remarks>
  440. /// **** VHSUBPS ymm1, ymm2, ymm3/v256
  441. /// Subtract pairs of adjacent single-precision floating-point values in
  442. /// the first source operand and second source operand and stores results
  443. /// in the destination.
  444. /// </remarks>
  445. /// <param name="a">Vector a</param>
  446. /// <param name="b">Vector b</param>
  447. /// <returns>Vector</returns>
  448. [DebuggerStepThrough]
  449. public static v256 mm256_hsub_ps(v256 a, v256 b)
  450. {
  451. return new v256(Sse3.hsub_ps(a.Lo128, b.Lo128), Sse3.hsub_ps(a.Hi128, b.Hi128));
  452. }
  453. /// <summary>
  454. /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.
  455. /// </summary>
  456. /// <remarks>
  457. /// **** VMAXPD ymm1, ymm2, ymm3/v256
  458. /// Performs an SIMD compare of the packed double-precision floating-point
  459. /// values in the first source operand and the second source operand and
  460. /// returns the maximum value for each pair of values to the destination
  461. /// </remarks>
  462. /// <param name="a">Vector a</param>
  463. /// <param name="b">Vector b</param>
  464. /// <returns>Vector</returns>
  465. [DebuggerStepThrough]
  466. public static v256 mm256_max_pd(v256 a, v256 b)
  467. {
  468. return new v256(Sse2.max_pd(a.Lo128, b.Lo128), Sse2.max_pd(a.Hi128, b.Hi128));
  469. }
  470. /// <summary>
  471. /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.
  472. /// </summary>
  473. /// <remarks>
  474. /// **** VMAXPS ymm1, ymm2, ymm3/v256
  475. /// Performs an SIMD compare of the packed single-precision floating-point
  476. /// values in the first source operand and the second source operand and
  477. /// returns the maximum value for each pair of values to the destination
  478. /// </remarks>
  479. /// <param name="a">Vector a</param>
  480. /// <param name="b">Vector b</param>
  481. /// <returns>Vector</returns>
  482. [DebuggerStepThrough]
  483. public static v256 mm256_max_ps(v256 a, v256 b)
  484. {
  485. return new v256(Sse.max_ps(a.Lo128, b.Lo128), Sse.max_ps(a.Hi128, b.Hi128));
  486. }
  487. /// <summary>
  488. /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.
  489. /// </summary>
  490. /// <remarks>
  491. /// **** VMINPD ymm1, ymm2, ymm3/v256
  492. /// Performs an SIMD compare of the packed double-precision floating-point
  493. /// values in the first source operand and the second source operand and
  494. /// returns the minimum value for each pair of values to the destination
  495. /// </remarks>
  496. /// <param name="a">Vector a</param>
  497. /// <param name="b">Vector b</param>
  498. /// <returns>Vector</returns>
  499. [DebuggerStepThrough]
  500. public static v256 mm256_min_pd(v256 a, v256 b)
  501. {
  502. return new v256(Sse2.min_pd(a.Lo128, b.Lo128), Sse2.min_pd(a.Hi128, b.Hi128));
  503. }
  504. /// <summary>
  505. /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.
  506. /// </summary>
  507. /// <remarks>
  508. /// **** VMINPS ymm1, ymm2, ymm3/v256
  509. /// Performs an SIMD compare of the packed single-precision floating-point
  510. /// values in the first source operand and the second source operand and
  511. /// returns the minimum value for each pair of values to the destination
  512. /// </remarks>
  513. /// <param name="a">Vector a</param>
  514. /// <param name="b">Vector b</param>
  515. /// <returns>Vector</returns>
  516. [DebuggerStepThrough]
  517. public static v256 mm256_min_ps(v256 a, v256 b)
  518. {
  519. return new v256(Sse.min_ps(a.Lo128, b.Lo128), Sse.min_ps(a.Hi128, b.Hi128));
  520. }
  521. /// <summary>
  522. /// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
  523. /// </summary>
  524. /// <remarks>
  525. /// **** VMULPD ymm1, ymm2, ymm3/v256
  526. /// Performs a SIMD multiply of the four packed double-precision floating-point
  527. /// values from the first Source operand to the Second Source operand, and
  528. /// stores the packed double-precision floating-point results in the
  529. /// destination
  530. /// </remarks>
  531. /// <param name="a">Vector a</param>
  532. /// <param name="b">Vector b</param>
  533. /// <returns>Vector</returns>
  534. [DebuggerStepThrough]
  535. public static v256 mm256_mul_pd(v256 a, v256 b)
  536. {
  537. return new v256(Sse2.mul_pd(a.Lo128, b.Lo128), Sse2.mul_pd(a.Hi128, b.Hi128));
  538. }
  539. /// <summary>
  540. /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
  541. /// </summary>
  542. /// <remarks>
  543. /// **** VMULPS ymm1, ymm2, ymm3/v256
  544. /// Performs an SIMD multiply of the eight packed single-precision
  545. /// floating-point values from the first source operand to the second source
  546. /// operand, and stores the packed double-precision floating-point results in
  547. /// the destination
  548. /// </remarks>
  549. /// <param name="a">Vector a</param>
  550. /// <param name="b">Vector b</param>
  551. /// <returns>Vector</returns>
  552. [DebuggerStepThrough]
  553. public static v256 mm256_mul_ps(v256 a, v256 b)
  554. {
  555. return new v256(Sse.mul_ps(a.Lo128, b.Lo128), Sse.mul_ps(a.Hi128, b.Hi128));
  556. }
  557. /// <summary>
  558. /// Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
  559. /// </summary>
  560. /// <remarks>
  561. /// **** VORPD ymm1, ymm2, ymm3/v256
  562. /// Performs a bitwise logical OR of the four packed double-precision
  563. /// floating-point values from the first source operand and the second
  564. /// source operand, and stores the result in the destination
  565. /// </remarks>
  566. /// <param name="a">Vector a</param>
  567. /// <param name="b">Vector b</param>
  568. /// <returns>Vector</returns>
  569. [DebuggerStepThrough]
  570. public static v256 mm256_or_pd(v256 a, v256 b)
  571. {
  572. return new v256(Sse2.or_pd(a.Lo128, b.Lo128), Sse2.or_pd(a.Hi128, b.Hi128));
  573. }
  574. /// <summary>
  575. /// Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
  576. /// </summary>
  577. /// <remarks>
  578. /// **** VORPS ymm1, ymm2, ymm3/v256
  579. /// Performs a bitwise logical OR of the eight packed single-precision
  580. /// floating-point values from the first source operand and the second
  581. /// source operand, and stores the result in the destination
  582. /// </remarks>
  583. /// <param name="a">Vector a</param>
  584. /// <param name="b">Vector b</param>
  585. /// <returns>Vector</returns>
  586. [DebuggerStepThrough]
  587. public static v256 mm256_or_ps(v256 a, v256 b)
  588. {
  589. return new v256(Sse.or_ps(a.Lo128, b.Lo128), Sse.or_ps(a.Hi128, b.Hi128));
  590. }
  591. /// <summary>
  592. /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst.
  593. /// </summary>
  594. /// <remarks>
  595. /// **** VSHUFPD ymm1, ymm2, ymm3/v256, imm8
  596. /// Moves either of the two packed double-precision floating-point values from
  597. /// each double quadword in the first source operand into the low quadword
  598. /// of each double quadword of the destination; moves either of the two packed
  599. /// double-precision floating-point values from the second source operand into
  600. /// the high quadword of each double quadword of the destination operand.
  601. /// The selector operand determines which values are moved to the destination
  602. /// </remarks>
  603. /// <param name="a">Vector a</param>
  604. /// <param name="b">Vector b</param>
  605. /// <param name="imm8">imm8</param>
  606. /// <returns>Vector</returns>
  607. [DebuggerStepThrough]
  608. public static v256 mm256_shuffle_pd(v256 a, v256 b, int imm8)
  609. {
  610. return new v256(Sse2.shuffle_pd(a.Lo128, b.Lo128, imm8 & 3), Sse2.shuffle_pd(a.Hi128, b.Hi128, imm8 >> 2));
  611. }
  612. /// <summary>
  613. /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
  614. /// </summary>
  615. /// <remarks>
  616. /// **** VSHUFPS ymm1, ymm2, ymm3/v256, imm8
  617. /// Moves two of the four packed single-precision floating-point values
  618. /// from each double qword of the first source operand into the low
  619. /// quadword of each double qword of the destination; moves two of the four
  620. /// packed single-precision floating-point values from each double qword of
  621. /// the second source operand into to the high quadword of each double qword
  622. /// of the destination. The selector operand determines which values are moved
  623. /// to the destination.
  624. /// </remarks>
  625. /// <param name="a">Vector a</param>
  626. /// <param name="b">Vector b</param>
  627. /// <param name="imm8">imm8</param>
  628. /// <returns>Vector</returns>
  629. [DebuggerStepThrough]
  630. public static v256 mm256_shuffle_ps(v256 a, v256 b, int imm8)
  631. {
  632. return new v256(Sse.shuffle_ps(a.Lo128, b.Lo128, imm8), Sse.shuffle_ps(a.Hi128, b.Hi128, imm8));
  633. }
  634. /// <summary>
  635. /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
  636. /// </summary>
  637. /// <remarks>
  638. /// **** VSUBPD ymm1, ymm2, ymm3/v256
  639. /// Performs an SIMD subtract of the four packed double-precision floating-point
  640. /// values of the second Source operand from the first Source operand, and
  641. /// stores the packed double-precision floating-point results in the destination
  642. /// </remarks>
  643. /// <param name="a">Vector a</param>
  644. /// <param name="b">Vector b</param>
  645. /// <returns>Vector</returns>
  646. [DebuggerStepThrough]
  647. public static v256 mm256_sub_pd(v256 a, v256 b)
  648. {
  649. return new v256(Sse2.sub_pd(a.Lo128, b.Lo128), Sse2.sub_pd(a.Hi128, b.Hi128));
  650. }
  651. /// <summary>
  652. /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
  653. /// </summary>
  654. /// <remarks>
  655. /// **** VSUBPS ymm1, ymm2, ymm3/v256
  656. /// Performs an SIMD subtract of the eight packed single-precision
  657. /// floating-point values in the second Source operand from the First Source
  658. /// operand, and stores the packed single-precision floating-point results in
  659. /// the destination
  660. /// </remarks>
  661. /// <param name="a">Vector a</param>
  662. /// <param name="b">Vector b</param>
  663. /// <returns>Vector</returns>
  664. [DebuggerStepThrough]
  665. public static v256 mm256_sub_ps(v256 a, v256 b)
  666. {
  667. return new v256(Sse.sub_ps(a.Lo128, b.Lo128), Sse.sub_ps(a.Hi128, b.Hi128));
  668. }
  669. /// <summary>
  670. /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
  671. /// </summary>
  672. /// <remarks>
  673. /// **** VXORPD ymm1, ymm2, ymm3/v256
  674. /// Performs a bitwise logical XOR of the four packed double-precision
  675. /// floating-point values from the first source operand and the second
  676. /// source operand, and stores the result in the destination
  677. /// </remarks>
  678. /// <param name="a">Vector a</param>
  679. /// <param name="b">Vector b</param>
  680. /// <returns>Vector</returns>
  681. [DebuggerStepThrough]
  682. public static v256 mm256_xor_pd(v256 a, v256 b)
  683. {
  684. return new v256(Sse2.xor_pd(a.Lo128, b.Lo128), Sse2.xor_pd(a.Hi128, b.Hi128));
  685. }
  686. /// <summary>
  687. /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
  688. /// </summary>
  689. /// <remarks>
  690. /// **** VXORPS ymm1, ymm2, ymm3/v256
  691. /// Performs a bitwise logical XOR of the eight packed single-precision
  692. /// floating-point values from the first source operand and the second
  693. /// source operand, and stores the result in the destination
  694. /// </remarks>
  695. /// <param name="a">Vector a</param>
  696. /// <param name="b">Vector b</param>
  697. /// <returns>Vector</returns>
  698. [DebuggerStepThrough]
  699. public static v256 mm256_xor_ps(v256 a, v256 b)
  700. {
  701. return new v256(Sse.xor_ps(a.Lo128, b.Lo128), Sse.xor_ps(a.Hi128, b.Hi128));
  702. }
  703. /// <summary>
  704. /// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.
  705. /// </summary>
  706. /// <remarks>
  707. /// **** VCMPPD xmm1, xmm2, xmm3/v128, imm8
  708. /// Performs an SIMD compare of the four packed double-precision floating-point
  709. /// values in the second source operand (third operand) and the first source
  710. /// operand (second operand) and returns the results of the comparison to the
  711. /// destination operand (first operand). The comparison predicate operand
  712. /// (immediate) specifies the type of comparison performed on each of the pairs
  713. /// of packed values.
  714. /// For 128-bit intrinsic function with compare predicate values in range 0-7
  715. /// compiler may generate SSE2 instructions if it is warranted for performance
  716. /// reasons.
  717. /// </remarks>
  718. /// <param name="a">Vector a</param>
  719. /// <param name="b">Vector b</param>
  720. /// <param name="imm8">imm8</param>
  721. /// <returns>Vector</returns>
  722. [DebuggerStepThrough]
  723. public static v128 cmp_pd(v128 a, v128 b, int imm8)
  724. {
  725. switch ((CMP)(imm8 & 0x1F))
  726. {
  727. // The first variants map to SSE variants
  728. case CMP.EQ_OQ: return Sse2.cmpeq_pd(a, b);
  729. case CMP.LT_OS: return Sse2.cmplt_pd(a, b);
  730. case CMP.LE_OS: return Sse2.cmple_pd(a, b);
  731. case CMP.UNORD_Q: return Sse2.cmpunord_pd(a, b);
  732. case CMP.NEQ_UQ: return Sse2.cmpneq_pd(a, b);
  733. case CMP.NLT_US: return Sse2.cmpnlt_pd(a, b);
  734. case CMP.NLE_US: return Sse2.cmpnle_pd(a, b);
  735. case CMP.ORD_Q: return Sse2.cmpord_pd(a, b);
  736. case CMP.EQ_UQ: return Sse2.or_pd(Sse2.cmpeq_pd(a, b), Sse2.cmpunord_pd(a, b));
  737. case CMP.NGE_UQ: return Sse2.or_pd(Sse2.cmpnge_pd(a, b), Sse2.cmpunord_pd(a, b));
  738. case CMP.NGT_US: return Sse2.or_pd(Sse2.cmpngt_pd(a, b), Sse2.cmpunord_pd(a, b));
  739. case CMP.FALSE_OQ: return default;
  740. case CMP.NEQ_OQ: return Sse2.and_pd(Sse2.cmpneq_pd(a, b), Sse2.cmpord_pd(a, b));
  741. case CMP.GE_OS: return Sse2.and_pd(Sse2.cmpge_pd(a, b), Sse2.cmpord_pd(a, b));
  742. case CMP.GT_OS: return Sse2.and_pd(Sse2.cmpgt_pd(a, b), Sse2.cmpord_pd(a, b));
  743. case CMP.TRUE_UQ: return new v128(-1);
  744. case CMP.EQ_OS: return Sse2.and_pd(Sse2.cmpeq_pd(a, b), Sse2.cmpord_pd(a, b));
  745. case CMP.LT_OQ: return Sse2.and_pd(Sse2.cmplt_pd(a, b), Sse2.cmpord_pd(a, b));
  746. case CMP.LE_OQ: return Sse2.and_pd(Sse2.cmple_pd(a, b), Sse2.cmpord_pd(a, b));
  747. case CMP.UNORD_S: return Sse2.cmpunord_pd(a, b);
  748. case CMP.NEQ_US: return Sse2.cmpneq_pd(a, b);
  749. case CMP.NLT_UQ: return Sse2.or_pd(Sse2.cmpnlt_pd(a, b), Sse2.cmpunord_pd(a, b));
  750. case CMP.NLE_UQ: return Sse2.or_pd(Sse2.cmpnle_pd(a, b), Sse2.cmpunord_pd(a, b));
  751. case CMP.ORD_S: return Sse2.cmpord_pd(a, b);
  752. case CMP.EQ_US: return Sse2.or_pd(Sse2.cmpeq_pd(a, b), Sse2.cmpunord_pd(a, b));
  753. case CMP.NGE_US: return Sse2.or_pd(Sse2.cmpnge_pd(a, b), Sse2.cmpunord_pd(a, b));
  754. case CMP.NGT_UQ: return Sse2.or_pd(Sse2.cmpngt_pd(a, b), Sse2.cmpunord_pd(a, b));
  755. case CMP.FALSE_OS: return default;
  756. case CMP.NEQ_OS: return Sse2.and_pd(Sse2.cmpneq_pd(a, b), Sse2.cmpord_pd(a, b));
  757. case CMP.GE_OQ: return Sse2.and_pd(Sse2.cmpge_pd(a, b), Sse2.cmpord_pd(a, b));
  758. case CMP.GT_OQ: return Sse2.and_pd(Sse2.cmpgt_pd(a, b), Sse2.cmpord_pd(a, b));
  759. default:
  760. return new v128(-1);
  761. }
  762. }
  763. /// <summary>
  764. /// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.
  765. /// </summary>
  766. /// <remarks>
  767. /// **** VCMPPD ymm1, ymm2, ymm3/v256, imm8
  768. /// Performs an SIMD compare of the four packed double-precision floating-point
  769. /// values in the second source operand (third operand) and the first source
  770. /// operand (second operand) and returns the results of the comparison to the
  771. /// destination operand (first operand). The comparison predicate operand
  772. /// (immediate) specifies the type of comparison performed on each of the pairs
  773. /// of packed values.
  774. /// </remarks>
  775. /// <param name="a">Vector a</param>
  776. /// <param name="b">Vector b</param>
  777. /// <param name="imm8">imm8</param>
  778. /// <returns>Vector</returns>
  779. [DebuggerStepThrough]
  780. public static v256 mm256_cmp_pd(v256 a, v256 b, int imm8)
  781. {
  782. return new v256(cmp_pd(a.Lo128, b.Lo128, imm8), cmp_pd(a.Hi128, b.Hi128, imm8));
  783. }
  784. /// **** VCMPPS ymm1, ymm2, ymm3/v256, imm8
  785. /// <summary>
  786. /// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.
  787. /// </summary>
  788. /// <remarks>
  789. /// **** VCMPPS xmm1, xmm2, xmm3/v256, imm8
  790. /// Performs a SIMD compare of the packed single-precision floating-point values
  791. /// in the second source operand (third operand) and the first source operand
  792. /// (second operand) and returns the results of the comparison to the
  793. /// destination operand (first operand). The comparison predicate operand
  794. /// (immediate) specifies the type of comparison performed on each of the pairs
  795. /// of packed values.
  796. /// For 128-bit intrinsic function with compare predicate values in range 0-7
  797. /// compiler may generate SSE2 instructions if it is warranted for performance
  798. /// reasons.
  799. /// </remarks>
  800. /// <param name="a">Vector a</param>
  801. /// <param name="b">Vector b</param>
  802. /// <param name="imm8">imm8</param>
  803. /// <returns>Vector</returns>
  804. [DebuggerStepThrough]
  805. public static v128 cmp_ps(v128 a, v128 b, int imm8)
  806. {
  807. switch ((CMP)(imm8 & 0x1F))
  808. {
  809. // The first variants map to SSE variants
  810. case CMP.EQ_OQ: return Sse.cmpeq_ps(a, b);
  811. case CMP.LT_OS: return Sse.cmplt_ps(a, b);
  812. case CMP.LE_OS: return Sse.cmple_ps(a, b);
  813. case CMP.UNORD_Q: return Sse.cmpunord_ps(a, b);
  814. case CMP.NEQ_UQ: return Sse.cmpneq_ps(a, b);
  815. case CMP.NLT_US: return Sse.cmpnlt_ps(a, b);
  816. case CMP.NLE_US: return Sse.cmpnle_ps(a, b);
  817. case CMP.ORD_Q: return Sse.cmpord_ps(a, b);
  818. case CMP.EQ_UQ: return Sse.or_ps(Sse.cmpeq_ps(a, b), Sse.cmpunord_ps(a, b));
  819. case CMP.NGE_UQ: return Sse.or_ps(Sse.cmpnge_ps(a, b), Sse.cmpunord_ps(a, b));
  820. case CMP.NGT_US: return Sse.or_ps(Sse.cmpngt_ps(a, b), Sse.cmpunord_ps(a, b));
  821. case CMP.FALSE_OQ: return default;
  822. case CMP.NEQ_OQ: return Sse.and_ps(Sse.cmpneq_ps(a, b), Sse.cmpord_ps(a, b));
  823. case CMP.GE_OS: return Sse.and_ps(Sse.cmpge_ps(a, b), Sse.cmpord_ps(a, b));
  824. case CMP.GT_OS: return Sse.and_ps(Sse.cmpgt_ps(a, b), Sse.cmpord_ps(a, b));
  825. case CMP.TRUE_UQ: return new v128(-1);
  826. case CMP.EQ_OS: return Sse.and_ps(Sse.cmpeq_ps(a, b), Sse.cmpord_ps(a, b));
  827. case CMP.LT_OQ: return Sse.and_ps(Sse.cmplt_ps(a, b), Sse.cmpord_ps(a, b));
  828. case CMP.LE_OQ: return Sse.and_ps(Sse.cmple_ps(a, b), Sse.cmpord_ps(a, b));
  829. case CMP.UNORD_S: return Sse.cmpunord_ps(a, b);
  830. case CMP.NEQ_US: return Sse.cmpneq_ps(a, b);
  831. case CMP.NLT_UQ: return Sse.or_ps(Sse.cmpnlt_ps(a, b), Sse.cmpunord_ps(a, b));
  832. case CMP.NLE_UQ: return Sse.or_ps(Sse.cmpnle_ps(a, b), Sse.cmpunord_ps(a, b));
  833. case CMP.ORD_S: return Sse.cmpord_ps(a, b);
  834. case CMP.EQ_US: return Sse.or_ps(Sse.cmpeq_ps(a, b), Sse.cmpunord_ps(a, b));
  835. case CMP.NGE_US: return Sse.or_ps(Sse.cmpnge_ps(a, b), Sse.cmpunord_ps(a, b));
  836. case CMP.NGT_UQ: return Sse.or_ps(Sse.cmpngt_ps(a, b), Sse.cmpunord_ps(a, b));
  837. case CMP.FALSE_OS: return default;
  838. case CMP.NEQ_OS: return Sse.and_ps(Sse.cmpneq_ps(a, b), Sse.cmpord_ps(a, b));
  839. case CMP.GE_OQ: return Sse.and_ps(Sse.cmpge_ps(a, b), Sse.cmpord_ps(a, b));
  840. case CMP.GT_OQ: return Sse.and_ps(Sse.cmpgt_ps(a, b), Sse.cmpord_ps(a, b));
  841. default:
  842. return new v128(-1);
  843. }
  844. }
  845. /// <summary>
  846. /// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.
  847. /// </summary>
  848. /// <remarks>
  849. /// **** VCMPPS xmm1, xmm2, xmm3/v256, imm8
  850. /// Performs a SIMD compare of the packed single-precision floating-point values
  851. /// in the second source operand (third operand) and the first source operand
  852. /// (second operand) and returns the results of the comparison to the
  853. /// destination operand (first operand). The comparison predicate operand
  854. /// (immediate) specifies the type of comparison performed on each of the pairs
  855. /// of packed values.
  856. /// </remarks>
  857. /// <param name="a">Vector a</param>
  858. /// <param name="b">Vector b</param>
  859. /// <param name="imm8">imm8</param>
  860. /// <returns>Vector</returns>
  861. [DebuggerStepThrough]
  862. public static v256 mm256_cmp_ps(v256 a, v256 b, int imm8)
  863. {
  864. return new v256(cmp_ps(a.Lo128, b.Lo128, imm8), cmp_ps(a.Hi128, b.Hi128, imm8));
  865. }
  866. /// <summary>
  867. /// Compare the lower double-precision (64-bit) floating-point
  868. /// element in a and b based on the comparison operand specified by
  869. /// imm8, store the result in the lower element of dst, and copy
  870. /// the upper element from a to the upper element of dst.
  871. /// </summary>
  872. /// <remarks>
  873. /// **** VCMPSD xmm1, xmm2, xmm3/m64, imm8
  874. /// Compares the low double-precision floating-point values in the second source
  875. /// operand (third operand) and the first source operand (second operand) and
  876. /// returns the results in of the comparison to the destination operand (first
  877. /// operand). The comparison predicate operand (immediate operand) specifies the
  878. /// type of comparison performed.
  879. /// For compare predicate values in range 0-7 compiler may generate SSE2
  880. /// instructions if it is warranted for performance reasons.
  881. /// </remarks>
  882. /// <param name="a">Vector a</param>
  883. /// <param name="b">Vector b</param>
  884. /// <param name="imm8">imm8</param>
  885. /// <returns>Vector</returns>
  886. [DebuggerStepThrough]
  887. public static v128 cmp_sd(v128 a, v128 b, int imm8)
  888. {
  889. v128 full = cmp_pd(a, b, imm8);
  890. return new v128(full.ULong0, a.ULong1);
  891. }
  892. /// <summary>
  893. /// Compare the lower single-precision (32-bit) floating-point
  894. /// element in a and b based on the comparison operand specified by
  895. /// imm8, store the result in the lower element of dst, and copy
  896. /// the upper 3 packed elements from a to the upper elements of
  897. /// dst.
  898. /// </summary>
  899. /// <remarks>
  900. /// **** VCMPSS xmm1, xmm2, xmm3/m64, imm8
  901. /// Compares the low single-precision floating-point values in the second source
  902. /// operand (third operand) and the first source operand (second operand) and
  903. /// returns the results of the comparison to the destination operand (first
  904. /// operand). The comparison predicate operand (immediate operand) specifies
  905. /// the type of comparison performed.
  906. /// For compare predicate values in range 0-7 compiler may generate SSE2
  907. /// instructions if it is warranted for performance reasons.
  908. /// </remarks>
  909. /// <param name="a">Vector a</param>
  910. /// <param name="b">Vector b</param>
  911. /// <param name="imm8">imm8</param>
  912. /// <returns>Vector</returns>
  913. [DebuggerStepThrough]
  914. public static v128 cmp_ss(v128 a, v128 b, int imm8)
  915. {
  916. v128 full = cmp_ps(a, b, imm8);
  917. return new v128(full.UInt0, a.UInt1, a.UInt2, a.UInt3);
  918. }
  919. /// <summary>
  920. /// Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
  921. /// </summary>
  922. /// <param name="a"></param>
  923. /// <remarks>
  924. /// **** VCVTDQ2PD ymm1, xmm2/v128
  925. /// Converts four packed signed doubleword integers in the source operand to
  926. /// four packed double-precision floating-point values in the destination
  927. /// </remarks>
  928. /// <returns>Vector</returns>
  929. [DebuggerStepThrough]
  930. public static v256 mm256_cvtepi32_pd(v128 a)
  931. {
  932. return new v256((double)a.SInt0, (double)a.SInt1, (double)a.SInt2, (double)a.SInt3);
  933. }
  934. /// <summary>
  935. /// Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
  936. /// </summary>
  937. /// <remarks>
  938. /// **** VCVTDQ2PS ymm1, ymm2/v256
  939. /// Converts eight packed signed doubleword integers in the source operand to
  940. /// eight packed double-precision floating-point values in the destination
  941. /// </remarks>
  942. /// <param name="a">Vector a</param>
  943. /// <returns>Vector</returns>
  944. [DebuggerStepThrough]
  945. public static v256 mm256_cvtepi32_ps(v256 a)
  946. {
  947. return new v256(Sse2.cvtepi32_ps(a.Lo128), Sse2.cvtepi32_ps(a.Hi128));
  948. }
  949. /// <summary>
  950. /// Convert packed double-precision (64-bit) floating-point
  951. /// elements in a to packed single-precision (32-bit)
  952. /// floating-point elements, and store the results in dst.
  953. /// </summary>
  954. /// <remarks>
  955. /// **** VCVTPD2PS xmm1, ymm2/v256
  956. /// Converts four packed double-precision floating-point values in the source
  957. /// operand to four packed single-precision floating-point values in the
  958. /// destination
  959. /// </remarks>
  960. /// <param name="a">Vector a</param>
  961. /// <returns>Vector</returns>
  962. [DebuggerStepThrough]
  963. public static v128 mm256_cvtpd_ps(v256 a)
  964. {
  965. v128 lo = Sse2.cvtpd_ps(a.Lo128);
  966. v128 hi = Sse2.cvtpd_ps(a.Hi128);
  967. return new v128(lo.Float0, lo.Float1, hi.Float0, hi.Float1);
  968. }
  969. /// <summary>
  970. /// Convert packed single-precision (32-bit) floating-point
  971. /// elements in a to packed 32-bit integers, and store the results
  972. /// in dst.
  973. /// </summary>
  974. /// <remarks>
  975. /// **** VCVTPS2DQ ymm1, ymm2/v256
  976. /// Converts eight packed single-precision floating-point values in the source
  977. /// operand to eight signed doubleword integers in the destination
  978. /// </remarks>
  979. /// <param name="a">Vector a</param>
  980. /// <returns>Vector</returns>
  981. [DebuggerStepThrough]
  982. public static v256 mm256_cvtps_epi32(v256 a)
  983. {
  984. return new v256(Sse2.cvtps_epi32(a.Lo128), Sse2.cvtps_epi32(a.Hi128));
  985. }
  986. /// <summary>
  987. /// Convert packed single-precision (32-bit) floating-point
  988. /// elements in a to packed double-precision (64-bit)
  989. /// floating-point elements, and store the results in dst.
  990. /// </summary>
  991. /// <remarks>
  992. /// **** VCVTPS2PD ymm1, xmm2/v128
  993. /// Converts four packed single-precision floating-point values in the source
  994. /// operand to four packed double-precision floating-point values in the
  995. /// destination
  996. /// </remarks>
  997. /// <param name="a">Vector a</param>
  998. /// <returns>Vector</returns>
  999. [DebuggerStepThrough]
  1000. public static v256 mm256_cvtps_pd(v128 a)
  1001. {
  1002. // The normal Burst IR does fine here.
  1003. return new v256(a.Float0, a.Float1, a.Float2, a.Float3);
  1004. }
  1005. /// <summary>
  1006. /// Convert packed double-precision (64-bit) floating-point
  1007. /// elements in a to packed 32-bit integers with truncation, and
  1008. /// store the results in dst.
  1009. /// </summary>
  1010. /// <remarks>
  1011. /// **** VCVTTPD2DQ xmm1, ymm2/v256
  1012. /// Converts four packed double-precision floating-point values in the source
  1013. /// operand to four packed signed doubleword integers in the destination.
  1014. /// When a conversion is inexact, a truncated (round toward zero) value is
  1015. /// returned. If a converted result is larger than the maximum signed doubleword
  1016. /// integer, the floating-point invalid exception is raised, and if this
  1017. /// exception is masked, the indefinite integer value (80000000H) is returned
  1018. /// </remarks>
  1019. /// <param name="a">Vector a</param>
  1020. /// <returns>Vector</returns>
  1021. [DebuggerStepThrough]
  1022. public static v128 mm256_cvttpd_epi32(v256 a)
  1023. {
  1024. return new v128((int)a.Double0, (int)a.Double1, (int)a.Double2, (int)a.Double3);
  1025. }
  1026. /// <summary>
  1027. /// Convert packed double-precision(64-bit) floating-point elements
  1028. /// in a to packed 32-bit integers, and store the results in dst.
  1029. /// </summary>
  1030. /// <remarks>
  1031. /// **** VCVTPD2DQ xmm1, ymm2/v256
  1032. /// Converts four packed double-precision floating-point values in the source
  1033. /// operand to four packed signed doubleword integers in the destination
  1034. /// </remarks>
  1035. /// <param name="a">Vector a</param>
  1036. /// <returns>Vector</returns>
  1037. [DebuggerStepThrough]
  1038. [BurstTargetCpu(BurstTargetCpu.AVX)]
  1039. public static v128 mm256_cvtpd_epi32(v256 a)
  1040. {
  1041. v128 q = Sse2.cvtpd_epi32(new v128(a.Double0, a.Double1));
  1042. v128 r = Sse2.cvtpd_epi32(new v128(a.Double2, a.Double3));
  1043. return new v128(q.SInt0, q.SInt1, r.SInt0, r.SInt1);
  1044. }
  1045. /// <summary>
  1046. /// Convert packed single-precision (32-bit) floating-point
  1047. /// elements in a to packed 32-bit integers with truncation, and
  1048. /// store the results in dst.
  1049. /// </summary>
  1050. /// <remarks>
  1051. /// **** VCVTTPS2DQ ymm1, ymm2/v256
  1052. /// Converts eight packed single-precision floating-point values in the source
  1053. /// operand to eight signed doubleword integers in the destination.
  1054. /// When a conversion is inexact, a truncated (round toward zero) value is
  1055. /// returned. If a converted result is larger than the maximum signed doubleword
  1056. /// integer, the floating-point invalid exception is raised, and if this
  1057. /// exception is masked, the indefinite integer value (80000000H) is returned
  1058. /// </remarks>
  1059. /// <param name="a">Vector a</param>
  1060. /// <returns>Vector</returns>
  1061. [DebuggerStepThrough]
  1062. public static v256 mm256_cvttps_epi32(v256 a)
  1063. {
  1064. return new v256(Sse2.cvttps_epi32(a.Lo128), Sse2.cvttps_epi32(a.Hi128));
  1065. }
  1066. /*
  1067. * Convert Scalar Single-Precision Floating-point value in 256-bit vector to
  1068. * equivalent C/C++ float type.
  1069. */
  1070. /// <summary>
  1071. /// Copy the lower single-precision (32-bit) floating-point element of a to dst.
  1072. /// </summary>
  1073. /// <remarks>
  1074. /// Identical in HPC# to accessing Float0, kept for compatibility with existing code while porting.
  1075. /// </remarks>
  1076. /// <param name="a">Vector a</param>
  1077. /// <returns>Float</returns>
  1078. [DebuggerStepThrough]
  1079. public static float mm256_cvtss_f32(v256 a)
  1080. {
  1081. // Burst IR is fine here.
  1082. return a.Float0;
  1083. }
  1084. /// <summary>
  1085. /// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
  1086. /// </summary>
  1087. /// <remarks>
  1088. /// **** VEXTRACTF128 xmm1/v128, ymm2, imm8
  1089. /// </remarks>
  1090. /// <param name="a">Vector a</param>
  1091. /// <param name="imm8">imm8</param>
  1092. /// <returns>Vector</returns>
  1093. [DebuggerStepThrough]
  1094. public static v128 mm256_extractf128_ps(v256 a, int imm8)
  1095. {
  1096. return imm8 != 0 ? a.Hi128 : a.Lo128;
  1097. }
  1098. /// <summary>
  1099. /// Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
  1100. /// </summary>
  1101. /// <remarks>
  1102. /// **** VEXTRACTF128 xmm1/v128, ymm2, imm8
  1103. /// </remarks>
  1104. /// <param name="a">Vector a</param>
  1105. /// <param name="imm8">imm8</param>
  1106. /// <returns>Vector</returns>
  1107. [DebuggerStepThrough]
  1108. public static v128 mm256_extractf128_pd(v256 a, int imm8)
  1109. {
  1110. return imm8 != 0 ? a.Hi128 : a.Lo128;
  1111. }
  1112. /// <summary>
  1113. /// Extract 128 bits (composed of integer data) from a, selected with imm8, and store the result in dst.
  1114. /// </summary>
  1115. /// <remarks>
  1116. /// **** VEXTRACTF128 xmm1/v128, ymm2, imm8
  1117. /// </remarks>
  1118. /// <param name="a">Vector a</param>
  1119. /// <param name="imm8">imm8</param>
  1120. /// <returns>Vector</returns>
  1121. [DebuggerStepThrough]
  1122. public static v128 mm256_extractf128_si256(v256 a, int imm8)
  1123. {
  1124. return imm8 != 0 ? a.Hi128 : a.Lo128;
  1125. }
  1126. /// <summary>
  1127. /// Zeros the contents of all YMM registers
  1128. /// </summary>
  1129. /// <remarks>
  1130. /// **** VZEROALL
  1131. /// </remarks>
  1132. [DebuggerStepThrough]
  1133. public static void mm256_zeroall()
  1134. {
  1135. // This is a no-op in C# land
  1136. }
  1137. /// <summary>
  1138. /// Zero the upper 128 bits of all YMM registers; the lower 128-bits of the registers are unmodified.
  1139. /// </summary>
  1140. /// <remarks>
  1141. /// **** VZEROUPPER
  1142. /// </remarks>
  1143. [DebuggerStepThrough]
  1144. public static void mm256_zeroupper()
  1145. {
  1146. // This is a no-op in C# land
  1147. }
  1148. /// <summary>
  1149. /// Shuffle single-precision (32-bit) floating-point elements in a using the control in b, and store the results in dst.
  1150. /// </summary>
  1151. /// <remarks>
  1152. /// **** VPERMILPS xmm1, xmm2, xmm3/v128
  1153. /// Permute Single-Precision Floating-Point values in the first source operand
  1154. /// using 8-bit control fields in the low bytes of corresponding elements the
  1155. /// shuffle control and store results in the destination
  1156. /// </remarks>
  1157. /// <param name="a">Vector a</param>
  1158. /// <param name="b">Vector b</param>
  1159. /// <returns>Vector</returns>
  1160. [DebuggerStepThrough]
  1161. public static v128 permutevar_ps(v128 a, v128 b)
  1162. {
  1163. v128 dst = default;
  1164. uint* dptr = &dst.UInt0;
  1165. uint* aptr = &a.UInt0;
  1166. int* bptr = &b.SInt0;
  1167. for (int i = 0; i < 4; ++i)
  1168. {
  1169. int ndx = bptr[i] & 3;
  1170. dptr[i] = aptr[ndx];
  1171. }
  1172. return dst;
  1173. }
  1174. /// <summary>
  1175. /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
  1176. /// </summary>
  1177. /// <remarks>
  1178. /// **** VPERMILPS ymm1, ymm2, ymm3/v256
  1179. /// Permute Single-Precision Floating-Point values in the first source operand
  1180. /// using 8-bit control fields in the low bytes of corresponding elements the
  1181. /// shuffle control and store results in the destination
  1182. /// </remarks>
  1183. /// <param name="a">Vector a</param>
  1184. /// <param name="b">Vector b</param>
  1185. /// <returns>Vector</returns>
  1186. [DebuggerStepThrough]
  1187. public static v256 mm256_permutevar_ps(v256 a, v256 b)
  1188. {
  1189. return new v256(permutevar_ps(a.Lo128, b.Lo128), permutevar_ps(a.Hi128, b.Hi128));
  1190. }
  1191. /// <summary>
  1192. /// Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst.
  1193. /// </summary>
  1194. /// <remarks>
  1195. /// **** VPERMILPS xmm1, xmm2/v128, imm8
  1196. /// Permute Single-Precision Floating-Point values in the first source operand
  1197. /// using four 2-bit control fields in the 8-bit immediate and store results
  1198. /// in the destination
  1199. /// </remarks>
  1200. /// <param name="a">Vector a</param>
  1201. /// <param name="imm8">imm8</param>
  1202. /// <returns>Vector</returns>
  1203. [DebuggerStepThrough]
  1204. public static v128 permute_ps(v128 a, int imm8)
  1205. {
  1206. return Sse2.shuffle_epi32(a, imm8);
  1207. }
  1208. /// <summary>
  1209. /// Shuffle single-precision (32-bit) floating-point elements in a
  1210. /// within 128-bit lanes using the control in imm8, and store the
  1211. /// results in dst.
  1212. /// </summary>
  1213. /// <remarks>
  1214. /// **** VPERMILPS ymm1, ymm2/v256, imm8
  1215. /// Permute Single-Precision Floating-Point values in the first source operand
  1216. /// using four 2-bit control fields in the 8-bit immediate and store results
  1217. /// in the destination
  1218. /// </remarks>
  1219. /// <param name="a">Vector a</param>
  1220. /// <param name="imm8">imm8</param>
  1221. /// <returns>Vector</returns>
  1222. [DebuggerStepThrough]
  1223. public static v256 mm256_permute_ps(v256 a, int imm8)
  1224. {
  1225. return new v256(permute_ps(a.Lo128, imm8), permute_ps(a.Hi128, imm8));
  1226. }
  1227. /// <summary>
  1228. /// Shuffle double-precision (64-bit) floating-point elements in a using the control in b, and store the results in dst.
  1229. /// </summary>
  1230. /// <remarks>
  1231. /// **** VPERMILPD xmm1, xmm2, xmm3/v128
  1232. /// Permute Double-Precision Floating-Point values in the first source operand
  1233. /// using 8-bit control fields in the low bytes of the second source operand
  1234. /// and store results in the destination
  1235. /// </remarks>
  1236. /// <param name="a">Vector a</param>
  1237. /// <param name="b">Vector b</param>
  1238. /// <returns>Vector</returns>
  1239. [DebuggerStepThrough]
  1240. public static v128 permutevar_pd(v128 a, v128 b)
  1241. {
  1242. v128 dst = default;
  1243. double* dptr = &dst.Double0;
  1244. double* aptr = &a.Double0;
  1245. dptr[0] = aptr[(int)(b.SLong0 & 2) >> 1];
  1246. dptr[1] = aptr[(int)(b.SLong1 & 2) >> 1];
  1247. return dst;
  1248. }
  1249. /// <summary>
  1250. /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
  1251. /// </summary>
  1252. /// <remarks>
  1253. /// **** VPERMILPD ymm1, ymm2, ymm3/v256
  1254. /// Permute Double-Precision Floating-Point values in the first source operand
  1255. /// using 8-bit control fields in the low bytes of the second source operand
  1256. /// and store results in the destination
  1257. /// </remarks>
  1258. /// <param name="a">Vector a</param>
  1259. /// <param name="b">Vector b</param>
  1260. /// <returns>Vector</returns>
  1261. [DebuggerStepThrough]
  1262. public static v256 mm256_permutevar_pd(v256 a, v256 b)
  1263. {
  1264. v256 dst = default;
  1265. double* dptr = &dst.Double0;
  1266. double* aptr = &a.Double0;
  1267. dptr[0] = aptr[(int)(b.SLong0 & 2) >> 1];
  1268. dptr[1] = aptr[(int)(b.SLong1 & 2) >> 1];
  1269. dptr[2] = aptr[2 + ((int)(b.SLong2 & 2) >> 1)];
  1270. dptr[3] = aptr[2 + ((int)(b.SLong3 & 2) >> 1)];
  1271. return dst;
  1272. }
  1273. /*
  1274. * Permute Double-Precision Floating-Point Values
  1275. */
  1276. /// <summary>
  1277. /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
  1278. /// </summary>
  1279. /// <remarks>
  1280. /// **** VPERMILPD ymm1, ymm2/v256, imm8
  1281. /// Permute Double-Precision Floating-Point values in the first source operand
  1282. /// using two, 1-bit control fields in the low 2 bits of the 8-bit immediate
  1283. /// and store results in the destination
  1284. /// </remarks>
  1285. /// <param name="a">Vector a</param>
  1286. /// <param name="imm8">imm8</param>
  1287. /// <returns>Vector</returns>
  1288. [DebuggerStepThrough]
  1289. public static v256 mm256_permute_pd(v256 a, int imm8)
  1290. {
  1291. return new v256(permute_pd(a.Lo128, imm8 & 3), permute_pd(a.Hi128, imm8 >> 2));
  1292. }
  1293. /// <summary>
  1294. /// Shuffle double-precision (64-bit) floating-point elements in a using the control in imm8, and store the results in dst.
  1295. /// </summary>
  1296. /// <remarks>
  1297. /// **** VPERMILPD xmm1, xmm2/v128, imm8
  1298. /// Permute Double-Precision Floating-Point values in the first source operand
  1299. /// using two, 1-bit control fields in the low 2 bits of the 8-bit immediate
  1300. /// and store results in the destination
  1301. /// </remarks>
  1302. /// <param name="a">Vector a</param>
  1303. /// <param name="imm8">imm8</param>
  1304. /// <returns>Vector</returns>
  1305. [DebuggerStepThrough]
  1306. public static v128 permute_pd(v128 a, int imm8)
  1307. {
  1308. v128 dst = default;
  1309. double* dptr = &dst.Double0;
  1310. double* aptr = &a.Double0;
  1311. dptr[0] = aptr[imm8 & 1];
  1312. dptr[1] = aptr[(imm8 >> 1) & 1];
  1313. return dst;
  1314. }
  1315. private static v128 Select4(v256 src1, v256 src2, int control)
  1316. {
  1317. switch (control & 3)
  1318. {
  1319. case 0: return src1.Lo128;
  1320. case 1: return src1.Hi128;
  1321. case 2: return src2.Lo128;
  1322. default: return src2.Hi128;
  1323. }
  1324. }
  1325. /// <summary>
  1326. /// Shuffle 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
  1327. /// </summary>
  1328. /// <remarks>
  1329. /// **** VPERM2F128 ymm1, ymm2, ymm3/v256, imm8
  1330. /// Permute 128 bit floating-point-containing fields from the first source
  1331. /// operand and second source operand using bits in the 8-bit immediate and
  1332. /// store results in the destination
  1333. /// </remarks>
  1334. /// <param name="a">Vector a</param>
  1335. /// <param name="b">Vector b</param>
  1336. /// <param name="imm8">imm8</param>
  1337. /// <returns>Vector</returns>
  1338. [DebuggerStepThrough]
  1339. public static v256 mm256_permute2f128_ps(v256 a, v256 b, int imm8)
  1340. {
  1341. return new v256(Select4(a, b, imm8), Select4(a, b, imm8 >> 4));
  1342. }
  1343. /// <summary>
  1344. /// Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
  1345. /// </summary>
  1346. /// <remarks>
  1347. /// **** VPERM2F128 ymm1, ymm2, ymm3/v256, imm8
  1348. /// Permute 128 bit floating-point-containing fields from the first source
  1349. /// operand and second source operand using bits in the 8-bit immediate and
  1350. /// store results in the destination
  1351. /// </remarks>
  1352. /// <param name="a">Vector a</param>
  1353. /// <param name="b">Vector b</param>
  1354. /// <param name="imm8">imm8</param>
  1355. /// <returns>Vector</returns>
  1356. [DebuggerStepThrough]
  1357. public static v256 mm256_permute2f128_pd(v256 a, v256 b, int imm8)
  1358. {
  1359. return mm256_permute2f128_ps(a, b, imm8);
  1360. }
  1361. /// <summary>
  1362. /// Shuffle 128-bits (composed of integer data) selected by imm8 from a and b, and store the results in dst.
  1363. /// </summary>
  1364. /// <remarks>
  1365. /// **** VPERM2F128 ymm1, ymm2, ymm3/v256, imm8
  1366. /// Permute 128 bit floating-point-containing fields from the first source
  1367. /// operand and second source operand using bits in the 8-bit immediate and
  1368. /// store results in the destination
  1369. /// </remarks>
  1370. /// <param name="a">Vector a</param>
  1371. /// <param name="b">Vector b</param>
  1372. /// <param name="imm8">imm8</param>
  1373. /// <returns>Vector</returns>
  1374. [DebuggerStepThrough]
  1375. public static v256 mm256_permute2f128_si256(v256 a, v256 b, int imm8)
  1376. {
  1377. return mm256_permute2f128_ps(a, b, imm8);
  1378. }
  1379. /// <summary>
  1380. /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements of dst.
  1381. /// </summary>
  1382. /// <remarks>
  1383. /// **** VBROADCASTSS ymm1, m32
  1384. /// </remarks>
  1385. /// <param name="ptr">Pointer</param>
  1386. /// <returns>Vector</returns>
  1387. [DebuggerStepThrough]
  1388. public static v256 mm256_broadcast_ss(void* ptr)
  1389. {
  1390. return new v256(*(uint*)ptr);
  1391. }
  1392. /// <summary>
  1393. /// Broadcast a single-precision (32-bit) floating-point element from memory to all elements of dst.
  1394. /// </summary>
  1395. /// <remarks>
  1396. /// **** VBROADCASTSS xmm1, m32
  1397. /// </remarks>
  1398. /// <param name="ptr">Pointer</param>
  1399. /// <returns>Vector</returns>
  1400. [DebuggerStepThrough]
  1401. public static v128 broadcast_ss(void* ptr)
  1402. {
  1403. return new v128(*(uint*)ptr);
  1404. }
  1405. /// <summary>
  1406. /// Broadcast a double-precision (64-bit) floating-point element from memory to all elements of dst.
  1407. /// </summary>
  1408. /// <remarks>
  1409. /// **** VBROADCASTSD ymm1, m64
  1410. /// </remarks>
  1411. /// <param name="ptr">Pointer</param>
  1412. /// <returns>Vector</returns>
  1413. [DebuggerStepThrough]
  1414. public static v256 mm256_broadcast_sd(void* ptr)
  1415. {
  1416. return new v256(*(double*)ptr);
  1417. }
  1418. /// <summary>
  1419. /// Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of dst.
  1420. /// </summary>
  1421. /// <remarks>
  1422. /// **** VBROADCASTF128 ymm1, v128
  1423. /// </remarks>
  1424. /// <param name="ptr">Pointer</param>
  1425. /// <returns>Vector</returns>
  1426. [DebuggerStepThrough]
  1427. public static v256 mm256_broadcast_ps(void* ptr)
  1428. {
  1429. v128 a = Sse.loadu_ps(ptr);
  1430. return new v256(a, a);
  1431. }
  1432. /// <summary>
  1433. /// Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of dst.
  1434. /// </summary>
  1435. /// <param name="ptr">Pointer</param>
  1436. /// <returns>
  1437. /// **** VBROADCASTF128 ymm1, v128
  1438. /// </returns>
  1439. [DebuggerStepThrough]
  1440. public static v256 mm256_broadcast_pd(void* ptr)
  1441. {
  1442. return mm256_broadcast_ps(ptr);
  1443. }
  1444. /// <summary>
  1445. /// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
  1446. /// </summary>
  1447. /// <remarks>
  1448. /// **** VINSERTF128 ymm1, ymm2, xmm3/v128, imm8
  1449. /// Performs an insertion of 128-bits of packed floating-point values from the
  1450. /// second source operand into an the destination at an 128-bit offset from
  1451. /// imm8[0]. The remaining portions of the destination are written by the
  1452. /// corresponding fields of the first source operand
  1453. /// </remarks>
  1454. /// <param name="a">Vector a</param>
  1455. /// <param name="b">Vector b</param>
  1456. /// <param name="imm8">imm8</param>
  1457. /// <returns>Vector</returns>
  1458. [DebuggerStepThrough]
  1459. public static v256 mm256_insertf128_ps(v256 a, v128 b, int imm8)
  1460. {
  1461. if (0 == (imm8 & 1))
  1462. return new v256(b, a.Hi128);
  1463. else
  1464. return new v256(a.Lo128, b);
  1465. }
  1466. /// <summary>
  1467. /// Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.
  1468. /// </summary>
  1469. /// <remarks>
  1470. /// **** VINSERTF128 ymm1, ymm2, xmm3/v128, imm8
  1471. /// Performs an insertion of 128-bits of packed floating-point values from the
  1472. /// second source operand into an the destination at an 128-bit offset from
  1473. /// imm8[0]. The remaining portions of the destination are written by the
  1474. /// corresponding fields of the first source operand
  1475. /// </remarks>
  1476. /// <param name="a">Vector a</param>
  1477. /// <param name="b">Vector b</param>
  1478. /// <param name="imm8">imm8</param>
  1479. /// <returns>Vector</returns>
  1480. [DebuggerStepThrough]
  1481. public static v256 mm256_insertf128_pd(v256 a, v128 b, int imm8)
  1482. {
  1483. return mm256_insertf128_ps(a, b, imm8);
  1484. }
  1485. /// <summary>
  1486. /// Copy a to dst, then insert 128 bits of integer data from b into dst at the location specified by imm8.
  1487. /// </summary>
  1488. /// <remarks>
  1489. /// **** VINSERTF128 ymm1, ymm2, xmm3/v128, imm8
  1490. /// Performs an insertion of 128-bits of packed floating-point values from the
  1491. /// second source operand into an the destination at an 128-bit offset from
  1492. /// imm8[0]. The remaining portions of the destination are written by the
  1493. /// corresponding fields of the first source operand
  1494. /// </remarks>
  1495. /// <param name="a">Vector a</param>
  1496. /// <param name="b">Vector b</param>
  1497. /// <param name="imm8">imm8</param>
  1498. /// <returns>Vector</returns>
  1499. [DebuggerStepThrough]
  1500. public static v256 mm256_insertf128_si256(v256 a, v128 b, int imm8)
  1501. {
  1502. return mm256_insertf128_ps(a, b, imm8);
  1503. }
  1504. /// <summary>
  1505. /// Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory
  1506. /// </summary>
  1507. /// <remarks>
  1508. /// **** VMOVUPS ymm1, v256
  1509. /// Burst only generates unaligned stores.
  1510. /// </remarks>
  1511. /// <param name="ptr">Pointer</param>
  1512. /// <returns>Vector</returns>
  1513. [DebuggerStepThrough]
  1514. public static v256 mm256_load_ps(void* ptr)
  1515. {
  1516. return *(v256*)ptr;
  1517. }
  1518. /// <summary>
  1519. /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a into memory
  1520. /// </summary>
  1521. /// <remarks>
  1522. /// **** VMOVUPS v256, ymm1
  1523. /// Burst only generates unaligned stores.
  1524. /// </remarks>
  1525. /// <param name="ptr">Pointer</param>
  1526. /// <param name="val">Value</param>
  1527. [DebuggerStepThrough]
  1528. public static void mm256_store_ps(void* ptr, v256 val)
  1529. {
  1530. *(v256*)ptr = val;
  1531. }
  1532. /// <summary>
  1533. /// Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory
  1534. /// </summary>
  1535. /// <remarks>
  1536. /// **** VMOVUPS ymm1, v256
  1537. /// Burst only generates unaligned stores.
  1538. /// </remarks>
  1539. /// <param name="ptr">Pointer</param>
  1540. /// <returns>Vector</returns>
  1541. [DebuggerStepThrough]
  1542. public static v256 mm256_load_pd(void* ptr)
  1543. {
  1544. return mm256_load_ps(ptr);
  1545. }
  1546. /// <summary>
  1547. /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a into memory
  1548. /// </summary>
  1549. /// <remarks>
  1550. /// **** VMOVUPS v256, ymm1
  1551. /// Burst only generates unaligned stores.
  1552. /// </remarks>
  1553. /// <param name="ptr">Pointer</param>
  1554. /// <param name="a">Vector a</param>
  1555. [DebuggerStepThrough]
  1556. public static void mm256_store_pd(void* ptr, v256 a)
  1557. {
  1558. mm256_store_ps(ptr, a);
  1559. }
  1560. /// <summary>
  1561. /// Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory
  1562. /// </summary>
  1563. /// <remarks>
  1564. /// **** VMOVUPS ymm1, v256
  1565. /// Burst only generates unaligned stores.
  1566. /// </remarks>
  1567. /// <param name="ptr">Pointer</param>
  1568. /// <returns>Vector</returns>
  1569. [DebuggerStepThrough]
  1570. public static v256 mm256_loadu_pd(void* ptr)
  1571. {
  1572. return mm256_load_ps(ptr);
  1573. }
  1574. /// <summary>
  1575. /// Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a into memory
  1576. /// </summary>
  1577. /// <remarks>
  1578. /// **** VMOVUPS v256, ymm1
  1579. /// Burst only generates unaligned stores.
  1580. /// </remarks>
  1581. /// <param name="ptr">Pointer</param>
  1582. /// <param name="a">Vector a</param>
  1583. [DebuggerStepThrough]
  1584. public static void mm256_storeu_pd(void* ptr, v256 a)
  1585. {
  1586. mm256_store_ps(ptr, a);
  1587. }
  1588. /// <summary>
  1589. /// Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory
  1590. /// </summary>
  1591. /// <remarks>
  1592. /// **** VMOVUPS ymm1, v256
  1593. /// Burst only generates unaligned stores.
  1594. /// </remarks>
  1595. /// <param name="ptr">Pointer</param>
  1596. /// <returns>Vector</returns>
  1597. [DebuggerStepThrough]
  1598. public static v256 mm256_loadu_ps(void* ptr)
  1599. {
  1600. return mm256_load_ps(ptr);
  1601. }
  1602. /// <summary>
  1603. /// Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a into memory
  1604. /// </summary>
  1605. /// <remarks>
  1606. /// **** VMOVUPS v256, ymm1
  1607. /// Burst only generates unaligned stores.
  1608. /// </remarks>
  1609. /// <param name="ptr">Pointer</param>
  1610. /// <param name="a">Vector a</param>
  1611. [DebuggerStepThrough]
  1612. public static void mm256_storeu_ps(void* ptr, v256 a)
  1613. {
  1614. mm256_store_ps(ptr, a);
  1615. }
  1616. /// <summary>
  1617. /// Load 256-bits (composed of 8 packed 32-bit integers elements) from memory
  1618. /// </summary>
  1619. /// <remarks>
  1620. /// **** VMOVDQU ymm1, v256
  1621. /// Burst only generates unaligned stores.
  1622. /// </remarks>
  1623. /// <param name="ptr">Pointer</param>
  1624. /// <returns>Vector</returns>
  1625. [DebuggerStepThrough]
  1626. public static v256 mm256_load_si256(void* ptr)
  1627. {
  1628. return mm256_load_ps(ptr);
  1629. }
  1630. /// <summary>
  1631. /// Store 256-bits (composed of 8 packed 32-bit integer elements) from a into memory
  1632. /// </summary>
  1633. /// <remarks>
  1634. /// **** VMOVDQU v256, ymm1
  1635. /// Burst only generates unaligned stores.
  1636. /// </remarks>
  1637. /// <param name="ptr">Pointer</param>
  1638. /// <param name="v">Vector</param>
  1639. [DebuggerStepThrough]
  1640. public static void mm256_store_si256(void* ptr, v256 v)
  1641. {
  1642. mm256_store_ps(ptr, v);
  1643. }
  1644. /// <summary>
  1645. /// Load 256-bits (composed of 8 packed 32-bit integers elements) from memory
  1646. /// </summary>
  1647. /// <remarks>
  1648. /// **** VMOVDQU ymm1, v256
  1649. /// Burst only generates unaligned stores.
  1650. /// </remarks>
  1651. /// <param name="ptr">Pointer</param>
  1652. /// <returns>Vector</returns>
  1653. [DebuggerStepThrough]
  1654. public static v256 mm256_loadu_si256(void* ptr)
  1655. {
  1656. return mm256_load_ps(ptr);
  1657. }
  1658. /// <summary>
  1659. /// Store 256-bits (composed of 8 packed 32-bit integer elements) from a into memory
  1660. /// </summary>
  1661. /// <remarks>
  1662. /// **** VMOVDQU v256, ymm1
  1663. /// Burst only generates unaligned stores.
  1664. /// </remarks>
  1665. /// <param name="ptr">Pointer</param>
  1666. /// <param name="v">Vector</param>
  1667. [DebuggerStepThrough]
  1668. public static void mm256_storeu_si256(void* ptr, v256 v)
  1669. {
  1670. mm256_store_ps(ptr, v);
  1671. }
  1672. /// <summary>
  1673. /// Load two 128-bit values (composed of 4 packed single-precision
  1674. /// (32-bit) floating-point elements) from memory, and combine them
  1675. /// into a 256-bit value in dst. hiaddr and loaddr do not need to
  1676. /// be aligned on any particular boundary.
  1677. /// </summary>
  1678. /// <remarks>
  1679. /// This is a composite function which can generate more than one instruction.
  1680. /// </remarks>
  1681. /// <param name="hiaddr">High address pointer</param>
  1682. /// <param name="loaddr">Low address pointer</param>
  1683. /// <returns>Vector</returns>
  1684. [DebuggerStepThrough]
  1685. [BurstTargetCpu(BurstTargetCpu.AVX)]
  1686. public static v256 mm256_loadu2_m128(void* hiaddr, void* loaddr)
  1687. {
  1688. return mm256_set_m128(Sse.loadu_ps(hiaddr), Sse.loadu_ps(loaddr));
  1689. }
  1690. /// <summary>
  1691. /// Load two 128-bit values (composed of 2 packed double-precision
  1692. /// (64-bit) floating-point elements) from memory, and combine them
  1693. /// into a 256-bit value in dst. hiaddr and loaddr do not need to
  1694. /// be aligned on any particular boundary.
  1695. /// </summary>
  1696. /// <remarks>
  1697. /// This is a composite function which can generate more than one instruction.
  1698. /// </remarks>
  1699. /// <param name="hiaddr">High address pointer</param>
  1700. /// <param name="loaddr">Low address pointer</param>
  1701. /// <returns>Vector</returns>
  1702. [DebuggerStepThrough]
  1703. [BurstTargetCpu(BurstTargetCpu.AVX)]
  1704. public static v256 mm256_loadu2_m128d(void* hiaddr, void* loaddr)
  1705. {
  1706. return mm256_loadu2_m128(hiaddr, loaddr);
  1707. }
  1708. /// <summary>
  1709. /// Load two 128-bit values (composed of integer data) from memory,
  1710. /// and combine them into a 256-bit value in dst. hiaddr and loaddr
  1711. /// do not need to be aligned on any particular boundary.
  1712. /// </summary>
  1713. /// <remarks>
  1714. /// This is a composite function which can generate more than one instruction.
  1715. /// </remarks>
  1716. /// <param name="hiaddr">High address pointer</param>
  1717. /// <param name="loaddr">Low address pointer</param>
  1718. /// <returns>Vector</returns>
  1719. [DebuggerStepThrough]
  1720. [BurstTargetCpu(BurstTargetCpu.AVX)]
  1721. public static v256 mm256_loadu2_m128i(void* hiaddr, void* loaddr)
  1722. {
  1723. return mm256_loadu2_m128(hiaddr, loaddr);
  1724. }
  1725. /// <summary>
  1726. /// Set packed __m256 vector dst with the supplied values.
  1727. /// </summary>
  1728. /// <remarks>
  1729. /// This is a composite function which can generate more than one instruction.
  1730. /// </remarks>
  1731. /// <param name="hi">High half of the vector</param>
  1732. /// <param name="lo">Low half of the vector</param>
  1733. /// <returns>Vector</returns>
  1734. [DebuggerStepThrough]
  1735. public static v256 mm256_set_m128(v128 hi, v128 lo)
  1736. {
  1737. return new v256(lo, hi);
  1738. }
  1739. /// <summary>
  1740. /// Store the high and low 128-bit halves (each composed of 4
  1741. /// packed single-precision (32-bit) floating-point elements) from
  1742. /// a into memory two different 128-bit locations. hiaddr and
  1743. /// loaddr do not need to be aligned on any particular boundary.
  1744. /// </summary>
  1745. /// <remarks>
  1746. /// This is a composite function which can generate more than one instruction.
  1747. /// </remarks>
  1748. /// <param name="hiaddr">High address pointer</param>
  1749. /// <param name="loaddr">Low address pointer</param>
  1750. /// <param name="val">Value</param>
  1751. [DebuggerStepThrough]
  1752. [BurstTargetCpu(BurstTargetCpu.AVX)]
  1753. public static void mm256_storeu2_m128(void* hiaddr, void* loaddr, v256 val)
  1754. {
  1755. Sse.storeu_ps(hiaddr, val.Hi128);
  1756. Sse.storeu_ps(loaddr, val.Lo128);
  1757. }
  1758. /// <summary>
  1759. /// Store the high and low 128-bit halves (each composed of 2
  1760. /// packed double-precision (64-bit) floating-point elements) from
  1761. /// a into memory two different 128-bit locations. hiaddr and
  1762. /// loaddr do not need to be aligned on any particular boundary.
  1763. /// </summary>
  1764. /// <remarks>
  1765. /// This is a composite function which can generate more than one instruction.
  1766. /// </remarks>
  1767. /// <param name="hiaddr">High address pointer</param>
  1768. /// <param name="loaddr">Low address pointer</param>
  1769. /// <param name="val">Value</param>
  1770. [DebuggerStepThrough]
  1771. [BurstTargetCpu(BurstTargetCpu.AVX)]
  1772. public static void mm256_storeu2_m128d(void* hiaddr, void* loaddr, v256 val)
  1773. {
  1774. Sse.storeu_ps(hiaddr, val.Hi128);
  1775. Sse.storeu_ps(loaddr, val.Lo128);
  1776. }
  1777. /// <summary>
  1778. /// Store the high and low 128-bit halves (each composed of integer
  1779. /// data) from a into memory two different 128-bit locations. hiaddr
  1780. /// and loaddr do not need to be aligned on any particular boundary.
  1781. /// </summary>
  1782. /// <remarks>
  1783. /// This is a composite function which can generate more than one instruction.
  1784. /// </remarks>
  1785. /// <param name="hiaddr">High address pointer</param>
  1786. /// <param name="loaddr">Low address pointer</param>
  1787. /// <param name="val">Value</param>
  1788. [DebuggerStepThrough]
  1789. [BurstTargetCpu(BurstTargetCpu.AVX)]
  1790. public static void mm256_storeu2_m128i(void* hiaddr, void* loaddr, v256 val)
  1791. {
  1792. Sse.storeu_ps(hiaddr, val.Hi128);
  1793. Sse.storeu_ps(loaddr, val.Lo128);
  1794. }
  1795. /// <summary>
  1796. /// Load packed double-precision (64-bit) floating-point elements
  1797. /// from memory into dst using mask (elements are zeroed out when
  1798. /// the high bit of the corresponding element is not set).
  1799. /// </summary>
  1800. /// <remarks>
  1801. /// **** VMASKMOVPD xmm1, xmm2, v128
  1802. /// </remarks>
  1803. /// <param name="mem_addr">Memory address</param>
  1804. /// <param name="mask">Mask</param>
  1805. /// <returns>Vector</returns>
  1806. [DebuggerStepThrough]
  1807. public static v128 maskload_pd(void* mem_addr, v128 mask)
  1808. {
  1809. ulong* addr = (ulong*)mem_addr;
  1810. v128 result = default;
  1811. if (mask.SLong0 < 0) result.ULong0 = addr[0];
  1812. if (mask.SLong1 < 0) result.ULong1 = addr[1];
  1813. return result;
  1814. }
  1815. /// <summary>
  1816. /// Load packed double-precision (64-bit) floating-point elements
  1817. /// from memory into dst using mask (elements are zeroed out when
  1818. /// the high bit of the corresponding element is not set).
  1819. /// </summary>
  1820. /// <remarks>
  1821. /// **** VMASKMOVPD ymm1, ymm2, v256
  1822. /// </remarks>
  1823. /// <param name="mem_addr">Memory address</param>
  1824. /// <param name="mask">Mask</param>
  1825. /// <returns>Vector</returns>
  1826. [DebuggerStepThrough]
  1827. public static v256 mm256_maskload_pd(void* mem_addr, v256 mask)
  1828. {
  1829. return new v256(maskload_pd(mem_addr, mask.Lo128), maskload_pd(((byte*)mem_addr) + 16, mask.Hi128));
  1830. }
  1831. /// <summary>
  1832. /// Store packed double-precision (64-bit) floating-point elements from a into memory using mask.
  1833. /// </summary>
  1834. /// <remarks>
  1835. /// **** VMASKMOVPD v128, xmm1, xmm2
  1836. /// </remarks>
  1837. /// <param name="mem_addr">Memory address</param>
  1838. /// <param name="mask">Mask</param>
  1839. /// <param name="a">Vector a</param>
  1840. [DebuggerStepThrough]
  1841. public static void maskstore_pd(void* mem_addr, v128 mask, v128 a)
  1842. {
  1843. ulong* addr = (ulong*)mem_addr;
  1844. if (mask.SLong0 < 0) addr[0] = a.ULong0;
  1845. if (mask.SLong1 < 0) addr[1] = a.ULong1;
  1846. }
  1847. /// <summary>
  1848. /// Store packed double-precision (64-bit) floating-point elements from a into memory using mask.
  1849. /// </summary>
  1850. /// <remarks>
  1851. /// **** VMASKMOVPD v256, ymm1, ymm2
  1852. /// </remarks>
  1853. /// <param name="mem_addr">Memory address</param>
  1854. /// <param name="mask">Mask</param>
  1855. /// <param name="a">Vector a</param>
  1856. [DebuggerStepThrough]
  1857. public static void mm256_maskstore_pd(void* mem_addr, v256 mask, v256 a)
  1858. {
  1859. maskstore_pd(mem_addr, mask.Lo128, a.Lo128);
  1860. maskstore_pd(((byte*)mem_addr) + 16, mask.Hi128, a.Hi128);
  1861. }
  1862. /// <summary>
  1863. /// Load packed single-precision (32-bit) floating-point elements
  1864. /// from memory into dst using mask (elements are zeroed out when
  1865. /// the high bit of the corresponding element is not set).
  1866. /// </summary>
  1867. /// <remarks>
  1868. /// **** VMASKMOVPS xmm1, xmm2, v128
  1869. /// </remarks>
  1870. /// <param name="mem_addr">Memory address</param>
  1871. /// <param name="mask">Mask</param>
  1872. /// <returns>Vector</returns>
  1873. [DebuggerStepThrough]
  1874. public static v128 maskload_ps(void* mem_addr, v128 mask)
  1875. {
  1876. uint* addr = (uint*)mem_addr;
  1877. v128 result = default;
  1878. if (mask.SInt0 < 0) result.UInt0 = addr[0];
  1879. if (mask.SInt1 < 0) result.UInt1 = addr[1];
  1880. if (mask.SInt2 < 0) result.UInt2 = addr[2];
  1881. if (mask.SInt3 < 0) result.UInt3 = addr[3];
  1882. return result;
  1883. }
  1884. /// <summary>
  1885. /// Load packed single-precision (32-bit) floating-point elements
  1886. /// from memory into dst using mask (elements are zeroed out when
  1887. /// the high bit of the corresponding element is not set).
  1888. /// </summary>
  1889. /// <remarks>
  1890. /// **** VMASKMOVPS ymm1, ymm2, v256
  1891. /// </remarks>
  1892. /// <param name="mem_addr">Memory address</param>
  1893. /// <param name="mask">Mask</param>
  1894. /// <returns>Vector</returns>
  1895. [DebuggerStepThrough]
  1896. public static v256 mm256_maskload_ps(void* mem_addr, v256 mask)
  1897. {
  1898. return new v256(maskload_ps(mem_addr, mask.Lo128), maskload_ps(((byte*)mem_addr) + 16, mask.Hi128));
  1899. }
  1900. /// <summary>
  1901. /// Store packed single-precision (32-bit) floating-point elements from a into memory using mask.
  1902. /// </summary>
  1903. /// <remarks>
  1904. /// **** VMASKMOVPS v128, xmm1, xmm2
  1905. /// </remarks>
  1906. /// <param name="mem_addr">Memory address</param>
  1907. /// <param name="mask">Mask</param>
  1908. /// <param name="a">Vector a</param>
  1909. [DebuggerStepThrough]
  1910. public static void maskstore_ps(void* mem_addr, v128 mask, v128 a)
  1911. {
  1912. uint* addr = (uint*)mem_addr;
  1913. if (mask.SInt0 < 0) addr[0] = a.UInt0;
  1914. if (mask.SInt1 < 0) addr[1] = a.UInt1;
  1915. if (mask.SInt2 < 0) addr[2] = a.UInt2;
  1916. if (mask.SInt3 < 0) addr[3] = a.UInt3;
  1917. }
  1918. /// <summary>
  1919. /// Store packed single-precision (32-bit) floating-point elements from a into memory using mask.
  1920. /// </summary>
  1921. /// <remarks>
  1922. /// **** VMASKMOVPS v256, ymm1, ymm2
  1923. /// </remarks>
  1924. /// <param name="mem_addr">Memory address</param>
  1925. /// <param name="mask">Mask</param>
  1926. /// <param name="a">Vector a</param>
  1927. [DebuggerStepThrough]
  1928. public static void mm256_maskstore_ps(void* mem_addr, v256 mask, v256 a)
  1929. {
  1930. maskstore_ps(mem_addr, mask.Lo128, a.Lo128);
  1931. maskstore_ps(((byte*)mem_addr) + 16, mask.Hi128, a.Hi128);
  1932. }
  1933. /*
  1934. * Replicate Single-Precision Floating-Point Values
  1935. * Duplicates odd-indexed single-precision floating-point values from the
  1936. * source operand
  1937. */
  1938. /// <summary>
  1939. /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
  1940. /// </summary>
  1941. /// <remarks>
  1942. /// **** VMOVSHDUP ymm1, ymm2/v256
  1943. /// </remarks>
  1944. /// <param name="a">Vector a</param>
  1945. /// <returns>Vector</returns>
  1946. [DebuggerStepThrough]
  1947. public static v256 mm256_movehdup_ps(v256 a)
  1948. {
  1949. return new v256(a.UInt1, a.UInt1, a.UInt3, a.UInt3, a.UInt5, a.UInt5, a.UInt7, a.UInt7);
  1950. }
  1951. /// <summary>
  1952. /// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
  1953. /// </summary>
  1954. /// <remarks>
  1955. /// **** VMOVSLDUP ymm1, ymm2/v256
  1956. /// </remarks>
  1957. /// <param name="a">Vector a</param>
  1958. /// <returns>Vector</returns>
  1959. [DebuggerStepThrough]
  1960. public static v256 mm256_moveldup_ps(v256 a)
  1961. {
  1962. return new v256(a.UInt0, a.UInt0, a.UInt2, a.UInt2, a.UInt4, a.UInt4, a.UInt6, a.UInt6);
  1963. }
  1964. /// <summary>
  1965. /// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst.
  1966. /// </summary>
  1967. /// <remarks>
  1968. /// **** VMOVDDUP ymm1, ymm2/v256
  1969. /// </remarks>
  1970. /// <param name="a">Vector a</param>
  1971. /// <returns>Vector</returns>
  1972. [DebuggerStepThrough]
  1973. public static v256 mm256_movedup_pd(v256 a)
  1974. {
  1975. return new v256(a.Double0, a.Double0, a.Double2, a.Double2);
  1976. }
  1977. /// <summary>
  1978. /// Load 256-bits of integer data from unaligned memory into dst.
  1979. /// This intrinsic may perform better than mm256_loadu_si256 when
  1980. /// the data crosses a cache line boundary.
  1981. /// </summary>
  1982. /// <remarks>
  1983. /// **** VLDDQU ymm1, v256
  1984. /// </remarks>
  1985. /// <param name="mem_addr">Memory address</param>
  1986. /// <returns>Vector</returns>
  1987. [DebuggerStepThrough]
  1988. public static v256 mm256_lddqu_si256(void* mem_addr)
  1989. {
  1990. return *(v256*)mem_addr;
  1991. }
  1992. /*
  1993. * Store Packed Integers Using Non-Temporal Hint
  1994. * **** VMOVNTDQ v256, ymm1
  1995. * Moves the packed integers in the source operand to the destination using a
  1996. * non-temporal hint to prevent caching of the data during the write to memory
  1997. */
  1998. /// <summary>
  1999. /// Store 256-bits of integer data from a into memory using a
  2000. /// non-temporal memory hint. mem_addr must be aligned on a 32-byte
  2001. /// boundary or a general-protection exception may be generated.
  2002. /// </summary>
  2003. /// <remarks>
  2004. /// **** VMOVNTDQ v256, ymm1
  2005. /// </remarks>
  2006. /// <param name="mem_addr">Memory address</param>
  2007. /// <param name="a">Vector a</param>
  2008. [DebuggerStepThrough]
  2009. public static void mm256_stream_si256(void* mem_addr, v256 a)
  2010. {
  2011. *(v256*)mem_addr = a;
  2012. }
  2013. /// <summary>
  2014. /// Store 256-bits (composed of 4 packed double-precision (64-bit)
  2015. /// floating-point elements) from a into memory using a
  2016. /// non-temporal memory hint. mem_addr must be aligned on a 32-byte
  2017. /// boundary or a general-protection exception may be generated.
  2018. /// </summary>
  2019. /// <remarks>
  2020. /// **** VMOVNTPD v256, ymm1
  2021. /// </remarks>
  2022. /// <param name="mem_addr">Memory address</param>
  2023. /// <param name="a">Vector a</param>
  2024. [DebuggerStepThrough]
  2025. public static void mm256_stream_pd(void* mem_addr, v256 a)
  2026. {
  2027. *(v256*)mem_addr = a;
  2028. }
  2029. /// <summary>
  2030. /// Store 256-bits (composed of 8 packed single-precision (32-bit)
  2031. /// floating-point elements) from a into memory using a
  2032. /// non-temporal memory hint. mem_addr must be aligned on a 32-byte
  2033. /// boundary or a general-protection exception may be generated.
  2034. /// </summary>
  2035. /// <remarks>
  2036. /// **** VMOVNTPS v256, ymm1
  2037. /// </remarks>
  2038. /// <param name="mem_addr">Memory address</param>
  2039. /// <param name="a">Vector a</param>
  2040. [DebuggerStepThrough]
  2041. public static void mm256_stream_ps(void* mem_addr, v256 a)
  2042. {
  2043. *(v256*)mem_addr = a;
  2044. }
  2045. /// <summary>
  2046. /// Compute the approximate reciprocal of packed single-precision
  2047. /// (32-bit) floating-point elements in a, and store the results in
  2048. /// dst. The maximum relative error for this approximation is less
  2049. /// than 1.5*2^-12.
  2050. /// </summary>
  2051. /// <remarks>
  2052. /// **** VRCPPS ymm1, ymm2/v256
  2053. /// </remarks>
  2054. /// <param name="a">Vector a</param>
  2055. /// <returns>Vector</returns>
  2056. [DebuggerStepThrough]
  2057. public static v256 mm256_rcp_ps(v256 a)
  2058. {
  2059. return new v256(Sse.rcp_ps(a.Lo128), Sse.rcp_ps(a.Hi128));
  2060. }
  2061. /// <summary>
  2062. /// Compute the approximate reciprocal square root of packed
  2063. /// single-precision (32-bit) floating-point elements in a, and
  2064. /// store the results in dst. The maximum relative error for this
  2065. /// approximation is less than 1.5*2^-12.
  2066. /// </summary>
  2067. /// <remarks>
  2068. /// **** VRSQRTPS ymm1, ymm2/v256
  2069. /// </remarks>
  2070. /// <param name="a">Vector a</param>
  2071. /// <returns>Vector</returns>
  2072. [DebuggerStepThrough]
  2073. public static v256 mm256_rsqrt_ps(v256 a)
  2074. {
  2075. return new v256(Sse.rsqrt_ps(a.Lo128), Sse.rsqrt_ps(a.Hi128));
  2076. }
  2077. /// <summary>
  2078. /// Compute the square root of packed double-precision (64-bit)
  2079. /// floating-point elements in a, and store the results in dst.
  2080. /// </summary>
  2081. /// <remarks>
  2082. /// **** VSQRTPD ymm1, ymm2/v256
  2083. /// </remarks>
  2084. /// <param name="a">Vector a</param>
  2085. /// <returns>Vector</returns>
  2086. [DebuggerStepThrough]
  2087. public static v256 mm256_sqrt_pd(v256 a)
  2088. {
  2089. return new v256(Sse2.sqrt_pd(a.Lo128), Sse2.sqrt_pd(a.Hi128));
  2090. }
  2091. /// <summary>
  2092. /// Compute the square root of packed single-precision (32-bit)
  2093. /// floating-point elements in a, and store the results in dst.
  2094. /// </summary>
  2095. /// <remarks>
  2096. /// **** VSQRTPS ymm1, ymm2/v256
  2097. /// </remarks>
  2098. /// <param name="a">Vector a</param>
  2099. /// <returns>Vector</returns>
  2100. [DebuggerStepThrough]
  2101. public static v256 mm256_sqrt_ps(v256 a)
  2102. {
  2103. return new v256(Sse.sqrt_ps(a.Lo128), Sse.sqrt_ps(a.Hi128));
  2104. }
  2105. /// <summary>
  2106. /// Round the packed double-precision (64-bit) floating-point
  2107. /// elements in a using the rounding parameter, and store the
  2108. /// results as packed double-precision floating-point elements in
  2109. /// dst.
  2110. /// </summary>
  2111. /// <remarks>
  2112. ///**** VROUNDPD ymm1,ymm2/v256,imm8
  2113. /// Rounding is done according to the rounding parameter, which can be one of:
  2114. /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
  2115. /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
  2116. /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
  2117. /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
  2118. /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
  2119. /// </remarks>
  2120. /// <param name="a">Vector a</param>
  2121. /// <param name="rounding">Rounding mode</param>
  2122. /// <returns>Vector</returns>
  2123. [DebuggerStepThrough]
  2124. public static v256 mm256_round_pd(v256 a, int rounding)
  2125. {
  2126. return new v256(Sse4_1.round_pd(a.Lo128, rounding), Sse4_1.round_pd(a.Hi128, rounding));
  2127. }
  2128. /// <summary>
  2129. /// Round the packed double-precision (64-bit) floating-point
  2130. /// elements in a up to an integer value, and store the results as
  2131. /// packed double-precision floating-point elements in dst.
  2132. /// </summary>
  2133. /// <param name="val">Value</param>
  2134. /// <returns>Vector</returns>
  2135. [DebuggerStepThrough]
  2136. [BurstTargetCpu(BurstTargetCpu.AVX)]
  2137. public static v256 mm256_ceil_pd(v256 val)
  2138. {
  2139. return mm256_round_pd(val, (int)RoundingMode.FROUND_CEIL);
  2140. }
  2141. /// <summary>
  2142. /// Round the packed double-precision (64-bit) floating-point
  2143. /// elements in a down to an integer value, and store the results
  2144. /// as packed double-precision floating-point elements in dst.
  2145. /// </summary>
  2146. /// <param name="val">Value</param>
  2147. /// <returns>Vector</returns>
  2148. [DebuggerStepThrough]
  2149. [BurstTargetCpu(BurstTargetCpu.AVX)]
  2150. public static v256 mm256_floor_pd(v256 val)
  2151. {
  2152. return mm256_round_pd(val, (int)RoundingMode.FROUND_FLOOR);
  2153. }
  2154. /// <summary>
  2155. /// Round the packed single-precision (32-bit) floating-point
  2156. /// elements in a using the rounding parameter, and store the
  2157. /// results as packed single-precision floating-point elements in
  2158. /// dst.
  2159. /// </summary>
  2160. /// <remarks>
  2161. /// **** VROUNDPS ymm1,ymm2/v256,imm8
  2162. /// Round the four single-precision floating-point values values in the source
  2163. /// operand by the rounding mode specified in the immediate operand and place
  2164. /// the result in the destination. The rounding process rounds the input to an
  2165. /// integral value and returns the result as a double-precision floating-point
  2166. /// value. The Precision Floating Point Exception is signaled according to the
  2167. /// immediate operand. If any source operand is an SNaN then it will be
  2168. /// converted to a QNaN.
  2169. /// </remarks>
  2170. /// <param name="a">Vector a</param>
  2171. /// <param name="rounding">Rounding mode</param>
  2172. /// <returns>Vector</returns>
  2173. [DebuggerStepThrough]
  2174. public static v256 mm256_round_ps(v256 a, int rounding)
  2175. {
  2176. return new v256(Sse4_1.round_ps(a.Lo128, rounding), Sse4_1.round_ps(a.Hi128, rounding));
  2177. }
  2178. /// <summary>
  2179. /// Round the packed single-precision (32-bit) floating-point
  2180. /// elements in a up to an integer value, and store the results as
  2181. /// packed single-precision floating-point elements in dst.
  2182. /// </summary>
  2183. /// <param name="val">Value</param>
  2184. /// <returns>Vector</returns>
  2185. [DebuggerStepThrough]
  2186. [BurstTargetCpu(BurstTargetCpu.AVX)]
  2187. public static v256 mm256_ceil_ps(v256 val)
  2188. {
  2189. return mm256_round_ps(val, (int)RoundingMode.FROUND_CEIL);
  2190. }
  2191. /// <summary>
  2192. /// Round the packed single-precision (32-bit) floating-point
  2193. /// elements in a down to an integer value, and store the results
  2194. /// as packed single-precision floating-point elements in dst.
  2195. /// </summary>
  2196. /// <param name="val">Value</param>
  2197. /// <returns>Vector</returns>
  2198. [DebuggerStepThrough]
  2199. [BurstTargetCpu(BurstTargetCpu.AVX)]
  2200. public static v256 mm256_floor_ps(v256 val)
  2201. {
  2202. return mm256_round_ps(val, (int)RoundingMode.FROUND_FLOOR);
  2203. }
  2204. /// <summary>
  2205. /// Unpack and interleave double-precision (64-bit) floating-point
  2206. /// elements from the high half of each 128-bit lane in a and b,
  2207. /// and store the results in dst.
  2208. /// </summary>
  2209. /// <remarks>
  2210. /// **** VUNPCKHPD ymm1,ymm2,ymm3/v256
  2211. /// </remarks>
  2212. /// <param name="a">Vector a</param>
  2213. /// <param name="b">Vector b</param>
  2214. /// <returns>Vector</returns>
  2215. [DebuggerStepThrough]
  2216. public static v256 mm256_unpackhi_pd(v256 a, v256 b)
  2217. {
  2218. return new v256(Sse2.unpackhi_pd(a.Lo128, b.Lo128), Sse2.unpackhi_pd(a.Hi128, b.Hi128));
  2219. }
  2220. /// <summary>
  2221. /// Unpack and interleave double-precision (64-bit) floating-point
  2222. /// elements from the low half of each 128-bit lane in a and b, and
  2223. /// store the results in dst.
  2224. /// </summary>
  2225. /// <remarks>
  2226. /// **** VUNPCKLPD ymm1,ymm2,ymm3/v256
  2227. /// </remarks>
  2228. /// <param name="a">Vector a</param>
  2229. /// <param name="b">Vector b</param>
  2230. /// <returns>Vector</returns>
  2231. [DebuggerStepThrough]
  2232. public static v256 mm256_unpacklo_pd(v256 a, v256 b)
  2233. {
  2234. return new v256(Sse2.unpacklo_pd(a.Lo128, b.Lo128), Sse2.unpacklo_pd(a.Hi128, b.Hi128));
  2235. }
  2236. /// <summary>
  2237. /// Unpack and interleave single-precision(32-bit) floating-point
  2238. /// elements from the high half of each 128-bit lane in a and b,
  2239. /// and store the results in dst.
  2240. /// </summary>
  2241. /// <remarks>
  2242. /// **** VUNPCKHPS ymm1,ymm2,ymm3/v256
  2243. /// </remarks>
  2244. /// <param name="a">Vector a</param>
  2245. /// <param name="b">Vector b</param>
  2246. /// <returns>Vector</returns>
  2247. [DebuggerStepThrough]
  2248. [BurstTargetCpu(BurstTargetCpu.AVX)]
  2249. public static v256 mm256_unpackhi_ps(v256 a, v256 b)
  2250. {
  2251. return new v256(Sse.unpackhi_ps(a.Lo128, b.Lo128), Sse.unpackhi_ps(a.Hi128, b.Hi128));
  2252. }
  2253. /// <summary>
  2254. /// Unpack and interleave single-precision (32-bit) floating-point
  2255. /// elements from the low half of each 128-bit lane in a and b, and
  2256. /// store the results in dst.
  2257. /// </summary>
  2258. /// <remarks>
  2259. /// **** VUNPCKLPS ymm1,ymm2,ymm3/v256
  2260. /// </remarks>
  2261. /// <param name="a">Vector a</param>
  2262. /// <param name="b">Vector b</param>
  2263. /// <returns>Vector</returns>
  2264. [DebuggerStepThrough]
  2265. [BurstTargetCpu(BurstTargetCpu.AVX)]
  2266. public static v256 mm256_unpacklo_ps(v256 a, v256 b)
  2267. {
  2268. return new v256(Sse.unpacklo_ps(a.Lo128, b.Lo128), Sse.unpacklo_ps(a.Hi128, b.Hi128));
  2269. }
  2270. /// <summary>
  2271. /// Compute the bitwise AND of 256 bits (representing integer data)
  2272. /// in a and b, and set ZF to 1 if the result is zero, otherwise
  2273. /// set ZF to 0. Compute the bitwise NOT of a and then AND with b,
  2274. /// and set CF to 1 if the result is zero, otherwise set CF to 0.
  2275. /// Return the ZF value.
  2276. /// </summary>
  2277. /// <param name="a">Vector a</param>
  2278. /// <param name="b">Vector b</param>
  2279. /// <returns>ZF value</returns>
  2280. [DebuggerStepThrough]
  2281. public static int mm256_testz_si256(v256 a, v256 b)
  2282. {
  2283. return Sse4_1.testz_si128(a.Lo128, b.Lo128) & Sse4_1.testz_si128(a.Hi128, b.Hi128);
  2284. }
  2285. /// <summary>
  2286. /// Compute the bitwise AND of 256 bits (representing integer data)
  2287. /// in a and b, and set ZF to 1 if the result is zero, otherwise
  2288. /// set ZF to 0. Compute the bitwise NOT of a and then AND with b,
  2289. /// and set CF to 1 if the result is zero, otherwise set CF to 0.
  2290. /// Return the CF value.
  2291. /// </summary>
  2292. /// <param name="a">Vector a</param>
  2293. /// <param name="b">Vector b</param>
  2294. /// <returns>CF value</returns>
  2295. [DebuggerStepThrough]
  2296. public static int mm256_testc_si256(v256 a, v256 b)
  2297. {
  2298. return Sse4_1.testc_si128(a.Lo128, b.Lo128) & Sse4_1.testc_si128(a.Hi128, b.Hi128);
  2299. }
  2300. /// <summary>
  2301. /// Compute the bitwise AND of 256 bits (representing integer data)
  2302. /// in a and b, and set ZF to 1 if the result is zero, otherwise
  2303. /// set ZF to 0. Compute the bitwise NOT of a and then AND with b,
  2304. /// and set CF to 1 if the result is zero, otherwise set CF to 0.
  2305. /// Return 1 if both the ZF and CF values are zero, otherwise
  2306. /// return 0.
  2307. /// </summary>
  2308. /// <param name="a">Vector a</param>
  2309. /// <param name="b">Vector b</param>
  2310. /// <returns>Integer</returns>
  2311. [DebuggerStepThrough]
  2312. public static int mm256_testnzc_si256(v256 a, v256 b)
  2313. {
  2314. int zf = mm256_testz_si256(a, b);
  2315. int cf = mm256_testc_si256(a, b);
  2316. return 1 - (zf | cf);
  2317. }
  2318. /// <summary>
  2319. /// Compute the bitwise AND of 256 bits (representing
  2320. /// double-precision (64-bit) floating-point elements) in a and b,
  2321. /// producing an intermediate 256-bit value, and set ZF to 1 if the
  2322. /// sign bit of each 64-bit element in the intermediate value is
  2323. /// zero, otherwise set ZF to 0. Compute the bitwise NOT of a and
  2324. /// then AND with b, producing an intermediate value, and set CF to
  2325. /// 1 if the sign bit of each 64-bit element in the intermediate
  2326. /// value is zero, otherwise set CF to 0. Return the ZF value.
  2327. /// </summary>
  2328. /// <remarks>
  2329. /// **** VTESTPD ymm1, ymm2/v256
  2330. /// </remarks>
  2331. /// <param name="a">Vector a</param>
  2332. /// <param name="b">Vector b</param>
  2333. /// <returns>ZF value</returns>
  2334. [DebuggerStepThrough]
  2335. public static int mm256_testz_pd(v256 a, v256 b)
  2336. {
  2337. ulong* aptr = &a.ULong0;
  2338. ulong* bptr = &b.ULong0;
  2339. for (int i = 0; i < 4; ++i)
  2340. {
  2341. if (((aptr[i] & bptr[i]) & 0x8000_0000_0000_0000) != 0)
  2342. return 0;
  2343. }
  2344. return 1;
  2345. }
  2346. /// <summary>
  2347. /// Compute the bitwise AND of 256 bits (representing
  2348. /// double-precision (64-bit) floating-point elements) in a and b,
  2349. /// producing an intermediate 256-bit value, and set ZF to 1 if the
  2350. /// sign bit of each 64-bit element in the intermediate value is
  2351. /// zero, otherwise set ZF to 0. Compute the bitwise NOT of a and
  2352. /// then AND with b, producing an intermediate value, and set CF to
  2353. /// 1 if the sign bit of each 64-bit element in the intermediate
  2354. /// value is zero, otherwise set CF to 0. Return the CF value.
  2355. /// </summary>
  2356. /// <remarks>
  2357. /// **** VTESTPD ymm1, ymm2/v256
  2358. /// </remarks>
  2359. /// <param name="a">Vector a</param>
  2360. /// <param name="b">Vector b</param>
  2361. /// <returns>CF value</returns>
  2362. [DebuggerStepThrough]
  2363. public static int mm256_testc_pd(v256 a, v256 b)
  2364. {
  2365. ulong* aptr = &a.ULong0;
  2366. ulong* bptr = &b.ULong0;
  2367. for (int i = 0; i < 4; ++i)
  2368. {
  2369. if ((((~aptr[i]) & bptr[i]) & 0x8000_0000_0000_0000) != 0)
  2370. return 0;
  2371. }
  2372. return 1;
  2373. }
  2374. /// <summary>
  2375. /// Compute the bitwise AND of 256 bits (representing
  2376. /// double-precision (64-bit) floating-point elements) in a and b,
  2377. /// producing an intermediate 256-bit value, and set ZF to 1 if the
  2378. /// sign bit of each 64-bit element in the intermediate value is
  2379. /// zero, otherwise set ZF to 0. Compute the bitwise NOT of a and
  2380. /// then AND with b, producing an intermediate value, and set CF to
  2381. /// 1 if the sign bit of each 64-bit element in the intermediate
  2382. /// value is zero, otherwise set CF to 0. Return 1 if both the ZF
  2383. /// and CF values are zero, otherwise return 0.
  2384. /// </summary>
  2385. /// <remarks>
  2386. /// **** VTESTPD ymm1, ymm2/v256
  2387. /// </remarks>
  2388. /// <param name="a">Vector a</param>
  2389. /// <param name="b">Vector b</param>
  2390. /// <returns>Integer</returns>
  2391. [DebuggerStepThrough]
  2392. public static int mm256_testnzc_pd(v256 a, v256 b)
  2393. {
  2394. return 1 - (mm256_testz_pd(a, b) | mm256_testc_pd(a, b));
  2395. }
  2396. /// <summary>
  2397. /// Compute the bitwise AND of 128 bits (representing
  2398. /// double-precision (64-bit) floating-point elements) in a and b,
  2399. /// producing an intermediate 128-bit value, and set ZF to 1 if the
  2400. /// sign bit of each 64-bit element in the intermediate value is
  2401. /// zero, otherwise set ZF to 0. Compute the bitwise NOT of a and
  2402. /// then AND with b, producing an intermediate value, and set CF to
  2403. /// 1 if the sign bit of each 64-bit element in the intermediate
  2404. /// value is zero, otherwise set CF to 0. Return the ZF value.
  2405. /// </summary>
  2406. /// <remarks>
  2407. /// **** VTESTPD xmm1, xmm2/v128
  2408. /// </remarks>
  2409. /// <param name="a">Vector a</param>
  2410. /// <param name="b">Vector b</param>
  2411. /// <returns>ZF value</returns>
  2412. [DebuggerStepThrough]
  2413. public static int testz_pd(v128 a, v128 b)
  2414. {
  2415. ulong* aptr = &a.ULong0;
  2416. ulong* bptr = &b.ULong0;
  2417. for (int i = 0; i < 2; ++i)
  2418. {
  2419. if (((aptr[i] & bptr[i]) & 0x8000_0000_0000_0000) != 0)
  2420. return 0;
  2421. }
  2422. return 1;
  2423. }
  2424. /// <summary>
  2425. /// Compute the bitwise AND of 128 bits (representing
  2426. /// double-precision (64-bit) floating-point elements) in a and b,
  2427. /// producing an intermediate 128-bit value, and set ZF to 1 if the
  2428. /// sign bit of each 64-bit element in the intermediate value is
  2429. /// zero, otherwise set ZF to 0. Compute the bitwise NOT of a and
  2430. /// then AND with b, producing an intermediate value, and set CF to
  2431. /// 1 if the sign bit of each 64-bit element in the intermediate
  2432. /// value is zero, otherwise set CF to 0. Return the CF value.
  2433. /// </summary>
  2434. /// <remarks>
  2435. /// **** VTESTPD xmm1, xmm2/v128
  2436. /// </remarks>
  2437. /// <param name="a">Vector a</param>
  2438. /// <param name="b">Vector b</param>
  2439. /// <returns>CF value</returns>
  2440. [DebuggerStepThrough]
  2441. public static int testc_pd(v128 a, v128 b)
  2442. {
  2443. ulong* aptr = &a.ULong0;
  2444. ulong* bptr = &b.ULong0;
  2445. for (int i = 0; i < 2; ++i)
  2446. {
  2447. if ((((~aptr[i]) & bptr[i]) & 0x8000_0000_0000_0000) != 0)
  2448. return 0;
  2449. }
  2450. return 1;
  2451. }
  2452. /// <summary>
  2453. /// Compute the bitwise AND of 128 bits (representing
  2454. /// double-precision (64-bit) floating-point elements) in a and b,
  2455. /// producing an intermediate 128-bit value, and set ZF to 1 if the
  2456. /// sign bit of each 64-bit element in the intermediate value is
  2457. /// zero, otherwise set ZF to 0. Compute the bitwise NOT of a and
  2458. /// then AND with b, producing an intermediate value, and set CF to
  2459. /// 1 if the sign bit of each 64-bit element in the intermediate
  2460. /// value is zero, otherwise set CF to 0. Return 1 if both the ZF
  2461. /// and CF values are zero, otherwise return 0.
  2462. /// </summary>
  2463. /// <remarks>
  2464. /// **** VTESTPD xmm1, xmm2/v128
  2465. /// </remarks>
  2466. /// <param name="a">Vector a</param>
  2467. /// <param name="b">Vector b</param>
  2468. /// <returns>Integer</returns>
  2469. [DebuggerStepThrough]
  2470. public static int testnzc_pd(v128 a, v128 b)
  2471. {
  2472. return 1 - (testz_pd(a, b) | testc_pd(a, b));
  2473. }
  2474. /// <summary>
  2475. /// Compute the bitwise AND of 256 bits (representing
  2476. /// single-precision (32-bit) floating-point elements) in a and b,
  2477. /// producing an intermediate 256-bit value, and set ZF to 1 if the
  2478. /// sign bit of each 32-bit element in the intermediate value is
  2479. /// zero, otherwise set ZF to 0. Compute the bitwise NOT of a and
  2480. /// then AND with b, producing an intermediate value, and set CF to
  2481. /// 1 if the sign bit of each 32-bit element in the intermediate
  2482. /// value is zero, otherwise set CF to 0. Return the ZF value.
  2483. /// </summary>
  2484. /// <remarks>
  2485. /// **** VTESTPS ymm1, ymm2/v256
  2486. /// </remarks>
  2487. /// <param name="a">Vector a</param>
  2488. /// <param name="b">Vector b</param>
  2489. /// <returns>ZF value</returns>
  2490. [DebuggerStepThrough]
  2491. public static int mm256_testz_ps(v256 a, v256 b)
  2492. {
  2493. uint* aptr = &a.UInt0;
  2494. uint* bptr = &b.UInt0;
  2495. for (int i = 0; i < 8; ++i)
  2496. {
  2497. if (((aptr[i] & bptr[i]) & 0x8000_0000) != 0)
  2498. return 0;
  2499. }
  2500. return 1;
  2501. }
  2502. /// <summary>
  2503. /// Compute the bitwise AND of 256 bits (representing
  2504. /// single-precision (32-bit) floating-point elements) in a and b,
  2505. /// producing an intermediate 256-bit value, and set ZF to 1 if the
  2506. /// sign bit of each 32-bit element in the intermediate value is
  2507. /// zero, otherwise set ZF to 0. Compute the bitwise NOT of a and
  2508. /// then AND with b, producing an intermediate value, and set CF to
  2509. /// 1 if the sign bit of each 32-bit element in the intermediate
  2510. /// value is zero, otherwise set CF to 0. Return the CF value.
  2511. /// </summary>
  2512. /// <remarks>
  2513. /// **** VTESTPS ymm1, ymm2/v256
  2514. /// </remarks>
  2515. /// <param name="a">Vector a</param>
  2516. /// <param name="b">Vector b</param>
  2517. /// <returns>CF value</returns>
  2518. [DebuggerStepThrough]
  2519. public static int mm256_testc_ps(v256 a, v256 b)
  2520. {
  2521. uint* aptr = &a.UInt0;
  2522. uint* bptr = &b.UInt0;
  2523. for (int i = 0; i < 8; ++i)
  2524. {
  2525. if ((((~aptr[i]) & bptr[i]) & 0x8000_0000) != 0)
  2526. return 0;
  2527. }
  2528. return 1;
  2529. }
  2530. /// <summary>
  2531. /// Compute the bitwise AND of 256 bits (representing
  2532. /// single-precision (32-bit) floating-point elements) in a and b,
  2533. /// producing an intermediate 256-bit value, and set ZF to 1 if the
  2534. /// sign bit of each 32-bit element in the intermediate value is
  2535. /// zero, otherwise set ZF to 0. Compute the bitwise NOT of a and
  2536. /// then AND with b, producing an intermediate value, and set CF to
  2537. /// 1 if the sign bit of each 32-bit element in the intermediate
  2538. /// value is zero, otherwise set CF to 0. Return 1 if both the ZF
  2539. /// and CF values are zero, otherwise return 0.
  2540. /// </summary>
  2541. /// <remarks>
  2542. /// **** VTESTPS ymm1, ymm2/v256
  2543. /// </remarks>
  2544. /// <param name="a">Vector a</param>
  2545. /// <param name="b">Vector b</param>
  2546. /// <returns>Integer</returns>
  2547. [DebuggerStepThrough]
  2548. public static int mm256_testnzc_ps(v256 a, v256 b)
  2549. {
  2550. return 1 - (mm256_testz_ps(a, b) | mm256_testc_ps(a, b));
  2551. }
  2552. /// <summary>
  2553. /// Compute the bitwise AND of 128 bits (representing
  2554. /// single-precision (32-bit) floating-point elements) in a and b,
  2555. /// producing an intermediate 128-bit value, and set ZF to 1 if the
  2556. /// sign bit of each 32-bit element in the intermediate value is
  2557. /// zero, otherwise set ZF to 0. Compute the bitwise NOT of a and
  2558. /// then AND with b, producing an intermediate value, and set CF to
  2559. /// 1 if the sign bit of each 32-bit element in the intermediate
  2560. /// value is zero, otherwise set CF to 0. Return the ZF value.
  2561. /// </summary>
  2562. /// <remarks>
  2563. /// **** VTESTPS xmm1, xmm2/v128
  2564. /// </remarks>
  2565. /// <param name="a">Vector a</param>
  2566. /// <param name="b">Vector b</param>
  2567. /// <returns>ZF value</returns>
  2568. [DebuggerStepThrough]
  2569. public static int testz_ps(v128 a, v128 b)
  2570. {
  2571. uint* aptr = &a.UInt0;
  2572. uint* bptr = &b.UInt0;
  2573. for (int i = 0; i < 4; ++i)
  2574. {
  2575. if (((aptr[i] & bptr[i]) & 0x8000_0000) != 0)
  2576. return 0;
  2577. }
  2578. return 1;
  2579. }
  2580. /// <summary>
  2581. /// Compute the bitwise AND of 128 bits (representing
  2582. /// single-precision (32-bit) floating-point elements) in a and b,
  2583. /// producing an intermediate 128-bit value, and set ZF to 1 if the
  2584. /// sign bit of each 32-bit element in the intermediate value is
  2585. /// zero, otherwise set ZF to 0. Compute the bitwise NOT of a and
  2586. /// then AND with b, producing an intermediate value, and set CF to
  2587. /// 1 if the sign bit of each 32-bit element in the intermediate
  2588. /// value is zero, otherwise set CF to 0. Return the CF value.
  2589. /// </summary>
  2590. /// <remarks>
  2591. /// **** VTESTPS xmm1, xmm2/v128
  2592. /// </remarks>
  2593. /// <param name="a">Vector a</param>
  2594. /// <param name="b">Vector b</param>
  2595. /// <returns>CF value</returns>
  2596. [DebuggerStepThrough]
  2597. public static int testc_ps(v128 a, v128 b)
  2598. {
  2599. uint* aptr = &a.UInt0;
  2600. uint* bptr = &b.UInt0;
  2601. for (int i = 0; i < 4; ++i)
  2602. {
  2603. if ((((~aptr[i]) & bptr[i]) & 0x8000_0000) != 0)
  2604. return 0;
  2605. }
  2606. return 1;
  2607. }
  2608. /// <summary>
  2609. /// Compute the bitwise AND of 128 bits (representing
  2610. /// single-precision (32-bit) floating-point elements) in a and b,
  2611. /// producing an intermediate 128-bit value, and set ZF to 1 if the
  2612. /// sign bit of each 32-bit element in the intermediate value is
  2613. /// zero, otherwise set ZF to 0. Compute the bitwise NOT of a and
  2614. /// then AND with b, producing an intermediate value, and set CF to
  2615. /// 1 if the sign bit of each 32-bit element in the intermediate
  2616. /// value is zero, otherwise set CF to 0. Return 1 if both the ZF
  2617. /// and CF values are zero, otherwise return 0.
  2618. /// </summary>
  2619. /// <remarks>
  2620. /// **** VTESTPS xmm1, xmm2/v128
  2621. /// </remarks>
  2622. /// <param name="a">Vector a</param>
  2623. /// <param name="b">Vector b</param>
  2624. /// <returns>Integer</returns>
  2625. [DebuggerStepThrough]
  2626. public static int testnzc_ps(v128 a, v128 b)
  2627. {
  2628. return 1 - (testz_ps(a, b) | testc_ps(a, b));
  2629. }
  2630. /// <summary>
  2631. /// Set each bit of mask dst based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in a.
  2632. /// </summary>
  2633. /// <remarks>
  2634. /// **** VMOVMSKPD r32, ymm2
  2635. /// Extracts the sign bits from the packed double-precision floating-point
  2636. /// values in the source operand, formats them into a 4-bit mask, and stores
  2637. /// the mask in the destination
  2638. /// </remarks>
  2639. /// <param name="a">Vector a</param>
  2640. /// <returns>Integer</returns>
  2641. [DebuggerStepThrough]
  2642. public static int mm256_movemask_pd(v256 a)
  2643. {
  2644. return Sse2.movemask_pd(a.Lo128) | (Sse2.movemask_pd(a.Hi128) << 2);
  2645. }
  2646. /// <summary>
  2647. /// Set each bit of mask dst based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in a.
  2648. /// </summary>
  2649. /// <remarks>
  2650. /// **** VMOVMSKPS r32, ymm2
  2651. /// Extracts the sign bits from the packed single-precision floating-point
  2652. /// values in the source operand, formats them into a 8-bit mask, and stores
  2653. /// the mask in the destination
  2654. /// </remarks>
  2655. /// <param name="a">Vector a</param>
  2656. /// <returns>Integer</returns>
  2657. [DebuggerStepThrough]
  2658. public static int mm256_movemask_ps(v256 a)
  2659. {
  2660. return Sse.movemask_ps(a.Lo128) | (Sse.movemask_ps(a.Hi128) << 4);
  2661. }
  2662. // Normal IR is fine for this
  2663. /// <summary>
  2664. /// Return Vector with all elements set to zero.
  2665. /// </summary>
  2666. /// <returns>Vector</returns>
  2667. [DebuggerStepThrough]
  2668. public static v256 mm256_setzero_pd() { return default; }
  2669. /// <summary>
  2670. /// Return Vector with all elements set to zero.
  2671. /// </summary>
  2672. /// <returns>Vector</returns>
  2673. [DebuggerStepThrough]
  2674. public static v256 mm256_setzero_ps() { return default; }
  2675. /// <summary>
  2676. /// Return Vector with all elements set to zero.
  2677. /// </summary>
  2678. /// <returns>Vector</returns>
  2679. [DebuggerStepThrough]
  2680. public static v256 mm256_setzero_si256() { return default; }
  2681. /// <summary>
  2682. /// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values.
  2683. /// </summary>
  2684. /// <param name="d">Element d</param>
  2685. /// <param name="c">Element c</param>
  2686. /// <param name="b">Element b</param>
  2687. /// <param name="a">Element a</param>
  2688. /// <returns>Vector</returns>
  2689. [DebuggerStepThrough]
  2690. public static v256 mm256_set_pd(double d, double c, double b, double a)
  2691. {
  2692. return new v256(a, b, c, d);
  2693. }
  2694. /// <summary>
  2695. /// Set packed single-precision (32-bit) floating-point elements in dst with the supplied values.
  2696. /// </summary>
  2697. /// <param name="e7">Element 7</param>
  2698. /// <param name="e6">Element 6</param>
  2699. /// <param name="e5">Element 5</param>
  2700. /// <param name="e4">Element 4</param>
  2701. /// <param name="e3">Element 3</param>
  2702. /// <param name="e2">Element 2</param>
  2703. /// <param name="e1">Element 1</param>
  2704. /// <param name="e0">Element 0</param>
  2705. /// <returns>Vector</returns>
  2706. [DebuggerStepThrough]
  2707. public static v256 mm256_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
  2708. {
  2709. return new v256(e0, e1, e2, e3, e4, e5, e6, e7);
  2710. }
  2711. /// <summary>
  2712. /// Set packed byte elements in dst with the supplied values.
  2713. /// </summary>
  2714. /// <param name="e31_">Element 31</param>
  2715. /// <param name="e30_">Element 30</param>
  2716. /// <param name="e29_">Element 29</param>
  2717. /// <param name="e28_">Element 28</param>
  2718. /// <param name="e27_">Element 27</param>
  2719. /// <param name="e26_">Element 26</param>
  2720. /// <param name="e25_">Element 25</param>
  2721. /// <param name="e24_">Element 24</param>
  2722. /// <param name="e23_">Element 23</param>
  2723. /// <param name="e22_">Element 22</param>
  2724. /// <param name="e21_">Element 21</param>
  2725. /// <param name="e20_">Element 20</param>
  2726. /// <param name="e19_">Element 19</param>
  2727. /// <param name="e18_">Element 18</param>
  2728. /// <param name="e17_">Element 17</param>
  2729. /// <param name="e16_">Element 16</param>
  2730. /// <param name="e15_">Element 15</param>
  2731. /// <param name="e14_">Element 14</param>
  2732. /// <param name="e13_">Element 13</param>
  2733. /// <param name="e12_">Element 12</param>
  2734. /// <param name="e11_">Element 11</param>
  2735. /// <param name="e10_">Element 10</param>
  2736. /// <param name="e9_">Element 9</param>
  2737. /// <param name="e8_">Element 8</param>
  2738. /// <param name="e7_">Element 7</param>
  2739. /// <param name="e6_">Element 6</param>
  2740. /// <param name="e5_">Element 5</param>
  2741. /// <param name="e4_">Element 4</param>
  2742. /// <param name="e3_">Element 3</param>
  2743. /// <param name="e2_">Element 2</param>
  2744. /// <param name="e1_">Element 1</param>
  2745. /// <param name="e0_">Element 0</param>
  2746. /// <returns>Vector</returns>
  2747. [DebuggerStepThrough]
  2748. public static v256 mm256_set_epi8(
  2749. byte e31_, byte e30_, byte e29_, byte e28_, byte e27_, byte e26_, byte e25_, byte e24_, byte e23_, byte e22_, byte e21_, byte e20_, byte e19_, byte e18_, byte e17_, byte e16_,
  2750. byte e15_, byte e14_, byte e13_, byte e12_, byte e11_, byte e10_, byte e9_, byte e8_, byte e7_, byte e6_, byte e5_, byte e4_, byte e3_, byte e2_, byte e1_, byte e0_)
  2751. {
  2752. return new v256(
  2753. e0_, e1_, e2_, e3_, e4_, e5_, e6_, e7_,
  2754. e8_, e9_, e10_, e11_, e12_, e13_, e14_, e15_,
  2755. e16_, e17_, e18_, e19_, e20_, e21_, e22_, e23_,
  2756. e24_, e25_, e26_, e27_, e28_, e29_, e30_, e31_);
  2757. }
  2758. /// <summary>
  2759. /// Set packed short elements in dst with the supplied values.
  2760. /// </summary>
  2761. /// <param name="e15_">Element 15</param>
  2762. /// <param name="e14_">Element 14</param>
  2763. /// <param name="e13_">Element 13</param>
  2764. /// <param name="e12_">Element 12</param>
  2765. /// <param name="e11_">Element 11</param>
  2766. /// <param name="e10_">Element 10</param>
  2767. /// <param name="e9_">Element 9</param>
  2768. /// <param name="e8_">Element 8</param>
  2769. /// <param name="e7_">Element 7</param>
  2770. /// <param name="e6_">Element 6</param>
  2771. /// <param name="e5_">Element 5</param>
  2772. /// <param name="e4_">Element 4</param>
  2773. /// <param name="e3_">Element 3</param>
  2774. /// <param name="e2_">Element 2</param>
  2775. /// <param name="e1_">Element 1</param>
  2776. /// <param name="e0_">Element 0</param>
  2777. /// <returns>Vector</returns>
  2778. [DebuggerStepThrough]
  2779. public static v256 mm256_set_epi16(short e15_, short e14_, short e13_, short e12_, short e11_, short e10_, short e9_, short e8_, short e7_, short e6_, short e5_, short e4_, short e3_, short e2_, short e1_, short e0_)
  2780. {
  2781. return new v256(
  2782. e0_, e1_, e2_, e3_, e4_, e5_, e6_, e7_,
  2783. e8_, e9_, e10_, e11_, e12_, e13_, e14_, e15_);
  2784. }
  2785. /// <summary>
  2786. /// Set packed int elements in dst with the supplied values.
  2787. /// </summary>
  2788. /// <param name="e7">Element 7</param>
  2789. /// <param name="e6">Element 6</param>
  2790. /// <param name="e5">Element 5</param>
  2791. /// <param name="e4">Element 4</param>
  2792. /// <param name="e3">Element 3</param>
  2793. /// <param name="e2">Element 2</param>
  2794. /// <param name="e1">Element 1</param>
  2795. /// <param name="e0">Element 0</param>
  2796. /// <returns>Vector</returns>
  2797. [DebuggerStepThrough]
  2798. public static v256 mm256_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
  2799. {
  2800. return new v256(e0, e1, e2, e3, e4, e5, e6, e7);
  2801. }
  2802. /// <summary>
  2803. /// Set packed 64-bit integers in dst with the supplied values.
  2804. /// </summary>
  2805. /// <param name="e3">Element 3</param>
  2806. /// <param name="e2">Element 2</param>
  2807. /// <param name="e1">Element 1</param>
  2808. /// <param name="e0">Element 0</param>
  2809. /// <returns>Vector</returns>
  2810. [DebuggerStepThrough]
  2811. public static v256 mm256_set_epi64x(long e3, long e2, long e1, long e0)
  2812. {
  2813. return new v256(e0, e1, e2, e3);
  2814. }
  2815. /// <summary>
  2816. /// Set packed v256 vector with the supplied values.
  2817. /// </summary>
  2818. /// <param name="hi">High half of the vector</param>
  2819. /// <param name="lo">Low half of the vector</param>
  2820. /// <returns>Vector</returns>
  2821. [DebuggerStepThrough]
  2822. public static v256 mm256_set_m128d(v128 hi, v128 lo)
  2823. {
  2824. return new v256(lo, hi);
  2825. }
  2826. /// <summary>
  2827. /// Set packed v256 vector with the supplied values.
  2828. /// </summary>
  2829. /// <param name="hi">High half of the vector</param>
  2830. /// <param name="lo">Low half of the vector</param>
  2831. /// <returns>Vector</returns>
  2832. [DebuggerStepThrough]
  2833. public static v256 mm256_set_m128i(v128 hi, v128 lo)
  2834. {
  2835. return new v256(lo, hi);
  2836. }
  2837. /// <summary>
  2838. /// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order.
  2839. /// </summary>
  2840. /// <param name="d">Element d</param>
  2841. /// <param name="c">Element c</param>
  2842. /// <param name="b">Element b</param>
  2843. /// <param name="a">Element a</param>
  2844. /// <returns>Vector</returns>
  2845. [DebuggerStepThrough]
  2846. public static v256 mm256_setr_pd(double d, double c, double b, double a)
  2847. {
  2848. return new v256(d, c, b, a);
  2849. }
  2850. /// <summary>
  2851. /// Set packed single-precision (32-bit) floating-point elements in dst with the supplied values in reverse order.
  2852. /// </summary>
  2853. /// <param name="e7">Element 7</param>
  2854. /// <param name="e6">Element 6</param>
  2855. /// <param name="e5">Element 5</param>
  2856. /// <param name="e4">Element 4</param>
  2857. /// <param name="e3">Element 3</param>
  2858. /// <param name="e2">Element 2</param>
  2859. /// <param name="e1">Element 1</param>
  2860. /// <param name="e0">Element 0</param>
  2861. /// <returns>Vector</returns>
  2862. [DebuggerStepThrough]
  2863. public static v256 mm256_setr_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
  2864. {
  2865. return new v256(e7, e6, e5, e4, e3, e2, e1, e0);
  2866. }
  2867. /// <summary>
  2868. /// Set packed byte elements in dst with the supplied values in reverse order.
  2869. /// </summary>
  2870. /// <param name="e31_">Element 31</param>
  2871. /// <param name="e30_">Element 30</param>
  2872. /// <param name="e29_">Element 29</param>
  2873. /// <param name="e28_">Element 28</param>
  2874. /// <param name="e27_">Element 27</param>
  2875. /// <param name="e26_">Element 26</param>
  2876. /// <param name="e25_">Element 25</param>
  2877. /// <param name="e24_">Element 24</param>
  2878. /// <param name="e23_">Element 23</param>
  2879. /// <param name="e22_">Element 22</param>
  2880. /// <param name="e21_">Element 21</param>
  2881. /// <param name="e20_">Element 20</param>
  2882. /// <param name="e19_">Element 19</param>
  2883. /// <param name="e18_">Element 18</param>
  2884. /// <param name="e17_">Element 17</param>
  2885. /// <param name="e16_">Element 16</param>
  2886. /// <param name="e15_">Element 15</param>
  2887. /// <param name="e14_">Element 14</param>
  2888. /// <param name="e13_">Element 13</param>
  2889. /// <param name="e12_">Element 12</param>
  2890. /// <param name="e11_">Element 11</param>
  2891. /// <param name="e10_">Element 10</param>
  2892. /// <param name="e9_">Element 9</param>
  2893. /// <param name="e8_">Element 8</param>
  2894. /// <param name="e7_">Element 7</param>
  2895. /// <param name="e6_">Element 6</param>
  2896. /// <param name="e5_">Element 5</param>
  2897. /// <param name="e4_">Element 4</param>
  2898. /// <param name="e3_">Element 3</param>
  2899. /// <param name="e2_">Element 2</param>
  2900. /// <param name="e1_">Element 1</param>
  2901. /// <param name="e0_">Element 0</param>
  2902. /// <returns>Vector</returns>
  2903. [DebuggerStepThrough]
  2904. public static v256 mm256_setr_epi8(
  2905. byte e31_, byte e30_, byte e29_, byte e28_, byte e27_, byte e26_, byte e25_, byte e24_, byte e23_, byte e22_, byte e21_, byte e20_, byte e19_, byte e18_, byte e17_, byte e16_,
  2906. byte e15_, byte e14_, byte e13_, byte e12_, byte e11_, byte e10_, byte e9_, byte e8_, byte e7_, byte e6_, byte e5_, byte e4_, byte e3_, byte e2_, byte e1_, byte e0_)
  2907. {
  2908. return new v256(
  2909. e31_, e30_, e29_, e28_, e27_, e26_, e25_, e24_,
  2910. e23_, e22_, e21_, e20_, e19_, e18_, e17_, e16_,
  2911. e15_, e14_, e13_, e12_, e11_, e10_, e9_, e8_,
  2912. e7_, e6_, e5_, e4_, e3_, e2_, e1_, e0_);
  2913. }
  2914. /// <summary>
  2915. /// Set packed short elements in dst with the supplied values in reverse order.
  2916. /// </summary>
  2917. /// <param name="e15_">Element 15</param>
  2918. /// <param name="e14_">Element 14</param>
  2919. /// <param name="e13_">Element 13</param>
  2920. /// <param name="e12_">Element 12</param>
  2921. /// <param name="e11_">Element 11</param>
  2922. /// <param name="e10_">Element 10</param>
  2923. /// <param name="e9_">Element 9</param>
  2924. /// <param name="e8_">Element 8</param>
  2925. /// <param name="e7_">Element 7</param>
  2926. /// <param name="e6_">Element 6</param>
  2927. /// <param name="e5_">Element 5</param>
  2928. /// <param name="e4_">Element 4</param>
  2929. /// <param name="e3_">Element 3</param>
  2930. /// <param name="e2_">Element 2</param>
  2931. /// <param name="e1_">Element 1</param>
  2932. /// <param name="e0_">Element 0</param>
  2933. /// <returns>Vector</returns>
  2934. [DebuggerStepThrough]
  2935. public static v256 mm256_setr_epi16(short e15_, short e14_, short e13_, short e12_, short e11_, short e10_, short e9_, short e8_, short e7_, short e6_, short e5_, short e4_, short e3_, short e2_, short e1_, short e0_)
  2936. {
  2937. return new v256(
  2938. e15_, e14_, e13_, e12_, e11_, e10_, e9_, e8_,
  2939. e7_, e6_, e5_, e4_, e3_, e2_, e1_, e0_);
  2940. }
  2941. /// <summary>
  2942. /// Set packed int elements in dst with the supplied values in reverse order.
  2943. /// </summary>
  2944. /// <param name="e7">Element 7</param>
  2945. /// <param name="e6">Element 6</param>
  2946. /// <param name="e5">Element 5</param>
  2947. /// <param name="e4">Element 4</param>
  2948. /// <param name="e3">Element 3</param>
  2949. /// <param name="e2">Element 2</param>
  2950. /// <param name="e1">Element 1</param>
  2951. /// <param name="e0">Element 0</param>
  2952. /// <returns>Vector</returns>
  2953. [DebuggerStepThrough]
  2954. public static v256 mm256_setr_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
  2955. {
  2956. return new v256(e7, e6, e5, e4, e3, e2, e1, e0);
  2957. }
  2958. /// <summary>
  2959. /// Set packed 64-bit integers in dst with the supplied values in reverse order.
  2960. /// </summary>
  2961. /// <param name="e3">Element 3</param>
  2962. /// <param name="e2">Element 2</param>
  2963. /// <param name="e1">Element 1</param>
  2964. /// <param name="e0">Element 0</param>
  2965. /// <returns>Vector</returns>
  2966. [DebuggerStepThrough]
  2967. public static v256 mm256_setr_epi64x(long e3, long e2, long e1, long e0)
  2968. {
  2969. return new v256(e3, e2, e1, e0);
  2970. }
  2971. /// <summary>
  2972. /// Set packed v256 vector with the supplied values in reverse order.
  2973. /// </summary>
  2974. /// <param name="hi">High half of the vector</param>
  2975. /// <param name="lo">Low half of the vector</param>
  2976. /// <returns>Vector</returns>
  2977. [DebuggerStepThrough]
  2978. public static v256 mm256_setr_m128(v128 hi, v128 lo)
  2979. {
  2980. return new v256(hi, lo);
  2981. }
  2982. /// <summary>
  2983. /// Set packed v256 vector with the supplied values in reverse order.
  2984. /// </summary>
  2985. /// <param name="hi">High half of the vector</param>
  2986. /// <param name="lo">Low half of the vector</param>
  2987. /// <returns>Vector</returns>
  2988. [DebuggerStepThrough]
  2989. public static v256 mm256_setr_m128d(v128 hi, v128 lo)
  2990. {
  2991. return new v256(hi, lo);
  2992. }
  2993. /// <summary>
  2994. /// Set packed v256 vector with the supplied values in reverse order.
  2995. /// </summary>
  2996. /// <param name="hi">High half of the vector</param>
  2997. /// <param name="lo">Low half of the vector</param>
  2998. /// <returns>Vector</returns>
  2999. [DebuggerStepThrough]
  3000. public static v256 mm256_setr_m128i(v128 hi, v128 lo)
  3001. {
  3002. return new v256(hi, lo);
  3003. }
  3004. /// <summary>
  3005. /// Broadcast double-precision (64-bit) floating-point value a to all elements of dst.
  3006. /// </summary>
  3007. /// <param name="a">Value</param>
  3008. /// <returns>Vector</returns>
  3009. [DebuggerStepThrough]
  3010. public static v256 mm256_set1_pd(double a)
  3011. {
  3012. return new v256(a);
  3013. }
  3014. /// <summary>
  3015. /// Broadcast single-precision (32-bit) floating-point value a to all elements of dst.
  3016. /// </summary>
  3017. /// <param name="a">Value</param>
  3018. /// <returns>Vector</returns>
  3019. [DebuggerStepThrough]
  3020. public static v256 mm256_set1_ps(float a)
  3021. {
  3022. return new v256(a);
  3023. }
  3024. /// <summary>
  3025. /// Broadcast 8-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastb instruction.
  3026. /// </summary>
  3027. /// <param name="a">8-bit integer</param>
  3028. /// <returns>Vector</returns>
  3029. [DebuggerStepThrough]
  3030. public static v256 mm256_set1_epi8(byte a)
  3031. {
  3032. return new v256(a);
  3033. }
  3034. /// <summary>
  3035. /// Broadcast 16-bit integer a to all all elements of dst. This intrinsic may generate the vpbroadcastw instruction.
  3036. /// </summary>
  3037. /// <param name="a">16-bit integer</param>
  3038. /// <returns>Vector</returns>
  3039. [DebuggerStepThrough]
  3040. public static v256 mm256_set1_epi16(short a)
  3041. {
  3042. return new v256(a);
  3043. }
  3044. /// <summary>
  3045. /// Broadcast 32-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastd instruction.
  3046. /// </summary>
  3047. /// <param name="a">32-bit integer</param>
  3048. /// <returns>Vector</returns>
  3049. [DebuggerStepThrough]
  3050. public static v256 mm256_set1_epi32(int a)
  3051. {
  3052. return new v256(a);
  3053. }
  3054. /// <summary>
  3055. /// Broadcast 64-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastq instruction.
  3056. /// </summary>
  3057. /// <param name="a">64-bit integer</param>
  3058. /// <returns>Vector</returns>
  3059. [DebuggerStepThrough]
  3060. public static v256 mm256_set1_epi64x(long a)
  3061. {
  3062. return new v256(a);
  3063. }
  3064. /// <summary>For compatibility with C++ code only. This is a no-op in Burst.</summary>
  3065. /// <param name="a">Vector a</param>
  3066. /// <returns>Vector</returns>
  3067. [DebuggerStepThrough]
  3068. public static v256 mm256_castpd_ps(v256 a) { return a; }
  3069. /// <summary>For compatibility with C++ code only. This is a no-op in Burst.</summary>
  3070. /// <param name="a">Vector a</param>
  3071. /// <returns>Vector</returns>
  3072. [DebuggerStepThrough]
  3073. public static v256 mm256_castps_pd(v256 a) { return a; }
  3074. /// <summary>For compatibility with C++ code only. This is a no-op in Burst.</summary>
  3075. /// <param name="a">Vector a</param>
  3076. /// <returns>Vector</returns>
  3077. [DebuggerStepThrough]
  3078. public static v256 mm256_castps_si256(v256 a) { return a; }
  3079. /// <summary>For compatibility with C++ code only. This is a no-op in Burst.</summary>
  3080. /// <param name="a">Vector a</param>
  3081. /// <returns>Vector</returns>
  3082. [DebuggerStepThrough]
  3083. public static v256 mm256_castpd_si256(v256 a) { return a; }
  3084. /// <summary>For compatibility with C++ code only. This is a no-op in Burst.</summary>
  3085. /// <param name="a">Vector a</param>
  3086. /// <returns>Vector</returns>
  3087. [DebuggerStepThrough]
  3088. public static v256 mm256_castsi256_ps(v256 a) { return a; }
  3089. /// <summary>For compatibility with C++ code only. This is a no-op in Burst.</summary>
  3090. /// <param name="a">Vector a</param>
  3091. /// <returns>Vector</returns>
  3092. [DebuggerStepThrough]
  3093. public static v256 mm256_castsi256_pd(v256 a) { return a; }
  3094. /// <summary>For compatibility with C++ code only. This is a no-op in Burst.</summary>
  3095. /// <param name="a">Vector a</param>
  3096. /// <returns>Vector</returns>
  3097. [DebuggerStepThrough]
  3098. public static v128 mm256_castps256_ps128(v256 a) { return a.Lo128; }
  3099. /// <summary>For compatibility with C++ code only. This is a no-op in Burst.</summary>
  3100. /// <param name="a">Vector a</param>
  3101. /// <returns>Vector</returns>
  3102. [DebuggerStepThrough]
  3103. public static v128 mm256_castpd256_pd128(v256 a) { return a.Lo128; }
  3104. /// <summary>For compatibility with C++ code only. This is a no-op in Burst.</summary>
  3105. /// <param name="a">Vector a</param>
  3106. /// <returns>Vector</returns>
  3107. [DebuggerStepThrough]
  3108. public static v128 mm256_castsi256_si128(v256 a) { return a.Lo128; }
  3109. /// <summary>For compatibility with C++ code only. This is a no-op in Burst.</summary>
  3110. /// <param name="a">Vector a</param>
  3111. /// <returns>Vector</returns>
  3112. [DebuggerStepThrough]
  3113. public static v256 mm256_castps128_ps256(v128 a) { return new v256(a, Sse.setzero_ps()); }
  3114. /// <summary>For compatibility with C++ code only. This is a no-op in Burst.</summary>
  3115. /// <param name="a">Vector a</param>
  3116. /// <returns>Vector</returns>
  3117. [DebuggerStepThrough]
  3118. public static v256 mm256_castpd128_pd256(v128 a) { return new v256(a, Sse.setzero_ps()); }
  3119. /// <summary>For compatibility with C++ code only. This is a no-op in Burst.</summary>
  3120. /// <param name="a">Vector a</param>
  3121. /// <returns>Vector</returns>
  3122. [DebuggerStepThrough]
  3123. public static v256 mm256_castsi128_si256(v128 a) { return new v256(a, Sse.setzero_ps()); }
  3124. /// <summary>Return a 128-bit vector with undefined contents.</summary>
  3125. /// <returns>Vector</returns>
  3126. [DebuggerStepThrough]
  3127. public static v128 undefined_ps()
  3128. {
  3129. return default;
  3130. }
  3131. /// <summary>Return a 128-bit vector with undefined contents.</summary>
  3132. /// <returns>Vector</returns>
  3133. [DebuggerStepThrough]
  3134. [BurstTargetCpu(BurstTargetCpu.AVX)]
  3135. public static v128 undefined_pd()
  3136. {
  3137. return undefined_ps();
  3138. }
  3139. /// <summary>Return a 128-bit vector with undefined contents.</summary>
  3140. /// <returns>Vector</returns>
  3141. [DebuggerStepThrough]
  3142. [BurstTargetCpu(BurstTargetCpu.AVX)]
  3143. public static v128 undefined_si128()
  3144. {
  3145. return undefined_ps();
  3146. }
  3147. /// <summary>Return a 256-bit vector with undefined contents.</summary>
  3148. /// <returns>Vector</returns>
  3149. [DebuggerStepThrough]
  3150. public static v256 mm256_undefined_ps()
  3151. {
  3152. return default;
  3153. }
  3154. /// <summary>Return a 256-bit vector with undefined contents.</summary>
  3155. /// <returns>Vector</returns>
  3156. [DebuggerStepThrough]
  3157. [BurstTargetCpu(BurstTargetCpu.AVX)]
  3158. public static v256 mm256_undefined_pd()
  3159. {
  3160. return mm256_undefined_ps();
  3161. }
  3162. /// <summary>Return a 256-bit vector with undefined contents.</summary>
  3163. /// <returns>Vector</returns>
  3164. [DebuggerStepThrough]
  3165. [BurstTargetCpu(BurstTargetCpu.AVX)]
  3166. public static v256 mm256_undefined_si256()
  3167. {
  3168. return mm256_undefined_ps();
  3169. }
  3170. // Zero-extended cast functions
  3171. /// <summary>
  3172. /// Casts vector of type v128 to type v256; the upper 128 bits of the result
  3173. /// are zeroed. This intrinsic is only used for compilation and does not
  3174. /// generate any instructions, thus it has zero latency.
  3175. /// </summary>
  3176. /// <param name="a">Vector a</param>
  3177. /// <returns>Vector</returns>
  3178. [DebuggerStepThrough]
  3179. public static v256 mm256_zextps128_ps256(v128 a) { return new v256(a, Sse.setzero_ps()); }
  3180. /// <summary>
  3181. /// Casts vector of type v128 to type v256; the upper 128 bits of the result
  3182. /// are zeroed. This intrinsic is only used for compilation and does not
  3183. /// generate any instructions, thus it has zero latency.
  3184. /// </summary>
  3185. /// <param name="a">Vector a</param>
  3186. /// <returns>Vector</returns>
  3187. [DebuggerStepThrough]
  3188. [BurstTargetCpu(BurstTargetCpu.AVX)]
  3189. public static v256 mm256_zextpd128_pd256(v128 a) { return mm256_zextps128_ps256(a); }
  3190. /// <summary>
  3191. /// Casts vector of type v128 to type v256; the upper 128 bits of the result
  3192. /// are zeroed. This intrinsic is only used for compilation and does not
  3193. /// generate any instructions, thus it has zero latency.
  3194. /// </summary>
  3195. /// <param name="a">Vector a</param>
  3196. /// <returns>Vector</returns>
  3197. [DebuggerStepThrough]
  3198. [BurstTargetCpu(BurstTargetCpu.AVX)]
  3199. public static v256 mm256_zextsi128_si256(v128 a) { return mm256_zextps128_ps256(a); }
  3200. /// <summary>
  3201. /// Copy a to dst, and insert the 8-bit integer i into dst at the location specified by index (which must be a constant).
  3202. /// </summary>
  3203. /// <param name="a">Vector a</param>
  3204. /// <param name="i">8-bit integer i</param>
  3205. /// <param name="index">Location</param>
  3206. /// <returns>Vector</returns>
  3207. [DebuggerStepThrough]
  3208. public static v256 mm256_insert_epi8(v256 a, int i, int index)
  3209. {
  3210. v256 dst = a;
  3211. byte* target = &dst.Byte0;
  3212. target[index & 31] = (byte)i;
  3213. return dst;
  3214. }
  3215. /// <summary>
  3216. /// Copy a to dst, and insert the 16-bit integer i into dst at the location specified by index (which must be a constant).
  3217. /// </summary>
  3218. /// <param name="a">Vector a</param>
  3219. /// <param name="i">16-bit integer i</param>
  3220. /// <param name="index">Location</param>
  3221. /// <returns>Vector</returns>
  3222. [DebuggerStepThrough]
  3223. public static v256 mm256_insert_epi16(v256 a, int i, int index)
  3224. {
  3225. v256 dst = a;
  3226. short* target = &dst.SShort0;
  3227. target[index & 15] = (short)i;
  3228. return dst;
  3229. }
  3230. /// <summary>
  3231. /// Copy a to dst, and insert the 32-bit integer i into dst at the location specified by index (which must be a constant).
  3232. /// </summary>
  3233. /// <param name="a">Vector a</param>
  3234. /// <param name="i">32-bit integer i</param>
  3235. /// <param name="index">Location</param>
  3236. /// <returns>Vector</returns>
  3237. [DebuggerStepThrough]
  3238. public static v256 mm256_insert_epi32(v256 a, int i, int index)
  3239. {
  3240. v256 dst = a;
  3241. int* target = &dst.SInt0;
  3242. target[index & 7] = i;
  3243. return dst;
  3244. }
  3245. /// <summary>
  3246. /// Copy a to dst, and insert the 64-bit integer i into dst at the location specified by index (which must be a constant).
  3247. /// </summary>
  3248. /// <remarks>
  3249. /// This intrinsic requires a 64-bit processor.
  3250. /// </remarks>
  3251. /// <param name="a">Vector a</param>
  3252. /// <param name="i">64-bit integer i</param>
  3253. /// <param name="index">Location</param>
  3254. /// <returns>Vector</returns>
  3255. [DebuggerStepThrough]
  3256. public static v256 mm256_insert_epi64(v256 a, long i, int index)
  3257. {
  3258. v256 dst = a;
  3259. long* target = &dst.SLong0;
  3260. target[index & 3] = i;
  3261. return dst;
  3262. }
  3263. /// <summary>
  3264. /// Extract a 32-bit integer from a, selected with index (which must be a constant), and store the result in dst.
  3265. /// </summary>
  3266. /// <param name="a">Vector a</param>
  3267. /// <param name="index">Index</param>
  3268. /// <returns>32-bit integer</returns>
  3269. [DebuggerStepThrough]
  3270. public static int mm256_extract_epi32(v256 a, int index)
  3271. {
  3272. return (&a.SInt0)[index & 7];
  3273. }
  3274. /// <summary>
  3275. /// Extract a 64-bit integer from a, selected with index (which must be a constant), and store the result in dst.
  3276. /// </summary>
  3277. /// <param name="a">Vector a</param>
  3278. /// <param name="index">Index</param>
  3279. /// <returns>64-bit integer</returns>
  3280. [DebuggerStepThrough]
  3281. public static long mm256_extract_epi64(v256 a, int index)
  3282. {
  3283. return (&a.SLong0)[index & 3];
  3284. }
  3285. }
  3286. }
  3287. }