Нема описа
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707
  1. using System;
  2. using Unity.Collections.LowLevel.Unsafe;
  3. namespace Unity.Collections
  4. {
  5. /// <summary>
  6. /// Kinds of format errors.
  7. /// </summary>
  8. public enum FormatError
  9. {
  10. /// <summary>
  11. /// No error.
  12. /// </summary>
  13. None,
  14. /// <summary>
  15. /// The target storage does not have sufficient capacity.
  16. /// Note that the format's write failed. It did not truncate.
  17. /// </summary>
  18. Overflow,
  19. /// <summary>
  20. /// The source format specifier is not itself correctly formatted, or
  21. /// a format specifier tokens were found outside of accepted usage.
  22. /// Note that the format's write failed.
  23. /// </summary>
  24. BadFormatSpecifier,
  25. }
  26. /// <summary>
  27. /// Kinds of parse errors.
  28. /// </summary>
  29. public enum ParseError
  30. {
  31. /// <summary>
  32. /// No parse error.
  33. /// </summary>
  34. None,
  35. /// <summary>
  36. /// The text parsed does not form a number.
  37. /// </summary>
  38. Syntax,
  39. /// <summary>
  40. /// The number exceeds the range of the target type.
  41. /// The number was either truncated, or failed to write entirely.
  42. /// </summary>
  43. Overflow,
  44. /// <summary>
  45. /// The number exceeds the precision of the target type.
  46. /// </summary>
  47. Underflow,
  48. }
  49. /// <summary>
  50. /// Kinds of copy errors.
  51. /// </summary>
  52. public enum CopyError
  53. {
  54. /// <summary>
  55. /// No copy error.
  56. /// </summary>
  57. None,
  58. /// <summary>
  59. /// The target storage does not have sufficient capacity.
  60. /// Unless stated in the API comment, assume that the write operation was partially applied.
  61. /// </summary>
  62. Truncation,
  63. }
  64. /// <summary>
  65. /// Kinds of conversion errors.
  66. /// </summary>
  67. public enum ConversionError
  68. {
  69. /// <summary>
  70. /// No conversion error.
  71. /// </summary>
  72. None,
  73. /// <summary>
  74. /// The target storage does not have sufficient capacity.
  75. /// For copy operations; the value was either truncated into the target storage, or failed to write entirely.
  76. /// </summary>
  77. Overflow,
  78. /// <summary>
  79. /// The bytes do not form a valid character.
  80. /// </summary>
  81. Encoding,
  82. /// <summary>
  83. /// The rune is not a valid code point.
  84. /// </summary>
  85. CodePoint,
  86. }
  87. /// <summary>
  88. /// Provides utility methods for UTF-8, UTF-16, UCS-4 (a.k.a. UTF-32), and WTF-8.
  89. /// </summary>
  90. [GenerateTestsForBurstCompatibility]
  91. public unsafe struct Unicode
  92. {
  93. /// <summary>
  94. /// Representation of a Unicode character as a code point.
  95. /// </summary>
  96. [GenerateTestsForBurstCompatibility]
  97. public struct Rune
  98. {
  99. /// <summary>
  100. /// The code point.
  101. /// </summary>
  102. /// <value>The code point.</value>
  103. public int value;
  104. /// <summary>
  105. /// Initializes and returns an instance of Rune.
  106. /// </summary>
  107. /// <remarks>You are responsible for the code point being valid.</remarks>
  108. /// <param name="codepoint">The code point.</param>
  109. public Rune(int codepoint)
  110. {
  111. value = codepoint;
  112. }
  113. /// <summary>
  114. /// Returns a rune.
  115. /// </summary>
  116. /// <remarks>Because a char is 16-bit, it can only represent the first 2^16 code points, not all 1.1 million.</remarks>
  117. /// <param name="codepoint">A code point.</param>
  118. /// <returns>A rune.</returns>
  119. public static implicit operator Rune(char codepoint) => new Rune { value = codepoint };
  120. /// <summary>
  121. /// Evaluates if one is equal to the other.
  122. /// </summary>
  123. /// <param name="lhs">The left-hand side</param>
  124. /// <param name="rhs">The right-hand side</param>
  125. /// <returns>True if the left-hand side's is equal to the right-hand side's.</returns>
  126. public static bool operator ==(Rune lhs, Rune rhs)
  127. {
  128. return lhs.value == rhs.value;
  129. }
  130. /// <summary>
  131. /// Returns true if the value stored in this Rune is equal to an object.
  132. /// </summary>
  133. /// <remarks>Can only be equal if the object is itself a Rune.</remarks>
  134. /// <param name="obj">An object to compare with.</param>
  135. /// <returns>True if the value stored in this Rune is equal to the object.</returns>
  136. [ExcludeFromBurstCompatTesting("Takes managed object")]
  137. public override bool Equals(object obj)
  138. {
  139. if (obj is Rune)
  140. {
  141. return value == ((Rune)obj).value;
  142. }
  143. return false;
  144. }
  145. /// <summary>
  146. /// A hash used for comparisons.
  147. /// </summary>
  148. /// <returns>A unique hash code.</returns>
  149. public override int GetHashCode()
  150. {
  151. return value;
  152. }
  153. /// <summary>
  154. /// Evaluates if one is not equal to the other.
  155. /// </summary>
  156. /// <param name="lhs">The left-hand side</param>
  157. /// <param name="rhs">The right-hand side</param>
  158. /// <returns>True if the left-hand side's is not equal to the right-hand side's.</returns>
  159. public static bool operator !=(Rune lhs, Rune rhs)
  160. {
  161. return lhs.value != rhs.value;
  162. }
  163. /// <summary>
  164. /// Returns true if a rune is a numerical digit character.
  165. /// </summary>
  166. /// <param name="r">The rune.</param>
  167. /// <returns>True if the rune is a numerical digit character.</returns>
  168. public static bool IsDigit(Rune r)
  169. {
  170. return r.IsDigit();
  171. }
  172. internal bool IsAscii()
  173. {
  174. return value < 0x80;
  175. }
  176. internal bool IsLatin1()
  177. {
  178. return value < 0x100;
  179. }
  180. internal bool IsDigit()
  181. {
  182. return value >= '0' && value <= '9';
  183. }
  184. internal bool IsWhiteSpace()
  185. {
  186. // https://en.wikipedia.org/wiki/Whitespace_character#Unicode
  187. if (IsLatin1())
  188. {
  189. return value == ' '
  190. || (value >= 0x9 && value <= 0xD) // CHARACTER TABULATION (U+0009), LINE FEED (U+000A), LINE TABULATION (U+000B), FORM FEED (U+000C), CARRIAGE RETURN (U+000D)
  191. || value == 0xA0 // NO-BREAK SPACE
  192. || value == 0x85 // NEXT LINE
  193. ;
  194. }
  195. return value == 0x1680 // OGHAM SPACE MARK
  196. || (value >= 0x2000 && value <= 0x200A) // EN QUAD(U+2000)
  197. // EM QUAD(U+2001)
  198. // EN SPACE(U+2002)
  199. // EM SPACE(U+2003)
  200. // THREE - PER - EM SPACE(U + 2004)
  201. // FOUR - PER - EM SPACE(U + 2005)
  202. // SIX - PER - EM SPACE(U + 2006)
  203. // FIGURE SPACE(U+2007)
  204. // PUNCTUATION SPACE(U+2008)
  205. // THIN SPACE(U+2009)
  206. // HAIR SPACE(U+200A)
  207. || value == 0x2028 // LINE SEPARATOR
  208. || value == 0x2029 // PARAGRAPH SEPARATOR
  209. || value == 0x202F // NARROW NO-BREAK SPACE
  210. || value == 0x205F // MEDIUM MATHEMATICAL SPACE
  211. || value == 0x3000 // IDEOGRAPHIC SPACE
  212. ;
  213. }
  214. internal Rune ToLowerAscii()
  215. {
  216. return new Rune(value + (((uint)(value - 'A') <= ('Z' - 'A')) ? 0x20 : 0));
  217. }
  218. internal Rune ToUpperAscii()
  219. {
  220. return new Rune(value - (((uint)(value - 'a') <= ('z' - 'a')) ? 0x20 : 0));
  221. }
  222. /// <summary>
  223. /// Returns the number of bytes required to encode this rune as UTF-8.
  224. /// </summary>
  225. /// <returns>The number of bytes required to encode this rune as UTF-8. If the rune's codepoint
  226. /// is invalid, returns 4 (the maximum possible encoding length).</returns>
  227. public int LengthInUtf8Bytes()
  228. {
  229. if (value < 0)
  230. return 4; // invalid codepoint
  231. if (value <= 0x7F)
  232. return 1;
  233. if (value <= 0x7FF)
  234. return 2;
  235. if (value <= 0xFFFF)
  236. return 3;
  237. if (value <= 0x1FFFFF)
  238. return 4;
  239. // invalid codepoint, max size.
  240. return 4;
  241. }
  242. }
  243. /// <summary>The maximum value of a valid UNICODE code point</summary>
  244. public const int kMaximumValidCodePoint = 0x10FFFF;
  245. /// <summary>
  246. /// Returns true if a code point is valid.
  247. /// </summary>
  248. /// <param name="codepoint">A code point.</param>
  249. /// <returns>True if a code point is valid.</returns>
  250. public static bool IsValidCodePoint(int codepoint)
  251. {
  252. if (codepoint > kMaximumValidCodePoint) // maximum valid code point
  253. return false;
  254. // if (codepoint >= 0xD800 && codepoint <= 0xDFFF) // surrogate pair
  255. // return false;
  256. if (codepoint < 0) // negative?
  257. return false;
  258. return true;
  259. }
  260. /// <summary>
  261. /// Returns true if the byte is not the last byte of a UTF-8 character.
  262. /// </summary>
  263. /// <param name="b">The byte.</param>
  264. /// <returns>True if the byte is not the last byte of a UTF-8 character.</returns>
  265. public static bool NotTrailer(byte b)
  266. {
  267. return (b & 0xC0) != 0x80;
  268. }
  269. /// <summary>
  270. /// The Unicode character �.
  271. /// </summary>
  272. /// <remarks>This character is used to stand-in for characters that can't be rendered.</remarks>
  273. /// <value>The Unicode character �.</value>
  274. public static Rune ReplacementCharacter => new Rune { value = 0xFFFD };
  275. /// <summary>
  276. /// The null rune value.
  277. /// </summary>
  278. /// <remarks>In this package, the "bad rune" is used as a null character. It represents no valid code point.</remarks>
  279. /// <value>The null rune value.</value>
  280. public static Rune BadRune => new Rune { value = 0 };
  281. /// <summary>
  282. /// Reads a UTF-8 encoded character from a buffer.
  283. /// </summary>
  284. /// <param name="rune">Outputs the character read. If the read fails, outputs <see cref="ReplacementCharacter"/>.</param>
  285. /// <param name="buffer">The buffer of bytes to read.</param>
  286. /// <param name="index">Reference to a byte index into the buffer. If the read succeeds, index is incremented by the
  287. /// size in bytes of the character read. If the read fails, index is incremented by 1.</param>
  288. /// <param name="capacity">The size in bytes of the buffer. Used to check that the read is in bounds.</param>
  289. /// <returns><see cref="ConversionError.None"/> if the read succeeds. Otherwise, returns <see cref="ConversionError.Overflow"/> or <see cref="ConversionError.Encoding"/>.</returns>
  290. public static ConversionError Utf8ToUcs(out Rune rune, byte* buffer, ref int index, int capacity)
  291. {
  292. int code = 0;
  293. rune = ReplacementCharacter;
  294. if (index + 1 > capacity)
  295. {
  296. return ConversionError.Overflow;
  297. }
  298. if ((buffer[index] & 0b10000000) == 0b00000000) // if high bit is 0, 1 byte
  299. {
  300. rune.value = buffer[index + 0];
  301. index += 1;
  302. return ConversionError.None;
  303. }
  304. if ((buffer[index] & 0b11100000) == 0b11000000) // if high 3 bits are 110, 2 bytes
  305. {
  306. if (index + 2 > capacity)
  307. {
  308. index += 1;
  309. return ConversionError.Overflow;
  310. }
  311. code = (buffer[index + 0] & 0b00011111);
  312. code = (code << 6) | (buffer[index + 1] & 0b00111111);
  313. if (code < (1 << 7) || NotTrailer(buffer[index + 1]))
  314. {
  315. index += 1;
  316. return ConversionError.Encoding;
  317. }
  318. rune.value = code;
  319. index += 2;
  320. return ConversionError.None;
  321. }
  322. if ((buffer[index] & 0b11110000) == 0b11100000) // if high 4 bits are 1110, 3 bytes
  323. {
  324. if (index + 3 > capacity)
  325. {
  326. index += 1;
  327. return ConversionError.Overflow;
  328. }
  329. code = (buffer[index + 0] & 0b00001111);
  330. code = (code << 6) | (buffer[index + 1] & 0b00111111);
  331. code = (code << 6) | (buffer[index + 2] & 0b00111111);
  332. if (code < (1 << 11) || !IsValidCodePoint(code) || NotTrailer(buffer[index + 1]) || NotTrailer(buffer[index + 2]))
  333. {
  334. index += 1;
  335. return ConversionError.Encoding;
  336. }
  337. rune.value = code;
  338. index += 3;
  339. return ConversionError.None;
  340. }
  341. if ((buffer[index] & 0b11111000) == 0b11110000) // if high 5 bits are 11110, 4 bytes
  342. {
  343. if (index + 4 > capacity)
  344. {
  345. index += 1;
  346. return ConversionError.Overflow;
  347. }
  348. code = (buffer[index + 0] & 0b00000111);
  349. code = (code << 6) | (buffer[index + 1] & 0b00111111);
  350. code = (code << 6) | (buffer[index + 2] & 0b00111111);
  351. code = (code << 6) | (buffer[index + 3] & 0b00111111);
  352. if (code < (1 << 16) || !IsValidCodePoint(code) || NotTrailer(buffer[index + 1]) || NotTrailer(buffer[index + 2]) || NotTrailer(buffer[index + 3]))
  353. {
  354. index += 1;
  355. return ConversionError.Encoding;
  356. }
  357. rune.value = code;
  358. index += 4;
  359. return ConversionError.None;
  360. }
  361. index += 1;
  362. return ConversionError.Encoding;
  363. }
  364. static int FindUtf8CharStartInReverse(byte* ptr, ref int index)
  365. {
  366. do
  367. {
  368. if (index <= 0)
  369. {
  370. return 0;
  371. }
  372. --index;
  373. } while ((ptr[index] & 0xC0) == 0x80);
  374. return index;
  375. }
  376. internal static ConversionError Utf8ToUcsReverse(out Rune rune, byte* buffer, ref int index, int capacity)
  377. {
  378. var prev = index;
  379. --index;
  380. index = FindUtf8CharStartInReverse(buffer, ref index);
  381. if (index == prev)
  382. {
  383. rune = ReplacementCharacter;
  384. return ConversionError.Overflow;
  385. }
  386. var ignore = index;
  387. return Utf8ToUcs(out rune, buffer, ref ignore, capacity);
  388. }
  389. /// <summary>
  390. /// Returns true if a char is a Unicode leading surrogate.
  391. /// </summary>
  392. /// <param name="c">The char.</param>
  393. /// <returns>True if the char is a Unicode leading surrogate.</returns>
  394. static bool IsLeadingSurrogate(char c)
  395. {
  396. return c >= 0xD800 && c <= 0xDBFF;
  397. }
  398. /// <summary>
  399. /// Returns true if a char is a Unicode trailing surrogate.
  400. /// </summary>
  401. /// <param name="c">The char.</param>
  402. /// <returns>True if the char is a Unicode trailing surrogate.</returns>
  403. static bool IsTrailingSurrogate(char c)
  404. {
  405. return c >= 0xDC00 && c <= 0xDFFF;
  406. }
  407. /// <summary>
  408. /// Reads a UTF-16 encoded character from a buffer.
  409. /// </summary>
  410. /// <param name="rune">Outputs the character read. If the read fails, rune is not set.</param>
  411. /// <param name="buffer">The buffer of chars to read.</param>
  412. /// <param name="index">Reference to a char index into the buffer. If the read succeeds, index is incremented by the
  413. /// size in chars of the character read. If the read fails, index is not incremented.</param>
  414. /// <param name="capacity">The size in chars of the buffer. Used to check that the read is in bounds.</param>
  415. /// <returns><see cref="ConversionError.None"/> if the read succeeds. Otherwise, returns <see cref="ConversionError.Overflow"/>.</returns>
  416. public static ConversionError Utf16ToUcs(out Rune rune, char* buffer, ref int index, int capacity)
  417. {
  418. int code = 0;
  419. rune = ReplacementCharacter;
  420. if (index + 1 > capacity)
  421. return ConversionError.Overflow;
  422. if (!IsLeadingSurrogate(buffer[index]) || (index + 2 > capacity))
  423. {
  424. rune.value = buffer[index];
  425. index += 1;
  426. return ConversionError.None;
  427. }
  428. code = (buffer[index + 0] & 0x03FF);
  429. char next = buffer[index + 1];
  430. if (!IsTrailingSurrogate(next))
  431. {
  432. rune.value = buffer[index];
  433. index += 1;
  434. return ConversionError.None;
  435. }
  436. code = (code << 10) | (buffer[index + 1] & 0x03FF);
  437. code += 0x10000;
  438. rune.value = code;
  439. index += 2;
  440. return ConversionError.None;
  441. }
  442. internal static ConversionError UcsToUcs(out Rune rune, Rune* buffer, ref int index, int capacity)
  443. {
  444. rune = ReplacementCharacter;
  445. if (index + 1 > capacity)
  446. return ConversionError.Overflow;
  447. rune = buffer[index];
  448. index += 1;
  449. return ConversionError.None;
  450. }
  451. /// <summary>
  452. /// Writes a rune to a buffer as a UTF-8 encoded character.
  453. /// </summary>
  454. /// <param name="rune">The rune to encode.</param>
  455. /// <param name="buffer">The buffer to write to.</param>
  456. /// <param name="index">Reference to a byte index into the buffer. If the write succeeds, index is incremented by the
  457. /// size in bytes of the character written. If the write fails, index is not incremented.</param>
  458. /// <param name="capacity">The size in bytes of the buffer. Used to check that the write is in bounds.</param>
  459. /// <returns><see cref="ConversionError.None"/> if the write succeeds. Otherwise, returns <see cref="ConversionError.CodePoint"/>, <see cref="ConversionError.Overflow"/>, or <see cref="ConversionError.Encoding"/>.</returns>
  460. public static ConversionError UcsToUtf8(byte* buffer, ref int index, int capacity, Rune rune)
  461. {
  462. if (!IsValidCodePoint(rune.value))
  463. {
  464. return ConversionError.CodePoint;
  465. }
  466. if (index + 1 > capacity)
  467. {
  468. return ConversionError.Overflow;
  469. }
  470. if (rune.value <= 0x7F)
  471. {
  472. buffer[index++] = (byte)rune.value;
  473. return ConversionError.None;
  474. }
  475. if (rune.value <= 0x7FF)
  476. {
  477. if (index + 2 > capacity)
  478. {
  479. return ConversionError.Overflow;
  480. }
  481. buffer[index++] = (byte)(0xC0 | (rune.value >> 6));
  482. buffer[index++] = (byte)(0x80 | ((rune.value >> 0) & 0x3F));
  483. return ConversionError.None;
  484. }
  485. if (rune.value <= 0xFFFF)
  486. {
  487. if (index + 3 > capacity)
  488. {
  489. return ConversionError.Overflow;
  490. }
  491. buffer[index++] = (byte)(0xE0 | (rune.value >> 12));
  492. buffer[index++] = (byte)(0x80 | ((rune.value >> 6) & 0x3F));
  493. buffer[index++] = (byte)(0x80 | ((rune.value >> 0) & 0x3F));
  494. return ConversionError.None;
  495. }
  496. if (rune.value <= 0x1FFFFF)
  497. {
  498. if (index + 4 > capacity)
  499. {
  500. return ConversionError.Overflow;
  501. }
  502. buffer[index++] = (byte)(0xF0 | (rune.value >> 18));
  503. buffer[index++] = (byte)(0x80 | ((rune.value >> 12) & 0x3F));
  504. buffer[index++] = (byte)(0x80 | ((rune.value >> 6) & 0x3F));
  505. buffer[index++] = (byte)(0x80 | ((rune.value >> 0) & 0x3F));
  506. return ConversionError.None;
  507. }
  508. return ConversionError.Encoding;
  509. }
  510. /// <summary>
  511. /// Writes a rune to a buffer as a UTF-16 encoded character.
  512. /// </summary>
  513. /// <param name="rune">The rune to encode.</param>
  514. /// <param name="buffer">The buffer of chars to write to.</param>
  515. /// <param name="index">Reference to a char index into the buffer. If the write succeeds, index is incremented by the
  516. /// size in chars of the character written. If the write fails, index is not incremented.</param>
  517. /// <param name="capacity">The size in chars of the buffer. Used to check that the write is in bounds.</param>
  518. /// <returns><see cref="ConversionError.None"/> if the write succeeds. Otherwise, returns <see cref="ConversionError.CodePoint"/>, <see cref="ConversionError.Overflow"/>, or <see cref="ConversionError.Encoding"/>.</returns>
  519. public static ConversionError UcsToUtf16(char* buffer, ref int index, int capacity, Rune rune)
  520. {
  521. if (!IsValidCodePoint(rune.value))
  522. {
  523. return ConversionError.CodePoint;
  524. }
  525. if (index + 1 > capacity)
  526. {
  527. return ConversionError.Overflow;
  528. }
  529. if (rune.value >= 0x10000)
  530. {
  531. if (index + 2 > capacity)
  532. {
  533. return ConversionError.Overflow;
  534. }
  535. int code = rune.value - 0x10000;
  536. if (code >= (1 << 20))
  537. {
  538. return ConversionError.Encoding;
  539. }
  540. buffer[index++] = (char)(0xD800 | (code >> 10));
  541. buffer[index++] = (char)(0xDC00 | (code & 0x3FF));
  542. return ConversionError.None;
  543. }
  544. buffer[index++] = (char)rune.value;
  545. return ConversionError.None;
  546. }
  547. /// <summary>
  548. /// Copies UTF-16 characters from one buffer to another buffer as UTF-8.
  549. /// </summary>
  550. /// <remarks>Assumes the source data is valid UTF-16.</remarks>
  551. /// <param name="utf16Buffer">The source buffer.</param>
  552. /// <param name="utf16Length">The number of chars to read from the source.</param>
  553. /// <param name="utf8Buffer">The destination buffer.</param>
  554. /// <param name="utf8Length">Outputs the number of bytes written to the destination.</param>
  555. /// <param name="utf8Capacity">The size in bytes of the destination buffer.</param>
  556. /// <returns><see cref="ConversionError.None"/> if the copy fully completes. Otherwise, returns <see cref="ConversionError.Overflow"/>.</returns>
  557. public static ConversionError Utf16ToUtf8(char* utf16Buffer, int utf16Length, byte* utf8Buffer, out int utf8Length, int utf8Capacity)
  558. {
  559. utf8Length = 0;
  560. for (var utf16Offset = 0; utf16Offset < utf16Length;)
  561. {
  562. Utf16ToUcs(out var ucs, utf16Buffer, ref utf16Offset, utf16Length);
  563. if (UcsToUtf8(utf8Buffer, ref utf8Length, utf8Capacity, ucs) == ConversionError.Overflow)
  564. return ConversionError.Overflow;
  565. }
  566. return ConversionError.None;
  567. }
  568. /// <summary>
  569. /// Copies UTF-8 characters from one buffer to another.
  570. /// </summary>
  571. /// <remarks>Assumes the source data is valid UTF-8.</remarks>
  572. /// <param name="srcBuffer">The source buffer.</param>
  573. /// <param name="srcLength">The number of bytes to read from the source.</param>
  574. /// <param name="destBuffer">The destination buffer.</param>
  575. /// <param name="destLength">Outputs the number of bytes written to the destination.</param>
  576. /// <param name="destCapacity">The size in bytes of the destination buffer.</param>
  577. /// <returns><see cref="ConversionError.None"/> if the copy fully completes. Otherwise, returns <see cref="ConversionError.Overflow"/>.</returns>
  578. public static ConversionError Utf8ToUtf8(byte* srcBuffer, int srcLength, byte* destBuffer, out int destLength, int destCapacity)
  579. {
  580. if (destCapacity >= srcLength)
  581. {
  582. UnsafeUtility.MemCpy(destBuffer, srcBuffer, srcLength);
  583. destLength = srcLength;
  584. return ConversionError.None;
  585. }
  586. // TODO even in this case, it's possible to MemCpy all but the last 3 bytes that fit, and then by looking at only
  587. // TODO the high bits of the last 3 bytes that fit, decide how many of the 3 to append. but that requires a
  588. // TODO little UNICODE presence of mind that nobody has today.
  589. destLength = 0;
  590. for (var srcOffset = 0; srcOffset < srcLength;)
  591. {
  592. Utf8ToUcs(out var ucs, srcBuffer, ref srcOffset, srcLength);
  593. if (UcsToUtf8(destBuffer, ref destLength, destCapacity, ucs) == ConversionError.Overflow)
  594. return ConversionError.Overflow;
  595. }
  596. return ConversionError.None;
  597. }
  598. /// <summary>
  599. /// Copies UTF-8 characters from one buffer to another as UTF-16.
  600. /// </summary>
  601. /// <remarks>Assumes the source data is valid UTF-8.</remarks>
  602. /// <param name="utf8Buffer">The source buffer.</param>
  603. /// <param name="utf8Length">The number of bytes to read from the source.</param>
  604. /// <param name="utf16Buffer">The destination buffer.</param>
  605. /// <param name="utf16Length">Outputs the number of chars written to the destination.</param>
  606. /// <param name="utf16Capacity">The size in chars of the destination buffer.</param>
  607. /// <returns><see cref="ConversionError.None"/> if the copy fully completes. Otherwise, <see cref="ConversionError.Overflow"/>.</returns>
  608. public static ConversionError Utf8ToUtf16(byte* utf8Buffer, int utf8Length, char* utf16Buffer, out int utf16Length, int utf16Capacity)
  609. {
  610. utf16Length = 0;
  611. for (var utf8Offset
  612. = 0; utf8Offset < utf8Length;)
  613. {
  614. Utf8ToUcs(out var ucs, utf8Buffer, ref utf8Offset, utf8Length);
  615. if (UcsToUtf16(utf16Buffer, ref utf16Length, utf16Capacity, ucs) == ConversionError.Overflow)
  616. return ConversionError.Overflow;
  617. }
  618. return ConversionError.None;
  619. }
  620. static int CountRunes(byte* utf8Buffer, int utf8Length, int maxRunes = int.MaxValue)
  621. {
  622. var numRunes = 0;
  623. for (var i = 0; numRunes < maxRunes && i < utf8Length; ++i)
  624. {
  625. if ((utf8Buffer[i] & 0xC0) != 0x80)
  626. numRunes++;
  627. }
  628. return numRunes;
  629. }
  630. }
  631. }