暫無描述
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Unicode.cs 21KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. using System;
  2. using System.Diagnostics;
  3. using UnityEngine.Assertions;
  4. using Unity.Collections.LowLevel.Unsafe;
  5. namespace Unity.Collections
  6. {
  7. /// <summary>
  8. /// Kinds of format errors.
  9. /// </summary>
  10. public enum FormatError
  11. {
  12. /// <summary>
  13. /// No error.
  14. /// </summary>
  15. None,
  16. /// <summary>
  17. /// The target storage does not have sufficient capacity.
  18. /// </summary>
  19. Overflow,
  20. }
  21. /// <summary>
  22. /// Kinds of parse errors.
  23. /// </summary>
  24. public enum ParseError
  25. {
  26. /// <summary>
  27. /// No parse error.
  28. /// </summary>
  29. None,
  30. /// <summary>
  31. /// The text parsed does not form a number.
  32. /// </summary>
  33. Syntax,
  34. /// <summary>
  35. /// The number exceeds the range of the target type.
  36. /// </summary>
  37. Overflow,
  38. /// <summary>
  39. /// The number exceeds the precision of the target type.
  40. /// </summary>
  41. Underflow,
  42. }
  43. /// <summary>
  44. /// Kinds of copy errors.
  45. /// </summary>
  46. public enum CopyError
  47. {
  48. /// <summary>
  49. /// No copy error.
  50. /// </summary>
  51. None,
  52. /// <summary>
  53. /// The target storage does not have sufficient capacity.
  54. /// </summary>
  55. Truncation,
  56. }
  57. /// <summary>
  58. /// Kinds of conversion errors.
  59. /// </summary>
  60. public enum ConversionError
  61. {
  62. /// <summary>
  63. /// No conversion error.
  64. /// </summary>
  65. None,
  66. /// <summary>
  67. /// The target storage does not have sufficient capacity.
  68. /// </summary>
  69. Overflow,
  70. /// <summary>
  71. /// The bytes do not form a valid character.
  72. /// </summary>
  73. Encoding,
  74. /// <summary>
  75. /// The rune is not a valid code point.
  76. /// </summary>
  77. CodePoint,
  78. }
  79. /// <summary>
  80. /// Provides utility methods for UTF-8, UTF-16, UCS-4 (a.k.a. UTF-32), and WTF-8.
  81. /// </summary>
  82. [BurstCompatible]
  83. public unsafe struct Unicode
  84. {
  85. /// <summary>
  86. /// Representation of a Unicode character as a code point.
  87. /// </summary>
  88. [BurstCompatible]
  89. public struct Rune
  90. {
  91. /// <summary>
  92. /// The code point.
  93. /// </summary>
  94. /// <value>The code point.</value>
  95. public int value;
  96. /// <summary>
  97. /// Initializes and returns an instance of Rune.
  98. /// </summary>
  99. /// <remarks>You are responsible for the code point being valid.</remarks>
  100. /// <param name="codepoint">The code point.</param>
  101. public Rune(int codepoint)
  102. {
  103. value = codepoint;
  104. }
  105. /// <summary>
  106. /// Returns a rune.
  107. /// </summary>
  108. /// <remarks>Because a char is 16-bit, it can only represent the first 2^16 code points, not all 1.1 million.</remarks>
  109. /// <param name="codepoint">A code point.</param>
  110. /// <returns>A rune.</returns>
  111. public static explicit operator Rune(char codepoint) => new Rune { value = codepoint };
  112. /// <summary>
  113. /// Returns true if a rune is a numerical digit character.
  114. /// </summary>
  115. /// <param name="r">The rune.</param>
  116. /// <returns>True if the rune is a numerical digit character.</returns>
  117. public static bool IsDigit(Rune r)
  118. {
  119. return r.value >= '0' && r.value <= '9';
  120. }
  121. /// <summary>
  122. /// Returns the number of bytes required to encode this rune as UTF-8.
  123. /// </summary>
  124. /// <returns>The number of bytes required to encode this rune as UTF-8. If the rune's codepoint
  125. /// is invalid, returns 4 (the maximum possible encoding length).</returns>
  126. public int LengthInUtf8Bytes()
  127. {
  128. if (value < 0)
  129. return 4; // invalid codepoint
  130. if (value <= 0x7F)
  131. return 1;
  132. if (value <= 0x7FF)
  133. return 2;
  134. if (value <= 0xFFFF)
  135. return 3;
  136. if (value <= 0x1FFFFF)
  137. return 4;
  138. // invalid codepoint, max size.
  139. return 4;
  140. }
  141. }
  142. /// <summary>The maximum value of a valid UNICODE code point</summary>
  143. public const int kMaximumValidCodePoint = 0x10FFFF;
  144. /// <summary>
  145. /// Returns true if a code point is valid.
  146. /// </summary>
  147. /// <param name="codepoint">A code point.</param>
  148. /// <returns>True if a code point is valid.</returns>
  149. public static bool IsValidCodePoint(int codepoint)
  150. {
  151. if (codepoint > kMaximumValidCodePoint) // maximum valid code point
  152. return false;
  153. // if (codepoint >= 0xD800 && codepoint <= 0xDFFF) // surrogate pair
  154. // return false;
  155. if (codepoint < 0) // negative?
  156. return false;
  157. return true;
  158. }
  159. /// <summary>
  160. /// Returns true if the byte is not the last byte of a UTF-8 character.
  161. /// </summary>
  162. /// <param name="b">The byte.</param>
  163. /// <returns>True if the byte is not the last byte of a UTF-8 character.</returns>
  164. public static bool NotTrailer(byte b)
  165. {
  166. return (b & 0xC0) != 0x80;
  167. }
  168. /// <summary>
  169. /// The Unicode character �.
  170. /// </summary>
  171. /// <remarks>This character is used to stand-in for characters that can't be rendered.</remarks>
  172. /// <value>The Unicode character �.</value>
  173. public static Rune ReplacementCharacter => new Rune { value = 0xFFFD };
  174. /// <summary>
  175. /// The null rune value.
  176. /// </summary>
  177. /// <remarks>In this package, the "bad rune" is used as a null character. It represents no valid code point.</remarks>
  178. /// <value>The null rune value.</value>
  179. public static Rune BadRune => new Rune { value = 0 };
  180. /// <summary>
  181. /// Reads a UTF-8 encoded character from a buffer.
  182. /// </summary>
  183. /// <param name="rune">Outputs the character read. If the read fails, outputs <see cref="ReplacementCharacter"/>.</param>
  184. /// <param name="buffer">The buffer of bytes to read.</param>
  185. /// <param name="index">Reference to a byte index into the buffer. If the read succeeds, index is incremented by the
  186. /// size in bytes of the character read. If the read fails, index is incremented by 1.</param>
  187. /// <param name="capacity">The size in bytes of the buffer. Used to check that the read is in bounds.</param>
  188. /// <returns><see cref="ConversionError.None"/> if the read succeeds. Otherwise, returns <see cref="ConversionError.Overflow"/> or <see cref="ConversionError.Encoding"/>.</returns>
  189. public static ConversionError Utf8ToUcs(out Rune rune, byte* buffer, ref int index, int capacity)
  190. {
  191. int code = 0;
  192. rune = ReplacementCharacter;
  193. if (index + 1 > capacity)
  194. {
  195. return ConversionError.Overflow;
  196. }
  197. if ((buffer[index] & 0b10000000) == 0b00000000) // if high bit is 0, 1 byte
  198. {
  199. rune.value = buffer[index + 0];
  200. index += 1;
  201. return ConversionError.None;
  202. }
  203. if ((buffer[index] & 0b11100000) == 0b11000000) // if high 3 bits are 110, 2 bytes
  204. {
  205. if (index + 2 > capacity)
  206. {
  207. index += 1;
  208. return ConversionError.Overflow;
  209. }
  210. code = (buffer[index + 0] & 0b00011111);
  211. code = (code << 6) | (buffer[index + 1] & 0b00111111);
  212. if (code < (1 << 7) || NotTrailer(buffer[index + 1]))
  213. {
  214. index += 1;
  215. return ConversionError.Encoding;
  216. }
  217. rune.value = code;
  218. index += 2;
  219. return ConversionError.None;
  220. }
  221. if ((buffer[index] & 0b11110000) == 0b11100000) // if high 4 bits are 1110, 3 bytes
  222. {
  223. if (index + 3 > capacity)
  224. {
  225. index += 1;
  226. return ConversionError.Overflow;
  227. }
  228. code = (buffer[index + 0] & 0b00001111);
  229. code = (code << 6) | (buffer[index + 1] & 0b00111111);
  230. code = (code << 6) | (buffer[index + 2] & 0b00111111);
  231. if (code < (1 << 11) || !IsValidCodePoint(code) || NotTrailer(buffer[index + 1]) || NotTrailer(buffer[index + 2]))
  232. {
  233. index += 1;
  234. return ConversionError.Encoding;
  235. }
  236. rune.value = code;
  237. index += 3;
  238. return ConversionError.None;
  239. }
  240. if ((buffer[index] & 0b11111000) == 0b11110000) // if high 5 bits are 11110, 4 bytes
  241. {
  242. if (index + 4 > capacity)
  243. {
  244. index += 1;
  245. return ConversionError.Overflow;
  246. }
  247. code = (buffer[index + 0] & 0b00000111);
  248. code = (code << 6) | (buffer[index + 1] & 0b00111111);
  249. code = (code << 6) | (buffer[index + 2] & 0b00111111);
  250. code = (code << 6) | (buffer[index + 3] & 0b00111111);
  251. if (code < (1 << 16) || !IsValidCodePoint(code) || NotTrailer(buffer[index + 1]) || NotTrailer(buffer[index + 2]) || NotTrailer(buffer[index + 3]))
  252. {
  253. index += 1;
  254. return ConversionError.Encoding;
  255. }
  256. rune.value = code;
  257. index += 4;
  258. return ConversionError.None;
  259. }
  260. index += 1;
  261. return ConversionError.Encoding;
  262. }
  263. /// <summary>
  264. /// Returns true if a char is a Unicode leading surrogate.
  265. /// </summary>
  266. /// <param name="c">The char.</param>
  267. /// <returns>True if the char is a Unicode leading surrogate.</returns>
  268. static bool IsLeadingSurrogate(char c)
  269. {
  270. return c >= 0xD800 && c <= 0xDBFF;
  271. }
  272. /// <summary>
  273. /// Returns true if a char is a Unicode trailing surrogate.
  274. /// </summary>
  275. /// <param name="c">The char.</param>
  276. /// <returns>True if the char is a Unicode trailing surrogate.</returns>
  277. static bool IsTrailingSurrogate(char c)
  278. {
  279. return c >= 0xDC00 && c <= 0xDFFF;
  280. }
  281. /// <summary>
  282. /// Reads a UTF-16 encoded character from a buffer.
  283. /// </summary>
  284. /// <param name="rune">Outputs the character read. If the read fails, rune is not set.</param>
  285. /// <param name="buffer">The buffer of chars to read.</param>
  286. /// <param name="index">Reference to a char index into the buffer. If the read succeeds, index is incremented by the
  287. /// size in chars of the character read. If the read fails, index is not incremented.</param>
  288. /// <param name="capacity">The size in chars of the buffer. Used to check that the read is in bounds.</param>
  289. /// <returns><see cref="ConversionError.None"/> if the read succeeds. Otherwise, returns <see cref="ConversionError.Overflow"/>.</returns>
  290. public static ConversionError Utf16ToUcs(out Rune rune, char* buffer, ref int index, int capacity)
  291. {
  292. int code = 0;
  293. rune = ReplacementCharacter;
  294. if (index + 1 > capacity)
  295. return ConversionError.Overflow;
  296. if (!IsLeadingSurrogate(buffer[index]) || (index + 2 > capacity))
  297. {
  298. rune.value = buffer[index];
  299. index += 1;
  300. return ConversionError.None;
  301. }
  302. code = (buffer[index + 0] & 0x03FF);
  303. char next = buffer[index + 1];
  304. if (!IsTrailingSurrogate(next))
  305. {
  306. rune.value = buffer[index];
  307. index += 1;
  308. return ConversionError.None;
  309. }
  310. code = (code << 10) | (buffer[index + 1] & 0x03FF);
  311. code += 0x10000;
  312. rune.value = code;
  313. index += 2;
  314. return ConversionError.None;
  315. }
  316. /// <summary>
  317. /// Writes a rune to a buffer as a UTF-8 encoded character.
  318. /// </summary>
  319. /// <param name="rune">The rune to encode.</param>
  320. /// <param name="buffer">The buffer to write to.</param>
  321. /// <param name="index">Reference to a byte index into the buffer. If the write succeeds, index is incremented by the
  322. /// size in bytes of the character written. If the write fails, index is not incremented.</param>
  323. /// <param name="capacity">The size in bytes of the buffer. Used to check that the write is in bounds.</param>
  324. /// <returns><see cref="ConversionError.None"/> if the write succeeds. Otherwise, returns <see cref="ConversionError.CodePoint"/>, <see cref="ConversionError.Overflow"/>, or <see cref="ConversionError.Encoding"/>.</returns>
  325. public static ConversionError UcsToUtf8(byte* buffer, ref int index, int capacity, Rune rune)
  326. {
  327. if (!IsValidCodePoint(rune.value))
  328. {
  329. return ConversionError.CodePoint;
  330. }
  331. if (index + 1 > capacity)
  332. {
  333. return ConversionError.Overflow;
  334. }
  335. if (rune.value <= 0x7F)
  336. {
  337. buffer[index++] = (byte)rune.value;
  338. return ConversionError.None;
  339. }
  340. if (rune.value <= 0x7FF)
  341. {
  342. if (index + 2 > capacity)
  343. {
  344. return ConversionError.Overflow;
  345. }
  346. buffer[index++] = (byte)(0xC0 | (rune.value >> 6));
  347. buffer[index++] = (byte)(0x80 | ((rune.value >> 0) & 0x3F));
  348. return ConversionError.None;
  349. }
  350. if (rune.value <= 0xFFFF)
  351. {
  352. if (index + 3 > capacity)
  353. {
  354. return ConversionError.Overflow;
  355. }
  356. buffer[index++] = (byte)(0xE0 | (rune.value >> 12));
  357. buffer[index++] = (byte)(0x80 | ((rune.value >> 6) & 0x3F));
  358. buffer[index++] = (byte)(0x80 | ((rune.value >> 0) & 0x3F));
  359. return ConversionError.None;
  360. }
  361. if (rune.value <= 0x1FFFFF)
  362. {
  363. if (index + 4 > capacity)
  364. {
  365. return ConversionError.Overflow;
  366. }
  367. buffer[index++] = (byte)(0xF0 | (rune.value >> 18));
  368. buffer[index++] = (byte)(0x80 | ((rune.value >> 12) & 0x3F));
  369. buffer[index++] = (byte)(0x80 | ((rune.value >> 6) & 0x3F));
  370. buffer[index++] = (byte)(0x80 | ((rune.value >> 0) & 0x3F));
  371. return ConversionError.None;
  372. }
  373. return ConversionError.Encoding;
  374. }
  375. /// <summary>
  376. /// Writes a rune to a buffer as a UTF-16 encoded character.
  377. /// </summary>
  378. /// <param name="rune">The rune to encode.</param>
  379. /// <param name="buffer">The buffer of chars to write to.</param>
  380. /// <param name="index">Reference to a char index into the buffer. If the write succeeds, index is incremented by the
  381. /// size in chars of the character written. If the write fails, index is not incremented.</param>
  382. /// <param name="capacity">The size in chars of the buffer. Used to check that the write is in bounds.</param>
  383. /// <returns><see cref="ConversionError.None"/> if the write succeeds. Otherwise, returns <see cref="ConversionError.CodePoint"/>, <see cref="ConversionError.Overflow"/>, or <see cref="ConversionError.Encoding"/>.</returns>
  384. public static ConversionError UcsToUtf16(char* buffer, ref int index, int capacity, Rune rune)
  385. {
  386. if (!IsValidCodePoint(rune.value))
  387. {
  388. return ConversionError.CodePoint;
  389. }
  390. if (index + 1 > capacity)
  391. {
  392. return ConversionError.Overflow;
  393. }
  394. if (rune.value >= 0x10000)
  395. {
  396. if (index + 2 > capacity)
  397. {
  398. return ConversionError.Overflow;
  399. }
  400. int code = rune.value - 0x10000;
  401. if (code >= (1 << 20))
  402. {
  403. return ConversionError.Encoding;
  404. }
  405. buffer[index++] = (char)(0xD800 | (code >> 10));
  406. buffer[index++] = (char)(0xDC00 | (code & 0x3FF));
  407. return ConversionError.None;
  408. }
  409. buffer[index++] = (char)rune.value;
  410. return ConversionError.None;
  411. }
  412. /// <summary>
  413. /// Copies UTF-16 characters from one buffer to another buffer as UTF-8.
  414. /// </summary>
  415. /// <remarks>Assumes the source data is valid UTF-16.</remarks>
  416. /// <param name="utf16Buffer">The source buffer.</param>
  417. /// <param name="utf16Length">The number of chars to read from the source.</param>
  418. /// <param name="utf8Buffer">The destination buffer.</param>
  419. /// <param name="utf8Length">Outputs the number of bytes written to the destination.</param>
  420. /// <param name="utf8Capacity">The size in bytes of the destination buffer.</param>
  421. /// <returns><see cref="ConversionError.None"/> if the copy fully completes. Otherwise, returns <see cref="ConversionError.Overflow"/>.</returns>
  422. public static ConversionError Utf16ToUtf8(char* utf16Buffer, int utf16Length, byte* utf8Buffer, out int utf8Length, int utf8Capacity)
  423. {
  424. utf8Length = 0;
  425. for (var utf16Offset = 0; utf16Offset < utf16Length;)
  426. {
  427. Utf16ToUcs(out var ucs, utf16Buffer, ref utf16Offset, utf16Length);
  428. if (UcsToUtf8(utf8Buffer, ref utf8Length, utf8Capacity, ucs) == ConversionError.Overflow)
  429. return ConversionError.Overflow;
  430. }
  431. return ConversionError.None;
  432. }
  433. /// <summary>
  434. /// Copies UTF-8 characters from one buffer to another.
  435. /// </summary>
  436. /// <remarks>Assumes the source data is valid UTF-8.</remarks>
  437. /// <param name="srcBuffer">The source buffer.</param>
  438. /// <param name="srcLength">The number of bytes to read from the source.</param>
  439. /// <param name="destBuffer">The destination buffer.</param>
  440. /// <param name="destLength">Outputs the number of bytes written to the destination.</param>
  441. /// <param name="destCapacity">The size in bytes of the destination buffer.</param>
  442. /// <returns><see cref="ConversionError.None"/> if the copy fully completes. Otherwise, returns <see cref="ConversionError.Overflow"/>.</returns>
  443. public static ConversionError Utf8ToUtf8(byte* srcBuffer, int srcLength, byte* destBuffer, out int destLength, int destCapacity)
  444. {
  445. if (destCapacity >= srcLength)
  446. {
  447. UnsafeUtility.MemCpy(destBuffer, srcBuffer, srcLength);
  448. destLength = srcLength;
  449. return ConversionError.None;
  450. }
  451. // TODO even in this case, it's possible to MemCpy all but the last 3 bytes that fit, and then by looking at only
  452. // TODO the high bits of the last 3 bytes that fit, decide how many of the 3 to append. but that requires a
  453. // TODO little UNICODE presence of mind that nobody has today.
  454. destLength = 0;
  455. for (var srcOffset = 0; srcOffset < srcLength;)
  456. {
  457. Utf8ToUcs(out var ucs, srcBuffer, ref srcOffset, srcLength);
  458. if (UcsToUtf8(destBuffer, ref destLength, destCapacity, ucs) == ConversionError.Overflow)
  459. return ConversionError.Overflow;
  460. }
  461. return ConversionError.None;
  462. }
  463. /// <summary>
  464. /// Copies UTF-8 characters from one buffer to another as UTF-16.
  465. /// </summary>
  466. /// <remarks>Assumes the source data is valid UTF-8.</remarks>
  467. /// <param name="utf8Buffer">The source buffer.</param>
  468. /// <param name="utf8Length">The number of bytes to read from the source.</param>
  469. /// <param name="utf16Buffer">The destination buffer.</param>
  470. /// <param name="utf16Length">Outputs the number of chars written to the destination.</param>
  471. /// <param name="utf16Capacity">The size in chars of the destination buffer.</param>
  472. /// <returns><see cref="ConversionError.None"/> if the copy fully completes. Otherwise, <see cref="ConversionError.Overflow"/>.</returns>
  473. public static ConversionError Utf8ToUtf16(byte* utf8Buffer, int utf8Length, char* utf16Buffer, out int utf16Length, int utf16Capacity)
  474. {
  475. utf16Length = 0;
  476. for (var utf8Offset
  477. = 0; utf8Offset < utf8Length;)
  478. {
  479. Utf8ToUcs(out var ucs, utf8Buffer, ref utf8Offset, utf8Length);
  480. if (UcsToUtf16(utf16Buffer, ref utf16Length, utf16Capacity, ucs) == ConversionError.Overflow)
  481. return ConversionError.Overflow;
  482. }
  483. return ConversionError.None;
  484. }
  485. }
  486. }