|
|
// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/
// 10 november 2016
#include "utf.h"
// this code imitates Go's unicode/utf8 and unicode/utf16
// the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not)
// it is also an imitation so we can license it under looser terms than the Go source
#define badrune 0xFFFD
// encoded must be at most 4 bytes
// TODO clean this code up somehow
size_t utf8EncodeRune(uint32_t rune, char *encoded) { uint8_t b; uint8_t c = 0; uint8_t d = 0; uint8_t e = 0; size_t n;
// not in the valid range for Unicode
if (rune > 0x10FFFF) rune = badrune; // surrogate runes cannot be encoded
if (rune >= 0xD800 && rune < 0xE000) rune = badrune;
if (rune < 0x80) { // ASCII bytes represent themselves
b = (uint8_t) (rune & 0xFF); n = 1; goto done; } if (rune < 0x800) { // two-byte encoding
c = (uint8_t) (rune & 0x3F); c |= 0x80; rune >>= 6; b = (uint8_t) (rune & 0x1F); b |= 0xC0; n = 2; goto done; } if (rune < 0x10000) { // three-byte encoding
d = (uint8_t) (rune & 0x3F); d |= 0x80; rune >>= 6; c = (uint8_t) (rune & 0x3F); c |= 0x80; rune >>= 6; b = (uint8_t) (rune & 0x0F); b |= 0xE0; n = 3; goto done; } // otherwise use a four-byte encoding
e = (uint8_t) (rune & 0x3F); e |= 0x80; rune >>= 6; d = (uint8_t) (rune & 0x3F); d |= 0x80; rune >>= 6; c = (uint8_t) (rune & 0x3F); c |= 0x80; rune >>= 6; b = (uint8_t) (rune & 0x07); b |= 0xF0; n = 4;
done: encoded[0] = b; if (n > 1) encoded[1] = c; if (n > 2) encoded[2] = d; if (n > 3) encoded[3] = e; return n; }
const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune) { uint8_t b, c; uint8_t lowestAllowed, highestAllowed; size_t i, expected; int bad;
b = (uint8_t) (*s); if (b < 0x80) { // ASCII bytes represent themselves
*rune = b; s++; return s; } // 0xC0 and 0xC1 cover 2-byte overlong equivalents
// 0xF5 to 0xFD cover values > 0x10FFFF
// 0xFE and 0xFF were never defined (always illegal)
if (b < 0xC2 || b > 0xF4) { // invalid
*rune = badrune; s++; return s; }
// this determines the range of allowed first continuation bytes
lowestAllowed = 0x80; highestAllowed = 0xBF; switch (b) { case 0xE0: // disallow 3-byte overlong equivalents
lowestAllowed = 0xA0; break; case 0xED: // disallow surrogate characters
highestAllowed = 0x9F; break; case 0xF0: // disallow 4-byte overlong equivalents
lowestAllowed = 0x90; break; case 0xF4: // disallow values > 0x10FFFF
highestAllowed = 0x8F; break; }
// and this determines how many continuation bytes are expected
expected = 1; if (b >= 0xE0) expected++; if (b >= 0xF0) expected++; if (nElem != 0) { // are there enough bytes?
nElem--; if (nElem < expected) { // nope
*rune = badrune; s++; return s; } }
// ensure that everything is correct
// if not, **only** consume the initial byte
bad = 0; for (i = 0; i < expected; i++) { c = (uint8_t) (s[1 + i]); if (c < lowestAllowed || c > highestAllowed) { bad = 1; break; } // the old lowestAllowed and highestAllowed is only for the first continuation byte
lowestAllowed = 0x80; highestAllowed = 0xBF; } if (bad) { *rune = badrune; s++; return s; }
// now do the topmost bits
if (b < 0xE0) *rune = b & 0x1F; else if (b < 0xF0) *rune = b & 0x0F; else *rune = b & 0x07; s++; // we can finally move on
// now do the continuation bytes
for (; expected; expected--) { c = (uint8_t) (*s); s++; c &= 0x3F; // strip continuation bits
*rune <<= 6; *rune |= c; }
return s; }
// encoded must have at most 2 elements
size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded) { uint16_t low, high;
// not in the valid range for Unicode
if (rune > 0x10FFFF) rune = badrune; // surrogate runes cannot be encoded
if (rune >= 0xD800 && rune < 0xE000) rune = badrune;
if (rune < 0x10000) { encoded[0] = (uint16_t) rune; return 1; }
rune -= 0x10000; low = (uint16_t) (rune & 0x3FF); rune >>= 10; high = (uint16_t) (rune & 0x3FF); encoded[0] = high | 0xD800; encoded[1] = low | 0xDC00; return 2; }
// TODO see if this can be cleaned up somehow
const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune) { uint16_t high, low;
if (*s < 0xD800 || *s >= 0xE000) { // self-representing character
*rune = *s; s++; return s; } if (*s >= 0xDC00) { // out-of-order surrogates
*rune = badrune; s++; return s; } if (nElem == 1) { // not enough elements
*rune = badrune; s++; return s; } high = *s; high &= 0x3FF; if (s[1] < 0xDC00 || s[1] >= 0xE000) { // bad surrogate pair
*rune = badrune; s++; return s; } s++; low = *s; s++; low &= 0x3FF; *rune = high; *rune <<= 10; *rune |= low; *rune += 0x10000; return s; }
// TODO find a way to reduce the code in all of these somehow
// TODO find a way to remove u as well
size_t utf8RuneCount(const char *s, size_t nElem) { size_t len; uint32_t rune;
if (nElem != 0) { const char *t, *u;
len = 0; t = s; while (nElem != 0) { u = utf8DecodeRune(t, nElem, &rune); len++; nElem -= u - t; t = u; } return len; } len = 0; while (*s) { s = utf8DecodeRune(s, nElem, &rune); len++; } return len; }
size_t utf8UTF16Count(const char *s, size_t nElem) { size_t len; uint32_t rune; uint16_t encoded[2];
if (nElem != 0) { const char *t, *u;
len = 0; t = s; while (nElem != 0) { u = utf8DecodeRune(t, nElem, &rune); len += utf16EncodeRune(rune, encoded); nElem -= u - t; t = u; } return len; } len = 0; while (*s) { s = utf8DecodeRune(s, nElem, &rune); len += utf16EncodeRune(rune, encoded); } return len; }
size_t utf16RuneCount(const uint16_t *s, size_t nElem) { size_t len; uint32_t rune;
if (nElem != 0) { const uint16_t *t, *u;
len = 0; t = s; while (nElem != 0) { u = utf16DecodeRune(t, nElem, &rune); len++; nElem -= u - t; t = u; } return len; } len = 0; while (*s) { s = utf16DecodeRune(s, nElem, &rune); len++; } return len; }
size_t utf16UTF8Count(const uint16_t *s, size_t nElem) { size_t len; uint32_t rune; char encoded[4];
if (nElem != 0) { const uint16_t *t, *u;
len = 0; t = s; while (nElem != 0) { u = utf16DecodeRune(t, nElem, &rune); len += utf8EncodeRune(rune, encoded); nElem -= u - t; t = u; } return len; } len = 0; while (*s) { s = utf16DecodeRune(s, nElem, &rune); len += utf8EncodeRune(rune, encoded); } return len; }
|