From: Michael D. Lowis Date: Tue, 4 Jan 2022 21:31:14 +0000 (-0500) Subject: pulled in UTF8 encoder/decoder and updated TODO list X-Git-Url: https://git.mdlowis.com/?a=commitdiff_plain;h=b4c8dde0fd929c86782b6862e522455b6bf7f711;p=proto%2Faos.git pulled in UTF8 encoder/decoder and updated TODO list --- diff --git a/TODO.md b/TODO.md index e9340da..5be09ac 100644 --- a/TODO.md +++ b/TODO.md @@ -1,20 +1,17 @@ # Doing -* Finish option parsing logic (remove mallocs, handle positional args) +* Add JSON parser/generator (binary and text) + * [UBJSON](https://en.wikipedia.org/wiki/UBJSON) + * [JSMN](https://github.com/zserge/jsmn) # On Deck -* Add support for complex binary targets +* Add ubus-like IPC daemon * Add tide-like text editor # Backlog -* Add JSON parser/generator (binary and text) - * [UBJSON](https://en.wikipedia.org/wiki/UBJSON) - * [JSMN](https://github.com/zserge/jsmn) -* Add ubus-like IPC daemon +* Add INI (or TOML) parser to liba * Add Icon-like language interpreter -* Combine libnet and liba * Add MDL markup language processor -* Add INI (or TOML) parser to liba * Add ANSI terminal support to liba diff --git a/inc/liba.h b/inc/liba.h index b8e4e73..6e5a038 100644 --- a/inc/liba.h +++ b/inc/liba.h @@ -76,6 +76,20 @@ int Net_Accept(int fd); int Net_Dial(char* dialstr); void Net_Serve(char* dialstr, void (*on_client)(int cfd)); +/* + UTF8 Encoding and Decoding +*/ +#define UTF_MAX 6u +#define RUNE_SELF ((Rune)0x80) +#define RUNE_ERR ((Rune)0xFFFD) +#define RUNE_MAX ((Rune)0x10FFFF) +#define RUNE_EOF ((Rune)EOF) + +typedef uint32_t Rune; + +size_t UTF8_Encode(char str[UTF_MAX], Rune rune); +bool UTF8_Decode(Rune* rune, size_t* length, int byte); + /* Basic Runtime Facilities */ diff --git a/lib/a/UTF8.c b/lib/a/UTF8.c new file mode 100644 index 0000000..5a39bb8 --- /dev/null +++ b/lib/a/UTF8.c @@ -0,0 +1,84 @@ +#include + +static const uint8_t UTF8_SeqBits[] = { 0x00u, 0x80u, 0xC0u, 0xE0u, 0xF0u, 0xF8u, 0xFCu, 0xFEu }; +static const uint8_t UTF8_SeqMask[] = { 0x00u, 0xFFu, 0x1Fu, 0x0Fu, 0x07u, 0x03u, 0x01u, 0x00u }; +static const uint8_t UTF8_SeqLens[] = { 0x01u, 0x00u, 0x02u, 0x03u, 0x04u, 0x05u, 0x06u, 0x00u }; + +static bool runevalid(Rune val) { + return (val <= RUNE_MAX) + && ((val & 0xFFFEu) != 0xFFFEu) + && ((val < 0xD800u) || (val > 0xDFFFu)) + && ((val < 0xFDD0u) || (val > 0xFDEFu)); +} + +static size_t runelen(Rune rune) { + if(!runevalid(rune)) + return 0; + else if(rune <= 0x7F) + return 1; + else if(rune <= 0x07FF) + return 2; + else if(rune <= 0xFFFF) + return 3; + else + return 4; +} + +static uint8_t utfseq(uint8_t byte) { + for (int i = 1; i < 8; i++) + if ((byte & UTF8_SeqBits[i]) == UTF8_SeqBits[i-1]) + return UTF8_SeqLens[i-1]; + return 0; +} + +size_t UTF8_Encode(char str[UTF_MAX], Rune rune) +{ + size_t len = runelen(rune); + str[0] = (len == 1 ? 0x00 : UTF8_SeqBits[len]) + | (UTF8_SeqMask[len] & (rune >> (6 * (len-1)))); + for (size_t i = 1; i < len; i++) + { + str[i] = 0x80u | (0x3Fu & (rune >> (6 * (len-i-1)))); + } + return len; +} + +bool UTF8_Decode(Rune* rune, size_t* length, int byte) +{ + /* Handle the start of a new rune */ + if (*length == 0) + { + /* If we were fed in an EOF as a start byte, handle it here */ + if (byte == EOF) + { + *rune = RUNE_EOF; + } + else + { + /* Otherwise, decode the first byte of the rune */ + *length = utfseq(byte); + *rune = (*length == 0) ? RUNE_ERR : (byte & UTF8_SeqMask[*length]); + (*length)--; + } + } + /* Handle continuation bytes */ + else if ((byte & 0xC0) == 0x80) + { + /* add bits from continuation byte to rune value + * cannot overflow: 6 byte sequences contain 31 bits */ + *rune = (*rune << 6) | (byte & 0x3F); /* 10xxxxxx */ + (*length)--; + /* Sanity check the final rune value before finishing */ + if ((*length == 0) && !runevalid(*rune)) + { + *rune = RUNE_ERR; + } + } + /* Didn't get the continuation byte we expected */ + else + { + *rune = RUNE_ERR; + } + /* Tell the caller whether we finished or not */ + return ((*length == 0) || (*rune == RUNE_ERR)); +}