From ecee2adb64f270a3255988b3c4aca31de0552c4b Mon Sep 17 00:00:00 2001 From: "Michael D. Lowis" Date: Sat, 8 Oct 2016 00:04:17 -0400 Subject: [PATCH] Added *extremely* basic character set detection logic. Only binary and UTF-8 are supported at the moment --- Makefile | 2 +- buf.c | 56 ++++++++++++++++++++++++++++++++++++++++++--------- charset.c | 36 +++++++++++++++++++++++++++++++++ edit.h | 15 ++++++++++++-- foo | 1 - keyboard.c | 23 +++++++++------------ tests/tests.c | 5 +++++ 7 files changed, 112 insertions(+), 26 deletions(-) create mode 100644 charset.c delete mode 100644 foo diff --git a/Makefile b/Makefile index 396b07b..14e70f1 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ LDFLAGS = -L/opt/X11/lib -lX11 -lXft CFLAGS = --std=c99 -Wall -Wextra -I. -I/opt/X11/include -I/opt/local/include/freetype2 -I/usr/include/freetype2 -OBJS = buf.o screen.o utf8.o keyboard.o mouse.o +OBJS = buf.o screen.o utf8.o keyboard.o mouse.o charset.o TESTOBJS = tests/tests.o tests/buf.o tests/utf8.o all: edit test diff --git a/buf.c b/buf.c index dbfa318..2b6484b 100644 --- a/buf.c +++ b/buf.c @@ -1,20 +1,58 @@ #define _GNU_SOURCE #include #include +#include +#include +#include +#include + #include "edit.h" +typedef struct { + char* buf; + size_t len; +} FMap; + +FMap fmap(char* path) { + int fd; + FMap file; + struct stat sb; + if ((fd = open(path, O_RDONLY, 0)) < 0) + die("could not open file"); + if (fstat(fd, &sb) < 0) + die("file size could not be determined"); + file.buf = (char*)mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0); + file.len = sb.st_size; + if (file.buf == MAP_FAILED) + die("memory mapping of file failed"); + return file; +} + +void funmap(FMap file) { + munmap(file.buf, file.len); +} + void buf_load(Buf* buf, char* path) { buf->insert_mode = true; - unsigned i = 0; - Rune r; - FILE* in = (!strcmp(path,"-") ? stdin : fopen(path, "rb")); - buf->path = (in == stdin ? NULL : strdup(path)); - if (in != NULL) { - while (RUNE_EOF != (r = fgetrune(in))) - buf_ins(buf, i++, r); - fclose(in); + if (!strcmp(path,"-")) { + buf_ins(buf, 0, (Rune)'\n'); } else { - buf_ins(buf, i, (Rune)'\n'); + FMap file = fmap(path); + int chset = charset(file.buf, file.len); + if (chset > UTF_8) { + die("Unsupported character set"); + } else if (chset == BINARY) { + for (size_t i = 0; i < file.len; i++) + buf_ins(buf, buf_end(buf), file.buf[i]); + } else { // UTF-8 + for (size_t i = 0; i < file.len;) { + Rune r = 0; + size_t len = 0; + while (!utf8decode(&r, &len, file.buf[i++])); + buf_ins(buf, buf_end(buf), r); + } + } + funmap(file); } buf->insert_mode = false; } diff --git a/charset.c b/charset.c new file mode 100644 index 0000000..0aaa329 --- /dev/null +++ b/charset.c @@ -0,0 +1,36 @@ +#include "edit.h" + +static const struct { + int type; + int len; + char* seq; +} BOMS[] = { + { .type = UTF_8, .len = 3, .seq = (char[]){ 0xEF, 0xBB, 0xBF }}, + { .type = UTF_16BE, .len = 2, .seq = (char[]){ 0xFE, 0xFF }}, + { .type = UTF_16LE, .len = 2, .seq = (char[]){ 0xFF, 0xFE }}, + { .type = UTF_32BE, .len = 4, .seq = (char[]){ 0x00, 0x00, 0xFE, 0xFF }}, + { .type = UTF_32LE, .len = 4, .seq = (char[]){ 0xFF, 0xFE, 0x00, 0x00 }}, +}; + +static const char Utf8Valid[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0, +}; + +int charset(const char* buf, size_t len) { + /* look for a BOM and parse it */ + for (size_t i = 0; i < (sizeof(BOMS)/sizeof(BOMS[0])); i++) + if (!strncmp(buf, BOMS[i].seq, BOMS[i].len)) + return BOMS[i].type; + /* look for bytes that are invalid in utf-8 */ + int type = UTF_8; + for (size_t i = 0; type && (i < len); i++) + type = Utf8Valid[(int)buf[i]]; + return type; +} diff --git a/edit.h b/edit.h index a2c8b75..6a72d50 100644 --- a/edit.h +++ b/edit.h @@ -4,24 +4,35 @@ #include #include -/* UTF-8 Handling +/* Charset Handling *****************************************************************************/ enum { UTF_MAX = 6u, /* maximum number of bytes that make up a rune */ RUNE_SELF = 0x80, /* byte values larger than this are *not* ascii */ RUNE_ERR = 0xFFFD, /* rune value representing an error */ RUNE_MAX = 0x10FFFF, /* Maximum decodable rune value */ - RUNE_EOF = UINT32_MAX /* ruen value representing end of file */ + RUNE_EOF = UINT32_MAX /* rune value representing end of file */ }; /* Represents a unicode code point */ typedef uint32_t Rune; +enum { + BINARY = 0, + UTF_8, + UTF_16BE, + UTF_16LE, + UTF_32BE, + UTF_32LE, +}; + +int charset(const char* buf, size_t len); size_t utf8encode(char str[UTF_MAX], Rune rune); bool utf8decode(Rune* rune, size_t* length, int byte); Rune fgetrune(FILE* f); void fputrune(Rune rune, FILE* f); + /* Input Handling *****************************************************************************/ /* key definitions */ diff --git a/foo b/foo deleted file mode 100644 index 1910281..0000000 --- a/foo +++ /dev/null @@ -1 +0,0 @@ -foo \ No newline at end of file diff --git a/keyboard.c b/keyboard.c index 41cf646..6895d39 100644 --- a/keyboard.c +++ b/keyboard.c @@ -24,19 +24,16 @@ void handle_key(Rune key) { static void special_keys(Rune key) { switch (key) { - case KEY_F1: Buffer.insert_mode = !Buffer.insert_mode; break; - case KEY_F6: ColorBase = !ColorBase; break; - case KEY_LEFT: CursorPos = buf_byrune(&Buffer, CursorPos, -1); break; - case KEY_RIGHT: CursorPos = buf_byrune(&Buffer, CursorPos, 1); break; - case KEY_DOWN: CursorPos = buf_byline(&Buffer, CursorPos, 1); break; - case KEY_UP: CursorPos = buf_byline(&Buffer, CursorPos, -1); break; - case KEY_HOME: CursorPos = buf_bol(&Buffer, CursorPos); break; - case KEY_END: CursorPos = buf_eol(&Buffer, CursorPos); break; - case KEY_DELETE: - if (Buffer.insert_mode) - buf_del(&Buffer, CursorPos); - break; - + case KEY_F6: ColorBase = !ColorBase; break; + case KEY_UP: CursorPos = buf_byline(&Buffer, CursorPos, -1); break; + case KEY_DOWN: CursorPos = buf_byline(&Buffer, CursorPos, 1); break; + case KEY_LEFT: CursorPos = buf_byrune(&Buffer, CursorPos, -1); break; + case KEY_RIGHT: CursorPos = buf_byrune(&Buffer, CursorPos, 1); break; + case KEY_INSERT: Buffer.insert_mode = !Buffer.insert_mode; break; + case KEY_F1: Buffer.insert_mode = !Buffer.insert_mode; break; + case KEY_DELETE: buf_del(&Buffer, CursorPos); break; + case KEY_HOME: CursorPos = buf_bol(&Buffer, CursorPos); break; + case KEY_END: CursorPos = buf_eol(&Buffer, CursorPos); break; } } diff --git a/tests/tests.c b/tests/tests.c index 0f066e9..228c730 100644 --- a/tests/tests.c +++ b/tests/tests.c @@ -5,6 +5,11 @@ Buf Buffer; unsigned CursorPos; +void die(char* m) { + (void)m; +} + + int main(int argc, char** argv) { atf_init(argc,argv); RUN_EXTERN_TEST_SUITE(BufferTests); -- 2.49.0