Added *extremely* basic character set detection logic. Only binary and UTF-8 are...

author Michael D. Lowis <mike@mdlowis.com>

Sat, 8 Oct 2016 04:04:17 +0000 (00:04 -0400)

committer Michael D. Lowis <mike@mdlowis.com>

Sat, 8 Oct 2016 04:04:17 +0000 (00:04 -0400)
author Michael D. Lowis <mike@mdlowis.com>
Sat, 8 Oct 2016 04:04:17 +0000 (00:04 -0400)
committer Michael D. Lowis <mike@mdlowis.com>
Sat, 8 Oct 2016 04:04:17 +0000 (00:04 -0400)
diff --git a/Makefile b/Makefile

index 396b07b45a8516b3c6335d52f7e81ba4ada320cb..14e70f10e16531c912657acdf12e3642ce936c4a 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
  LDFLAGS  = -L/opt/X11/lib -lX11 -lXft
  CFLAGS   = --std=c99 -Wall -Wextra -I. -I/opt/X11/include -I/opt/local/include/freetype2 -I/usr/include/freetype2
-OBJS     = buf.o screen.o utf8.o keyboard.o mouse.o
+OBJS     = buf.o screen.o utf8.o keyboard.o mouse.o charset.o
  TESTOBJS = tests/tests.o tests/buf.o tests/utf8.o
  
  all: edit test
diff --git a/buf.c b/buf.c

index dbfa318324f0b3bba1918d0e9d82ac53e49b8d10..2b6484b07889498524f85d99c7f88a6cbffddd1c 100644 (file)
--- a/buf.c
+++ b/buf.c
@@ -1,20 +1,58 @@
  #define _GNU_SOURCE
  #include <string.h>
  #include <assert.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
  #include "edit.h"
  
+typedef struct {
+    char* buf;
+    size_t len;
+} FMap;
+
+FMap fmap(char* path) {
+    int fd;
+    FMap file;
+    struct stat sb;
+    if ((fd = open(path, O_RDONLY, 0)) < 0)
+        die("could not open file");
+    if (fstat(fd, &sb) < 0)
+        die("file size could not be determined");
+    file.buf = (char*)mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0);
+    file.len = sb.st_size;
+    if (file.buf == MAP_FAILED)
+        die("memory mapping of file failed");
+    return file;
+}
+
+void funmap(FMap file) {
+    munmap(file.buf, file.len);
+}
+
  void buf_load(Buf* buf, char* path) {
      buf->insert_mode = true;
-    unsigned i = 0;
-    Rune r;
-    FILE* in = (!strcmp(path,"-") ? stdin : fopen(path, "rb"));
-    buf->path = (in == stdin ? NULL : strdup(path));
-    if (in != NULL) {
-        while (RUNE_EOF != (r = fgetrune(in)))
-            buf_ins(buf, i++, r);
-        fclose(in);
+    if (!strcmp(path,"-")) {
+        buf_ins(buf, 0, (Rune)'\n');
      } else {
-        buf_ins(buf, i, (Rune)'\n');
+        FMap file = fmap(path);
+        int chset = charset(file.buf, file.len);
+        if (chset > UTF_8) {
+            die("Unsupported character set");
+        } else if (chset == BINARY) {
+            for (size_t i = 0; i < file.len; i++)
+                buf_ins(buf, buf_end(buf), file.buf[i]);
+        } else { // UTF-8
+            for (size_t i = 0; i < file.len;) {
+                Rune r = 0;
+                size_t len = 0;
+                while (!utf8decode(&r, &len, file.buf[i++]));
+                buf_ins(buf, buf_end(buf), r);
+            }
+        }
+        funmap(file);
      }
      buf->insert_mode = false;
  }
diff --git a/charset.c b/charset.c

new file mode 100644 (file)

index 0000000..0aaa329
--- /dev/null
+++ b/charset.c
@@ -0,0 +1,36 @@
+#include "edit.h"
+
+static const struct {
+    int type;
+    int len;
+    char* seq;
+} BOMS[] = {
+    { .type = UTF_8,    .len = 3, .seq = (char[]){ 0xEF, 0xBB, 0xBF       }},
+    { .type = UTF_16BE, .len = 2, .seq = (char[]){ 0xFE, 0xFF             }},
+    { .type = UTF_16LE, .len = 2, .seq = (char[]){ 0xFF, 0xFE             }},
+    { .type = UTF_32BE, .len = 4, .seq = (char[]){ 0x00, 0x00, 0xFE, 0xFF }},
+    { .type = UTF_32LE, .len = 4, .seq = (char[]){ 0xFF, 0xFE, 0x00, 0x00 }},
+};
+
+static const char Utf8Valid[256] = {
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
+};
+
+int charset(const char* buf, size_t len) {
+    /* look for a BOM and parse it */
+    for (size_t i = 0; i < (sizeof(BOMS)/sizeof(BOMS[0])); i++)
+        if (!strncmp(buf, BOMS[i].seq, BOMS[i].len))
+            return BOMS[i].type;
+    /* look for bytes that are invalid in utf-8 */
+    int type = UTF_8;
+    for (size_t i = 0; type && (i < len); i++)
+        type = Utf8Valid[(int)buf[i]];
+    return type;
+}
diff --git a/edit.h b/edit.h

index a2c8b7527760e5f550a03ffb0349a5bc12d92cfa..6a72d5018c50168450d32ba0d9f4e0cd580f872b 100644 (file)
--- a/edit.h
+++ b/edit.h
@@ -4,24 +4,35 @@
  #include <stdbool.h>
  #include <string.h>
  
-/* UTF-8 Handling
+/* Charset Handling
   *****************************************************************************/
  enum {
      UTF_MAX   = 6u,        /* maximum number of bytes that make up a rune */
      RUNE_SELF = 0x80,      /* byte values larger than this are *not* ascii */
      RUNE_ERR  = 0xFFFD,    /* rune value representing an error */
      RUNE_MAX  = 0x10FFFF,  /* Maximum decodable rune value */
-    RUNE_EOF  = UINT32_MAX /* ruen value representing end of file */
+    RUNE_EOF  = UINT32_MAX /* rune value representing end of file */
  };
  
  /* Represents a unicode code point */
  typedef uint32_t Rune;
  
+enum {
+    BINARY = 0,
+    UTF_8,
+    UTF_16BE,
+    UTF_16LE,
+    UTF_32BE,
+    UTF_32LE,
+};
+
+int charset(const char* buf, size_t len);
  size_t utf8encode(char str[UTF_MAX], Rune rune);
  bool utf8decode(Rune* rune, size_t* length, int byte);
  Rune fgetrune(FILE* f);
  void fputrune(Rune rune, FILE* f);
  
+
  /* Input Handling
   *****************************************************************************/
  /* key definitions */
diff --git a/foo b/foo

deleted file mode 100644 (file)

index 1910281..0000000
--- a/foo
+++ /dev/null
@@ -1 +0,0 @@
-foo
-\ No newline at end of file
diff --git a/keyboard.c b/keyboard.c

index 41cf6464b79d7513a759257488090e198e2ede68..6895d39ca75471dc6822b554b1cad8ca52bd9d7c 100644 (file)
--- a/keyboard.c
+++ b/keyboard.c
@@ -24,19 +24,16 @@ void handle_key(Rune key) {
  
  static void special_keys(Rune key) {
      switch (key) {
-        case KEY_F1:    Buffer.insert_mode = !Buffer.insert_mode;       break;
-        case KEY_F6:    ColorBase = !ColorBase;                         break;
-        case KEY_LEFT:  CursorPos = buf_byrune(&Buffer, CursorPos, -1); break;
-        case KEY_RIGHT: CursorPos = buf_byrune(&Buffer, CursorPos, 1);  break;
-        case KEY_DOWN:  CursorPos = buf_byline(&Buffer, CursorPos, 1);  break;
-        case KEY_UP:    CursorPos = buf_byline(&Buffer, CursorPos, -1); break;
-        case KEY_HOME:  CursorPos = buf_bol(&Buffer, CursorPos);        break;
-        case KEY_END:   CursorPos = buf_eol(&Buffer, CursorPos);        break;
-        case KEY_DELETE:
-            if (Buffer.insert_mode)
-                buf_del(&Buffer, CursorPos);
-            break;
-
+        case KEY_F6:     ColorBase = !ColorBase;                         break;
+        case KEY_UP:     CursorPos = buf_byline(&Buffer, CursorPos, -1); break;
+        case KEY_DOWN:   CursorPos = buf_byline(&Buffer, CursorPos, 1);  break;
+        case KEY_LEFT:   CursorPos = buf_byrune(&Buffer, CursorPos, -1); break;
+        case KEY_RIGHT:  CursorPos = buf_byrune(&Buffer, CursorPos, 1);  break;
+        case KEY_INSERT: Buffer.insert_mode = !Buffer.insert_mode;       break;
+        case KEY_F1:     Buffer.insert_mode = !Buffer.insert_mode;       break;
+        case KEY_DELETE: buf_del(&Buffer, CursorPos);                    break;
+        case KEY_HOME:   CursorPos = buf_bol(&Buffer, CursorPos);        break;
+        case KEY_END:    CursorPos = buf_eol(&Buffer, CursorPos);        break;
      }
  }
  
diff --git a/tests/tests.c b/tests/tests.c

index 0f066e959edd0dbeb84ade6a986c28012751ae80..228c73076f014600f88d2d9b9643f1276a7f9d6d 100644 (file)
--- a/tests/tests.c
+++ b/tests/tests.c
@@ -5,6 +5,11 @@
  Buf Buffer;
  unsigned CursorPos;
  
+void die(char* m) {
+    (void)m;
+}
+
+
  int main(int argc, char** argv) {
      atf_init(argc,argv);
      RUN_EXTERN_TEST_SUITE(BufferTests);
author	Michael D. Lowis <mike@mdlowis.com>
	Sat, 8 Oct 2016 04:04:17 +0000 (00:04 -0400)
committer	Michael D. Lowis <mike@mdlowis.com>
	Sat, 8 Oct 2016 04:04:17 +0000 (00:04 -0400)
Makefile		patch \| blob \| history
buf.c		patch \| blob \| history
charset.c	[new file with mode: 0644]	patch \| blob
edit.h		patch \| blob \| history
foo	[deleted file]	patch \| blob \| history
keyboard.c		patch \| blob \| history
tests/tests.c		patch \| blob \| history