diff --git a/Makefile b/Makefile index 1ebb816..dd60f8d 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ MAKE=make CFLAGS=-g DEFS=-D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -D_GNU_SOURCE +#WARN=-Wall -Wunused all: lib test diff --git a/README.md b/README.md index e59a717..9ba937c 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,3 @@ # libxconv Lightweight ISO-6937 to UTF-8 conversion library. + diff --git a/test.c b/test.c index fa62abe..94587e1 100644 --- a/test.c +++ b/test.c @@ -1,12 +1,17 @@ + #include #include -#include "charset.h" +#include +#include +#include "xconv.h" char *teststr[] = { "The House That \243100k Built", "home d\302ecor", "home made p\303at\302e", "scene at \251Odette\271 that", + "Sealgairean Sp\xc3\xacrsail/History Hunters", + "Ben & Hoilidh san R\xc3\xb2oghachd Bhig", NULL }; @@ -54,9 +59,10 @@ main() { hexdump(teststr[i], 0); - xconv(teststr[i], buf, sizeof(buf)); - - hexdump(buf, 0); + if (xconv(teststr[i], buf, sizeof(buf))) + hexdump(buf, 0); + else + printf("Unchanged.\n"); printf("\n\n"); } diff --git a/xconv.c b/xconv.c index 9c3e7d9..ccb4caa 100644 --- a/xconv.c +++ b/xconv.c @@ -37,15 +37,52 @@ add_unicode(char **d, size_t *len, uint16_t u) return 0; } +static int +is_utf_sequence(char *s) +{ + int utf8_len; + int i; + + if ((*s & 0xe0) == 0xc0) // U+0080+ 110xxxxx + utf8_len = 1; + else if ((*s & 0xf0) == 0xe0) // U+0800+ 1110xxxx + utf8_len = 2; + else if ((*s & 0xf8) == 0xf0) // U+10000+ 11110xxx + utf8_len = 3; + else return 0; + + for (i = 1; i <= utf8_len; i++) + { + // UTF-8 continuation character 10xxxxxx + if ((s[i] & 0xc0) != 0x80) + // Invalid UTF-8 sequence + return 0; + } + return 1; +} + int xconv(char *src, char *dst, size_t dstlen) { - size_t len = dstlen; + size_t len = dstlen - 1; char *s, *d; + int changed = 0; int i; for (s = src, d = dst; *s && len > 0; s++) { + if (!(*s & 0x80)) + { + *d++ = *s; + len--; + continue; + } + + if (is_utf_sequence(s)) + return 0; + + changed = 1; + // Check for combined character. if ((*s & 0xf0) == 0xc0 && s[1]) { @@ -66,6 +103,9 @@ xconv(char *src, char *dst, size_t dstlen) add_unicode(&d, &len, iso6937_map[*s & 0xff]); } + if (!changed) + return 0; + *d = '\0'; return dstlen - len; }