Do not convert existing UTF-8 sequences

This commit is contained in:
HummyPkg 2017-03-09 19:32:28 +00:00
parent 5927a5e9ca
commit 4002df3b86
4 changed files with 53 additions and 5 deletions

View File

@ -3,6 +3,7 @@ MAKE=make
CFLAGS=-g CFLAGS=-g
DEFS=-D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -D_GNU_SOURCE DEFS=-D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -D_GNU_SOURCE
#WARN=-Wall -Wunused
all: lib test all: lib test

View File

@ -1,2 +1,3 @@
# libxconv # libxconv
Lightweight ISO-6937 to UTF-8 conversion library. Lightweight ISO-6937 to UTF-8 conversion library.

14
test.c
View File

@ -1,12 +1,17 @@
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include "charset.h" #include <ctype.h>
#include <stdint.h>
#include "xconv.h"
char *teststr[] = { char *teststr[] = {
"The House That \243100k Built", "The House That \243100k Built",
"home d\302ecor", "home d\302ecor",
"home made p\303at\302e", "home made p\303at\302e",
"scene at \251Odette\271 that", "scene at \251Odette\271 that",
"Sealgairean Sp\xc3\xacrsail/History Hunters",
"Ben & Hoilidh san R\xc3\xb2oghachd Bhig",
NULL NULL
}; };
@ -54,9 +59,10 @@ main()
{ {
hexdump(teststr[i], 0); hexdump(teststr[i], 0);
xconv(teststr[i], buf, sizeof(buf)); if (xconv(teststr[i], buf, sizeof(buf)))
hexdump(buf, 0);
hexdump(buf, 0); else
printf("Unchanged.\n");
printf("\n\n"); printf("\n\n");
} }

42
xconv.c
View File

@ -37,15 +37,52 @@ add_unicode(char **d, size_t *len, uint16_t u)
return 0; return 0;
} }
static int
is_utf_sequence(char *s)
{
int utf8_len;
int i;
if ((*s & 0xe0) == 0xc0) // U+0080+ 110xxxxx
utf8_len = 1;
else if ((*s & 0xf0) == 0xe0) // U+0800+ 1110xxxx
utf8_len = 2;
else if ((*s & 0xf8) == 0xf0) // U+10000+ 11110xxx
utf8_len = 3;
else return 0;
for (i = 1; i <= utf8_len; i++)
{
// UTF-8 continuation character 10xxxxxx
if ((s[i] & 0xc0) != 0x80)
// Invalid UTF-8 sequence
return 0;
}
return 1;
}
int int
xconv(char *src, char *dst, size_t dstlen) xconv(char *src, char *dst, size_t dstlen)
{ {
size_t len = dstlen; size_t len = dstlen - 1;
char *s, *d; char *s, *d;
int changed = 0;
int i; int i;
for (s = src, d = dst; *s && len > 0; s++) for (s = src, d = dst; *s && len > 0; s++)
{ {
if (!(*s & 0x80))
{
*d++ = *s;
len--;
continue;
}
if (is_utf_sequence(s))
return 0;
changed = 1;
// Check for combined character. // Check for combined character.
if ((*s & 0xf0) == 0xc0 && s[1]) if ((*s & 0xf0) == 0xc0 && s[1])
{ {
@ -66,6 +103,9 @@ xconv(char *src, char *dst, size_t dstlen)
add_unicode(&d, &len, iso6937_map[*s & 0xff]); add_unicode(&d, &len, iso6937_map[*s & 0xff]);
} }
if (!changed)
return 0;
*d = '\0'; *d = '\0';
return dstlen - len; return dstlen - len;
} }