Do not convert existing UTF-8 sequences
This commit is contained in:
parent
5927a5e9ca
commit
4002df3b86
1
Makefile
1
Makefile
@ -3,6 +3,7 @@ MAKE=make
|
||||
|
||||
CFLAGS=-g
|
||||
DEFS=-D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -D_GNU_SOURCE
|
||||
#WARN=-Wall -Wunused
|
||||
|
||||
all: lib test
|
||||
|
||||
|
12
test.c
12
test.c
@ -1,12 +1,17 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "charset.h"
|
||||
#include <ctype.h>
|
||||
#include <stdint.h>
|
||||
#include "xconv.h"
|
||||
|
||||
char *teststr[] = {
|
||||
"The House That \243100k Built",
|
||||
"home d\302ecor",
|
||||
"home made p\303at\302e",
|
||||
"scene at \251Odette\271 that",
|
||||
"Sealgairean Sp\xc3\xacrsail/History Hunters",
|
||||
"Ben & Hoilidh san R\xc3\xb2oghachd Bhig",
|
||||
NULL
|
||||
};
|
||||
|
||||
@ -54,9 +59,10 @@ main()
|
||||
{
|
||||
hexdump(teststr[i], 0);
|
||||
|
||||
xconv(teststr[i], buf, sizeof(buf));
|
||||
|
||||
if (xconv(teststr[i], buf, sizeof(buf)))
|
||||
hexdump(buf, 0);
|
||||
else
|
||||
printf("Unchanged.\n");
|
||||
|
||||
printf("\n\n");
|
||||
}
|
||||
|
42
xconv.c
42
xconv.c
@ -37,15 +37,52 @@ add_unicode(char **d, size_t *len, uint16_t u)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
is_utf_sequence(char *s)
|
||||
{
|
||||
int utf8_len;
|
||||
int i;
|
||||
|
||||
if ((*s & 0xe0) == 0xc0) // U+0080+ 110xxxxx
|
||||
utf8_len = 1;
|
||||
else if ((*s & 0xf0) == 0xe0) // U+0800+ 1110xxxx
|
||||
utf8_len = 2;
|
||||
else if ((*s & 0xf8) == 0xf0) // U+10000+ 11110xxx
|
||||
utf8_len = 3;
|
||||
else return 0;
|
||||
|
||||
for (i = 1; i <= utf8_len; i++)
|
||||
{
|
||||
// UTF-8 continuation character 10xxxxxx
|
||||
if ((s[i] & 0xc0) != 0x80)
|
||||
// Invalid UTF-8 sequence
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
int
|
||||
xconv(char *src, char *dst, size_t dstlen)
|
||||
{
|
||||
size_t len = dstlen;
|
||||
size_t len = dstlen - 1;
|
||||
char *s, *d;
|
||||
int changed = 0;
|
||||
int i;
|
||||
|
||||
for (s = src, d = dst; *s && len > 0; s++)
|
||||
{
|
||||
if (!(*s & 0x80))
|
||||
{
|
||||
*d++ = *s;
|
||||
len--;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (is_utf_sequence(s))
|
||||
return 0;
|
||||
|
||||
changed = 1;
|
||||
|
||||
// Check for combined character.
|
||||
if ((*s & 0xf0) == 0xc0 && s[1])
|
||||
{
|
||||
@ -66,6 +103,9 @@ xconv(char *src, char *dst, size_t dstlen)
|
||||
|
||||
add_unicode(&d, &len, iso6937_map[*s & 0xff]);
|
||||
}
|
||||
if (!changed)
|
||||
return 0;
|
||||
|
||||
*d = '\0';
|
||||
return dstlen - len;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user