Do not convert existing UTF-8 sequences
This commit is contained in:
parent
5927a5e9ca
commit
4002df3b86
1
Makefile
1
Makefile
@ -3,6 +3,7 @@ MAKE=make
|
|||||||
|
|
||||||
CFLAGS=-g
|
CFLAGS=-g
|
||||||
DEFS=-D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -D_GNU_SOURCE
|
DEFS=-D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -D_GNU_SOURCE
|
||||||
|
#WARN=-Wall -Wunused
|
||||||
|
|
||||||
all: lib test
|
all: lib test
|
||||||
|
|
||||||
|
@ -1,2 +1,3 @@
|
|||||||
# libxconv
|
# libxconv
|
||||||
Lightweight ISO-6937 to UTF-8 conversion library.
|
Lightweight ISO-6937 to UTF-8 conversion library.
|
||||||
|
|
||||||
|
14
test.c
14
test.c
@ -1,12 +1,17 @@
|
|||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "charset.h"
|
#include <ctype.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include "xconv.h"
|
||||||
|
|
||||||
char *teststr[] = {
|
char *teststr[] = {
|
||||||
"The House That \243100k Built",
|
"The House That \243100k Built",
|
||||||
"home d\302ecor",
|
"home d\302ecor",
|
||||||
"home made p\303at\302e",
|
"home made p\303at\302e",
|
||||||
"scene at \251Odette\271 that",
|
"scene at \251Odette\271 that",
|
||||||
|
"Sealgairean Sp\xc3\xacrsail/History Hunters",
|
||||||
|
"Ben & Hoilidh san R\xc3\xb2oghachd Bhig",
|
||||||
NULL
|
NULL
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -54,9 +59,10 @@ main()
|
|||||||
{
|
{
|
||||||
hexdump(teststr[i], 0);
|
hexdump(teststr[i], 0);
|
||||||
|
|
||||||
xconv(teststr[i], buf, sizeof(buf));
|
if (xconv(teststr[i], buf, sizeof(buf)))
|
||||||
|
hexdump(buf, 0);
|
||||||
hexdump(buf, 0);
|
else
|
||||||
|
printf("Unchanged.\n");
|
||||||
|
|
||||||
printf("\n\n");
|
printf("\n\n");
|
||||||
}
|
}
|
||||||
|
42
xconv.c
42
xconv.c
@ -37,15 +37,52 @@ add_unicode(char **d, size_t *len, uint16_t u)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
is_utf_sequence(char *s)
|
||||||
|
{
|
||||||
|
int utf8_len;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
if ((*s & 0xe0) == 0xc0) // U+0080+ 110xxxxx
|
||||||
|
utf8_len = 1;
|
||||||
|
else if ((*s & 0xf0) == 0xe0) // U+0800+ 1110xxxx
|
||||||
|
utf8_len = 2;
|
||||||
|
else if ((*s & 0xf8) == 0xf0) // U+10000+ 11110xxx
|
||||||
|
utf8_len = 3;
|
||||||
|
else return 0;
|
||||||
|
|
||||||
|
for (i = 1; i <= utf8_len; i++)
|
||||||
|
{
|
||||||
|
// UTF-8 continuation character 10xxxxxx
|
||||||
|
if ((s[i] & 0xc0) != 0x80)
|
||||||
|
// Invalid UTF-8 sequence
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
xconv(char *src, char *dst, size_t dstlen)
|
xconv(char *src, char *dst, size_t dstlen)
|
||||||
{
|
{
|
||||||
size_t len = dstlen;
|
size_t len = dstlen - 1;
|
||||||
char *s, *d;
|
char *s, *d;
|
||||||
|
int changed = 0;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
for (s = src, d = dst; *s && len > 0; s++)
|
for (s = src, d = dst; *s && len > 0; s++)
|
||||||
{
|
{
|
||||||
|
if (!(*s & 0x80))
|
||||||
|
{
|
||||||
|
*d++ = *s;
|
||||||
|
len--;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_utf_sequence(s))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
changed = 1;
|
||||||
|
|
||||||
// Check for combined character.
|
// Check for combined character.
|
||||||
if ((*s & 0xf0) == 0xc0 && s[1])
|
if ((*s & 0xf0) == 0xc0 && s[1])
|
||||||
{
|
{
|
||||||
@ -66,6 +103,9 @@ xconv(char *src, char *dst, size_t dstlen)
|
|||||||
|
|
||||||
add_unicode(&d, &len, iso6937_map[*s & 0xff]);
|
add_unicode(&d, &len, iso6937_map[*s & 0xff]);
|
||||||
}
|
}
|
||||||
|
if (!changed)
|
||||||
|
return 0;
|
||||||
|
|
||||||
*d = '\0';
|
*d = '\0';
|
||||||
return dstlen - len;
|
return dstlen - len;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user