From 68c33866cd283542b8962bdfec27e5c4a26bffd5 Mon Sep 17 00:00:00 2001 From: df Date: Thu, 14 Jan 2021 23:55:08 +0000 Subject: [PATCH 1/3] Add more decoding of XML character entities --- tvdb.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/tvdb.c b/tvdb.c index 7e68e56..5210bed 100644 --- a/tvdb.c +++ b/tvdb.c @@ -48,26 +48,32 @@ struct episode { void unescape(char *txt) { - char *p = txt; int l = strlen(txt); + char *p = txt; - while ((p = strchr(p, '&'))) + for (; p = strchr(p, '&'); p++) { - HANDLE(""", '"', 5); - HANDLE("&", '&', 4); - HANDLE(" ", '\n', 4); - HANDLE(" ", '\r', 4); - p++; + size_t ll; + unsigned char icode; + if (1 == sscanf( p, "&#%hhu;%n", &icode, &ll) || + 1 == sscanf( p, "&#%*[xX]%hhx;%n", &icode, &ll)) { + /* &#x;, &#; */ + HANDLE(p, (char)icode, ll-1); + } else { + HANDLE("&", '&', 4); + HANDLE(""", '"', 5); + HANDLE("'", '\'', 5); + HANDLE("<", '<', 3); + HANDLE(">", '>', 3); + } } - p = txt; - while ((p = memchr(p, '\xe2', l - (p - txt)))) - { + for (p = txt; p = memchr(p, '\xe2', l - (p - txt)); p++) + { /* curly apostrophe, en dash, curly quotes */ HANDLE("\xe2\x80\x99", '\'', 2); HANDLE("\xe2\x80\x93", '-', 2); HANDLE("\xe2\x80\x9c", '"', 2); HANDLE("\xe2\x80\x9d", '"', 2); - p++; } if ((p = strpbrk(txt, "\n\r"))) From 5aeab9da9d1428ee484351f048573c28218c7b22 Mon Sep 17 00:00:00 2001 From: df Date: Fri, 15 Jan 2021 00:31:20 +0000 Subject: [PATCH 2/3] Build without warnings on-box --- Makefile | 2 +- tvdb.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index aebbbe4..b04f625 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ HDRS= OBJS= $(SRCS:.c=.o) CC=gcc #CC=mipsel-linux-gcc -CFLAGS=-g +CFLAGS=-g -std=c99 -D_XOPEN_SOURCE=700 INCS= LIBS=-lsqlite3 WARN=-pedantic -Wall -W -Wnested-externs -Wpointer-arith -Wno-long-long diff --git a/tvdb.c b/tvdb.c index 5210bed..53bb8a5 100644 --- a/tvdb.c +++ b/tvdb.c @@ -48,12 +48,12 @@ struct episode { void unescape(char *txt) { - int l = strlen(txt); char *p = txt; + int l = strlen(txt); - for (; p = strchr(p, '&'); p++) + for (; (p = strchr(p, '&')); p++) { - size_t ll; + int ll; unsigned char icode; if (1 == sscanf( p, "&#%hhu;%n", &icode, &ll) || 1 == sscanf( p, "&#%*[xX]%hhx;%n", &icode, &ll)) { @@ -68,7 +68,7 @@ unescape(char *txt) } } - for (p = txt; p = memchr(p, '\xe2', l - (p - txt)); p++) + for (p = txt; (p = memchr(p, '\xe2', l - (p - txt))); p++) { /* curly apostrophe, en dash, curly quotes */ HANDLE("\xe2\x80\x99", '\'', 2); HANDLE("\xe2\x80\x93", '-', 2); From cf86aaebca5d69b8d6f87cb0be6ea710bf5b94f7 Mon Sep 17 00:00:00 2001 From: df Date: Fri, 15 Jan 2021 00:58:18 +0000 Subject: [PATCH 3/3] Ensure ; immediately follows the code in &#; --- tvdb.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tvdb.c b/tvdb.c index 53bb8a5..ab3922c 100644 --- a/tvdb.c +++ b/tvdb.c @@ -53,10 +53,12 @@ unescape(char *txt) for (; (p = strchr(p, '&')); p++) { - int ll; + int ll = 0; unsigned char icode; - if (1 == sscanf( p, "&#%hhu;%n", &icode, &ll) || - 1 == sscanf( p, "&#%*[xX]%hhx;%n", &icode, &ll)) { + /* sscanf -> 1: the code was read; ll>0: ';' came next */ + if ((1 == sscanf( p, "&#%hhu;%n", &icode, &ll) || + 1 == sscanf( p, "&#%*[xX]%hhx;%n", &icode, &ll)) && + ll > 0) { /* &#x;, &#; */ HANDLE(p, (char)icode, ll-1); } else {