diff --git a/webif/lib/xml.class b/webif/lib/xml.class
index 1a4518e..7cde513 100644
--- a/webif/lib/xml.class
+++ b/webif/lib/xml.class
@@ -1,4 +1,6 @@
# Simple XML parser
+# From https://wiki.tcl-lang.org/page/Parsing+XML
+# Keith Vetter 2004-03-01
if {![exists -proc class]} { package require oo }
@@ -12,7 +14,7 @@ proc {xml init} {xml} {
regsub -all {} $xml {} xml
return [xml new [list xml [string trim $xml] loc 0]]
}
-
+
xml method next {{peek 0}} {
set n [regexp -start $loc -indices {(.*?)\s*?<(/?)(.*?)(/?)>} \
$xml all txt stok tok etok]
@@ -24,15 +26,18 @@ xml method next {{peek 0}} {
lassign $stok stok0 stok1
lassign $tok tok0 tok1
lassign $etok etok0 etok1
-
+
if {$txt1 >= $txt0} {
set txt [string range $xml $txt0 $txt1]
if {!$peek} {
set loc [expr {$txt1 + 1}]
}
+ if {$txt ne ""} {
+ set txt [$self decodeCharEntities $txt]
+ }
return [list TXT $txt]
}
-
+
set token [string range $xml $tok0 $tok1]
if {!$peek} {
set loc [expr {$all1 + 1}]
@@ -40,7 +45,7 @@ xml method next {{peek 0}} {
if {[regexp {^!\[CDATA\[(.*)\]\]} $token => txt]} {
return [list TXT $txt]
}
-
+
# Check for Processing Instruction ...?>
set type XML
if {[regexp {^\?(.*)\?$} $token => token]} {
@@ -61,3 +66,204 @@ xml method next {{peek 0}} {
return [list $type $token $attr $etype]
}
+xml method reset {} {
+ set loc 0
+}
+
+# ::XML::IsWellFormed
+# checks if the XML is well-formed )http://www.w3.org/TR/1998/REC-xml-19980210)
+#
+# Returns "" if well-formed, error message otherwise
+# missing:
+# characters: doesn't check valid extended characters
+# attributes: doesn't check anything: quotes, equals, unique, etc.
+# text stuff: references, entities, parameters, etc.
+# doctype internal stuff
+#
+xml method _NextToken {{peek 0}} {
+ set result [$self next $peek]
+ if { $result eq "EOF" } {
+ return [list $result]
+ }
+ return $result
+}
+
+xml method isWellFormed {} {
+ set result [$self _isWellFormed]
+ $self reset
+ return $result
+}
+
+xml method _isWellFormed {} {
+ array set emsg {
+ XMLDECLFIRST "The XML declaration must come first"
+ MULTIDOCTYPE "Only one DOCTYPE is allowed"
+ INVALID "Invalid document structure"
+ MISMATCH "Ending tag '$val' doesn't match starting tag"
+ BADELEMENT "Bad element name '$val'"
+ EOD "Only processing instructions allowed at end of document"
+ BADNAME "Bad name '$val'"
+ BADPI "No processing instruction starts with 'xml'"
+ }
+
+ # [1] document ::= prolog element Misc*
+ # [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
+ # [27] Misc ::= Comment | PI | S
+ # [28] doctypedecl ::=
+ # [16] PI ::= Name ...?>
+ # 1 xml, 2 pi, 4 doctype
+ set seen 0
+ while {1} {
+ foreach {type val attr etype} [$self _NextToken] break
+ if {$type eq "PI"} {
+ if {! [regexp {^[a-zA-Z_:][a-zA-Z0-9.-_:\xB7]+$} $val]} {
+ return [subst $emsg(BADNAME)]
+ }
+ # XMLDecl
+ if {$val eq "xml"} {
+ if {$seen != 0} { return $emsg(XMLDECLFIRST) }
+ # TODO: check version number exists and only encoding and
+ # standalone attributes are allowed
+ # Mark as seen XMLDecl
+ incr seen
+ continue
+ }
+ if {[string equal -nocase "xml" $val]} {return $emsg(BADPI)}
+ # Mark as seen PI
+ set seen [expr {$seen | 2}]
+ continue
+ } elseif {$type eq "XML" && $val eq "!DOCTYPE"} {
+ # Doctype
+ if {$seen & 4} { return $emsg(MULTIDOCTYPE) }
+ set seen [expr {$seen | 4}]
+ continue
+ }
+ break
+ }
+
+ # [39] element ::= EmptyElemTag | STag content ETag
+ # [40] STag ::= < Name (S Attribute)* S? >
+ # [42] ETag ::= Name S? >
+ # [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
+ # [44] EmptyElemTag ::= < Name (S Attribute)* S? />
+ #
+
+ set stack {}
+ set first 1
+ while {1} {
+ # Skip first time in
+ if {! $first} {
+ foreach {type val attr etype} [$self _NextToken] break
+ } else {
+ if {$type ne "XML" && $type ne "EOF"} { return $emsg(INVALID) }
+ set first 0
+ }
+
+ if {$type eq "EOF"} break
+ # TODO: check attributes: quotes, equals and unique
+
+ if {$type eq "TXT"} continue
+ if {! [regexp {^[a-zA-Z_:][a-zA-Z0-9.-_:\xB7]+$} $val]} {
+ return [subst $emsg(BADNAME)]
+ }
+
+ if {$type eq "PI"} {
+ if {[string equal -nocase xml $val]} { return $emsg(BADPI) }
+ continue
+ }
+ if {$etype eq "START"} {
+ # Starting tag
+ lappend stack $val
+ } elseif {$etype eq "END"} {
+ #
+ if {$val ne [lindex $stack end]} { return [subst $emsg(MISMATCH)] }
+ set stack [lrange $stack 0 end-1]
+ # Empty stack
+ if {[llength $stack] == 0} break
+ } elseif {$etype eq "EMPTY"} {
+ #
+ }
+ }
+
+ # End-of-Document can only contain processing instructions
+ while {1} {
+ foreach {type val attr etype} [$self _NextToken] break
+ if {$type eq "EOF"} break
+ if {$type eq "PI"} {
+ if {[string equal -nocase xml $val]} { return $emsg(BADPI) }
+ continue
+ }
+ return $emsg(EOD)
+ }
+ return ""
+ }
+
+
+# Transform some characters that might be sent as SGML entities.
+# Initially &<>, but add some others (sent by TheTVDB.com?).
+
+proc {xml decodeCharEntities} {xmlText} {
+ # Entity names are case-sensitive but HTML5 adds AMP, etc;
+ # other syntaxes (eg hhhh;) aren't. Here treat all entity
+ # names as case-insensitive at the risk of transforming an
+ # illegal &APOS;, eg.
+
+ # XML "predefined entities"
+ set mapping {
+ & &
+ < <
+ > >
+ ' "'"
+ " "\""
+ }
+
+ # This parser
+ # could be used to import the HTML5 entity list from the JSON file at
+ # instead of the above.
+ # But this is XML without any additional DTD.
+
+ # add any numeric character entity in the input to the mapping
+ foreach {e _e xnum num} \
+ [regexp -all -inline \
+ {&((#[xX][0-9a-fA-F]+)|(#[0-9]+)|[[:alnum:]]+);} \
+ $xmlText] {
+ # if the hex/dec code is used ...
+ set code -1
+ if { $xnum ne "" } {
+ scan $xnum "#%*c%x" code
+ } elseif { $num ne "" } {
+ scan $num "#%u" code
+ }
+ # ... and not already mapped ...
+ if { $code >= 0 && [lsearch -nocase $mapping $e] < 0 } {
+ # ... add the entity and its translation
+ lappend mapping $e [format "%c" $code]
+ }
+ }
+ # substitute & last in case of eg "xxx"yyy"
+ return [string map -nocase \
+ [lrange $mapping 0 1] \
+ [string map -nocase [lrange $mapping 2 end] $xmlText]]
+}
+
+# test parsing XML; default from tvdb.class with added char entity goodness
+proc {xml test} \
+ {{testXml {
+
+ 261451
+ en
+ Teenage Mutant Ninja Turtles (2012)
+ graphical/261451-g2.jpg
+ Four ninja turtles, mutated by a mysterious alien substance, must rise up out of the sewers & defend their city against evil forces from both the past and present.
+ 2012-09-29
+ Nickelodeon
+ 261451
+
+ }}} {
+ set testXml [xml init $testXml]
+ for {set type ""} {$type ne "EOF"} {} {
+ foreach {type val attr etype} [$testXml _NextToken] break
+ puts "looking at: $type '$val' '$attr' '$etype'"
+ }
+}
+