diff --git a/webif/lib/xml.class b/webif/lib/xml.class index 1a4518e..7cde513 100644 --- a/webif/lib/xml.class +++ b/webif/lib/xml.class @@ -1,4 +1,6 @@ # Simple XML parser +# From https://wiki.tcl-lang.org/page/Parsing+XML +# Keith Vetter 2004-03-01 if {![exists -proc class]} { package require oo } @@ -12,7 +14,7 @@ proc {xml init} {xml} { regsub -all {} $xml {} xml return [xml new [list xml [string trim $xml] loc 0]] } - + xml method next {{peek 0}} { set n [regexp -start $loc -indices {(.*?)\s*?<(/?)(.*?)(/?)>} \ $xml all txt stok tok etok] @@ -24,15 +26,18 @@ xml method next {{peek 0}} { lassign $stok stok0 stok1 lassign $tok tok0 tok1 lassign $etok etok0 etok1 - + if {$txt1 >= $txt0} { set txt [string range $xml $txt0 $txt1] if {!$peek} { set loc [expr {$txt1 + 1}] } + if {$txt ne ""} { + set txt [$self decodeCharEntities $txt] + } return [list TXT $txt] } - + set token [string range $xml $tok0 $tok1] if {!$peek} { set loc [expr {$all1 + 1}] @@ -40,7 +45,7 @@ xml method next {{peek 0}} { if {[regexp {^!\[CDATA\[(.*)\]\]} $token => txt]} { return [list TXT $txt] } - + # Check for Processing Instruction set type XML if {[regexp {^\?(.*)\?$} $token => token]} { @@ -61,3 +66,204 @@ xml method next {{peek 0}} { return [list $type $token $attr $etype] } +xml method reset {} { + set loc 0 +} + +# ::XML::IsWellFormed +# checks if the XML is well-formed )http://www.w3.org/TR/1998/REC-xml-19980210) +# +# Returns "" if well-formed, error message otherwise +# missing: +# characters: doesn't check valid extended characters +# attributes: doesn't check anything: quotes, equals, unique, etc. +# text stuff: references, entities, parameters, etc. +# doctype internal stuff +# +xml method _NextToken {{peek 0}} { + set result [$self next $peek] + if { $result eq "EOF" } { + return [list $result] + } + return $result +} + +xml method isWellFormed {} { + set result [$self _isWellFormed] + $self reset + return $result +} + +xml method _isWellFormed {} { + array set emsg { + XMLDECLFIRST "The XML declaration must come first" + MULTIDOCTYPE "Only one DOCTYPE is allowed" + INVALID "Invalid document structure" + MISMATCH "Ending tag '$val' doesn't match starting tag" + BADELEMENT "Bad element name '$val'" + EOD "Only processing instructions allowed at end of document" + BADNAME "Bad name '$val'" + BADPI "No processing instruction starts with 'xml'" + } + + # [1] document ::= prolog element Misc* + # [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? + # [27] Misc ::= Comment | PI | S + # [28] doctypedecl ::= + # [16] PI ::= + # 1 xml, 2 pi, 4 doctype + set seen 0 + while {1} { + foreach {type val attr etype} [$self _NextToken] break + if {$type eq "PI"} { + if {! [regexp {^[a-zA-Z_:][a-zA-Z0-9.-_:\xB7]+$} $val]} { + return [subst $emsg(BADNAME)] + } + # XMLDecl + if {$val eq "xml"} { + if {$seen != 0} { return $emsg(XMLDECLFIRST) } + # TODO: check version number exists and only encoding and + # standalone attributes are allowed + # Mark as seen XMLDecl + incr seen + continue + } + if {[string equal -nocase "xml" $val]} {return $emsg(BADPI)} + # Mark as seen PI + set seen [expr {$seen | 2}] + continue + } elseif {$type eq "XML" && $val eq "!DOCTYPE"} { + # Doctype + if {$seen & 4} { return $emsg(MULTIDOCTYPE) } + set seen [expr {$seen | 4}] + continue + } + break + } + + # [39] element ::= EmptyElemTag | STag content ETag + # [40] STag ::= < Name (S Attribute)* S? > + # [42] ETag ::= + # [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* + # [44] EmptyElemTag ::= < Name (S Attribute)* S? /> + # + + set stack {} + set first 1 + while {1} { + # Skip first time in + if {! $first} { + foreach {type val attr etype} [$self _NextToken] break + } else { + if {$type ne "XML" && $type ne "EOF"} { return $emsg(INVALID) } + set first 0 + } + + if {$type eq "EOF"} break + # TODO: check attributes: quotes, equals and unique + + if {$type eq "TXT"} continue + if {! [regexp {^[a-zA-Z_:][a-zA-Z0-9.-_:\xB7]+$} $val]} { + return [subst $emsg(BADNAME)] + } + + if {$type eq "PI"} { + if {[string equal -nocase xml $val]} { return $emsg(BADPI) } + continue + } + if {$etype eq "START"} { + # Starting tag + lappend stack $val + } elseif {$etype eq "END"} { + # + if {$val ne [lindex $stack end]} { return [subst $emsg(MISMATCH)] } + set stack [lrange $stack 0 end-1] + # Empty stack + if {[llength $stack] == 0} break + } elseif {$etype eq "EMPTY"} { + # + } + } + + # End-of-Document can only contain processing instructions + while {1} { + foreach {type val attr etype} [$self _NextToken] break + if {$type eq "EOF"} break + if {$type eq "PI"} { + if {[string equal -nocase xml $val]} { return $emsg(BADPI) } + continue + } + return $emsg(EOD) + } + return "" + } + + +# Transform some characters that might be sent as SGML entities. +# Initially &<>, but add some others (sent by TheTVDB.com?). + +proc {xml decodeCharEntities} {xmlText} { + # Entity names are case-sensitive but HTML5 adds AMP, etc; + # other syntaxes (eg &#xhhhh;) aren't. Here treat all entity + # names as case-insensitive at the risk of transforming an + # illegal &APOS;, eg. + + # XML "predefined entities" + set mapping { + & & + < < + > > + ' "'" + " "\"" + } + + # This parser + # could be used to import the HTML5 entity list from the JSON file at + # instead of the above. + # But this is XML without any additional DTD. + + # add any numeric character entity in the input to the mapping + foreach {e _e xnum num} \ + [regexp -all -inline \ + {&((#[xX][0-9a-fA-F]+)|(#[0-9]+)|[[:alnum:]]+);} \ + $xmlText] { + # if the hex/dec code is used ... + set code -1 + if { $xnum ne "" } { + scan $xnum "#%*c%x" code + } elseif { $num ne "" } { + scan $num "#%u" code + } + # ... and not already mapped ... + if { $code >= 0 && [lsearch -nocase $mapping $e] < 0 } { + # ... add the entity and its translation + lappend mapping $e [format "%c" $code] + } + } + # substitute & last in case of eg "xxx&quot;yyy" + return [string map -nocase \ + [lrange $mapping 0 1] \ + [string map -nocase [lrange $mapping 2 end] $xmlText]] +} + +# test parsing XML; default from tvdb.class with added char entity goodness +proc {xml test} \ + {{testXml { + + 261451 + en + Teenage Mutant Ninja Turtles (2012) + graphical/261451-g2.jpg + Four ninja turtles, mutated by a mysterious alien substance, must rise up out of the sewers & defend their city against evil forces from both the past and present. + 2012-09-29 + Nickelodeon + 261451 + + }}} { + set testXml [xml init $testXml] + for {set type ""} {$type ne "EOF"} {} { + foreach {type val attr etype} [$testXml _NextToken] break + puts "looking at: $type '$val' '$attr' '$etype'" + } +} +