# Simple XML parser # From https://wiki.tcl-lang.org/page/Parsing+XML # Keith Vetter 2004-03-01 if {![exists -proc class]} { package require oo } class xml { xml "" loc 0 } proc {xml init} {xml} { # Remove all XML comments regsub -all {} $xml {} xml return [xml new [list xml [string trim $xml] loc 0]] } xml method next {{peek 0}} { set n [regexp -start $loc -indices {(.*?)\s*?<(/?)(.*?)(/?)>} \ $xml all txt stok tok etok] if {!$n} { return "EOF" } lassign $all all0 all1 lassign $txt txt0 txt1 lassign $stok stok0 stok1 lassign $tok tok0 tok1 lassign $etok etok0 etok1 if {$txt1 >= $txt0} { set txt [string range $xml $txt0 $txt1] if {!$peek} { set loc [expr {$txt1 + 1}] } if {$txt ne ""} { set txt [$self decodeCharEntities $txt] } return [list TXT $txt] } set token [string range $xml $tok0 $tok1] if {!$peek} { set loc [expr {$all1 + 1}] } if {[regexp {^!\[CDATA\[(.*)\]\]} $token => txt]} { return [list TXT $txt] } # Check for Processing Instruction set type XML if {[regexp {^\?(.*)\?$} $token => token]} { set type PI } set attr "" regexp {^(.*?)\s+(.*?)$} $token => token attr set etype START if {$etok0 <= $etok1} { if {$stok0 <= $stok1} { # Bad XML set token "/$token" } set etype EMPTY } elseif {$stok0 <= $stok1} { set etype END } return [list $type $token $attr $etype] } xml method reset {} { set loc 0 } # ::XML::IsWellFormed # checks if the XML is well-formed )http://www.w3.org/TR/1998/REC-xml-19980210) # # Returns "" if well-formed, error message otherwise # missing: # characters: doesn't check valid extended characters # attributes: doesn't check anything: quotes, equals, unique, etc. # text stuff: references, entities, parameters, etc. # doctype internal stuff # xml method _NextToken {{peek 0}} { set result [$self next $peek] if { $result eq "EOF" } { return [list $result] } return $result } xml method isWellFormed {} { set result [$self _isWellFormed] set loc 0 return $result } xml method _isWellFormed {} { array set emsg { XMLDECLFIRST "The XML declaration must come first" MULTIDOCTYPE "Only one DOCTYPE is allowed" INVALID "Invalid document structure" MISMATCH "Ending tag '$val' doesn't match starting tag" BADELEMENT "Bad element name '$val'" EOD "Only processing instructions allowed at end of document" BADNAME "Bad name '$val'" BADPI "No processing instruction starts with 'xml'" } # [1] document ::= prolog element Misc* # [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? # [27] Misc ::= Comment | PI | S # [28] doctypedecl ::= # [16] PI ::= # 1 xml, 2 pi, 4 doctype set seen 0 while {1} { foreach {type val attr etype} [$self _NextToken] break if {$type eq "PI"} { if {! [regexp {^[a-zA-Z_:][a-zA-Z0-9.-_:\xB7]+$} $val]} { return [subst $emsg(BADNAME)] } # XMLDecl if {$val eq "xml"} { if {$seen != 0} { return $emsg(XMLDECLFIRST) } # TODO: check version number exist and only encoding and # standalone attributes are allowed # Mark as seen XMLDecl incr seen continue } if {[string equal -nocase "xml" $val]} {return $emsg(BADPI)} # Mark as seen PI set seen [expr {$seen | 2}] continue } elseif {$type eq "XML" && $val eq "!DOCTYPE"} { # Doctype if {$seen & 4} { return $emsg(MULTIDOCTYPE) } set seen [expr {$seen | 4}] continue } break } # [39] element ::= EmptyElemTag | STag content ETag # [40] STag ::= < Name (S Attribute)* S? > # [42] ETag ::= # [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* # [44] EmptyElemTag ::= < Name (S Attribute)* S? /> # set stack {} set first 1 while {1} { # Skip first time in if {! $first} { foreach {type val attr etype} [$self _NextToken] break } else { if {$type ne "XML" && $type ne "EOF"} { return $emsg(INVALID) } set first 0 } if {$type eq "EOF"} break # TODO: check attributes: quotes, equals and unique if {$type eq "TXT"} continue if {! [regexp {^[a-zA-Z_:][a-zA-Z0-9.-_:\xB7]+$} $val]} { return [subst $emsg(BADNAME)] } if {$type eq "PI"} { if {[string equal -nocase xml $val]} { return $emsg(BADPI) } continue } if {$etype eq "START"} { # Starting tag lappend stack $val } elseif {$etype eq "END"} { # if {$val ne [lindex $stack end]} { return [subst $emsg(MISMATCH)] } set stack [lrange $stack 0 end-1] # Empty stack if {[llength $stack] == 0} break } elseif {$etype eq "EMPTY"} { # } } # End-of-Document can only contain processing instructions while {1} { foreach {type val attr etype} [$self _NextToken] break if {$type eq "EOF"} break if {$type eq "PI"} { if {[string equal -nocase xml $val]} { return $emsg(BADPI) } continue } return $emsg(EOD) } return "" } # Transform some characters that might be sent as SGML entities. # Initially &<>, but add some others (sent by TheTVDB.com?). proc {xml decodeCharEntities} {xmlText} { # Entity names are case-sensitive but HTML5 adds AMP, etc; # other syntaxes (eg &#xhhhh;) aren't. Here treat all entity # names as case-insensitive at the risk of transforming an # illegal &APOS;, eg. # XML "predefined entities" set mapping { & & < < > > ' "'" " "\"" } # This parser # could be used to import the HTML5 entity list from the JSON file at # instead of the above. But # this is XML without any additional DTD. # add any numeric character entity in the input to the mapping foreach {e _e xnum num} \ [regexp -all -inline \ {&((#[xX][0-9a-fA-F]+)|(#[0-9]+)|[[:alnum:]]+);} \ $xmlText] { # if the hex/dec code is used ... set code -1 if { $xnum ne "" } { scan $xnum "#%*c%x" code } elseif { $num ne "" } { scan $num "#%u" code } # ... and not already mapped ... if { $code >= 0 && [lsearch -nocase $mapping $e] < 0 } { # ... add the entity and its translation lappend mapping $e [format "%c" $code] } } # substitute & last in case of eg "xxx&quot;yyy" return [string map -nocase \ [lrange $mapping 0 1] \ [string map -nocase [lrange $mapping 2 end] $xmlText]] } # test parsing XML; default from tvdb.class with added char entity goodness proc {xml test} \ {{testXml { 261451 en Teenage Mutant Ninja Turtles (2012) graphical/261451-g2.jpg Four ninja turtles, mutated by a mysterious alien substance, must rise up out of the sewers & defend their city against evil forces from both the past and present. 2012-09-29 Nickelodeon 261451 }}} { set testXml [xml init $testXml] for {set type ""} {$type ne "EOF"} {} { foreach {type val attr etype} [$testXml _NextToken] break puts "looking at: $type '$val' '$attr' '$etype'" } }