webif/webif/lib/xml.class

270 lines
7.9 KiB
Tcl

# Simple XML parser
# From https://wiki.tcl-lang.org/page/Parsing+XML
# Keith Vetter 2004-03-01
if {![exists -command class]} { package require oo }
class xml {
xml ""
loc 0
}
proc {xml init} {xml} {
# Remove all XML comments
regsub -all {<!--.*?-->} $xml {} xml
return [xml new [list xml [string trim $xml] loc 0]]
}
xml method next {{peek 0}} {
set n [regexp -start $loc -indices {(.*?)\s*?<(/?)(.*?)(/?)>} \
$xml all txt stok tok etok]
if {!$n} {
return "EOF"
}
lassign $all all0 all1
lassign $txt txt0 txt1
lassign $stok stok0 stok1
lassign $tok tok0 tok1
lassign $etok etok0 etok1
if {$txt1 >= $txt0} {
set txt [string range $xml $txt0 $txt1]
if {!$peek} {
set loc [expr {$txt1 + 1}]
}
if {$txt ne ""} {
set txt [$self decodeCharEntities $txt]
}
return [list TXT $txt]
}
set token [string range $xml $tok0 $tok1]
if {!$peek} {
set loc [expr {$all1 + 1}]
}
if {[regexp {^!\[CDATA\[(.*)\]\]} $token => txt]} {
return [list TXT $txt]
}
# Check for Processing Instruction <?...?>
set type XML
if {[regexp {^\?(.*)\?$} $token => token]} {
set type PI
}
set attr ""
regexp {^(.*?)\s+(.*?)$} $token => token attr
set etype START
if {$etok0 <= $etok1} {
if {$stok0 <= $stok1} {
# Bad XML
set token "/$token"
}
set etype EMPTY
} elseif {$stok0 <= $stok1} {
set etype END
}
return [list $type $token $attr $etype]
}
xml method reset {} {
set loc 0
}
# ::XML::IsWellFormed
# checks if the XML is well-formed )http://www.w3.org/TR/1998/REC-xml-19980210)
#
# Returns "" if well-formed, error message otherwise
# missing:
# characters: doesn't check valid extended characters
# attributes: doesn't check anything: quotes, equals, unique, etc.
# text stuff: references, entities, parameters, etc.
# doctype internal stuff
#
xml method _NextToken {{peek 0}} {
set result [$self next $peek]
if { $result eq "EOF" } {
return [list $result]
}
return $result
}
xml method isWellFormed {} {
set result [$self _isWellFormed]
$self reset
return $result
}
xml method _isWellFormed {} {
array set emsg {
XMLDECLFIRST "The XML declaration must come first"
MULTIDOCTYPE "Only one DOCTYPE is allowed"
INVALID "Invalid document structure"
MISMATCH "Ending tag '$val' doesn't match starting tag"
BADELEMENT "Bad element name '$val'"
EOD "Only processing instructions allowed at end of document"
BADNAME "Bad name '$val'"
BADPI "No processing instruction starts with 'xml'"
}
# [1] document ::= prolog element Misc*
# [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
# [27] Misc ::= Comment | PI | S
# [28] doctypedecl ::= <!DOCTYPE...>
# [16] PI ::= <? Name ...?>
# 1 xml, 2 pi, 4 doctype
set seen 0
while {1} {
foreach {type val attr etype} [$self _NextToken] break
if {$type eq "PI"} {
if {! [regexp {^[a-zA-Z_:][a-zA-Z0-9.-_:\xB7]+$} $val]} {
return [subst $emsg(BADNAME)]
}
# XMLDecl
if {$val eq "xml"} {
if {$seen != 0} { return $emsg(XMLDECLFIRST) }
# TODO: check version number exists and only encoding and
# standalone attributes are allowed
# Mark as seen XMLDecl
incr seen
continue
}
if {[string equal -nocase "xml" $val]} {return $emsg(BADPI)}
# Mark as seen PI
set seen [expr {$seen | 2}]
continue
} elseif {$type eq "XML" && $val eq "!DOCTYPE"} {
# Doctype
if {$seen & 4} { return $emsg(MULTIDOCTYPE) }
set seen [expr {$seen | 4}]
continue
}
break
}
# [39] element ::= EmptyElemTag | STag content ETag
# [40] STag ::= < Name (S Attribute)* S? >
# [42] ETag ::= </ Name S? >
# [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
# [44] EmptyElemTag ::= < Name (S Attribute)* S? />
#
set stack {}
set first 1
while {1} {
# Skip first time in
if {! $first} {
foreach {type val attr etype} [$self _NextToken] break
} else {
if {$type ne "XML" && $type ne "EOF"} { return $emsg(INVALID) }
set first 0
}
if {$type eq "EOF"} break
# TODO: check attributes: quotes, equals and unique
if {$type eq "TXT"} continue
if {! [regexp {^[a-zA-Z_:][a-zA-Z0-9.-_:\xB7]+$} $val]} {
return [subst $emsg(BADNAME)]
}
if {$type eq "PI"} {
if {[string equal -nocase xml $val]} { return $emsg(BADPI) }
continue
}
if {$etype eq "START"} {
# Starting tag
lappend stack $val
} elseif {$etype eq "END"} {
# </tag>
if {$val ne [lindex $stack end]} { return [subst $emsg(MISMATCH)] }
set stack [lrange $stack 0 end-1]
# Empty stack
if {[llength $stack] == 0} break
} elseif {$etype eq "EMPTY"} {
# <tag/>
}
}
# End-of-Document can only contain processing instructions
while {1} {
foreach {type val attr etype} [$self _NextToken] break
if {$type eq "EOF"} break
if {$type eq "PI"} {
if {[string equal -nocase xml $val]} { return $emsg(BADPI) }
continue
}
return $emsg(EOD)
}
return ""
}
# Transform some characters that might be sent as SGML entities.
# Initially &<>, but add some others (sent by TheTVDB.com?).
proc {xml decodeCharEntities} {xmlText} {
# Entity names are case-sensitive but HTML5 adds AMP, etc;
# other syntaxes (eg &#xhhhh;) aren't. Here treat all entity
# names as case-insensitive at the risk of transforming an
# illegal &APOS;, eg.
# XML "predefined entities"
set mapping {
&amp; &
&lt; <
&gt; >
&apos; "'"
&quot; "\""
}
# This parser <https://github.com/dbohdan/jimhttp/blob/master/json.tcl>
# could be used to import the HTML5 entity list from the JSON file at
# <https://html.spec.whatwg.org/entities.json> instead of the above.
# But this is XML without any additional DTD.
# add any numeric character entity in the input to the mapping
foreach {e _e xnum num} \
[regexp -all -inline \
{&((#[xX][0-9a-fA-F]+)|(#[0-9]+)|[[:alnum:]]+);} \
$xmlText] {
# if the hex/dec code is used ...
set code -1
if { $xnum ne "" } {
scan $xnum "#%*c%x" code
} elseif { $num ne "" } {
scan $num "#%u" code
}
# ... and not already mapped ...
if { $code >= 0 && [lsearch -nocase $mapping $e] < 0 } {
# ... add the entity and its translation
lappend mapping $e [format "%c" $code]
}
}
# substitute &amp; last in case of eg "xxx&amp;quot;yyy"
return [string map -nocase \
[lrange $mapping 0 1] \
[string map -nocase [lrange $mapping 2 end] $xmlText]]
}
# test parsing XML; default from tvdb.class with added char entity goodness
proc {xml test} \
{{testXml { <?xml version="1.0" encoding="ISO-8859-1"?>
<Series>
<seriesid>261451</seriesid>
<language>en</language>
<SeriesName>Teenage Mutant Ninja Turtles (2012)</SeriesName>
<banner>graphical/261451-g2.jpg</banner>
<Overview>Four ninja turtles, mutated by a mysterious alien substance, must rise up out of the sewers &amp; defend their city against evil forces from both the past and present.</Overview>
<FirstAired>2012-09-29</FirstAired>
<Network>Nickelodeon</Network>
<id>261451</id>
</Series>
}}} {
set testXml [xml init $testXml]
for {set type ""} {$type ne "EOF"} {} {
foreach {type val attr etype} [$testXml _NextToken] break
puts "looking at: $type '$val' '$attr' '$etype'"
}
}