1#!MUNGERPATH 2 3; Copyright (c) 2004, 2009 James Bailie <jimmy@mammothcheese.ca>. 4; All rights reserved. 5; 6; Redistribution and use in source form, with or without 7; modification, are permitted provided that the following conditions are met: 8; 9; * Redistributions of source code must retain the above copyright 10; notice, this list of conditions and the following disclaimer. 11; * The name of James Bailie may not be used to endorse or promote 12; products derived from this software without specific prior written permission. 13; 14; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS" 15; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 18; LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 19; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 20; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 21; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 22; CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24; POSSIBILITY OF SUCH DAMAGE. 25 26; This script is a minimal, non-validating parser for standalone XML 1.0 27; documents. The program only knows about ISO-8859-1, but it will attempt 28; to process a document encoded using UTF-8. This will only succeed if the 29; characters used in tag and attribute names are limited to those of 30; ISO-8859-1, which is a subset of UTF-8. The character data in the 31; document is not subject to this limitation. 32 33; It cannot cope with documents which are not well-formed, nor can it 34; convert entity references beyond those defined by the XML 1.0 standard as 35; predefined (" & > < '). It will convert both decimal 36; and hexadecimal character references, however. The parser only 37; recognizes element tags and CDATA tags. It ignores processing 38; instructions. 39 40; XML structure and content is converted into an alist, and then munger code 41; to create and bind this alist to a symbol is printed to stdout. One 42; optional argument may be passed to the script to specify the symbol to be 43; bound in the output expression (-s [symbol]). If the option is not present, 44; then the alist will be bound to the symbol 'document. 45 46; The example script xmlquery.munger provides a query interface to alists 47; produced by this program. Examples of usage are in the comments at the 48; beginning of xmlquery.munger. 49 50; Makes lisp errors fatal to the interpreter. 51 52(fatal) 53 54; Maximum size of the chunks of text we will process. 55 56(setq maxlen 512) 57 58; Lexical analysis. 59 60(let ((token "") 61 (type 'empty) 62 (term "") 63 (chars ()) 64 (len 0) 65 (term_rx (regcomp (char 13)))) 66 67 (defun parse (line) 68 69 (if (eq 0 line) 70 (when token 71 (make_item type token)) 72 73 (setq chars (explode (substitute term_rx "" line 0))) 74 75 (while chars 76 (cond ((eq type 'empty) 77 (setq token (car chars)) 78 (setq type (if (eq (car chars) "<") 'markup 'cdata))) 79 80 ((eq type 'markup) 81 82 (if (and (> (setq len (length token)) 8) 83 (eq "<![CDATA[" (substring token 0 9))) 84 85 (cond ((and (setq term (eq "]]" (substring token (- len 2) 0))) 86 (eq (car chars) ">")) 87 (make_item type (concat token (car chars))) 88 (setq token "") 89 (setq type 'empty)) 90 91 ; Split-up long explicit cdata sections. 92 93 ((> len maxlen) 94 (if (eq (car chars) "]") 95 (setq token (concat token (car chars))) 96 97 (make_item type (concat token (car chars) "]]>")) 98 (setq token "<![CDATA["))) 99 100 (1 (setq token (concat token (car chars))))) 101 102 (if (not (eq (car chars) ">")) 103 (setq token (concat token (car chars))) 104 105 (make_item type (concat token (car chars))) 106 (setq type 'empty) 107 (setq token "")))) 108 109 ((eq type 'cdata) 110 (if (eq (car chars) "<") 111 (progn 112 (when token 113 (make_item type token)) 114 (set 'token (car chars)) 115 (set 'type 'markup)) 116 117 ; Split-up long implicit cdata sections. 118 119 (if (<= (length token) maxlen) 120 (setq token (concat token (car chars))) 121 122 (make_item type (concat token (car chars))) 123 (setq token ""))))) 124 125 (setq chars (cdr chars)))))) 126 127; Functions to convert character and entity references. 128 129(let ((nums ()) 130 (subst "") 131 (converted (table)) 132 (rx "") 133 (m ()) 134 (entities_rx (regcomp "&(lt|gt|apos|quot|amp|(x)?([0-9A-Fa-f]{2}));"))) 135 136 (defun do_conversions (line) 137 (if (not (setq m (matches entities_rx line))) 138 line 139 140 (setq subst 141 (cond ((eq (car (cdr m)) "lt") "<") 142 ((eq (car (cdr m)) "gt") ">") 143 ((eq (car (cdr m)) "apos") "'") 144 ((eq (car (cdr m)) "quot") "\"") 145 ((eq (car (cdr m)) "amp") "&") 146 147 ((eq (car (cddr m)) "x") 148 (char (hex2dec (car (cdddr m))))) 149 150 (1 (char (digitize (car (cdddr m))))))) 151 152 ; Convert one match at a time to avoid performing multiple levels 153 ; of conversions on portions of the line we have not seen yet, and 154 ; on portions of lines we have already scanned once. 155 156 (setq nums (match entities_rx line)) 157 158 (setq rx (lookup converted (car m))) 159 (when (not rx) 160 (setq rx (hash converted (car m) (regcomp (car m))))) 161 162 (concat (substitute rx subst (substring line 0 (cadr nums))) 163 (if (eq (length line) (cadr nums)) 164 "" 165 (do_conversions (substring line (cadr nums) 0))))))) 166 167; Regular expressions to escape quotes and backslashes so that the final alist 168; created by this script will be readable by the munger reader. 169 170(let ((quote_rx (regcomp "\"")) 171 (backslash_rx (regcomp "\\\\"))) 172 173 (defun convert_refs (line convert) 174 175 (when convert 176 (setq line (do_conversions line))) 177 178 ; Items to escape may be found literally in the string or they may be 179 ; created by entity replacement, so we need to escape them after entity 180 ; conversion. The double escaping in the replacement strings below is 181 ; necessary. The lisp reader interprets the first level, while the 182 ; substitute command interprets the second level. Quotes must be escaped 183 ; after backslashes since quotes are escaped with backslashes. 184 185 (substitute quote_rx "\\\\\"" (substitute backslash_rx "\\\\\\\\" line 0) 0))) 186 187; Regular expression used to remove delimiting quotes from attribute values. 188 189(setq quotes_rx (regcomp "^[\"'](.*)['\"]$")) 190 191; String containing whitespace characters recognized by XML. 192 193(setq whitespace (concat (char 32) (char 9) (char 10) (char 13))) 194 195; Helper function. Returns a list of one-element strings, consisting of 196; characters having the codes in the specified range, inclusive. Skips 197; DEL (127) for that position is unused in iso8859-1, and skips the 198; division and multiplication signs. 199 200(defun make_list_chars (start end) 201 202 (let ((n start) 203 (m ())) 204 205 (if (<= n end) 206 (tailcall 0 207 (+ n 1) 208 (if (or (eq n 247) (eq n 215) (eq n 127)) 209 m 210 (cons (char n) m))) 211 (reverse m)))) 212 213; String containing the letter glyphs of iso8859-1 only. 214 215(setq letter 216 (concat (make_list_chars 65 90) 217 (make_list_chars 97 122) 218 (make_list_chars 192 255))) 219 220; String representing regular expression to recognize a name token. Note: 221; "-" must appear first in the second character class to avoid having it 222; inadvertently specify a range. 223 224(setq name (concat "[" letter "_:][-0-9._:" letter "]*")) 225 226; Strings representing regular expressions to match character references and 227; entity references. 228 229(setq char_ref "&#(x)?([0-9A-Fa-f]+);(.*)") 230(setq ent_ref "&[a-z]+;") 231 232; String representing regular expression matching valid attribute value 233; characters. 234 235(setq attvalue (concat "(\"([^\"<&]|" char_ref "|" ent_ref ")*\"|" 236 "'([^'<&]|" char_ref "|" ent_ref ")*')")) 237 238; String representing regular expression matching an attribute/value pair. 239 240(setq attribute (concat name "[" whitespace "]*=[" whitespace "]*" attvalue)) 241 242; Regular expression to remove whitespace surrouding "=" in attribute pairs. 243; This is so (break_up_attrs) will work correctly. 244 245(let ((attr_space_rx (regcomp (concat "(" name ")[" whitespace "]+=[" whitespace "]+(" attvalue ")"))) 246 (attr_rx (regcomp attribute)) 247 (m ()) 248 (len 0) 249 (broken ())) 250 251; Function to breakup at attribute list into an alist. 252 253 (defun break_up_attrs (attrs) 254 255 (setq broken ()) 256 (setq len (length attrs)) 257 258 (while (setq m (match attr_rx attrs)) 259 (setq broken 260 (cons 261 (substitute attr_space_rx "\1=\2" (substring attrs (car m) (- (cadr m) (car m)))) 262 broken)) 263 264 (if (eq (cadr m) len) 265 (setq attrs "") 266 267 (setq attrs (substring attrs (cadr m) 0)) 268 (setq len (length attrs)))) 269 270 (mapcar 271 (lambda (x) 272 (list (convert_refs (car x) 1) 273 (convert_refs (substitute quotes_rx "\1" (cadr x)) 1))) 274 275 ; If we do not specifically limit the split operation it will also 276 ; split the value if it contains one or more equal signs. 277 278 (mapcar (lambda (x) (split "=" x 2)) (reverse broken))))) 279 280; Function to check the XML version and document encoding. 281 282(let ((xml_version_rx (regcomp (concat "[" whitespace "]+version[" whitespace "]*=[" whitespace "]*" attvalue))) 283 (xml_encoding_rx (regcomp (concat "[" whitespace "]+encoding[" whitespace "]*=[" whitespace "]*" attvalue))) 284 (encoding_rx (regcomp "[Ii][Ss][Oo](-8859-1|-646-[Uu][Ss])|[Uu][Tt][Ff]-8|[Uu][Ss]-[Aa][Ss][Cc][Ii][Ii]"))) 285 286 (defun check_version (items) 287 288 (let ((m1 (matches xml_version_rx (car (cddr items)))) 289 (m2 (matches xml_encoding_rx (car (cddr items))))) 290 291 (when m1 292 (unless (eq (substitute quotes_rx "\1" (cadr m1)) "1.0") 293 (warn "This processor does not understand XML " m1) 294 (exit 1))) 295 296 (when m2 297 (setq m2 (substitute quotes_rx "\1" (cadr m2))) 298 (unless (match encoding_rx m2) 299 (warn "This processor does not understand encoding " m2) 300 (exit 1)))))) 301 302; Function to add a parsed item to the "document" alist. 303 304(let ((cdata_rx (regcomp "^<!\[CDATA\[(.*)\]\]>$")) 305 (empty_rx (regcomp (concat "^<(" name ")(([" whitespace "]+" attribute ")*)" "[" whitespace "]*/>$"))) 306 (start_rx (regcomp (concat "^<(" name ")(([" whitespace "]+" attribute ")*)" "[" whitespace "]*>$"))) 307 (end_rx (regcomp (concat "^</(" name ")[" whitespace "]*>$"))) 308 (xml_rx (regcomp (concat "^<\?([xX][Mm][Ll])(([" whitespace "]+" attribute ")*)" "[" whitespace "]*\?>$"))) 309 (proc_rx (regcomp "^<\\?.*\\?>$")) 310 (comment_rx (regcomp "^<!--.*-->$")) 311 (m ())) 312 313 (defun make_item (type item) 314 315 (if (eq type 'cdata) 316 (print "(cdata \"" (convert_refs item 1) "\")") 317 318 ; cdata_rx clause must come first. 319 320 (cond ((match cdata_rx item) 321 (print "(cdata \"" (convert_refs (substitute cdata_rx "\1" item) 0) "\")")) 322 323 ((setq m (matches start_rx item)) 324 (print "(\"" (cadr m) "\" " (break_up_attrs (car (cddr m))) " (")) 325 326 ((match end_rx item) 327 (print "))")) 328 329 ((setq m (matches empty_rx item)) 330 (print "(\"" (cadr m) "\" " (break_up_attrs (car (cddr m))) ")")) 331 332 ((match xml_rx item) 333 (check_version (matches xml_rx item))) 334 335 ((match proc_rx item) 1) 336 ((match comment_rx item) 1) 337 338 (1 (die "unrecognized tag: " item)))))) 339 340; Replacement for "getline" which does its own buffering. We use a 4k 341; buffer. The "getline" intrinsic uses a 100k buffer, but will accumulate 342; text beyond that amount until it finds a newline. If the XML document we 343; are processing is very large and all on one physical line of text, we can 344; commit a lot of memory both here, and when "parse" calls "explode" to 345; create individual strings for every character in the line. Our 346; replacement returns maxlen characters (or less) at a time to the caller, 347; ignoring line boundaries. 348 349(let ((buffer "") 350 (len 0) 351 (line "")) 352 353 (defun get_line () 354 (catch 355 (when (not buffer) 356 (if (setq buffer (getchars 4096)) 357 (setq len (length buffer)) 358 (throw 0))) 359 360 (setq line (substring buffer 0 maxlen)) 361 362 (if (> len maxlen) 363 (progn 364 (setq buffer (substring buffer maxlen 0)) 365 (dec len maxlen)) 366 367 (setq buffer "") 368 (setq len 0)) 369 370 line))) 371 372; See if the user has provided us with a different symbol from the default 373; 'document to bind to the final alist. 374 375(load (join "/" (libdir) "options.munger")) 376(getopt) 377 378(if (setq symbol (lookup options "s")) 379 (setq symbol (intern symbol)) 380 (setq symbol 'document)) 381 382; We read from the first filename specified on the command-line, or from 383; stdin. The call to (getopt) above has left the argument pointer pointing to 384; the last option, if any, or the script name, so that we need only call 385; (next) once to get the name of the first command-line argument. 386 387(when (next) 388 (redirect 0 (current))) 389 390; Toplevel function. 391 392(let ((line "")) 393 (print "(setq " symbol " '((\"document\" () (") 394 395 (while (setq line (get_line)) 396 (parse line)) 397 (parse line) 398 399 (print "))))") 400 (quit)) 401