1#!MUNGERPATH
2
3; Copyright (c) 2004, 2009 James Bailie <jimmy@mammothcheese.ca>.
4; All rights reserved.
5;
6; Redistribution and use in source form, with or without
7; modification, are permitted provided that the following conditions are met:
8;
9;     * Redistributions of source code must retain the above copyright
10; notice, this list of conditions and the following disclaimer.
11;     * The name of James Bailie may not be used to endorse or promote
12; products derived from this software without specific prior written permission.
13;
14; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS"
15; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
18; LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22; CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24; POSSIBILITY OF SUCH DAMAGE.
25
26; This script is a minimal, non-validating parser for standalone XML 1.0
27; documents.  The program only knows about ISO-8859-1, but it will attempt
28; to process a document encoded using UTF-8.  This will only succeed if the
29; characters used in tag and attribute names are limited to those of
30; ISO-8859-1, which is a subset of UTF-8.  The character data in the
31; document is not subject to this limitation.
32
33; It cannot cope with documents which are not well-formed, nor can it
34; convert entity references beyond those defined by the XML 1.0 standard as
35; predefined (&quot; &amp; &gt; &lt; &apos;).  It will convert both decimal
36; and hexadecimal character references, however.  The parser only
37; recognizes element tags and CDATA tags.  It ignores processing
38; instructions.
39
40; XML structure and content is converted into an alist, and then munger code
41; to create and bind this alist to a symbol is printed to stdout.  One
42; optional argument may be passed to the script to specify the symbol to be
43; bound in the output expression (-s [symbol]).  If the option is not present,
44; then the alist will be bound to the symbol 'document.
45
46; The example script xmlquery.munger provides a query interface to alists
47; produced by this program.  Examples of usage are in the comments at the
48; beginning of xmlquery.munger.
49
50; Makes lisp errors fatal to the interpreter.
51
52(fatal)
53
54; Maximum size of the chunks of text we will process.
55
56(setq maxlen 512)
57
58; Lexical analysis.
59
60(let ((token "")
61      (type 'empty)
62      (term "")
63      (chars ())
64      (len 0)
65      (term_rx (regcomp (char 13))))
66
67   (defun parse (line)
68
69      (if (eq 0 line)
70         (when token
71            (make_item type token))
72
73         (setq chars (explode (substitute term_rx "" line 0)))
74
75         (while chars
76            (cond ((eq type 'empty)
77                   (setq token (car chars))
78                   (setq type (if (eq (car chars) "<") 'markup 'cdata)))
79
80                  ((eq type 'markup)
81
82                   (if (and (> (setq len (length token)) 8)
83                            (eq "<![CDATA[" (substring token 0 9)))
84
85                       (cond ((and (setq term (eq "]]" (substring token (- len 2) 0)))
86                                   (eq (car chars) ">"))
87                              (make_item type (concat token (car chars)))
88                              (setq token "")
89                              (setq type 'empty))
90
91                             ; Split-up long explicit cdata sections.
92
93                             ((> len maxlen)
94                              (if (eq (car chars) "]")
95                                 (setq token (concat token (car chars)))
96
97                                 (make_item type (concat token (car chars) "]]>"))
98                                 (setq token "<![CDATA[")))
99
100                             (1 (setq token (concat token (car chars)))))
101
102                       (if (not (eq (car chars) ">"))
103                           (setq token (concat token (car chars)))
104
105                           (make_item type (concat token (car chars)))
106                           (setq type 'empty)
107                           (setq token ""))))
108
109                  ((eq type 'cdata)
110                   (if (eq (car chars) "<")
111                     (progn
112                        (when token
113                           (make_item type token))
114                        (set 'token (car chars))
115                        (set 'type 'markup))
116
117                     ; Split-up long implicit cdata sections.
118
119                     (if (<= (length token) maxlen)
120                        (setq token (concat token (car chars)))
121
122                        (make_item type (concat token (car chars)))
123                        (setq token "")))))
124
125            (setq chars (cdr chars))))))
126
127; Functions to convert character and entity references.
128
129(let ((nums ())
130      (subst "")
131      (converted (table))
132      (rx "")
133      (m ())
134      (entities_rx (regcomp "&(lt|gt|apos|quot|amp|(x)?([0-9A-Fa-f]{2}));")))
135
136   (defun do_conversions (line)
137      (if (not (setq m (matches entities_rx line)))
138         line
139
140         (setq subst
141            (cond ((eq (car (cdr m)) "lt") "<")
142                  ((eq (car (cdr m)) "gt") ">")
143                  ((eq (car (cdr m)) "apos") "'")
144                  ((eq (car (cdr m)) "quot") "\"")
145                  ((eq (car (cdr m)) "amp") "&")
146
147                  ((eq (car (cddr m)) "x")
148                   (char (hex2dec (car (cdddr m)))))
149
150                  (1 (char (digitize (car (cdddr m)))))))
151
152         ; Convert one match at a time to avoid performing multiple levels
153         ; of conversions on portions of the line we have not seen yet, and
154         ; on portions of lines we have already scanned once.
155
156         (setq nums (match entities_rx line))
157
158         (setq rx (lookup converted (car m)))
159         (when (not rx)
160            (setq rx (hash converted (car m) (regcomp (car m)))))
161
162         (concat (substitute rx subst (substring line 0 (cadr nums)))
163                  (if (eq (length line) (cadr nums))
164                     ""
165                     (do_conversions (substring line (cadr nums) 0)))))))
166
167; Regular expressions to escape quotes and backslashes so that the final alist
168; created by this script will be readable by the munger reader.
169
170(let ((quote_rx (regcomp "\""))
171      (backslash_rx (regcomp "\\\\")))
172
173   (defun convert_refs (line convert)
174
175      (when convert
176         (setq line (do_conversions line)))
177
178      ; Items to escape may be found literally in the string or they may be
179      ; created by entity replacement, so we need to escape them after entity
180      ; conversion.  The double escaping in the replacement strings below is
181      ; necessary.  The lisp reader interprets the first level, while the
182      ; substitute command interprets the second level.  Quotes must be escaped
183      ; after backslashes since quotes are escaped with backslashes.
184
185      (substitute quote_rx "\\\\\"" (substitute backslash_rx "\\\\\\\\" line 0) 0)))
186
187; Regular expression used to remove delimiting quotes from attribute values.
188
189(setq quotes_rx (regcomp "^[\"'](.*)['\"]$"))
190
191; String containing whitespace characters recognized by XML.
192
193(setq whitespace (concat (char 32) (char 9) (char 10) (char 13)))
194
195; Helper function.  Returns a list of one-element strings, consisting of
196; characters having the codes in the specified range, inclusive.  Skips
197; DEL (127) for that position is unused in iso8859-1, and skips the
198; division and multiplication signs.
199
200(defun make_list_chars (start end)
201
202   (let ((n start)
203         (m ()))
204
205      (if (<= n end)
206         (tailcall 0
207                   (+ n 1)
208                   (if (or (eq n 247) (eq n 215) (eq n 127))
209                       m
210                       (cons (char n) m)))
211         (reverse m))))
212
213; String containing the letter glyphs of iso8859-1 only.
214
215(setq letter
216   (concat (make_list_chars 65 90)
217           (make_list_chars 97 122)
218           (make_list_chars 192 255)))
219
220; String representing regular expression to recognize a name token.  Note:
221; "-" must appear first in the second character class to avoid having it
222; inadvertently specify a range.
223
224(setq name (concat "[" letter "_:][-0-9._:" letter  "]*"))
225
226; Strings representing regular expressions to match character references and
227; entity references.
228
229(setq char_ref "&#(x)?([0-9A-Fa-f]+);(.*)")
230(setq ent_ref "&[a-z]+;")
231
232; String representing regular expression matching valid attribute value
233; characters.
234
235(setq attvalue (concat "(\"([^\"<&]|" char_ref "|" ent_ref ")*\"|"
236                       "'([^'<&]|" char_ref "|" ent_ref ")*')"))
237
238; String representing regular expression matching an attribute/value pair.
239
240(setq attribute (concat name "[" whitespace "]*=[" whitespace "]*" attvalue))
241
242; Regular expression to remove whitespace surrouding "=" in attribute pairs.
243; This is so (break_up_attrs) will work correctly.
244
245(let ((attr_space_rx (regcomp (concat "(" name ")[" whitespace "]+=[" whitespace "]+(" attvalue ")")))
246      (attr_rx (regcomp attribute))
247      (m ())
248      (len 0)
249      (broken ()))
250
251; Function to breakup at attribute list into an alist.
252
253   (defun break_up_attrs (attrs)
254
255      (setq broken ())
256      (setq len (length attrs))
257
258      (while (setq m (match attr_rx attrs))
259         (setq broken
260            (cons
261               (substitute attr_space_rx "\1=\2" (substring attrs (car m) (- (cadr m) (car m))))
262               broken))
263
264         (if (eq (cadr m) len)
265            (setq attrs "")
266
267            (setq attrs (substring attrs (cadr m) 0))
268            (setq len (length attrs))))
269
270      (mapcar
271         (lambda (x)
272            (list (convert_refs (car x) 1)
273                  (convert_refs (substitute quotes_rx "\1" (cadr x)) 1)))
274
275         ; If we do not specifically limit the split operation it will also
276         ; split the value if it contains one or more equal signs.
277
278         (mapcar (lambda (x) (split "=" x 2)) (reverse broken)))))
279
280; Function to check the XML version and document encoding.
281
282(let ((xml_version_rx (regcomp (concat "[" whitespace "]+version[" whitespace "]*=[" whitespace "]*" attvalue)))
283      (xml_encoding_rx (regcomp (concat "[" whitespace "]+encoding[" whitespace "]*=[" whitespace "]*" attvalue)))
284      (encoding_rx (regcomp "[Ii][Ss][Oo](-8859-1|-646-[Uu][Ss])|[Uu][Tt][Ff]-8|[Uu][Ss]-[Aa][Ss][Cc][Ii][Ii]")))
285
286   (defun check_version (items)
287
288      (let ((m1 (matches xml_version_rx (car (cddr items))))
289            (m2 (matches xml_encoding_rx (car (cddr items)))))
290
291         (when m1
292            (unless (eq (substitute quotes_rx "\1" (cadr m1)) "1.0")
293               (warn "This processor does not understand XML " m1)
294               (exit 1)))
295
296         (when m2
297            (setq m2 (substitute quotes_rx "\1" (cadr m2)))
298            (unless (match encoding_rx m2)
299               (warn "This processor does not understand encoding " m2)
300               (exit 1))))))
301
302; Function to add a parsed item to the "document" alist.
303
304(let ((cdata_rx (regcomp "^<!\[CDATA\[(.*)\]\]>$"))
305      (empty_rx (regcomp (concat "^<(" name ")(([" whitespace "]+" attribute ")*)" "[" whitespace "]*/>$")))
306      (start_rx (regcomp (concat "^<(" name ")(([" whitespace "]+" attribute ")*)" "[" whitespace "]*>$")))
307      (end_rx (regcomp (concat "^</(" name ")[" whitespace "]*>$")))
308      (xml_rx (regcomp (concat "^<\?([xX][Mm][Ll])(([" whitespace "]+" attribute ")*)" "[" whitespace "]*\?>$")))
309      (proc_rx (regcomp "^<\\?.*\\?>$"))
310      (comment_rx (regcomp "^<!--.*-->$"))
311      (m ()))
312
313   (defun make_item (type item)
314
315      (if (eq type 'cdata)
316         (print "(cdata \"" (convert_refs item 1) "\")")
317
318         ; cdata_rx clause must come first.
319
320         (cond ((match cdata_rx item)
321                (print "(cdata \"" (convert_refs (substitute cdata_rx "\1" item) 0) "\")"))
322
323               ((setq m (matches start_rx item))
324                (print "(\"" (cadr m) "\" " (break_up_attrs (car (cddr m))) " ("))
325
326               ((match end_rx item)
327                (print "))"))
328
329               ((setq m (matches empty_rx item))
330                (print "(\"" (cadr m) "\" " (break_up_attrs (car (cddr m))) ")"))
331
332               ((match xml_rx item)
333                (check_version (matches xml_rx item)))
334
335               ((match proc_rx item) 1)
336               ((match comment_rx item) 1)
337
338               (1 (die "unrecognized tag: " item))))))
339
340; Replacement for "getline" which does its own buffering.  We use a 4k
341; buffer.  The "getline" intrinsic uses a 100k buffer, but will accumulate
342; text beyond that amount until it finds a newline.  If the XML document we
343; are processing is very large and all on one physical line of text, we can
344; commit a lot of memory both here, and when "parse" calls "explode" to
345; create individual strings for every character in the line.  Our
346; replacement returns maxlen characters (or less) at a time to the caller,
347; ignoring line boundaries.
348
349(let ((buffer "")
350      (len 0)
351      (line ""))
352
353   (defun get_line ()
354      (catch
355         (when (not buffer)
356            (if (setq buffer (getchars 4096))
357               (setq len (length buffer))
358               (throw 0)))
359
360         (setq line (substring buffer 0 maxlen))
361
362         (if (> len maxlen)
363            (progn
364               (setq buffer (substring buffer maxlen 0))
365               (dec len maxlen))
366
367            (setq buffer "")
368            (setq len 0))
369
370         line)))
371
372; See if the user has provided us with a different symbol from the default
373; 'document to bind to the final alist.
374
375(load (join "/" (libdir) "options.munger"))
376(getopt)
377
378(if (setq symbol (lookup options "s"))
379   (setq symbol (intern symbol))
380   (setq symbol 'document))
381
382; We read from the first filename specified on the command-line, or from
383; stdin.  The call to (getopt) above has left the argument pointer pointing to
384; the last option, if any, or the script name, so that we need only call
385; (next) once to get the name of the first command-line argument.
386
387(when (next)
388   (redirect 0 (current)))
389
390; Toplevel function.
391
392(let ((line ""))
393   (print "(setq " symbol " '((\"document\" () (")
394
395   (while (setq line (get_line))
396      (parse line))
397   (parse line)
398
399   (print "))))")
400   (quit))
401