xml2alist.munger - OpenGrok cross reference for /dports/lang/munger/munger-5.12/xml2alist.munger

#!MUNGERPATH

; Copyright (c) 2004, 2009 James Bailie <jimmy@mammothcheese.ca>.
; All rights reserved.
;
; Redistribution and use in source form, with or without
; modification, are permitted provided that the following conditions are met:
;
;     * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
;     * The name of James Bailie may not be used to endorse or promote
; products derived from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS"
; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
; LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
; CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
; POSSIBILITY OF SUCH DAMAGE.

; This script is a minimal, non-validating parser for standalone XML 1.0
; documents.  The program only knows about ISO-8859-1, but it will attempt
; to process a document encoded using UTF-8.  This will only succeed if the
; characters used in tag and attribute names are limited to those of
; ISO-8859-1, which is a subset of UTF-8.  The character data in the
; document is not subject to this limitation.

; It cannot cope with documents which are not well-formed, nor can it
; convert entity references beyond those defined by the XML 1.0 standard as
; predefined (&quot; &amp; &gt; &lt; &apos;).  It will convert both decimal
; and hexadecimal character references, however.  The parser only
; recognizes element tags and CDATA tags.  It ignores processing
; instructions.

; XML structure and content is converted into an alist, and then munger code
; to create and bind this alist to a symbol is printed to stdout.  One
; optional argument may be passed to the script to specify the symbol to be
; bound in the output expression (-s [symbol]).  If the option is not present,
; then the alist will be bound to the symbol 'document.

; The example script xmlquery.munger provides a query interface to alists
; produced by this program.  Examples of usage are in the comments at the
; beginning of xmlquery.munger.

; Makes lisp errors fatal to the interpreter.

(fatal)

; Maximum size of the chunks of text we will process.

(setq maxlen 512)

; Lexical analysis.

(let ((token "")
      (type 'empty)
      (term "")
      (chars ())
      (len 0)
      (term_rx (regcomp (char 13))))

   (defun parse (line)

      (if (eq 0 line)
         (when token
            (make_item type token))

         (setq chars (explode (substitute term_rx "" line 0)))

         (while chars
            (cond ((eq type 'empty)
                   (setq token (car chars))
                   (setq type (if (eq (car chars) "<") 'markup 'cdata)))

                  ((eq type 'markup)

                   (if (and (> (setq len (length token)) 8)
                            (eq "<![CDATA[" (substring token 0 9)))

                       (cond ((and (setq term (eq "]]" (substring token (- len 2) 0)))
                                   (eq (car chars) ">"))
                              (make_item type (concat token (car chars)))
                              (setq token "")
                              (setq type 'empty))

                             ; Split-up long explicit cdata sections.

                             ((> len maxlen)
                              (if (eq (car chars) "]")
                                 (setq token (concat token (car chars)))

                                 (make_item type (concat token (car chars) "]]>"))
                                 (setq token "<![CDATA[")))

                             (1 (setq token (concat token (car chars)))))

                       (if (not (eq (car chars) ">"))
                           (setq token (concat token (car chars)))

                           (make_item type (concat token (car chars)))
                           (setq type 'empty)
                           (setq token ""))))

                  ((eq type 'cdata)
                   (if (eq (car chars) "<")
                     (progn
                        (when token
                           (make_item type token))
                        (set 'token (car chars))
                        (set 'type 'markup))

                     ; Split-up long implicit cdata sections.

                     (if (<= (length token) maxlen)
                        (setq token (concat token (car chars)))

                        (make_item type (concat token (car chars)))
                        (setq token "")))))

            (setq chars (cdr chars))))))

; Functions to convert character and entity references.

(let ((nums ())
      (subst "")
      (converted (table))
      (rx "")
      (m ())
      (entities_rx (regcomp "&(lt|gt|apos|quot|amp|(x)?([0-9A-Fa-f]{2}));")))

   (defun do_conversions (line)
      (if (not (setq m (matches entities_rx line)))
         line

         (setq subst
            (cond ((eq (car (cdr m)) "lt") "<")
                  ((eq (car (cdr m)) "gt") ">")
                  ((eq (car (cdr m)) "apos") "'")
                  ((eq (car (cdr m)) "quot") "\"")
                  ((eq (car (cdr m)) "amp") "&")

                  ((eq (car (cddr m)) "x")
                   (char (hex2dec (car (cdddr m)))))

                  (1 (char (digitize (car (cdddr m)))))))

         ; Convert one match at a time to avoid performing multiple levels
         ; of conversions on portions of the line we have not seen yet, and
         ; on portions of lines we have already scanned once.

         (setq nums (match entities_rx line))

         (setq rx (lookup converted (car m)))
         (when (not rx)
            (setq rx (hash converted (car m) (regcomp (car m)))))

         (concat (substitute rx subst (substring line 0 (cadr nums)))
                  (if (eq (length line) (cadr nums))
                     ""
                     (do_conversions (substring line (cadr nums) 0)))))))

; Regular expressions to escape quotes and backslashes so that the final alist
; created by this script will be readable by the munger reader.

(let ((quote_rx (regcomp "\""))
      (backslash_rx (regcomp "\\\\")))

   (defun convert_refs (line convert)

      (when convert
         (setq line (do_conversions line)))

      ; Items to escape may be found literally in the string or they may be
      ; created by entity replacement, so we need to escape them after entity
      ; conversion.  The double escaping in the replacement strings below is
      ; necessary.  The lisp reader interprets the first level, while the
      ; substitute command interprets the second level.  Quotes must be escaped
      ; after backslashes since quotes are escaped with backslashes.

      (substitute quote_rx "\\\\\"" (substitute backslash_rx "\\\\\\\\" line 0) 0)))

; Regular expression used to remove delimiting quotes from attribute values.

(setq quotes_rx (regcomp "^[\"'](.*)['\"]$"))

; String containing whitespace characters recognized by XML.

(setq whitespace (concat (char 32) (char 9) (char 10) (char 13)))

; Helper function.  Returns a list of one-element strings, consisting of
; characters having the codes in the specified range, inclusive.  Skips
; DEL (127) for that position is unused in iso8859-1, and skips the
; division and multiplication signs.

(defun make_list_chars (start end)

   (let ((n start)
         (m ()))

      (if (<= n end)
         (tailcall 0
                   (+ n 1)
                   (if (or (eq n 247) (eq n 215) (eq n 127))
                       m
                       (cons (char n) m)))
         (reverse m))))

; String containing the letter glyphs of iso8859-1 only.

(setq letter
   (concat (make_list_chars 65 90)
           (make_list_chars 97 122)
           (make_list_chars 192 255)))

; String representing regular expression to recognize a name token.  Note:
; "-" must appear first in the second character class to avoid having it
; inadvertently specify a range.

(setq name (concat "[" letter "_:][-0-9._:" letter  "]*"))

; Strings representing regular expressions to match character references and
; entity references.

(setq char_ref "&#(x)?([0-9A-Fa-f]+);(.*)")
(setq ent_ref "&[a-z]+;")

; String representing regular expression matching valid attribute value
; characters.

(setq attvalue (concat "(\"([^\"<&]|" char_ref "|" ent_ref ")*\"|"
                       "'([^'<&]|" char_ref "|" ent_ref ")*')"))

; String representing regular expression matching an attribute/value pair.

(setq attribute (concat name "[" whitespace "]*=[" whitespace "]*" attvalue))

; Regular expression to remove whitespace surrouding "=" in attribute pairs.
; This is so (break_up_attrs) will work correctly.

(let ((attr_space_rx (regcomp (concat "(" name ")[" whitespace "]+=[" whitespace "]+(" attvalue ")")))
      (attr_rx (regcomp attribute))
      (m ())
      (len 0)
      (broken ()))

; Function to breakup at attribute list into an alist.

   (defun break_up_attrs (attrs)

      (setq broken ())
      (setq len (length attrs))

      (while (setq m (match attr_rx attrs))
         (setq broken
            (cons
               (substitute attr_space_rx "\1=\2" (substring attrs (car m) (- (cadr m) (car m))))
               broken))

         (if (eq (cadr m) len)
            (setq attrs "")

            (setq attrs (substring attrs (cadr m) 0))
            (setq len (length attrs))))

      (mapcar
         (lambda (x)
            (list (convert_refs (car x) 1)
                  (convert_refs (substitute quotes_rx "\1" (cadr x)) 1)))

         ; If we do not specifically limit the split operation it will also
         ; split the value if it contains one or more equal signs.

         (mapcar (lambda (x) (split "=" x 2)) (reverse broken)))))

; Function to check the XML version and document encoding.

(let ((xml_version_rx (regcomp (concat "[" whitespace "]+version[" whitespace "]*=[" whitespace "]*" attvalue)))
      (xml_encoding_rx (regcomp (concat "[" whitespace "]+encoding[" whitespace "]*=[" whitespace "]*" attvalue)))
      (encoding_rx (regcomp "[Ii][Ss][Oo](-8859-1|-646-[Uu][Ss])|[Uu][Tt][Ff]-8|[Uu][Ss]-[Aa][Ss][Cc][Ii][Ii]")))

   (defun check_version (items)

      (let ((m1 (matches xml_version_rx (car (cddr items))))
            (m2 (matches xml_encoding_rx (car (cddr items)))))

         (when m1
            (unless (eq (substitute quotes_rx "\1" (cadr m1)) "1.0")
               (warn "This processor does not understand XML " m1)
               (exit 1)))

         (when m2
            (setq m2 (substitute quotes_rx "\1" (cadr m2)))
            (unless (match encoding_rx m2)
               (warn "This processor does not understand encoding " m2)
               (exit 1))))))

; Function to add a parsed item to the "document" alist.

(let ((cdata_rx (regcomp "^<!\[CDATA\[(.*)\]\]>$"))
      (empty_rx (regcomp (concat "^<(" name ")(([" whitespace "]+" attribute ")*)" "[" whitespace "]*/>$")))
      (start_rx (regcomp (concat "^<(" name ")(([" whitespace "]+" attribute ")*)" "[" whitespace "]*>$")))
      (end_rx (regcomp (concat "^</(" name ")[" whitespace "]*>$")))
      (xml_rx (regcomp (concat "^<\?([xX][Mm][Ll])(([" whitespace "]+" attribute ")*)" "[" whitespace "]*\?>$")))
      (proc_rx (regcomp "^<\\?.*\\?>$"))
      (comment_rx (regcomp "^<!--.*-->$"))
      (m ()))

   (defun make_item (type item)

      (if (eq type 'cdata)
         (print "(cdata \"" (convert_refs item 1) "\")")

         ; cdata_rx clause must come first.

         (cond ((match cdata_rx item)
                (print "(cdata \"" (convert_refs (substitute cdata_rx "\1" item) 0) "\")"))

               ((setq m (matches start_rx item))
                (print "(\"" (cadr m) "\" " (break_up_attrs (car (cddr m))) " ("))

               ((match end_rx item)
                (print "))"))

               ((setq m (matches empty_rx item))
                (print "(\"" (cadr m) "\" " (break_up_attrs (car (cddr m))) ")"))

               ((match xml_rx item)
                (check_version (matches xml_rx item)))

               ((match proc_rx item) 1)
               ((match comment_rx item) 1)

               (1 (die "unrecognized tag: " item))))))

; Replacement for "getline" which does its own buffering.  We use a 4k
; buffer.  The "getline" intrinsic uses a 100k buffer, but will accumulate
; text beyond that amount until it finds a newline.  If the XML document we
; are processing is very large and all on one physical line of text, we can
; commit a lot of memory both here, and when "parse" calls "explode" to
; create individual strings for every character in the line.  Our
; replacement returns maxlen characters (or less) at a time to the caller,
; ignoring line boundaries.

(let ((buffer "")
      (len 0)
      (line ""))

   (defun get_line ()
      (catch
         (when (not buffer)
            (if (setq buffer (getchars 4096))
               (setq len (length buffer))
               (throw 0)))

         (setq line (substring buffer 0 maxlen))

         (if (> len maxlen)
            (progn
               (setq buffer (substring buffer maxlen 0))
               (dec len maxlen))

            (setq buffer "")
            (setq len 0))

         line)))

; See if the user has provided us with a different symbol from the default
; 'document to bind to the final alist.

(load (join "/" (libdir) "options.munger"))
(getopt)

(if (setq symbol (lookup options "s"))
   (setq symbol (intern symbol))
   (setq symbol 'document))

; We read from the first filename specified on the command-line, or from
; stdin.  The call to (getopt) above has left the argument pointer pointing to
; the last option, if any, or the script name, so that we need only call
; (next) once to get the name of the first command-line argument.

(when (next)
   (redirect 0 (current)))

; Toplevel function.

(let ((line ""))
   (print "(setq " symbol " '((\"document\" () (")

   (while (setq line (get_line))
      (parse line))
   (parse line)

   (print "))))")
   (quit))