1;;;; (sxml simple) -- a simple interface to the SSAX parser 2;;;; 3;;;; Copyright (C) 2009, 2010, 2013 Free Software Foundation, Inc. 4;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>. 5;;;; Originally written by Oleg Kiselyov <oleg at pobox dot com> as SXML-to-HTML.scm. 6;;;; 7;;;; This library is free software; you can redistribute it and/or 8;;;; modify it under the terms of the GNU Lesser General Public 9;;;; License as published by the Free Software Foundation; either 10;;;; version 3 of the License, or (at your option) any later version. 11;;;; 12;;;; This library is distributed in the hope that it will be useful, 13;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of 14;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;;;; Lesser General Public License for more details. 16;;;; 17;;;; You should have received a copy of the GNU Lesser General Public 18;;;; License along with this library; if not, write to the Free Software 19;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;;;; 21 22;;; Commentary: 23;; 24;;A simple interface to XML parsing and serialization. 25;; 26;;; Code: 27 28(define-module (sxml simple) 29 #:use-module (sxml ssax input-parse) 30 #:use-module (sxml ssax) 31 #:use-module (sxml transform) 32 #:use-module (ice-9 match) 33 #:use-module (srfi srfi-13) 34 #:export (xml->sxml sxml->xml sxml->string)) 35 36;; Helpers from upstream/SSAX.scm. 37;; 38 39; ssax:reverse-collect-str LIST-OF-FRAGS -> LIST-OF-FRAGS 40; given the list of fragments (some of which are text strings) 41; reverse the list and concatenate adjacent text strings. 42; We can prove from the general case below that if LIST-OF-FRAGS 43; has zero or one element, the result of the procedure is equal? 44; to its argument. This fact justifies the shortcut evaluation below. 45(define (ssax:reverse-collect-str fragments) 46 (cond 47 ((null? fragments) '()) ; a shortcut 48 ((null? (cdr fragments)) fragments) ; see the comment above 49 (else 50 (let loop ((fragments fragments) (result '()) (strs '())) 51 (cond 52 ((null? fragments) 53 (if (null? strs) result 54 (cons (string-concatenate/shared strs) result))) 55 ((string? (car fragments)) 56 (loop (cdr fragments) result (cons (car fragments) strs))) 57 (else 58 (loop (cdr fragments) 59 (cons 60 (car fragments) 61 (if (null? strs) result 62 (cons (string-concatenate/shared strs) result))) 63 '()))))))) 64 65(define (read-internal-doctype-as-string port) 66 (string-concatenate/shared 67 (let loop () 68 (let ((fragment 69 (next-token '() '(#\]) "reading internal DOCTYPE" port))) 70 (if (eqv? #\> (peek-next-char port)) 71 (begin 72 (read-char port) 73 (cons fragment '())) 74 (cons* fragment "]" (loop))))))) 75 76;; Ideas for the future for this interface: 77;; 78;; * Allow doctypes to provide parsed entities 79;; 80;; * Allow validation (the ELEMENTS value from the DOCTYPE handler 81;; below) 82;; 83;; * Parse internal DTDs 84;; 85;; * Parse external DTDs 86;; 87(define* (xml->sxml #:optional (string-or-port (current-input-port)) #:key 88 (namespaces '()) 89 (declare-namespaces? #t) 90 (trim-whitespace? #f) 91 (entities '()) 92 (default-entity-handler #f) 93 (doctype-handler #f)) 94 "Use SSAX to parse an XML document into SXML. Takes one optional 95argument, @var{string-or-port}, which defaults to the current input 96port." 97 ;; NAMESPACES: alist of PREFIX -> URI. Specifies the symbol prefix 98 ;; that the user wants on elements of a given namespace in the 99 ;; resulting SXML, regardless of the abbreviated namespaces defined in 100 ;; the document by xmlns attributes. If DECLARE-NAMESPACES? is true, 101 ;; these namespaces are treated as if they were declared in the DTD. 102 103 ;; ENTITIES: alist of SYMBOL -> STRING. 104 105 ;; NAMESPACES: list of (DOC-PREFIX . (USER-PREFIX . URI)). 106 ;; A DOC-PREFIX of #f indicates that it comes from the user. 107 ;; Otherwise, prefixes are symbols. 108 (define (munge-namespaces namespaces) 109 (map (lambda (el) 110 (match el 111 ((prefix . uri-string) 112 (cons* (and declare-namespaces? prefix) 113 prefix 114 (ssax:uri-string->symbol uri-string))))) 115 namespaces)) 116 117 (define (user-namespaces) 118 (munge-namespaces namespaces)) 119 120 (define (user-entities) 121 (if (and default-entity-handler 122 (not (assq '*DEFAULT* entities))) 123 (acons '*DEFAULT* default-entity-handler entities) 124 entities)) 125 126 (define (name->sxml name) 127 (match name 128 ((prefix . local-part) 129 (symbol-append prefix (string->symbol ":") local-part)) 130 (_ name))) 131 132 (define (doctype-continuation seed) 133 (lambda* (#:key (entities '()) (namespaces '())) 134 (values #f 135 (append entities (user-entities)) 136 (append (munge-namespaces namespaces) (user-namespaces)) 137 seed))) 138 139 ;; The SEED in this parser is the SXML: initialized to '() at each new 140 ;; level by the fdown handlers; built in reverse by the fhere parsers; 141 ;; and reverse-collected by the fup handlers. 142 (define parser 143 (ssax:make-parser 144 NEW-LEVEL-SEED ; fdown 145 (lambda (elem-gi attributes namespaces expected-content seed) 146 '()) 147 148 FINISH-ELEMENT ; fup 149 (lambda (elem-gi attributes namespaces parent-seed seed) 150 (let ((seed (if trim-whitespace? 151 (ssax:reverse-collect-str-drop-ws seed) 152 (ssax:reverse-collect-str seed))) 153 (attrs (attlist-fold 154 (lambda (attr accum) 155 (cons (list (name->sxml (car attr)) (cdr attr)) 156 accum)) 157 '() attributes))) 158 (acons (name->sxml elem-gi) 159 (if (null? attrs) 160 seed 161 (cons (cons '@ attrs) seed)) 162 parent-seed))) 163 164 CHAR-DATA-HANDLER ; fhere 165 (lambda (string1 string2 seed) 166 (if (string-null? string2) 167 (cons string1 seed) 168 (cons* string2 string1 seed))) 169 170 DOCTYPE 171 ;; -> ELEMS ENTITIES NAMESPACES SEED 172 ;; 173 ;; ELEMS is for validation and currently unused. 174 ;; 175 ;; ENTITIES is an alist of parsed entities (symbol -> string). 176 ;; 177 ;; NAMESPACES is as above. 178 ;; 179 ;; SEED builds up the content. 180 (lambda (port docname systemid internal-subset? seed) 181 (call-with-values 182 (lambda () 183 (cond 184 (doctype-handler 185 (doctype-handler docname systemid 186 (and internal-subset? 187 (read-internal-doctype-as-string port)))) 188 (else 189 (when internal-subset? 190 (ssax:skip-internal-dtd port)) 191 (values)))) 192 (doctype-continuation seed))) 193 194 UNDECL-ROOT 195 ;; This is like the DOCTYPE handler, but for documents that do not 196 ;; have a <!DOCTYPE!> entry. 197 (lambda (elem-gi seed) 198 (call-with-values 199 (lambda () 200 (if doctype-handler 201 (doctype-handler #f #f #f) 202 (values))) 203 (doctype-continuation seed))) 204 205 PI 206 ((*DEFAULT* 207 . (lambda (port pi-tag seed) 208 (cons 209 (list '*PI* pi-tag (ssax:read-pi-body-as-string port)) 210 seed)))))) 211 212 (let* ((port (if (string? string-or-port) 213 (open-input-string string-or-port) 214 string-or-port)) 215 (elements (reverse (parser port '())))) 216 `(*TOP* ,@elements))) 217 218(define check-name 219 (let ((*good-cache* (make-hash-table))) 220 (lambda (name) 221 (if (not (hashq-ref *good-cache* name)) 222 (let* ((str (symbol->string name)) 223 (i (string-index str #\:)) 224 (head (or (and i (substring str 0 i)) str)) 225 (tail (and i (substring str (1+ i))))) 226 (and i (string-index (substring str (1+ i)) #\:) 227 (error "Invalid QName: more than one colon" name)) 228 (for-each 229 (lambda (s) 230 (and s 231 (or (char-alphabetic? (string-ref s 0)) 232 (eq? (string-ref s 0) #\_) 233 (error "Invalid name starting character" s name)) 234 (string-for-each 235 (lambda (c) 236 (or (char-alphabetic? c) (string-index "0123456789.-_" c) 237 (error "Invalid name character" c s name))) 238 s))) 239 (list head tail)) 240 (hashq-set! *good-cache* name #t)))))) 241 242;; The following two functions serialize tags and attributes. They are 243;; being used in the node handlers for the post-order function, see 244;; below. 245 246(define (attribute-value->xml value port) 247 (cond 248 ((pair? value) 249 (attribute-value->xml (car value) port) 250 (attribute-value->xml (cdr value) port)) 251 ((null? value) 252 *unspecified*) 253 ((string? value) 254 (string->escaped-xml value port)) 255 ((procedure? value) 256 (with-output-to-port port value)) 257 (else 258 (string->escaped-xml 259 (call-with-output-string (lambda (port) (display value port))) 260 port)))) 261 262(define (attribute->xml attr value port) 263 (check-name attr) 264 (display attr port) 265 (display "=\"" port) 266 (attribute-value->xml value port) 267 (display #\" port)) 268 269(define (element->xml tag attrs body port) 270 (check-name tag) 271 (display #\< port) 272 (display tag port) 273 (if attrs 274 (let lp ((attrs attrs)) 275 (if (pair? attrs) 276 (let ((attr (car attrs))) 277 (display #\space port) 278 (if (pair? attr) 279 (attribute->xml (car attr) (cdr attr) port) 280 (error "bad attribute" tag attr)) 281 (lp (cdr attrs))) 282 (if (not (null? attrs)) 283 (error "bad attributes" tag attrs))))) 284 (if (pair? body) 285 (begin 286 (display #\> port) 287 (let lp ((body body)) 288 (cond 289 ((pair? body) 290 (sxml->xml (car body) port) 291 (lp (cdr body))) 292 ((null? body) 293 (display "</" port) 294 (display tag port) 295 (display ">" port)) 296 (else 297 (error "bad element body" tag body))))) 298 (display " />" port))) 299 300;; FIXME: ensure name is valid 301(define (entity->xml name port) 302 (display #\& port) 303 (display name port) 304 (display #\; port)) 305 306;; FIXME: ensure tag and str are valid 307(define (pi->xml tag str port) 308 (display "<?" port) 309 (display tag port) 310 (display #\space port) 311 (display str port) 312 (display "?>" port)) 313 314(define* (sxml->xml tree #:optional (port (current-output-port))) 315 "Serialize the sxml tree @var{tree} as XML. The output will be written 316to the current output port, unless the optional argument @var{port} is 317present." 318 (cond 319 ((pair? tree) 320 (if (symbol? (car tree)) 321 ;; An element. 322 (let ((tag (car tree))) 323 (case tag 324 ((*TOP*) 325 (sxml->xml (cdr tree) port)) 326 ((*ENTITY*) 327 (if (and (list? (cdr tree)) (= (length (cdr tree)) 1)) 328 (entity->xml (cadr tree) port) 329 (error "bad *ENTITY* args" (cdr tree)))) 330 ((*PI*) 331 (if (and (list? (cdr tree)) (= (length (cdr tree)) 2)) 332 (pi->xml (cadr tree) (caddr tree) port) 333 (error "bad *PI* args" (cdr tree)))) 334 (else 335 (let* ((elems (cdr tree)) 336 (attrs (and (pair? elems) (pair? (car elems)) 337 (eq? '@ (caar elems)) 338 (cdar elems)))) 339 (element->xml tag attrs (if attrs (cdr elems) elems) port))))) 340 ;; A nodelist. 341 (for-each (lambda (x) (sxml->xml x port)) tree))) 342 ((string? tree) 343 (string->escaped-xml tree port)) 344 ((null? tree) *unspecified*) 345 ((not tree) *unspecified*) 346 ((eqv? tree #t) *unspecified*) 347 ((procedure? tree) 348 (with-output-to-port port tree)) 349 (else 350 (string->escaped-xml 351 (call-with-output-string (lambda (port) (display tree port))) 352 port)))) 353 354(define (sxml->string sxml) 355 "Detag an sxml tree @var{sxml} into a string. Does not perform any 356formatting." 357 (string-concatenate-reverse 358 (foldts 359 (lambda (seed tree) ; fdown 360 '()) 361 (lambda (seed kid-seed tree) ; fup 362 (append! kid-seed seed)) 363 (lambda (seed tree) ; fhere 364 (if (string? tree) (cons tree seed) seed)) 365 '() 366 sxml))) 367 368(define (make-char-quotator char-encoding) 369 (let ((bad-chars (list->char-set (map car char-encoding)))) 370 371 ;; Check to see if str contains one of the characters in charset, 372 ;; from the position i onward. If so, return that character's index. 373 ;; otherwise, return #f 374 (define (index-cset str i charset) 375 (string-index str charset i)) 376 377 ;; The body of the function 378 (lambda (str port) 379 (let ((bad-pos (index-cset str 0 bad-chars))) 380 (if (not bad-pos) 381 (display str port) ; str had all good chars 382 (let loop ((from 0) (to bad-pos)) 383 (cond 384 ((>= from (string-length str)) *unspecified*) 385 ((not to) 386 (display (substring str from (string-length str)) port)) 387 (else 388 (let ((quoted-char 389 (cdr (assv (string-ref str to) char-encoding))) 390 (new-to 391 (index-cset str (+ 1 to) bad-chars))) 392 (if (< from to) 393 (display (substring str from to) port)) 394 (display quoted-char port) 395 (loop (1+ to) new-to)))))))))) 396 397;; Given a string, check to make sure it does not contain characters 398;; such as '<' or '&' that require encoding. Return either the original 399;; string, or a list of string fragments with special characters 400;; replaced by appropriate character entities. 401 402(define string->escaped-xml 403 (make-char-quotator 404 '((#\< . "<") (#\> . ">") (#\& . "&") (#\" . """)))) 405 406;;; arch-tag: 9c853b25-d82f-42ef-a959-ae26fdc7d1ac 407;;; simple.scm ends here 408 409