1;;;; (sxml xpath) -- SXPath 2;;;; 3;;;; Copyright (C) 2009 Free Software Foundation, Inc. 4;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>. 5;;;; Written 2001 by Oleg Kiselyov <oleg at pobox dot com> SXPath.scm. 6;;;; 7;;;; This library is free software; you can redistribute it and/or 8;;;; modify it under the terms of the GNU Lesser General Public 9;;;; License as published by the Free Software Foundation; either 10;;;; version 3 of the License, or (at your option) any later version. 11;;;; 12;;;; This library is distributed in the hope that it will be useful, 13;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of 14;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;;;; Lesser General Public License for more details. 16;;;; 17;;;; You should have received a copy of the GNU Lesser General Public 18;;;; License along with this library; if not, write to the Free Software 19;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;;;; 21 22;;; Commentary: 23;; 24;;@heading SXPath: SXML Query Language 25;; 26;; SXPath is a query language for SXML, an instance of XML Information 27;; set (Infoset) in the form of s-expressions. See @code{(sxml ssax)} 28;; for the definition of SXML and more details. SXPath is also a 29;; translation into Scheme of an XML Path Language, 30;; @uref{http://www.w3.org/TR/xpath,XPath}. XPath and SXPath describe 31;; means of selecting a set of Infoset's items or their properties. 32;; 33;; To facilitate queries, XPath maps the XML Infoset into an explicit 34;; tree, and introduces important notions of a location path and a 35;; current, context node. A location path denotes a selection of a set of 36;; nodes relative to a context node. Any XPath tree has a distinguished, 37;; root node -- which serves as the context node for absolute location 38;; paths. Location path is recursively defined as a location step joined 39;; with a location path. A location step is a simple query of the 40;; database relative to a context node. A step may include expressions 41;; that further filter the selected set. Each node in the resulting set 42;; is used as a context node for the adjoining location path. The result 43;; of the step is a union of the sets returned by the latter location 44;; paths. 45;; 46;; The SXML representation of the XML Infoset (see SSAX.scm) is rather 47;; suitable for querying as it is. Bowing to the XPath specification, 48;; we will refer to SXML information items as 'Nodes': 49;;@example 50;; <Node> ::= <Element> | <attributes-coll> | <attrib> 51;; | "text string" | <PI> 52;;@end example 53;; This production can also be described as 54;;@example 55;; <Node> ::= (name . <Nodeset>) | "text string" 56;;@end example 57;; An (ordered) set of nodes is just a list of the constituent nodes: 58;;@example 59;; <Nodeset> ::= (<Node> ...) 60;;@end example 61;; Nodesets, and Nodes other than text strings are both lists. A 62;; <Nodeset> however is either an empty list, or a list whose head is not 63;; a symbol. A symbol at the head of a node is either an XML name (in 64;; which case it's a tag of an XML element), or an administrative name 65;; such as '@@'. This uniform list representation makes processing rather 66;; simple and elegant, while avoiding confusion. The multi-branch tree 67;; structure formed by the mutually-recursive datatypes <Node> and 68;; <Nodeset> lends itself well to processing by functional languages. 69;; 70;; A location path is in fact a composite query over an XPath tree or 71;; its branch. A singe step is a combination of a projection, selection 72;; or a transitive closure. Multiple steps are combined via join and 73;; union operations. This insight allows us to @emph{elegantly} 74;; implement XPath as a sequence of projection and filtering primitives 75;; -- converters -- joined by @dfn{combinators}. Each converter takes a 76;; node and returns a nodeset which is the result of the corresponding 77;; query relative to that node. A converter can also be called on a set 78;; of nodes. In that case it returns a union of the corresponding 79;; queries over each node in the set. The union is easily implemented as 80;; a list append operation as all nodes in a SXML tree are considered 81;; distinct, by XPath conventions. We also preserve the order of the 82;; members in the union. Query combinators are high-order functions: 83;; they take converter(s) (which is a Node|Nodeset -> Nodeset function) 84;; and compose or otherwise combine them. We will be concerned with only 85;; relative location paths [XPath]: an absolute location path is a 86;; relative path applied to the root node. 87;; 88;; Similarly to XPath, SXPath defines full and abbreviated notations 89;; for location paths. In both cases, the abbreviated notation can be 90;; mechanically expanded into the full form by simple rewriting 91;; rules. In case of SXPath the corresponding rules are given as 92;; comments to a sxpath function, below. The regression test suite at 93;; the end of this file shows a representative sample of SXPaths in 94;; both notations, juxtaposed with the corresponding XPath 95;; expressions. Most of the samples are borrowed literally from the 96;; XPath specification, while the others are adjusted for our running 97;; example, tree1. 98;; 99;;; Code: 100 101(define-module (sxml xpath) 102 #:use-module (ice-9 pretty-print) 103 #:export (nodeset? node-typeof? node-eq? node-equal? node-pos 104 filter take-until take-after map-union node-reverse 105 node-trace select-kids node-self node-join node-reduce 106 node-or node-closure node-parent 107 sxpath)) 108 109;; Upstream version: 110; $Id: SXPath.scm,v 3.5 2001/01/12 23:20:35 oleg Exp oleg $ 111 112(define (nodeset? x) 113 (or (and (pair? x) (not (symbol? (car x)))) (null? x))) 114 115;------------------------- 116; Basic converters and applicators 117; A converter is a function 118; type Converter = Node|Nodeset -> Nodeset 119; A converter can also play a role of a predicate: in that case, if a 120; converter, applied to a node or a nodeset, yields a non-empty 121; nodeset, the converter-predicate is deemed satisfied. Throughout 122; this file a nil nodeset is equivalent to #f in denoting a failure. 123 124; The following function implements a 'Node test' as defined in 125; Sec. 2.3 of XPath document. A node test is one of the components of a 126; location step. It is also a converter-predicate in SXPath. 127; 128; The function node-typeof? takes a type criterion and returns a function, 129; which, when applied to a node, will tell if the node satisfies 130; the test. 131; node-typeof? :: Crit -> Node -> Boolean 132; 133; The criterion 'crit' is a symbol, one of the following: 134; id - tests if the Node has the right name (id) 135; @ - tests if the Node is an <attributes-coll> 136; * - tests if the Node is an <Element> 137; *text* - tests if the Node is a text node 138; *PI* - tests if the Node is a PI node 139; *any* - #t for any type of Node 140 141(define (node-typeof? crit) 142 (lambda (node) 143 (case crit 144 ((*) (and (pair? node) (not (memq (car node) '(@ *PI*))))) 145 ((*any*) #t) 146 ((*text*) (string? node)) 147 (else 148 (and (pair? node) (eq? crit (car node)))) 149))) 150 151 152; Curried equivalence converter-predicates 153(define (node-eq? other) 154 (lambda (node) 155 (eq? other node))) 156 157(define (node-equal? other) 158 (lambda (node) 159 (equal? other node))) 160 161; node-pos:: N -> Nodeset -> Nodeset, or 162; node-pos:: N -> Converter 163; Select the N'th element of a Nodeset and return as a singular Nodeset; 164; Return an empty nodeset if the Nth element does not exist. 165; ((node-pos 1) Nodeset) selects the node at the head of the Nodeset, 166; if exists; ((node-pos 2) Nodeset) selects the Node after that, if 167; exists. 168; N can also be a negative number: in that case the node is picked from 169; the tail of the list. 170; ((node-pos -1) Nodeset) selects the last node of a non-empty nodeset; 171; ((node-pos -2) Nodeset) selects the last but one node, if exists. 172 173(define (node-pos n) 174 (lambda (nodeset) 175 (cond 176 ((not (nodeset? nodeset)) '()) 177 ((null? nodeset) nodeset) 178 ((eqv? n 1) (list (car nodeset))) 179 ((negative? n) ((node-pos (+ n 1 (length nodeset))) nodeset)) 180 (else 181 (or (positive? n) (error "yikes!")) 182 ((node-pos (1- n)) (cdr nodeset)))))) 183 184; filter:: Converter -> Converter 185; A filter applicator, which introduces a filtering context. The argument 186; converter is considered a predicate, with either #f or nil result meaning 187; failure. 188(define (filter pred?) 189 (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset) 190 (let loop ((lst (if (nodeset? lst) lst (list lst))) (res '())) 191 (if (null? lst) 192 (reverse res) 193 (let ((pred-result (pred? (car lst)))) 194 (loop (cdr lst) 195 (if (and pred-result (not (null? pred-result))) 196 (cons (car lst) res) 197 res))))))) 198 199; take-until:: Converter -> Converter, or 200; take-until:: Pred -> Node|Nodeset -> Nodeset 201; Given a converter-predicate and a nodeset, apply the predicate to 202; each element of the nodeset, until the predicate yields anything but #f or 203; nil. Return the elements of the input nodeset that have been processed 204; till that moment (that is, which fail the predicate). 205; take-until is a variation of the filter above: take-until passes 206; elements of an ordered input set till (but not including) the first 207; element that satisfies the predicate. 208; The nodeset returned by ((take-until (not pred)) nset) is a subset -- 209; to be more precise, a prefix -- of the nodeset returned by 210; ((filter pred) nset) 211 212(define (take-until pred?) 213 (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset) 214 (let loop ((lst (if (nodeset? lst) lst (list lst)))) 215 (if (null? lst) lst 216 (let ((pred-result (pred? (car lst)))) 217 (if (and pred-result (not (null? pred-result))) 218 '() 219 (cons (car lst) (loop (cdr lst))))) 220 )))) 221 222 223; take-after:: Converter -> Converter, or 224; take-after:: Pred -> Node|Nodeset -> Nodeset 225; Given a converter-predicate and a nodeset, apply the predicate to 226; each element of the nodeset, until the predicate yields anything but #f or 227; nil. Return the elements of the input nodeset that have not been processed: 228; that is, return the elements of the input nodeset that follow the first 229; element that satisfied the predicate. 230; take-after along with take-until partition an input nodeset into three 231; parts: the first element that satisfies a predicate, all preceding 232; elements and all following elements. 233 234(define (take-after pred?) 235 (lambda (lst) ; a nodeset or a node (will be converted to a singleton nset) 236 (let loop ((lst (if (nodeset? lst) lst (list lst)))) 237 (if (null? lst) lst 238 (let ((pred-result (pred? (car lst)))) 239 (if (and pred-result (not (null? pred-result))) 240 (cdr lst) 241 (loop (cdr lst)))) 242 )))) 243 244; Apply proc to each element of lst and return the list of results. 245; if proc returns a nodeset, splice it into the result 246; 247; From another point of view, map-union is a function Converter->Converter, 248; which places an argument-converter in a joining context. 249 250(define (map-union proc lst) 251 (if (null? lst) lst 252 (let ((proc-res (proc (car lst)))) 253 ((if (nodeset? proc-res) append cons) 254 proc-res (map-union proc (cdr lst)))))) 255 256; node-reverse :: Converter, or 257; node-reverse:: Node|Nodeset -> Nodeset 258; Reverses the order of nodes in the nodeset 259; This basic converter is needed to implement a reverse document order 260; (see the XPath Recommendation). 261(define node-reverse 262 (lambda (node-or-nodeset) 263 (if (not (nodeset? node-or-nodeset)) (list node-or-nodeset) 264 (reverse node-or-nodeset)))) 265 266; node-trace:: String -> Converter 267; (node-trace title) is an identity converter. In addition it prints out 268; a node or nodeset it is applied to, prefixed with the 'title'. 269; This converter is very useful for debugging. 270 271(define (node-trace title) 272 (lambda (node-or-nodeset) 273 (display "\n-->") 274 (display title) 275 (display " :") 276 (pretty-print node-or-nodeset) 277 node-or-nodeset)) 278 279 280;------------------------- 281; Converter combinators 282; 283; Combinators are higher-order functions that transmogrify a converter 284; or glue a sequence of converters into a single, non-trivial 285; converter. The goal is to arrive at converters that correspond to 286; XPath location paths. 287; 288; From a different point of view, a combinator is a fixed, named 289; _pattern_ of applying converters. Given below is a complete set of 290; such patterns that together implement XPath location path 291; specification. As it turns out, all these combinators can be built 292; from a small number of basic blocks: regular functional composition, 293; map-union and filter applicators, and the nodeset union. 294 295 296 297; select-kids:: Pred -> Node -> Nodeset 298; Given a Node, return an (ordered) subset its children that satisfy 299; the Pred (a converter, actually) 300; select-kids:: Pred -> Nodeset -> Nodeset 301; The same as above, but select among children of all the nodes in 302; the Nodeset 303; 304; More succinctly, the signature of this function is 305; select-kids:: Converter -> Converter 306 307(define (select-kids test-pred?) 308 (lambda (node) ; node or node-set 309 (cond 310 ((null? node) node) 311 ((not (pair? node)) '()) ; No children 312 ((symbol? (car node)) 313 ((filter test-pred?) (cdr node))) ; it's a single node 314 (else (map-union (select-kids test-pred?) node))))) 315 316 317; node-self:: Pred -> Node -> Nodeset, or 318; node-self:: Converter -> Converter 319; Similar to select-kids but apply to the Node itself rather 320; than to its children. The resulting Nodeset will contain either one 321; component, or will be empty (if the Node failed the Pred). 322(define node-self filter) 323 324 325; node-join:: [LocPath] -> Node|Nodeset -> Nodeset, or 326; node-join:: [Converter] -> Converter 327; join the sequence of location steps or paths as described 328; in the title comments above. 329(define (node-join . selectors) 330 (lambda (nodeset) ; Nodeset or node 331 (let loop ((nodeset nodeset) (selectors selectors)) 332 (if (null? selectors) nodeset 333 (loop 334 (if (nodeset? nodeset) 335 (map-union (car selectors) nodeset) 336 ((car selectors) nodeset)) 337 (cdr selectors)))))) 338 339 340; node-reduce:: [LocPath] -> Node|Nodeset -> Nodeset, or 341; node-reduce:: [Converter] -> Converter 342; A regular functional composition of converters. 343; From a different point of view, 344; ((apply node-reduce converters) nodeset) 345; is equivalent to 346; (foldl apply nodeset converters) 347; i.e., folding, or reducing, a list of converters with the nodeset 348; as a seed. 349(define (node-reduce . converters) 350 (lambda (nodeset) ; Nodeset or node 351 (let loop ((nodeset nodeset) (converters converters)) 352 (if (null? converters) nodeset 353 (loop ((car converters) nodeset) (cdr converters)))))) 354 355 356; node-or:: [Converter] -> Converter 357; This combinator applies all converters to a given node and 358; produces the union of their results. 359; This combinator corresponds to a union, '|' operation for XPath 360; location paths. 361; (define (node-or . converters) 362; (lambda (node-or-nodeset) 363; (if (null? converters) node-or-nodeset 364; (append 365; ((car converters) node-or-nodeset) 366; ((apply node-or (cdr converters)) node-or-nodeset))))) 367; More optimal implementation follows 368(define (node-or . converters) 369 (lambda (node-or-nodeset) 370 (let loop ((result '()) (converters converters)) 371 (if (null? converters) result 372 (loop (append result (or ((car converters) node-or-nodeset) '())) 373 (cdr converters)))))) 374 375 376; node-closure:: Converter -> Converter 377; Select all _descendants_ of a node that satisfy a converter-predicate. 378; This combinator is similar to select-kids but applies to 379; grand... children as well. 380; This combinator implements the "descendant::" XPath axis 381; Conceptually, this combinator can be expressed as 382; (define (node-closure f) 383; (node-or 384; (select-kids f) 385; (node-reduce (select-kids (node-typeof? '*)) (node-closure f)))) 386; This definition, as written, looks somewhat like a fixpoint, and it 387; will run forever. It is obvious however that sooner or later 388; (select-kids (node-typeof? '*)) will return an empty nodeset. At 389; this point further iterations will no longer affect the result and 390; can be stopped. 391 392(define (node-closure test-pred?) 393 (lambda (node) ; Nodeset or node 394 (let loop ((parent node) (result '())) 395 (if (null? parent) result 396 (loop ((select-kids (node-typeof? '*)) parent) 397 (append result 398 ((select-kids test-pred?) parent))) 399 )))) 400 401; node-parent:: RootNode -> Converter 402; (node-parent rootnode) yields a converter that returns a parent of a 403; node it is applied to. If applied to a nodeset, it returns the list 404; of parents of nodes in the nodeset. The rootnode does not have 405; to be the root node of the whole SXML tree -- it may be a root node 406; of a branch of interest. 407; Given the notation of Philip Wadler's paper on semantics of XSLT, 408; parent(x) = { y | y=subnode*(root), x=subnode(y) } 409; Therefore, node-parent is not the fundamental converter: it can be 410; expressed through the existing ones. Yet node-parent is a rather 411; convenient converter. It corresponds to a parent:: axis of SXPath. 412; Note that the parent:: axis can be used with an attribute node as well! 413 414(define (node-parent rootnode) 415 (lambda (node) ; Nodeset or node 416 (if (nodeset? node) (map-union (node-parent rootnode) node) 417 (let ((pred 418 (node-or 419 (node-reduce 420 (node-self (node-typeof? '*)) 421 (select-kids (node-eq? node))) 422 (node-join 423 (select-kids (node-typeof? '@)) 424 (select-kids (node-eq? node)))))) 425 ((node-or 426 (node-self pred) 427 (node-closure pred)) 428 rootnode))))) 429 430;------------------------- 431; Evaluate an abbreviated SXPath 432; sxpath:: AbbrPath -> Converter, or 433; sxpath:: AbbrPath -> Node|Nodeset -> Nodeset 434; AbbrPath is a list. It is translated to the full SXPath according 435; to the following rewriting rules 436; (sxpath '()) -> (node-join) 437; (sxpath '(path-component ...)) -> 438; (node-join (sxpath1 path-component) (sxpath '(...))) 439; (sxpath1 '//) -> (node-or 440; (node-self (node-typeof? '*any*)) 441; (node-closure (node-typeof? '*any*))) 442; (sxpath1 '(equal? x)) -> (select-kids (node-equal? x)) 443; (sxpath1 '(eq? x)) -> (select-kids (node-eq? x)) 444; (sxpath1 ?symbol) -> (select-kids (node-typeof? ?symbol) 445; (sxpath1 procedure) -> procedure 446; (sxpath1 '(?symbol ...)) -> (sxpath1 '((?symbol) ...)) 447; (sxpath1 '(path reducer ...)) -> 448; (node-reduce (sxpath path) (sxpathr reducer) ...) 449; (sxpathr number) -> (node-pos number) 450; (sxpathr path-filter) -> (filter (sxpath path-filter)) 451 452(define (sxpath path) 453 (lambda (nodeset) 454 (let loop ((nodeset nodeset) (path path)) 455 (cond 456 ((null? path) nodeset) 457 ((nodeset? nodeset) 458 (map-union (sxpath path) nodeset)) 459 ((procedure? (car path)) 460 (loop ((car path) nodeset) (cdr path))) 461 ((eq? '// (car path)) 462 (loop 463 ((if (nodeset? nodeset) append cons) nodeset 464 ((node-closure (node-typeof? '*any*)) nodeset)) 465 (cdr path))) 466 ((symbol? (car path)) 467 (loop ((select-kids (node-typeof? (car path))) nodeset) 468 (cdr path))) 469 ((and (pair? (car path)) (eq? 'equal? (caar path))) 470 (loop ((select-kids (apply node-equal? (cdar path))) nodeset) 471 (cdr path))) 472 ((and (pair? (car path)) (eq? 'eq? (caar path))) 473 (loop ((select-kids (apply node-eq? (cdar path))) nodeset) 474 (cdr path))) 475 ((pair? (car path)) 476 (let reducer ((nodeset 477 (if (symbol? (caar path)) 478 ((select-kids (node-typeof? (caar path))) nodeset) 479 (loop nodeset (caar path)))) 480 (reducing-path (cdar path))) 481 (cond 482 ((null? reducing-path) (loop nodeset (cdr path))) 483 ((number? (car reducing-path)) 484 (reducer ((node-pos (car reducing-path)) nodeset) 485 (cdr reducing-path))) 486 (else 487 (reducer ((filter (sxpath (car reducing-path))) nodeset) 488 (cdr reducing-path)))))) 489 (else 490 (error "Invalid path step: " (car path))))))) 491 492;;; arch-tag: c4e57abf-6b61-4612-a6aa-d1536d440774 493;;; xpath.scm ends here 494