1;;;; (sxml xpath) -- SXPath
2;;;;
3;;;; 	Copyright (C) 2009  Free Software Foundation, Inc.
4;;;;    Modified 2004 by Andy Wingo <wingo at pobox dot com>.
5;;;;    Written 2001 by Oleg Kiselyov <oleg at pobox dot com> SXPath.scm.
6;;;;
7;;;; This library is free software; you can redistribute it and/or
8;;;; modify it under the terms of the GNU Lesser General Public
9;;;; License as published by the Free Software Foundation; either
10;;;; version 3 of the License, or (at your option) any later version.
11;;;;
12;;;; This library is distributed in the hope that it will be useful,
13;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
14;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;;;; Lesser General Public License for more details.
16;;;;
17;;;; You should have received a copy of the GNU Lesser General Public
18;;;; License along with this library; if not, write to the Free Software
19;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;;;;
21
22;;; Commentary:
23;;
24;;@heading SXPath: SXML Query Language
25;;
26;; SXPath is a query language for SXML, an instance of XML Information
27;; set (Infoset) in the form of s-expressions. See @code{(sxml ssax)}
28;; for the definition of SXML and more details. SXPath is also a
29;; translation into Scheme of an XML Path Language,
30;; @uref{http://www.w3.org/TR/xpath,XPath}. XPath and SXPath describe
31;; means of selecting a set of Infoset's items or their properties.
32;;
33;; To facilitate queries, XPath maps the XML Infoset into an explicit
34;; tree, and introduces important notions of a location path and a
35;; current, context node. A location path denotes a selection of a set of
36;; nodes relative to a context node. Any XPath tree has a distinguished,
37;; root node -- which serves as the context node for absolute location
38;; paths. Location path is recursively defined as a location step joined
39;; with a location path. A location step is a simple query of the
40;; database relative to a context node. A step may include expressions
41;; that further filter the selected set. Each node in the resulting set
42;; is used as a context node for the adjoining location path. The result
43;; of the step is a union of the sets returned by the latter location
44;; paths.
45;;
46;; The SXML representation of the XML Infoset (see SSAX.scm) is rather
47;; suitable for querying as it is. Bowing to the XPath specification,
48;; we will refer to SXML information items as 'Nodes':
49;;@example
50;; 	<Node> ::= <Element> | <attributes-coll> | <attrib>
51;; 		   | "text string" | <PI>
52;;@end example
53;; This production can also be described as
54;;@example
55;;	<Node> ::= (name . <Nodeset>) | "text string"
56;;@end example
57;; An (ordered) set of nodes is just a list of the constituent nodes:
58;;@example
59;; 	<Nodeset> ::= (<Node> ...)
60;;@end example
61;; Nodesets, and Nodes other than text strings are both lists. A
62;; <Nodeset> however is either an empty list, or a list whose head is not
63;; a symbol.  A symbol at the head of a node is either an XML name (in
64;; which case it's a tag of an XML element), or an administrative name
65;; such as '@@'.  This uniform list representation makes processing rather
66;; simple and elegant, while avoiding confusion. The multi-branch tree
67;; structure formed by the mutually-recursive datatypes <Node> and
68;; <Nodeset> lends itself well to processing by functional languages.
69;;
70;; A location path is in fact a composite query over an XPath tree or
71;; its branch. A singe step is a combination of a projection, selection
72;; or a transitive closure. Multiple steps are combined via join and
73;; union operations. This insight allows us to @emph{elegantly}
74;; implement XPath as a sequence of projection and filtering primitives
75;; -- converters -- joined by @dfn{combinators}. Each converter takes a
76;; node and returns a nodeset which is the result of the corresponding
77;; query relative to that node. A converter can also be called on a set
78;; of nodes. In that case it returns a union of the corresponding
79;; queries over each node in the set. The union is easily implemented as
80;; a list append operation as all nodes in a SXML tree are considered
81;; distinct, by XPath conventions. We also preserve the order of the
82;; members in the union. Query combinators are high-order functions:
83;; they take converter(s) (which is a Node|Nodeset -> Nodeset function)
84;; and compose or otherwise combine them. We will be concerned with only
85;; relative location paths [XPath]: an absolute location path is a
86;; relative path applied to the root node.
87;;
88;; Similarly to XPath, SXPath defines full and abbreviated notations
89;; for location paths. In both cases, the abbreviated notation can be
90;; mechanically expanded into the full form by simple rewriting
91;; rules. In case of SXPath the corresponding rules are given as
92;; comments to a sxpath function, below. The regression test suite at
93;; the end of this file shows a representative sample of SXPaths in
94;; both notations, juxtaposed with the corresponding XPath
95;; expressions. Most of the samples are borrowed literally from the
96;; XPath specification, while the others are adjusted for our running
97;; example, tree1.
98;;
99;;; Code:
100
101(define-module (sxml xpath)
102  #:use-module (ice-9 pretty-print)
103  #:export (nodeset? node-typeof? node-eq? node-equal? node-pos
104            filter take-until take-after map-union node-reverse
105            node-trace select-kids node-self node-join node-reduce
106            node-or node-closure node-parent
107            sxpath))
108
109;; Upstream version:
110; $Id: SXPath.scm,v 3.5 2001/01/12 23:20:35 oleg Exp oleg $
111
112(define (nodeset? x)
113  (or (and (pair? x) (not (symbol? (car x)))) (null? x)))
114
115;-------------------------
116; Basic converters and applicators
117; A converter is a function
118;	type Converter = Node|Nodeset -> Nodeset
119; A converter can also play a role of a predicate: in that case, if a
120; converter, applied to a node or a nodeset, yields a non-empty
121; nodeset, the converter-predicate is deemed satisfied. Throughout
122; this file a nil nodeset is equivalent to #f in denoting a failure.
123
124; The following function implements a 'Node test' as defined in
125; Sec. 2.3 of XPath document. A node test is one of the components of a
126; location step. It is also a converter-predicate in SXPath.
127;
128; The function node-typeof? takes a type criterion and returns a function,
129; which, when applied to a node, will tell if the node satisfies
130; the test.
131;	node-typeof? :: Crit -> Node -> Boolean
132;
133; The criterion 'crit' is a symbol, one of the following:
134;	id		- tests if the Node has the right name (id)
135;	@		- tests if the Node is an <attributes-coll>
136;	*		- tests if the Node is an <Element>
137;	*text*		- tests if the Node is a text node
138;	*PI*		- tests if the Node is a PI node
139;	*any*		- #t for any type of Node
140
141(define (node-typeof? crit)
142  (lambda (node)
143    (case crit
144      ((*) (and (pair? node) (not (memq (car node) '(@ *PI*)))))
145      ((*any*) #t)
146      ((*text*) (string? node))
147      (else
148       (and (pair? node) (eq? crit (car node))))
149)))
150
151
152; Curried equivalence converter-predicates
153(define (node-eq? other)
154  (lambda (node)
155    (eq? other node)))
156
157(define (node-equal? other)
158  (lambda (node)
159    (equal? other node)))
160
161; node-pos:: N -> Nodeset -> Nodeset, or
162; node-pos:: N -> Converter
163; Select the N'th element of a Nodeset and return as a singular Nodeset;
164; Return an empty nodeset if the Nth element does not exist.
165; ((node-pos 1) Nodeset) selects the node at the head of the Nodeset,
166; if exists; ((node-pos 2) Nodeset) selects the Node after that, if
167; exists.
168; N can also be a negative number: in that case the node is picked from
169; the tail of the list.
170; ((node-pos -1) Nodeset) selects the last node of a non-empty nodeset;
171; ((node-pos -2) Nodeset) selects the last but one node, if exists.
172
173(define (node-pos n)
174  (lambda (nodeset)
175    (cond
176     ((not (nodeset? nodeset)) '())
177     ((null? nodeset) nodeset)
178     ((eqv? n 1) (list (car nodeset)))
179     ((negative? n) ((node-pos (+ n 1 (length nodeset))) nodeset))
180     (else
181      (or (positive? n) (error "yikes!"))
182      ((node-pos (1- n)) (cdr nodeset))))))
183
184; filter:: Converter -> Converter
185; A filter applicator, which introduces a filtering context. The argument
186; converter is considered a predicate, with either #f or nil result meaning
187; failure.
188(define (filter pred?)
189  (lambda (lst)	; a nodeset or a node (will be converted to a singleton nset)
190    (let loop ((lst (if (nodeset? lst) lst (list lst))) (res '()))
191      (if (null? lst)
192	  (reverse res)
193	  (let ((pred-result (pred? (car lst))))
194	    (loop (cdr lst)
195		  (if (and pred-result (not (null? pred-result)))
196		      (cons (car lst) res)
197		      res)))))))
198
199; take-until:: Converter -> Converter, or
200; take-until:: Pred -> Node|Nodeset -> Nodeset
201; Given a converter-predicate and a nodeset, apply the predicate to
202; each element of the nodeset, until the predicate yields anything but #f or
203; nil. Return the elements of the input nodeset that have been processed
204; till that moment (that is, which fail the predicate).
205; take-until is a variation of the filter above: take-until passes
206; elements of an ordered input set till (but not including) the first
207; element that satisfies the predicate.
208; The nodeset returned by ((take-until (not pred)) nset) is a subset --
209; to be more precise, a prefix -- of the nodeset returned by
210; ((filter pred) nset)
211
212(define (take-until pred?)
213  (lambda (lst)	; a nodeset or a node (will be converted to a singleton nset)
214    (let loop ((lst (if (nodeset? lst) lst (list lst))))
215      (if (null? lst) lst
216	  (let ((pred-result (pred? (car lst))))
217	    (if (and pred-result (not (null? pred-result)))
218		'()
219		(cons (car lst) (loop (cdr lst)))))
220	  ))))
221
222
223; take-after:: Converter -> Converter, or
224; take-after:: Pred -> Node|Nodeset -> Nodeset
225; Given a converter-predicate and a nodeset, apply the predicate to
226; each element of the nodeset, until the predicate yields anything but #f or
227; nil. Return the elements of the input nodeset that have not been processed:
228; that is, return the elements of the input nodeset that follow the first
229; element that satisfied the predicate.
230; take-after along with take-until partition an input nodeset into three
231; parts: the first element that satisfies a predicate, all preceding
232; elements and all following elements.
233
234(define (take-after pred?)
235  (lambda (lst)	; a nodeset or a node (will be converted to a singleton nset)
236    (let loop ((lst (if (nodeset? lst) lst (list lst))))
237      (if (null? lst) lst
238	  (let ((pred-result (pred? (car lst))))
239	    (if (and pred-result (not (null? pred-result)))
240		(cdr lst)
241		(loop (cdr lst))))
242	  ))))
243
244; Apply proc to each element of lst and return the list of results.
245; if proc returns a nodeset, splice it into the result
246;
247; From another point of view, map-union is a function Converter->Converter,
248; which places an argument-converter in a joining context.
249
250(define (map-union proc lst)
251  (if (null? lst) lst
252      (let ((proc-res (proc (car lst))))
253	((if (nodeset? proc-res) append cons)
254	 proc-res (map-union proc (cdr lst))))))
255
256; node-reverse :: Converter, or
257; node-reverse:: Node|Nodeset -> Nodeset
258; Reverses the order of nodes in the nodeset
259; This basic converter is needed to implement a reverse document order
260; (see the XPath Recommendation).
261(define node-reverse
262  (lambda (node-or-nodeset)
263    (if (not (nodeset? node-or-nodeset)) (list node-or-nodeset)
264	(reverse node-or-nodeset))))
265
266; node-trace:: String -> Converter
267; (node-trace title) is an identity converter. In addition it prints out
268; a node or nodeset it is applied to, prefixed with the 'title'.
269; This converter is very useful for debugging.
270
271(define (node-trace title)
272  (lambda (node-or-nodeset)
273    (display "\n-->")
274    (display title)
275    (display " :")
276    (pretty-print node-or-nodeset)
277    node-or-nodeset))
278
279
280;-------------------------
281; Converter combinators
282;
283; Combinators are higher-order functions that transmogrify a converter
284; or glue a sequence of converters into a single, non-trivial
285; converter. The goal is to arrive at converters that correspond to
286; XPath location paths.
287;
288; From a different point of view, a combinator is a fixed, named
289; _pattern_ of applying converters. Given below is a complete set of
290; such patterns that together implement XPath location path
291; specification. As it turns out, all these combinators can be built
292; from a small number of basic blocks: regular functional composition,
293; map-union and filter applicators, and the nodeset union.
294
295
296
297; select-kids:: Pred -> Node -> Nodeset
298; Given a Node, return an (ordered) subset its children that satisfy
299; the Pred (a converter, actually)
300; select-kids:: Pred -> Nodeset -> Nodeset
301; The same as above, but select among children of all the nodes in
302; the Nodeset
303;
304; More succinctly, the signature of this function is
305; select-kids:: Converter -> Converter
306
307(define (select-kids test-pred?)
308  (lambda (node)		; node or node-set
309    (cond
310     ((null? node) node)
311     ((not (pair? node)) '())   ; No children
312     ((symbol? (car node))
313      ((filter test-pred?) (cdr node)))	; it's a single node
314     (else (map-union (select-kids test-pred?) node)))))
315
316
317; node-self:: Pred -> Node -> Nodeset, or
318; node-self:: Converter -> Converter
319; Similar to select-kids but apply to the Node itself rather
320; than to its children. The resulting Nodeset will contain either one
321; component, or will be empty (if the Node failed the Pred).
322(define node-self filter)
323
324
325; node-join:: [LocPath] -> Node|Nodeset -> Nodeset, or
326; node-join:: [Converter] -> Converter
327; join the sequence of location steps or paths as described
328; in the title comments above.
329(define (node-join . selectors)
330  (lambda (nodeset)		; Nodeset or node
331    (let loop ((nodeset nodeset) (selectors selectors))
332      (if (null? selectors) nodeset
333	  (loop
334	   (if (nodeset? nodeset)
335	       (map-union (car selectors) nodeset)
336	       ((car selectors) nodeset))
337	   (cdr selectors))))))
338
339
340; node-reduce:: [LocPath] -> Node|Nodeset -> Nodeset, or
341; node-reduce:: [Converter] -> Converter
342; A regular functional composition of converters.
343; From a different point of view,
344;    ((apply node-reduce converters) nodeset)
345; is equivalent to
346;    (foldl apply nodeset converters)
347; i.e., folding, or reducing, a list of converters with the nodeset
348; as a seed.
349(define (node-reduce . converters)
350  (lambda (nodeset)		; Nodeset or node
351    (let loop ((nodeset nodeset) (converters converters))
352      (if (null? converters) nodeset
353	  (loop ((car converters) nodeset) (cdr converters))))))
354
355
356; node-or:: [Converter] -> Converter
357; This combinator applies all converters to a given node and
358; produces the union of their results.
359; This combinator corresponds to a union, '|' operation for XPath
360; location paths.
361; (define (node-or . converters)
362;   (lambda (node-or-nodeset)
363;     (if (null? converters) node-or-nodeset
364; 	(append
365; 	 ((car converters) node-or-nodeset)
366; 	 ((apply node-or (cdr converters)) node-or-nodeset)))))
367; More optimal implementation follows
368(define (node-or . converters)
369  (lambda (node-or-nodeset)
370    (let loop ((result '()) (converters converters))
371      (if (null? converters) result
372	  (loop (append result (or ((car converters) node-or-nodeset) '()))
373		(cdr converters))))))
374
375
376; node-closure:: Converter -> Converter
377; Select all _descendants_ of a node that satisfy a converter-predicate.
378; This combinator is similar to select-kids but applies to
379; grand... children as well.
380; This combinator implements the "descendant::" XPath axis
381; Conceptually, this combinator can be expressed as
382; (define (node-closure f)
383;      (node-or
384;        (select-kids f)
385;	 (node-reduce (select-kids (node-typeof? '*)) (node-closure f))))
386; This definition, as written, looks somewhat like a fixpoint, and it
387; will run forever. It is obvious however that sooner or later
388; (select-kids (node-typeof? '*)) will return an empty nodeset. At
389; this point further iterations will no longer affect the result and
390; can be stopped.
391
392(define (node-closure test-pred?)
393  (lambda (node)		; Nodeset or node
394    (let loop ((parent node) (result '()))
395      (if (null? parent) result
396	  (loop ((select-kids (node-typeof? '*)) parent)
397		(append result
398			((select-kids test-pred?) parent)))
399	  ))))
400
401; node-parent:: RootNode -> Converter
402; (node-parent rootnode) yields a converter that returns a parent of a
403; node it is applied to. If applied to a nodeset, it returns the list
404; of parents of nodes in the nodeset. The rootnode does not have
405; to be the root node of the whole SXML tree -- it may be a root node
406; of a branch of interest.
407; Given the notation of Philip Wadler's paper on semantics of XSLT,
408;  parent(x) = { y | y=subnode*(root), x=subnode(y) }
409; Therefore, node-parent is not the fundamental converter: it can be
410; expressed through the existing ones.  Yet node-parent is a rather
411; convenient converter. It corresponds to a parent:: axis of SXPath.
412; Note that the parent:: axis can be used with an attribute node as well!
413
414(define (node-parent rootnode)
415  (lambda (node)		; Nodeset or node
416    (if (nodeset? node) (map-union (node-parent rootnode) node)
417	(let ((pred
418	       (node-or
419		(node-reduce
420		 (node-self (node-typeof? '*))
421		 (select-kids (node-eq? node)))
422		(node-join
423		 (select-kids (node-typeof? '@))
424		 (select-kids (node-eq? node))))))
425	  ((node-or
426	    (node-self pred)
427	    (node-closure pred))
428	   rootnode)))))
429
430;-------------------------
431; Evaluate an abbreviated SXPath
432;	sxpath:: AbbrPath -> Converter, or
433;	sxpath:: AbbrPath -> Node|Nodeset -> Nodeset
434; AbbrPath is a list. It is translated to the full SXPath according
435; to the following rewriting rules
436; (sxpath '()) -> (node-join)
437; (sxpath '(path-component ...)) ->
438;		(node-join (sxpath1 path-component) (sxpath '(...)))
439; (sxpath1 '//) -> (node-or
440;		     (node-self (node-typeof? '*any*))
441;		      (node-closure (node-typeof? '*any*)))
442; (sxpath1 '(equal? x)) -> (select-kids (node-equal? x))
443; (sxpath1 '(eq? x))    -> (select-kids (node-eq? x))
444; (sxpath1 ?symbol)     -> (select-kids (node-typeof? ?symbol)
445; (sxpath1 procedure)   -> procedure
446; (sxpath1 '(?symbol ...)) -> (sxpath1 '((?symbol) ...))
447; (sxpath1 '(path reducer ...)) ->
448;		(node-reduce (sxpath path) (sxpathr reducer) ...)
449; (sxpathr number)      -> (node-pos number)
450; (sxpathr path-filter) -> (filter (sxpath path-filter))
451
452(define (sxpath path)
453  (lambda (nodeset)
454    (let loop ((nodeset nodeset) (path path))
455    (cond
456     ((null? path) nodeset)
457     ((nodeset? nodeset)
458      (map-union (sxpath path) nodeset))
459     ((procedure? (car path))
460      (loop ((car path) nodeset) (cdr path)))
461     ((eq? '// (car path))
462      (loop
463       ((if (nodeset? nodeset) append cons) nodeset
464	((node-closure (node-typeof? '*any*)) nodeset))
465       (cdr path)))
466     ((symbol? (car path))
467      (loop ((select-kids (node-typeof? (car path))) nodeset)
468	    (cdr path)))
469     ((and (pair? (car path)) (eq? 'equal? (caar path)))
470      (loop ((select-kids (apply node-equal? (cdar path))) nodeset)
471	    (cdr path)))
472     ((and (pair? (car path)) (eq? 'eq? (caar path)))
473      (loop ((select-kids (apply node-eq? (cdar path))) nodeset)
474	    (cdr path)))
475     ((pair? (car path))
476      (let reducer ((nodeset
477		     (if (symbol? (caar path))
478			 ((select-kids (node-typeof? (caar path))) nodeset)
479			 (loop nodeset (caar path))))
480		    (reducing-path (cdar path)))
481	(cond
482	 ((null? reducing-path) (loop nodeset (cdr path)))
483	 ((number? (car reducing-path))
484	  (reducer ((node-pos (car reducing-path)) nodeset)
485		   (cdr reducing-path)))
486	 (else
487	  (reducer ((filter (sxpath (car reducing-path))) nodeset)
488		   (cdr reducing-path))))))
489     (else
490      (error "Invalid path step: " (car path)))))))
491
492;;; arch-tag: c4e57abf-6b61-4612-a6aa-d1536d440774
493;;; xpath.scm ends here
494