1/*  This file is part of ClioPatria.
2
3    Author:	Jan Wielemaker <J.Wielemaker@cs.vu.nl>
4    HTTP:	http://e-culture.multimedian.nl/
5    Copyright:  2007, E-Culture/MultimediaN
6
7    ClioPatria is free software: you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation, either version 2 of the License, or
10    (at your option) any later version.
11
12    ClioPatria is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16
17    You should have received a copy of the GNU General Public License
18    along with ClioPatria.  If not, see <http://www.gnu.org/licenses/>.
19*/
20
21:- module(xpath,
22	  [ xpath/3,			% +DOM, +Spec, -Value
23	    xpath_chk/3,		% +DOM, +Spec, -Value
24
25	    op(400, fx, //),
26	    op(400, fx, /),
27	    op(200, fy, @)
28	  ]).
29:- use_module(library(record)).
30:- use_module(library(lists)).
31:- use_module(library(occurs)).
32:- use_module(library(debug)).
33
34/** <module> Select nodes in an XML DOM
35
36The library xpath.pl provides predicates to select nodes from an XML DOM
37tree as produced by library(sgml) based  on descriptions inspired by the
38XPATH language.
39
40The   predicate   xpath/3   selects   a   sub-structure   of   the   DOM
41non-deterministically based on an  xpath-like   specification.  Not  all
42selectors of XPATH are implemented, but the ability to mix xpath/3 calls
43with arbitrary Prolog code  provides  a   powerful  tool  for extracting
44information from XML parse-trees.
45
46@see http://www.w3.org/TR/xpath
47*/
48
49:- record
50	element(name, attributes, content).
51
52%%	xpath_chk(+DOM, +Spec, ?Content) is semidet.
53%
54%	Semi-deterministic version of xpath/3.
55
56xpath_chk(DOM, Spec, Content) :-
57	xpath(DOM, Spec, Content), !.
58
59%%	xpath(+DOM, +Spec, ?Content) is nondet.
60%
61%	Match an element in a DOM structure.   The syntax is inspired by
62%	XPath, using () rather than  []   to  select  inside an element.
63%	First we can construct paths using / and //:
64%
65%	    $ =|//|=Term :
66%	    Select any node in the DOM matching term.
67%	    $ =|/|=Term :
68%	    Match the root against Term.
69%	    $ Term :
70%	    Select the immediate children of the root matching Term.
71%
72%	The Terms above are of type   _callable_.  The functor specifies
73%	the element name. The element name   '*'  refers to any element.
74%	The name =self= refers to the   top-element  itself and is often
75%	used for processing matches of an  earlier xpath/3 query. A term
76%	NS:Term refers to an XML  name   in  the  namespace NS. Optional
77%	arguments specify additional  constraints   and  functions.  The
78%	arguments are processed from left  to right. Defined conditional
79%	argument values are:
80%
81%	    $ Integer :
82%	    The N-th element with the given name
83%	    $ =last= :
84%	    The last element with the given name.
85%	    $ =last= - IntExpr :
86%	    The IntExpr-th element counting from the last (0-based)
87%
88%	Defined function argument values are:
89%
90%	    $ =self= :
91%	    Evaluate to the entire element
92%	    $ =text= :
93%	    Evaluates to all text from the sub-tree as an atom
94%	    $ =normalize_space= :
95%	    As =text=, but uses normalize_space/2 to normalise
96%	    white-space in the output
97%	    $ =number= :
98%	    Extract an integer or float from the value.  Ignores
99%	    leading and trailing white-space
100%	    $ =|@|=Attribute :
101%	    Evaluates to the value of the given attribute
102%
103%	In addition, the argument-list can be _conditions_:
104%
105%	    $ Left = Right :
106%	    Succeeds if the left-hand unifies with the right-hand.
107%	    E.g. normalize_space = 'euro'
108%	    $ contains(Haystack, Needle) :
109%	    Succeeds if Needle is a sub-string of Haystack.
110%
111%	Examples:
112%
113%	Match each table-row in DOM:
114%
115%	    ==
116%	    xpath(DOM, //tr, TR)
117%	    ==
118%
119%	Match the last cell  of  each   tablerow  in  DOM.  This example
120%	illustrates that a result can be the input of subsequent xpath/3
121%	queries. Using multiple queries  on   the  intermediate  TR term
122%	guarantee that all results come from the same table-row:
123%
124%	    ==
125%	    xpath(DOM, //tr, TR),
126%	    xpath(TR,  /td(last), TD)
127%	    ==
128%
129%	Match each =href= attribute in an <a> element
130%
131%	    ==
132%	    xpath(DOM, //a(@href), HREF)
133%	    ==
134%
135%	Suppose we have a table containing  rows where each first column
136%	is the name of a product with a   link to details and the second
137%	is the price (a number).  The   following  predicate matches the
138%	name, URL and price:
139%
140%	    ==
141%	    product(DOM, Name, URL, Price) :-
142%	    	xpath(DOM, //tr, TR),
143%	    	xpath(TR, td(1), C1),
144%	    	xpath(C1, /self(normalize_space), Name),
145%	    	xpath(C1, a(@href), URL),
146%	    	xpath(TR, td(2, number), Price).
147%	    ==
148
149xpath(DOM, Spec, Content) :-
150	in_dom(Spec, DOM, Content).
151
152in_dom(//Spec, DOM, Value) :- !,
153	element_spec(Spec, Name, Modifiers),
154	sub_dom(I, Len, Name, E, DOM),
155	modifiers(Modifiers, I, Len, E, Value).
156in_dom(/Spec, E, Value) :- !,
157	element_spec(Spec, Name, Modifiers),
158	(   Name == self
159	->  true
160	;   element_name(E, Name)
161	),
162	modifiers(Modifiers, 1, 1, E, Value).
163in_dom(A/B, DOM, Value) :- !,
164	in_dom(A, DOM, Value0),
165	in_dom(B, Value0, Value).
166in_dom(A//B, DOM, Value) :- !,
167	in_dom(A, DOM, Value0),
168	in_dom(//B, Value0, Value).
169in_dom(Spec, element(_, _, Content), Value) :-
170	element_spec(Spec, Name, Modifiers),
171	count_named_elements(Content, Name, CLen),
172	CLen > 0,
173	nth_element(N, Name, E, Content),
174	modifiers(Modifiers, N, CLen, E, Value).
175
176element_spec(Var, _, _) :-
177	var(Var), !,
178	instantiation_error(Var).
179element_spec(NS:Term, NS:Name, Modifiers) :- !,
180	Term =.. [Name0|Modifiers],
181	star(Name0, Name).
182element_spec(Term, Name, Modifiers) :- !,
183	Term =.. [Name0|Modifiers],
184	star(Name0, Name).
185
186star(*, _) :- !.
187star(Name, Name).
188
189
190%%	sub_dom(-Index, -Count, +Name, -Sub, +DOM) is nondet.
191%
192%	Sub is a node in DOM with Name.
193%
194%	@param Count	is the total number of nodes in the content
195%			list Sub appears that have the same name.
196%	@param Index	is the 1-based index of Sub of nodes with
197%			Name.
198
199sub_dom(1, 1, Name, DOM, DOM) :-
200	element_name(DOM, Name).
201sub_dom(N, Len, Name, E, element(_,_,Content)) :- !,
202	sub_dom_2(N, Len, Name, E, Content).
203sub_dom(N, Len, Name, E, Content) :-
204	is_list(Content),
205	sub_dom_2(N, Len, Name, E, Content).
206
207sub_dom_2(N, Len, Name, Element, Content) :-
208	(   count_named_elements(Content, Name, Len),
209	    nth_element(N, Name, Element, Content)
210	;   member(element(_,_,C2), Content),
211	    sub_dom_2(N, Len, Name, Element, C2)
212	).
213
214
215%%	count_named_elements(+Content, +Name, -Count) is det.
216%
217%	Count is the number of nodes with Name in Content.
218
219count_named_elements(Content, Name, Count) :-
220	count_named_elements(Content, Name, 0, Count).
221
222count_named_elements([], _, Count, Count).
223count_named_elements([element(Name,_,_)|T], Name, C0, C) :- !,
224	C1 is C0+1,
225	count_named_elements(T, Name, C1, C).
226count_named_elements([_|T], Name, C0, C) :-
227	count_named_elements(T, Name, C0, C).
228
229
230%%	nth_element(?N, +Name, -Element, +Content:list) is nondet.
231%
232%	True if Element is the N-th element with name in Content.
233
234nth_element(N, Name, Element, Content) :-
235	nth_element_(1, N, Name, Element, Content).
236
237nth_element_(I, N, Name, E, [H|T]) :-
238	element_name(H, Name), !,
239	(   N = I,
240	    E = H
241	;   I2 is I + 1,
242	    (	nonvar(N), I2 > N
243	    ->	!, fail
244	    ;	true
245	    ),
246	    nth_element_(I2, N, Name, E, T)
247	).
248nth_element_(I, N, Name, E, [_|T]) :-
249	nth_element_(I, N, Name, E, T).
250
251
252%%	modifiers(+Modifiers, +I, +Clen, +DOM, -Value)
253%
254%
255
256modifiers([], _, _, Value, Value).
257modifiers([H|T], I, L, Value0, Value) :-
258	modifier(H, I, L, Value0, Value1),
259	modifiers(T, I, L, Value1, Value).
260
261modifier(N, I, _, Value, Value) :-				% Integer
262	integer(N), !,
263	N =:= I.
264modifier(last, I, L, Value, Value) :- !,			% last
265	I =:= L.
266modifier(last-Expr, I, L, Value, Value) :- !,			% last-Expr
267	I =:= L-Expr.
268modifier(Function, _, _, In, Out) :-
269	xpath_function(Function, In, Out).
270
271xpath_function(self, DOM, Value) :- !,				% self
272	Value = DOM.
273xpath_function(text, DOM, Text) :- !,				% text
274	text_of_dom(DOM, Text).
275xpath_function(normalize_space, DOM, Text) :- !,		% normalize_space
276	text_of_dom(DOM, Text0),
277	normalize_space(atom(Text), Text0).
278xpath_function(number, DOM, Number) :- !,			% number
279	text_of_dom(DOM, Text0),
280	normalize_space(string(Text), Text0),
281	catch(atom_number(Text, Number), _, fail).
282xpath_function(@Name, element(_, Attrs, _), Value) :- !,	% @Name
283	memberchk(Name=Value, Attrs).
284xpath_function(Left = Right, Value, Value) :- !,		% =
285	var_or_function(Left, Value, LeftValue),
286	var_or_function(Right, Value, RightValue),
287	LeftValue = RightValue.
288xpath_function(contains(Haystack, Needle), Value, Value) :- !,	% contains(Haystack, Needle)
289	val_or_function(Haystack, Value, HaystackValue),
290	val_or_function(Needle, Value, NeedleValue),
291	atom(HaystackValue), atom(NeedleValue),
292	(   sub_atom(HaystackValue, _, _, _, NeedleValue)
293	->  true
294	).
295
296var_or_function(Arg, _, Arg) :-
297	var(Arg), !.
298var_or_function(Func, Value0, Value) :-
299	xpath_function(Func, Value0, Value).
300
301val_or_function(Arg, _, Arg) :-
302	var(Arg), !,
303	instantiation_error(Arg).
304val_or_function(Func, Value0, Value) :-				% TBD
305	xpath_function(Func, Value0, Value).
306
307
308%%	text_of_dom(+DOM, -Text:atom) is det.
309%
310%	Text is the joined textual content of DOM.
311
312text_of_dom(DOM, Text) :-
313	phrase(text_of(DOM), Tokens),
314	concat_atom(Tokens, Text).
315
316text_of(element(_,_,Content)) -->
317	text_of_list(Content).
318text_of([]) -->
319	[].
320text_of([H|T]) -->
321	text_of(H),
322	text_of(T).
323
324
325text_of_list([]) -->
326	[].
327text_of_list([H|T]) -->
328	text_of_1(H),
329	text_of_list(T).
330
331
332text_of_1(element(_,_,Content)) --> !,
333	text_of_list(Content).
334text_of_1(Data) -->
335	{ assertion(atom(Data)) },
336	[Data].
337