1/* This file is part of ClioPatria. 2 3 Author: Jan Wielemaker <J.Wielemaker@cs.vu.nl> 4 HTTP: http://e-culture.multimedian.nl/ 5 Copyright: 2007, E-Culture/MultimediaN 6 7 ClioPatria is free software: you can redistribute it and/or modify 8 it under the terms of the GNU General Public License as published by 9 the Free Software Foundation, either version 2 of the License, or 10 (at your option) any later version. 11 12 ClioPatria is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with ClioPatria. If not, see <http://www.gnu.org/licenses/>. 19*/ 20 21:- module(xpath, 22 [ xpath/3, % +DOM, +Spec, -Value 23 xpath_chk/3, % +DOM, +Spec, -Value 24 25 op(400, fx, //), 26 op(400, fx, /), 27 op(200, fy, @) 28 ]). 29:- use_module(library(record)). 30:- use_module(library(lists)). 31:- use_module(library(occurs)). 32:- use_module(library(debug)). 33 34/** <module> Select nodes in an XML DOM 35 36The library xpath.pl provides predicates to select nodes from an XML DOM 37tree as produced by library(sgml) based on descriptions inspired by the 38XPATH language. 39 40The predicate xpath/3 selects a sub-structure of the DOM 41non-deterministically based on an xpath-like specification. Not all 42selectors of XPATH are implemented, but the ability to mix xpath/3 calls 43with arbitrary Prolog code provides a powerful tool for extracting 44information from XML parse-trees. 45 46@see http://www.w3.org/TR/xpath 47*/ 48 49:- record 50 element(name, attributes, content). 51 52%% xpath_chk(+DOM, +Spec, ?Content) is semidet. 53% 54% Semi-deterministic version of xpath/3. 55 56xpath_chk(DOM, Spec, Content) :- 57 xpath(DOM, Spec, Content), !. 58 59%% xpath(+DOM, +Spec, ?Content) is nondet. 60% 61% Match an element in a DOM structure. The syntax is inspired by 62% XPath, using () rather than [] to select inside an element. 63% First we can construct paths using / and //: 64% 65% $ =|//|=Term : 66% Select any node in the DOM matching term. 67% $ =|/|=Term : 68% Match the root against Term. 69% $ Term : 70% Select the immediate children of the root matching Term. 71% 72% The Terms above are of type _callable_. The functor specifies 73% the element name. The element name '*' refers to any element. 74% The name =self= refers to the top-element itself and is often 75% used for processing matches of an earlier xpath/3 query. A term 76% NS:Term refers to an XML name in the namespace NS. Optional 77% arguments specify additional constraints and functions. The 78% arguments are processed from left to right. Defined conditional 79% argument values are: 80% 81% $ Integer : 82% The N-th element with the given name 83% $ =last= : 84% The last element with the given name. 85% $ =last= - IntExpr : 86% The IntExpr-th element counting from the last (0-based) 87% 88% Defined function argument values are: 89% 90% $ =self= : 91% Evaluate to the entire element 92% $ =text= : 93% Evaluates to all text from the sub-tree as an atom 94% $ =normalize_space= : 95% As =text=, but uses normalize_space/2 to normalise 96% white-space in the output 97% $ =number= : 98% Extract an integer or float from the value. Ignores 99% leading and trailing white-space 100% $ =|@|=Attribute : 101% Evaluates to the value of the given attribute 102% 103% In addition, the argument-list can be _conditions_: 104% 105% $ Left = Right : 106% Succeeds if the left-hand unifies with the right-hand. 107% E.g. normalize_space = 'euro' 108% $ contains(Haystack, Needle) : 109% Succeeds if Needle is a sub-string of Haystack. 110% 111% Examples: 112% 113% Match each table-row in DOM: 114% 115% == 116% xpath(DOM, //tr, TR) 117% == 118% 119% Match the last cell of each tablerow in DOM. This example 120% illustrates that a result can be the input of subsequent xpath/3 121% queries. Using multiple queries on the intermediate TR term 122% guarantee that all results come from the same table-row: 123% 124% == 125% xpath(DOM, //tr, TR), 126% xpath(TR, /td(last), TD) 127% == 128% 129% Match each =href= attribute in an <a> element 130% 131% == 132% xpath(DOM, //a(@href), HREF) 133% == 134% 135% Suppose we have a table containing rows where each first column 136% is the name of a product with a link to details and the second 137% is the price (a number). The following predicate matches the 138% name, URL and price: 139% 140% == 141% product(DOM, Name, URL, Price) :- 142% xpath(DOM, //tr, TR), 143% xpath(TR, td(1), C1), 144% xpath(C1, /self(normalize_space), Name), 145% xpath(C1, a(@href), URL), 146% xpath(TR, td(2, number), Price). 147% == 148 149xpath(DOM, Spec, Content) :- 150 in_dom(Spec, DOM, Content). 151 152in_dom(//Spec, DOM, Value) :- !, 153 element_spec(Spec, Name, Modifiers), 154 sub_dom(I, Len, Name, E, DOM), 155 modifiers(Modifiers, I, Len, E, Value). 156in_dom(/Spec, E, Value) :- !, 157 element_spec(Spec, Name, Modifiers), 158 ( Name == self 159 -> true 160 ; element_name(E, Name) 161 ), 162 modifiers(Modifiers, 1, 1, E, Value). 163in_dom(A/B, DOM, Value) :- !, 164 in_dom(A, DOM, Value0), 165 in_dom(B, Value0, Value). 166in_dom(A//B, DOM, Value) :- !, 167 in_dom(A, DOM, Value0), 168 in_dom(//B, Value0, Value). 169in_dom(Spec, element(_, _, Content), Value) :- 170 element_spec(Spec, Name, Modifiers), 171 count_named_elements(Content, Name, CLen), 172 CLen > 0, 173 nth_element(N, Name, E, Content), 174 modifiers(Modifiers, N, CLen, E, Value). 175 176element_spec(Var, _, _) :- 177 var(Var), !, 178 instantiation_error(Var). 179element_spec(NS:Term, NS:Name, Modifiers) :- !, 180 Term =.. [Name0|Modifiers], 181 star(Name0, Name). 182element_spec(Term, Name, Modifiers) :- !, 183 Term =.. [Name0|Modifiers], 184 star(Name0, Name). 185 186star(*, _) :- !. 187star(Name, Name). 188 189 190%% sub_dom(-Index, -Count, +Name, -Sub, +DOM) is nondet. 191% 192% Sub is a node in DOM with Name. 193% 194% @param Count is the total number of nodes in the content 195% list Sub appears that have the same name. 196% @param Index is the 1-based index of Sub of nodes with 197% Name. 198 199sub_dom(1, 1, Name, DOM, DOM) :- 200 element_name(DOM, Name). 201sub_dom(N, Len, Name, E, element(_,_,Content)) :- !, 202 sub_dom_2(N, Len, Name, E, Content). 203sub_dom(N, Len, Name, E, Content) :- 204 is_list(Content), 205 sub_dom_2(N, Len, Name, E, Content). 206 207sub_dom_2(N, Len, Name, Element, Content) :- 208 ( count_named_elements(Content, Name, Len), 209 nth_element(N, Name, Element, Content) 210 ; member(element(_,_,C2), Content), 211 sub_dom_2(N, Len, Name, Element, C2) 212 ). 213 214 215%% count_named_elements(+Content, +Name, -Count) is det. 216% 217% Count is the number of nodes with Name in Content. 218 219count_named_elements(Content, Name, Count) :- 220 count_named_elements(Content, Name, 0, Count). 221 222count_named_elements([], _, Count, Count). 223count_named_elements([element(Name,_,_)|T], Name, C0, C) :- !, 224 C1 is C0+1, 225 count_named_elements(T, Name, C1, C). 226count_named_elements([_|T], Name, C0, C) :- 227 count_named_elements(T, Name, C0, C). 228 229 230%% nth_element(?N, +Name, -Element, +Content:list) is nondet. 231% 232% True if Element is the N-th element with name in Content. 233 234nth_element(N, Name, Element, Content) :- 235 nth_element_(1, N, Name, Element, Content). 236 237nth_element_(I, N, Name, E, [H|T]) :- 238 element_name(H, Name), !, 239 ( N = I, 240 E = H 241 ; I2 is I + 1, 242 ( nonvar(N), I2 > N 243 -> !, fail 244 ; true 245 ), 246 nth_element_(I2, N, Name, E, T) 247 ). 248nth_element_(I, N, Name, E, [_|T]) :- 249 nth_element_(I, N, Name, E, T). 250 251 252%% modifiers(+Modifiers, +I, +Clen, +DOM, -Value) 253% 254% 255 256modifiers([], _, _, Value, Value). 257modifiers([H|T], I, L, Value0, Value) :- 258 modifier(H, I, L, Value0, Value1), 259 modifiers(T, I, L, Value1, Value). 260 261modifier(N, I, _, Value, Value) :- % Integer 262 integer(N), !, 263 N =:= I. 264modifier(last, I, L, Value, Value) :- !, % last 265 I =:= L. 266modifier(last-Expr, I, L, Value, Value) :- !, % last-Expr 267 I =:= L-Expr. 268modifier(Function, _, _, In, Out) :- 269 xpath_function(Function, In, Out). 270 271xpath_function(self, DOM, Value) :- !, % self 272 Value = DOM. 273xpath_function(text, DOM, Text) :- !, % text 274 text_of_dom(DOM, Text). 275xpath_function(normalize_space, DOM, Text) :- !, % normalize_space 276 text_of_dom(DOM, Text0), 277 normalize_space(atom(Text), Text0). 278xpath_function(number, DOM, Number) :- !, % number 279 text_of_dom(DOM, Text0), 280 normalize_space(string(Text), Text0), 281 catch(atom_number(Text, Number), _, fail). 282xpath_function(@Name, element(_, Attrs, _), Value) :- !, % @Name 283 memberchk(Name=Value, Attrs). 284xpath_function(Left = Right, Value, Value) :- !, % = 285 var_or_function(Left, Value, LeftValue), 286 var_or_function(Right, Value, RightValue), 287 LeftValue = RightValue. 288xpath_function(contains(Haystack, Needle), Value, Value) :- !, % contains(Haystack, Needle) 289 val_or_function(Haystack, Value, HaystackValue), 290 val_or_function(Needle, Value, NeedleValue), 291 atom(HaystackValue), atom(NeedleValue), 292 ( sub_atom(HaystackValue, _, _, _, NeedleValue) 293 -> true 294 ). 295 296var_or_function(Arg, _, Arg) :- 297 var(Arg), !. 298var_or_function(Func, Value0, Value) :- 299 xpath_function(Func, Value0, Value). 300 301val_or_function(Arg, _, Arg) :- 302 var(Arg), !, 303 instantiation_error(Arg). 304val_or_function(Func, Value0, Value) :- % TBD 305 xpath_function(Func, Value0, Value). 306 307 308%% text_of_dom(+DOM, -Text:atom) is det. 309% 310% Text is the joined textual content of DOM. 311 312text_of_dom(DOM, Text) :- 313 phrase(text_of(DOM), Tokens), 314 concat_atom(Tokens, Text). 315 316text_of(element(_,_,Content)) --> 317 text_of_list(Content). 318text_of([]) --> 319 []. 320text_of([H|T]) --> 321 text_of(H), 322 text_of(T). 323 324 325text_of_list([]) --> 326 []. 327text_of_list([H|T]) --> 328 text_of_1(H), 329 text_of_list(T). 330 331 332text_of_1(element(_,_,Content)) --> !, 333 text_of_list(Content). 334text_of_1(Data) --> 335 { assertion(atom(Data)) }, 336 [Data]. 337