1%% mochiweb_html_xpath.erl
2%% @author Pablo Polvorin
3%% created on <2008-04-29>
4%%
5%% XPath interpreter, navigate mochiweb's html structs
6%% Only a subset of xpath is implemented, see what is supported in test.erl
7-module(mochiweb_xpath).
8
9-export([execute/2,execute/3,compile_xpath/1]).
10
11-export_type([xpath_return/0, html_node/0]).
12-export_type([xpath_fun_spec/0, xpath_fun/0, xpath_func_argspec/0,
13              xpath_func_context/0]).
14
15-export_type([indexed_xpath_return/0, indexed_html_node/0]). % internal!!!
16
17%internal data
18-record(ctx, {
19        root :: indexed_html_node(),
20        ctx :: [indexed_html_node()],
21        functions :: [xpath_fun_spec()],
22        position :: integer(),
23        size :: integer()
24    }).
25
26%% HTML tree specs
27-type html_comment() :: {comment, binary()}.
28%% -type html_doctype() :: {doctype, [binary()]}.
29-type html_pi() :: {pi, binary(), binary()} | {pi, binary()}.
30-type html_attr() :: {binary(), binary()}.
31-type html_node() :: {binary(), [html_attr()], [html_node()
32                                                | binary()
33                                                | html_comment()
34                                                | html_pi()]}.
35
36-type indexed_html_node() ::
37        {binary(),
38         [html_attr()],
39         [indexed_html_node()
40          | binary()
41          | html_comment()
42          | html_pi()],
43         [non_neg_integer()]}.
44
45%% XPath return results specs
46-type xpath_return_item() :: boolean() | number() | binary() | html_node().
47-type xpath_return() :: boolean() | number() | [xpath_return_item()].
48
49-type indexed_return_item() :: boolean() | number() | binary() | indexed_html_node().
50-type indexed_xpath_return() :: boolean() | number() | [indexed_return_item()].
51
52-type compiled_xpath() :: tuple().
53
54%% XPath functions specs
55-type xpath_func_context() :: #ctx{}.
56
57-type xpath_type() :: node_set | string | number | boolean.
58-type xpath_func_argspec() :: [xpath_type() | {'*', xpath_type()}].
59-type xpath_fun() ::
60        fun((FuncCtx :: xpath_func_context(),
61             FuncArgs :: indexed_xpath_return()) ->
62                   FuncReturn :: indexed_xpath_return()).
63-type xpath_fun_spec() :: {atom(), xpath_fun(), xpath_func_argspec()}.
64
65
66%%
67%% API
68%%
69
70-spec compile_xpath( string() ) -> compiled_xpath().
71compile_xpath(Expr) ->
72    mochiweb_xpath_parser:compile_xpath(Expr).
73
74%% @doc Execute the given XPath expression against the given document, using
75%% the default set of functions.
76%% @spec execute(XPath,Doc) -> Results
77%% @type XPath =  compiled_xpath() | string()
78%% @type Doc = node()
79%% @type Results = [node()] | binary() | boolean() | number()
80-spec execute(XPath, Doc) -> Results
81                                 when
82      XPath :: compiled_xpath() | string(),
83      Doc :: html_node(),
84      Results :: xpath_return().
85execute(XPath,Root) ->
86    execute(XPath,Root,[]).
87
88%% @doc Execute the given XPath expression against the given document,
89%%      using the default set of functions plus the user-supplied ones.
90%%
91%% @see mochiweb_xpath_functions.erl to see how to write functions
92%%
93%% @spec execute(XPath,Doc,Functions) -> Results
94%% @type XPath =  compiled_xpath() | string()
95%% @type Doc = node()
96%% @type Functions = [FunctionDefinition]
97%% @type FunctionDefinition = {FunName,Fun,Signature}
98%% @type FunName = atom()
99%% @type Fun = fun/2
100%% @type Signature = [ArgType]
101%% @type ArgType = node_set | string | number | boolean
102%% @type Results = [node()] | binary() | boolean() | number()
103%% TODO: should pass the user-defined functions when compiling
104%%       the xpath expression (compile_xpath/1). Then the
105%%       compiled expression would have all its functions
106%%       resolved, and no function lookup would occur when
107%%       the expression is executed
108-spec execute(XPath, Doc, Functions) -> Result
109                                            when
110      XPath :: string() | compiled_xpath(),
111      Doc :: html_node(),
112      Functions :: [xpath_fun_spec()],
113      Result :: xpath_return().
114execute(XPathString,Doc,Functions) when is_list(XPathString) ->
115    XPath = mochiweb_xpath_parser:compile_xpath(XPathString),
116    execute(XPath,Doc,Functions);
117
118execute(XPath,Doc,Functions) ->
119    R0 = {<<0>>,[],[Doc]},
120    %% TODO: set parent instead of positions list, or some lazy-positioning?
121    R1 = add_positions(R0),
122    Result = execute_expr(XPath,#ctx{ctx=[R1],
123                                     root=R1,
124                                     functions=Functions,
125                                     position=0}),
126    remove_positions(Result).
127
128%%
129%% XPath tree traversing, top-level XPath interpreter
130%%
131
132%% xmerl_xpath:match_expr/2
133execute_expr({path, Type, Arg}, S) ->
134    eval_path(Type, Arg, S);
135execute_expr(PrimExpr, S) ->
136    eval_primary_expr(PrimExpr, S).
137
138
139eval_path(union, {PathExpr1, PathExpr2}, C) ->
140    %% in XPath 1.0 union doesn't necessary must return nodes in document
141    %% order (but must in XPath 2.0)
142    S1 = execute_expr(PathExpr1, C),
143    S2 = execute_expr(PathExpr2, C),
144    ordsets:to_list(ordsets:union(ordsets:from_list(S1),
145                                  ordsets:from_list(S2)));
146eval_path(abs, Path ,Ctx = #ctx{root=Root}) ->
147    do_path_expr(Path, Ctx#ctx{ctx=[Root]});
148eval_path(rel, Path, Ctx) ->
149    do_path_expr(Path, Ctx);
150eval_path(filter, {_PathExpr, {pred, _Pred}}, _C) ->
151    throw({not_implemented, "filter"}).      % Who needs them?
152
153
154eval_primary_expr({comp,Comp,A,B},Ctx) ->
155    %% for predicates
156    CompFun = comp_fun(Comp),
157    L = execute_expr(A,Ctx),
158    R = execute_expr(B,Ctx),
159    comp(CompFun,L,R);
160eval_primary_expr({arith, Op, Arg1, Arg2}, Ctx) ->
161    %% for predicates
162    L = execute_expr(Arg1,Ctx),
163    R = execute_expr(Arg2,Ctx),
164    arith(Op, L, R);
165eval_primary_expr({bool,Comp,A,B},Ctx) ->
166    CompFun = bool_fun(Comp),
167    L = execute_expr(A,Ctx),
168    R = execute_expr(B,Ctx),
169    comp(CompFun,L,R);
170eval_primary_expr({literal,L},_Ctx) ->
171    [L];
172eval_primary_expr({number,N},_Ctx) ->
173    [N];
174eval_primary_expr({negative, A}, Ctx) ->
175    R = execute_expr(A, Ctx),
176    [-mochiweb_xpath_utils:number_value(R)];
177eval_primary_expr({function_call, Fun, Args}, Ctx=#ctx{functions=Funs}) ->
178    %% TODO: refactor double-case
179    case mochiweb_xpath_functions:lookup_function(Fun) of
180        {Fun, F, FormalSignature} ->
181            call_xpath_function(F, Args, FormalSignature, Ctx);
182        false ->
183            case lists:keysearch(Fun,1,Funs) of
184                {value, {Fun, F, FormalSignature}} ->
185                    call_xpath_function(F, Args, FormalSignature, Ctx);
186                false ->
187                    throw({efun_not_found, Fun})
188            end
189    end.
190
191
192call_xpath_function(F, Args, FormalSignature, Ctx) ->
193    TypedArgs = prepare_xpath_function_args(Args, FormalSignature, Ctx),
194    F(Ctx, TypedArgs).
195
196%% execute function args expressions and convert them using formal
197%% signatures
198prepare_xpath_function_args(Args, Specs, Ctx) ->
199    RealArgs = [execute_expr(Arg, Ctx) || Arg <- Args],
200    convert_xpath_function_args(RealArgs, Specs, []).
201
202convert_xpath_function_args([], [], Acc) ->
203    lists:reverse(Acc);
204convert_xpath_function_args(Args, [{'*', Spec}], Acc) ->
205    NewArgs = [mochiweb_xpath_utils:convert(Arg,Spec) || Arg <- Args],
206    lists:reverse(Acc) ++ NewArgs;
207convert_xpath_function_args([Arg | Args], [Spec | Specs], Acc) ->
208    NewAcc = [mochiweb_xpath_utils:convert(Arg,Spec) | Acc],
209    convert_xpath_function_args(Args, Specs, NewAcc).
210
211
212
213do_path_expr({step,{Axis,NodeTest,Predicates}}=_S,Ctx=#ctx{}) ->
214    NewNodeList = axis(Axis, NodeTest, Ctx),
215    apply_predicates(Predicates,NewNodeList,Ctx);
216do_path_expr({refine,Step1,Step2},Ctx) ->
217    S1 = do_path_expr(Step1,Ctx),
218    do_path_expr(Step2,Ctx#ctx{ctx=S1}).
219
220
221%%
222%% Axes
223%%
224%% TODO: port all axes to use test_node/3
225
226axis('self', NodeTest, #ctx{ctx=Context}) ->
227    [N || N <- Context, test_node(NodeTest, N, Context)];
228axis('descendant', NodeTest, #ctx{ctx=Context}) ->
229    [N || {_,_,Children,_} <- Context,
230          N <- descendant_or_self(Children, NodeTest, [], Context)];
231axis('descendant_or_self', NodeTest, #ctx{ctx=Context}) ->
232    descendant_or_self(Context, NodeTest, [], Context);
233axis('child', NodeTest, #ctx{ctx=Context}) ->
234    %% Flat list of all child nodes of Context that pass NodeTest
235    [N || {_,_,Children,_} <- Context,
236          N <- Children,
237          test_node(NodeTest, N, Context)];
238axis('parent', NodeTest, #ctx{root=Root, ctx=Context}) ->
239    L = lists:foldl(
240          fun({_,_,_,Position}, Acc) ->
241                  ParentPosition = get_parent_position(Position),
242                  ParentNode = get_node_at(Root, ParentPosition),
243                  maybe_add_node(ParentNode, NodeTest, Acc, Context);
244             (Smth, _Acc) ->
245                  throw({not_implemented, "parent for non-nodes", Smth})
246        end, [], Context),
247    ordsets:to_list(ordsets:from_list(lists:reverse(L)));
248axis('ancestor', _Test, _Ctx) ->
249    throw({not_implemented, "ancestor axis"});
250
251axis('following_sibling', NodeTest, #ctx{root=Root, ctx=Context}) ->
252    %% TODO: alerts for non-elements (like for `text()/parent::`)
253    [N || {_,_,_,Position} <- Context,
254          N <- begin
255                   ParentPosition = get_parent_position(Position),
256                   MyPosition = get_position_in_parent(Position),
257                   {_,_,Children,_} = get_node_at(Root, ParentPosition),
258                   lists:sublist(Children,
259                                 MyPosition + 1,
260                                 length(Children) - MyPosition)
261               end,
262          test_node(NodeTest, N, Context)];
263axis('preceding_sibling', NodeTest, #ctx{root=Root, ctx=Context}) ->
264    %% TODO: alerts for non-elements (like for `text()/parent::`)
265    [N || {_,_,_,Position} <- Context,
266          N <- begin
267                   ParentPosition = get_parent_position(Position),
268                   MyPosition = get_position_in_parent(Position),
269                   {_,_,Children,_} = get_node_at(Root, ParentPosition),
270                   lists:sublist(Children, MyPosition - 1)
271               end,
272          test_node(NodeTest, N, Context)];
273axis('following', _Test, _Ctx) ->
274    throw({not_implemented, "following axis"});
275axis('preceeding', _Test, _Ctx) ->
276    throw({not_implemented, "preceeding axis"});
277
278axis('attribute', NodeTest, #ctx{ctx=Context}) ->
279    %% Flat list of *attribute values* of Context, that pass NodeTest
280    %% TODO: maybe return attribute {Name, Value} will be better then
281    %% value only?
282    [Value || {_,Attributes,_,_} <- Context,
283          {_Name, Value} = A <- Attributes,
284          test_node(NodeTest, A, Context)];
285axis('namespace', _Test, _Ctx) ->
286    throw({not_implemented, "namespace axis"});
287axis('ancestor_or_self', _Test, _Ctx) ->
288    throw({not_implemented, "ancestor-or-self axis"}).
289
290
291descendant_or_self(Nodes, NodeTest, Acc, Ctx) ->
292    lists:reverse(do_descendant_or_self(Nodes, NodeTest, Acc, Ctx)).
293
294do_descendant_or_self([], _, Acc, _) ->
295    Acc;
296do_descendant_or_self([Node = {_, _, Children, _} | Rest], NodeTest, Acc, Ctx) ->
297    %% depth-first (document order)
298    NewAcc1 = maybe_add_node(Node, NodeTest, Acc, Ctx),
299    NewAcc2 = do_descendant_or_self(Children, NodeTest, NewAcc1, Ctx),
300    do_descendant_or_self(Rest, NodeTest, NewAcc2, Ctx);
301do_descendant_or_self([_Smth | Rest], NodeTest, Acc, Ctx) ->
302    %% NewAcc = maybe_add_node(Smth, NodeTest, Acc, Ctx), - no attribs or texts
303    do_descendant_or_self(Rest, NodeTest, Acc, Ctx).
304
305
306%% Except text nodes
307test_node({wildcard, wildcard}, Element, _Ctx) when not is_binary(Element) ->
308    true;
309test_node({prefix_test, Prefix}, {Tag, _, _, _}, _Ctx) ->
310    test_ns_prefix(Tag, Prefix);
311test_node({prefix_test, Prefix}, {AttrName, _}, _Ctx) ->
312    test_ns_prefix(AttrName, Prefix);
313test_node({name, {Tag, _, _}}, {Tag, _, _, _}, _Ctx) ->
314    true;
315test_node({name, {AttrName, _, _}}, {AttrName, _}, _Ctx) ->
316    %% XXX: check this!
317    true;
318test_node({node_type, text}, Text, _Ctx) when is_binary(Text) ->
319    true;
320test_node({node_type, node}, {_, _, _, _}, _Ctx) ->
321    true;
322test_node({node_type, node}, Text, _Ctx) when is_binary(Text) ->
323    true;
324test_node({node_type, node}, {_, _}, _Ctx) ->
325    true;
326%% test_node({node_type, attribute}, {_, _}, _Ctx) ->
327%%     true; [38] - attribute() not exists!
328test_node({node_type, comment}, {comment, _}, _Ctx) ->
329    true;
330test_node({node_type, processing_instruction}, {pi, _}, _Ctx) ->
331    true;
332test_node({processing_instruction, Name}, {pi, Node}, _Ctx) ->
333    NSize = size(Name),
334    case Node of
335        <<Name:NSize/binary, " ", _/binary>> ->
336            true;
337        _ ->
338            false
339    end;
340test_node(_Other, _N, _Ctx) ->
341    false.
342
343test_ns_prefix(Name, Prefix) ->
344    PSize = size(Prefix),
345    case Name of
346        <<Prefix:PSize/binary, ":", _/binary>> ->
347            true;
348        _ ->
349            false
350    end.
351
352%% Append Node to Acc only when NodeTest passed
353maybe_add_node(Node, NodeTest, Acc, Ctx) ->
354    case test_node(NodeTest, Node, Ctx) of
355        true ->
356            [Node | Acc];
357        false ->
358            Acc
359    end.
360
361%% used for predicate indexing
362%% is_reverse_axis(ancestor) ->
363%%     true;
364%% is_reverse_axis(ancestor_or_self) ->
365%%     true;
366%% is_reverse_axis(preceding) ->
367%%     true;
368%% is_reverse_axis(preceding_sibling) ->
369%%     true;
370%% is_reverse_axis(_) ->
371%%     flase.
372
373
374%%
375%% Predicates
376%%
377apply_predicates(Predicates,NodeList,Ctx) ->
378    lists:foldl(fun({pred, Pred} ,Nodes) ->
379                 apply_predicate(Pred,Nodes,Ctx)
380                end, NodeList,Predicates).
381
382% special case: indexing
383apply_predicate({number,N}, NodeList, _Ctx) when length(NodeList) >= N ->
384    [lists:nth(N,NodeList)];
385
386apply_predicate(Pred, NodeList,Ctx) ->
387    Size = length(NodeList),
388    Filter = fun(Node, {AccPosition, AccNodes0}) ->
389            Predicate = mochiweb_xpath_utils:boolean_value(
390                execute_expr(Pred,Ctx#ctx{ctx=[Node], position=AccPosition, size = Size})),
391            AccNodes1 = if Predicate -> [Node|AccNodes0];
392                true -> AccNodes0
393            end,
394            {AccPosition+1, AccNodes1}
395    end,
396    {_, L} = lists:foldl(Filter,{1,[]},NodeList),
397    lists:reverse(L).
398
399
400%%
401%% Compare functions
402%%
403
404%% @see http://www.w3.org/TR/1999/REC-xpath-19991116 , section 3.4
405comp(CompFun,L,R) when is_list(L), is_list(R) ->
406    lists:any(fun(LeftValue) ->
407                     lists:any(fun(RightValue)->
408                                 CompFun(LeftValue,RightValue)
409                               end, R)
410              end, L);
411comp(CompFun,L,R) when is_list(L) ->
412    lists:any(fun(LeftValue) -> CompFun(LeftValue,R) end,L);
413comp(CompFun,L,R) when is_list(R) ->
414    lists:any(fun(RightValue) -> CompFun(L,RightValue) end,R);
415comp(CompFun,L,R) ->
416    CompFun(L,R).
417
418-spec comp_fun(atom()) -> fun((indexed_xpath_return(), indexed_xpath_return()) -> boolean()).
419comp_fun('=') ->
420    fun
421        (A,B) when is_number(A) -> A == mochiweb_xpath_utils:number_value(B);
422        (A,B) when is_number(B) -> mochiweb_xpath_utils:number_value(A) == B;
423        (A,B) when is_boolean(A) -> A == mochiweb_xpath_utils:boolean_value(B);
424        (A,B) when is_boolean(B) -> mochiweb_xpath_utils:boolean_value(A) == B;
425        (A,B) -> mochiweb_xpath_utils:string_value(A) == mochiweb_xpath_utils:string_value(B)
426    end;
427
428comp_fun('!=') ->
429    fun(A,B) -> F = comp_fun('='),
430                not F(A,B)
431    end;
432
433comp_fun('>') ->
434  fun(A,B) ->
435    mochiweb_xpath_utils:number_value(A) > mochiweb_xpath_utils:number_value(B)
436  end;
437comp_fun('<') ->
438  fun(A,B) ->
439    mochiweb_xpath_utils:number_value(A) < mochiweb_xpath_utils:number_value(B)
440   end;
441comp_fun('<=') ->
442  fun(A,B) ->
443    mochiweb_xpath_utils:number_value(A) =< mochiweb_xpath_utils:number_value(B)
444  end;
445comp_fun('>=') ->
446  fun(A,B) ->
447    mochiweb_xpath_utils:number_value(A) >= mochiweb_xpath_utils:number_value(B)
448  end.
449
450%%
451%% Boolean functions
452%%
453
454bool_fun('and') ->
455    fun(A, B) ->
456            mochiweb_xpath_utils:boolean_value(A)
457                andalso mochiweb_xpath_utils:boolean_value(B)
458    end;
459bool_fun('or') ->
460    fun(A, B) ->
461            mochiweb_xpath_utils:boolean_value(A)
462                orelse mochiweb_xpath_utils:boolean_value(B)
463    end.
464%% TODO more boolean operators
465
466%%
467%% Arithmetic functions
468%%
469-spec arith(atom(), indexed_xpath_return(), indexed_xpath_return()) -> number().
470arith('+', Arg1, Arg2) ->
471    mochiweb_xpath_utils:number_value(Arg1)
472		+ mochiweb_xpath_utils:number_value(Arg2);
473arith('-', Arg1, Arg2) ->
474	mochiweb_xpath_utils:number_value(Arg1)
475		- mochiweb_xpath_utils:number_value(Arg2);
476arith('*', Arg1, Arg2) ->
477	mochiweb_xpath_utils:number_value(Arg1)
478		* mochiweb_xpath_utils:number_value(Arg2);
479arith('div', Arg1, Arg2) ->
480	mochiweb_xpath_utils:number_value(Arg1)
481		/ mochiweb_xpath_utils:number_value(Arg2);
482arith('mod', Arg1, Arg2) ->
483	mochiweb_xpath_utils:number_value(Arg1)
484		rem mochiweb_xpath_utils:number_value(Arg2).
485
486%%
487%% Helpers
488%%
489
490%% @doc Add a position to each node
491%% @spec add_positions(Doc) -> ExtendedDoc
492%% @type ExtendedDoc = {atom(), [{binary(), any()}], [extended_node()], [non_neg_integer()]}
493-spec add_positions(html_node()) -> indexed_html_node().
494add_positions(Node) ->
495    R = add_positions_aux(Node, []),
496    R.
497
498add_positions_aux({Tag,Attrs,Children}, Position) ->
499    {_, NewChildren} = lists:foldl(fun(Child, {Count, AccChildren}) ->
500                NewChild = add_positions_aux(Child, [Count | Position]),
501                {Count+1, [NewChild|AccChildren]}
502        end, {1, []}, Children),
503    {Tag, Attrs, lists:reverse(NewChildren), Position};
504add_positions_aux(Data, _) ->
505    Data.
506
507%% @doc Remove position from each node
508%% @spec remove_positions(ExtendedDoc) -> Doc
509%% @type ExtendedDoc = {atom(), [{binary(), any()}], [extended_node()], [non_neg_integer()]}
510-spec remove_positions(indexed_xpath_return()) -> xpath_return().
511remove_positions(Nodes) when is_list(Nodes) ->
512    [ remove_positions(SubNode) || SubNode <- Nodes ];
513remove_positions({Tag, Attrs, Children, _}) ->
514    {Tag, Attrs, remove_positions(Children)};
515remove_positions(Data) ->
516    Data.
517
518%% @doc Get node according to a position relative to root node
519%% @spec get_node_at(ExtendedDoc, Position) -> ExtendedDoc
520%% @type Position = [non_neg_integer()]
521%% @type ExtendedDoc = {atom(), [{binary(), any()}], [extended_node()], [non_neg_integer()]}
522get_node_at(Node, Position) ->
523    get_node_at_aux(Node, lists:reverse(Position)).
524
525get_node_at_aux(Node, []) ->
526    Node;
527get_node_at_aux({_,_,Children,_}, [Pos|Next]) ->
528    get_node_at_aux(lists:nth(Pos, Children), Next).
529
530%% @doc Get parent position
531%% @spec get_parent_position(Position) -> Position
532%% @type Position = [non_neg_integer()]
533get_parent_position([_|ParentPosition]) ->
534    ParentPosition.
535
536%% @doc Get position relative to my parent
537%% @spec get_self_position(Position) -> non_neg_integer()
538%% @type Position = [non_neg_integer()]
539get_position_in_parent([MyPosition|_]) ->
540    MyPosition.
541