1%% mochiweb_html_xpath.erl 2%% @author Pablo Polvorin 3%% created on <2008-04-29> 4%% 5%% XPath interpreter, navigate mochiweb's html structs 6%% Only a subset of xpath is implemented, see what is supported in test.erl 7-module(mochiweb_xpath). 8 9-export([execute/2,execute/3,compile_xpath/1]). 10 11-export_type([xpath_return/0, html_node/0]). 12-export_type([xpath_fun_spec/0, xpath_fun/0, xpath_func_argspec/0, 13 xpath_func_context/0]). 14 15-export_type([indexed_xpath_return/0, indexed_html_node/0]). % internal!!! 16 17%internal data 18-record(ctx, { 19 root :: indexed_html_node(), 20 ctx :: [indexed_html_node()], 21 functions :: [xpath_fun_spec()], 22 position :: integer(), 23 size :: integer() 24 }). 25 26%% HTML tree specs 27-type html_comment() :: {comment, binary()}. 28%% -type html_doctype() :: {doctype, [binary()]}. 29-type html_pi() :: {pi, binary(), binary()} | {pi, binary()}. 30-type html_attr() :: {binary(), binary()}. 31-type html_node() :: {binary(), [html_attr()], [html_node() 32 | binary() 33 | html_comment() 34 | html_pi()]}. 35 36-type indexed_html_node() :: 37 {binary(), 38 [html_attr()], 39 [indexed_html_node() 40 | binary() 41 | html_comment() 42 | html_pi()], 43 [non_neg_integer()]}. 44 45%% XPath return results specs 46-type xpath_return_item() :: boolean() | number() | binary() | html_node(). 47-type xpath_return() :: boolean() | number() | [xpath_return_item()]. 48 49-type indexed_return_item() :: boolean() | number() | binary() | indexed_html_node(). 50-type indexed_xpath_return() :: boolean() | number() | [indexed_return_item()]. 51 52-type compiled_xpath() :: tuple(). 53 54%% XPath functions specs 55-type xpath_func_context() :: #ctx{}. 56 57-type xpath_type() :: node_set | string | number | boolean. 58-type xpath_func_argspec() :: [xpath_type() | {'*', xpath_type()}]. 59-type xpath_fun() :: 60 fun((FuncCtx :: xpath_func_context(), 61 FuncArgs :: indexed_xpath_return()) -> 62 FuncReturn :: indexed_xpath_return()). 63-type xpath_fun_spec() :: {atom(), xpath_fun(), xpath_func_argspec()}. 64 65 66%% 67%% API 68%% 69 70-spec compile_xpath( string() ) -> compiled_xpath(). 71compile_xpath(Expr) -> 72 mochiweb_xpath_parser:compile_xpath(Expr). 73 74%% @doc Execute the given XPath expression against the given document, using 75%% the default set of functions. 76%% @spec execute(XPath,Doc) -> Results 77%% @type XPath = compiled_xpath() | string() 78%% @type Doc = node() 79%% @type Results = [node()] | binary() | boolean() | number() 80-spec execute(XPath, Doc) -> Results 81 when 82 XPath :: compiled_xpath() | string(), 83 Doc :: html_node(), 84 Results :: xpath_return(). 85execute(XPath,Root) -> 86 execute(XPath,Root,[]). 87 88%% @doc Execute the given XPath expression against the given document, 89%% using the default set of functions plus the user-supplied ones. 90%% 91%% @see mochiweb_xpath_functions.erl to see how to write functions 92%% 93%% @spec execute(XPath,Doc,Functions) -> Results 94%% @type XPath = compiled_xpath() | string() 95%% @type Doc = node() 96%% @type Functions = [FunctionDefinition] 97%% @type FunctionDefinition = {FunName,Fun,Signature} 98%% @type FunName = atom() 99%% @type Fun = fun/2 100%% @type Signature = [ArgType] 101%% @type ArgType = node_set | string | number | boolean 102%% @type Results = [node()] | binary() | boolean() | number() 103%% TODO: should pass the user-defined functions when compiling 104%% the xpath expression (compile_xpath/1). Then the 105%% compiled expression would have all its functions 106%% resolved, and no function lookup would occur when 107%% the expression is executed 108-spec execute(XPath, Doc, Functions) -> Result 109 when 110 XPath :: string() | compiled_xpath(), 111 Doc :: html_node(), 112 Functions :: [xpath_fun_spec()], 113 Result :: xpath_return(). 114execute(XPathString,Doc,Functions) when is_list(XPathString) -> 115 XPath = mochiweb_xpath_parser:compile_xpath(XPathString), 116 execute(XPath,Doc,Functions); 117 118execute(XPath,Doc,Functions) -> 119 R0 = {<<0>>,[],[Doc]}, 120 %% TODO: set parent instead of positions list, or some lazy-positioning? 121 R1 = add_positions(R0), 122 Result = execute_expr(XPath,#ctx{ctx=[R1], 123 root=R1, 124 functions=Functions, 125 position=0}), 126 remove_positions(Result). 127 128%% 129%% XPath tree traversing, top-level XPath interpreter 130%% 131 132%% xmerl_xpath:match_expr/2 133execute_expr({path, Type, Arg}, S) -> 134 eval_path(Type, Arg, S); 135execute_expr(PrimExpr, S) -> 136 eval_primary_expr(PrimExpr, S). 137 138 139eval_path(union, {PathExpr1, PathExpr2}, C) -> 140 %% in XPath 1.0 union doesn't necessary must return nodes in document 141 %% order (but must in XPath 2.0) 142 S1 = execute_expr(PathExpr1, C), 143 S2 = execute_expr(PathExpr2, C), 144 ordsets:to_list(ordsets:union(ordsets:from_list(S1), 145 ordsets:from_list(S2))); 146eval_path(abs, Path ,Ctx = #ctx{root=Root}) -> 147 do_path_expr(Path, Ctx#ctx{ctx=[Root]}); 148eval_path(rel, Path, Ctx) -> 149 do_path_expr(Path, Ctx); 150eval_path(filter, {_PathExpr, {pred, _Pred}}, _C) -> 151 throw({not_implemented, "filter"}). % Who needs them? 152 153 154eval_primary_expr({comp,Comp,A,B},Ctx) -> 155 %% for predicates 156 CompFun = comp_fun(Comp), 157 L = execute_expr(A,Ctx), 158 R = execute_expr(B,Ctx), 159 comp(CompFun,L,R); 160eval_primary_expr({arith, Op, Arg1, Arg2}, Ctx) -> 161 %% for predicates 162 L = execute_expr(Arg1,Ctx), 163 R = execute_expr(Arg2,Ctx), 164 arith(Op, L, R); 165eval_primary_expr({bool,Comp,A,B},Ctx) -> 166 CompFun = bool_fun(Comp), 167 L = execute_expr(A,Ctx), 168 R = execute_expr(B,Ctx), 169 comp(CompFun,L,R); 170eval_primary_expr({literal,L},_Ctx) -> 171 [L]; 172eval_primary_expr({number,N},_Ctx) -> 173 [N]; 174eval_primary_expr({negative, A}, Ctx) -> 175 R = execute_expr(A, Ctx), 176 [-mochiweb_xpath_utils:number_value(R)]; 177eval_primary_expr({function_call, Fun, Args}, Ctx=#ctx{functions=Funs}) -> 178 %% TODO: refactor double-case 179 case mochiweb_xpath_functions:lookup_function(Fun) of 180 {Fun, F, FormalSignature} -> 181 call_xpath_function(F, Args, FormalSignature, Ctx); 182 false -> 183 case lists:keysearch(Fun,1,Funs) of 184 {value, {Fun, F, FormalSignature}} -> 185 call_xpath_function(F, Args, FormalSignature, Ctx); 186 false -> 187 throw({efun_not_found, Fun}) 188 end 189 end. 190 191 192call_xpath_function(F, Args, FormalSignature, Ctx) -> 193 TypedArgs = prepare_xpath_function_args(Args, FormalSignature, Ctx), 194 F(Ctx, TypedArgs). 195 196%% execute function args expressions and convert them using formal 197%% signatures 198prepare_xpath_function_args(Args, Specs, Ctx) -> 199 RealArgs = [execute_expr(Arg, Ctx) || Arg <- Args], 200 convert_xpath_function_args(RealArgs, Specs, []). 201 202convert_xpath_function_args([], [], Acc) -> 203 lists:reverse(Acc); 204convert_xpath_function_args(Args, [{'*', Spec}], Acc) -> 205 NewArgs = [mochiweb_xpath_utils:convert(Arg,Spec) || Arg <- Args], 206 lists:reverse(Acc) ++ NewArgs; 207convert_xpath_function_args([Arg | Args], [Spec | Specs], Acc) -> 208 NewAcc = [mochiweb_xpath_utils:convert(Arg,Spec) | Acc], 209 convert_xpath_function_args(Args, Specs, NewAcc). 210 211 212 213do_path_expr({step,{Axis,NodeTest,Predicates}}=_S,Ctx=#ctx{}) -> 214 NewNodeList = axis(Axis, NodeTest, Ctx), 215 apply_predicates(Predicates,NewNodeList,Ctx); 216do_path_expr({refine,Step1,Step2},Ctx) -> 217 S1 = do_path_expr(Step1,Ctx), 218 do_path_expr(Step2,Ctx#ctx{ctx=S1}). 219 220 221%% 222%% Axes 223%% 224%% TODO: port all axes to use test_node/3 225 226axis('self', NodeTest, #ctx{ctx=Context}) -> 227 [N || N <- Context, test_node(NodeTest, N, Context)]; 228axis('descendant', NodeTest, #ctx{ctx=Context}) -> 229 [N || {_,_,Children,_} <- Context, 230 N <- descendant_or_self(Children, NodeTest, [], Context)]; 231axis('descendant_or_self', NodeTest, #ctx{ctx=Context}) -> 232 descendant_or_self(Context, NodeTest, [], Context); 233axis('child', NodeTest, #ctx{ctx=Context}) -> 234 %% Flat list of all child nodes of Context that pass NodeTest 235 [N || {_,_,Children,_} <- Context, 236 N <- Children, 237 test_node(NodeTest, N, Context)]; 238axis('parent', NodeTest, #ctx{root=Root, ctx=Context}) -> 239 L = lists:foldl( 240 fun({_,_,_,Position}, Acc) -> 241 ParentPosition = get_parent_position(Position), 242 ParentNode = get_node_at(Root, ParentPosition), 243 maybe_add_node(ParentNode, NodeTest, Acc, Context); 244 (Smth, _Acc) -> 245 throw({not_implemented, "parent for non-nodes", Smth}) 246 end, [], Context), 247 ordsets:to_list(ordsets:from_list(lists:reverse(L))); 248axis('ancestor', _Test, _Ctx) -> 249 throw({not_implemented, "ancestor axis"}); 250 251axis('following_sibling', NodeTest, #ctx{root=Root, ctx=Context}) -> 252 %% TODO: alerts for non-elements (like for `text()/parent::`) 253 [N || {_,_,_,Position} <- Context, 254 N <- begin 255 ParentPosition = get_parent_position(Position), 256 MyPosition = get_position_in_parent(Position), 257 {_,_,Children,_} = get_node_at(Root, ParentPosition), 258 lists:sublist(Children, 259 MyPosition + 1, 260 length(Children) - MyPosition) 261 end, 262 test_node(NodeTest, N, Context)]; 263axis('preceding_sibling', NodeTest, #ctx{root=Root, ctx=Context}) -> 264 %% TODO: alerts for non-elements (like for `text()/parent::`) 265 [N || {_,_,_,Position} <- Context, 266 N <- begin 267 ParentPosition = get_parent_position(Position), 268 MyPosition = get_position_in_parent(Position), 269 {_,_,Children,_} = get_node_at(Root, ParentPosition), 270 lists:sublist(Children, MyPosition - 1) 271 end, 272 test_node(NodeTest, N, Context)]; 273axis('following', _Test, _Ctx) -> 274 throw({not_implemented, "following axis"}); 275axis('preceeding', _Test, _Ctx) -> 276 throw({not_implemented, "preceeding axis"}); 277 278axis('attribute', NodeTest, #ctx{ctx=Context}) -> 279 %% Flat list of *attribute values* of Context, that pass NodeTest 280 %% TODO: maybe return attribute {Name, Value} will be better then 281 %% value only? 282 [Value || {_,Attributes,_,_} <- Context, 283 {_Name, Value} = A <- Attributes, 284 test_node(NodeTest, A, Context)]; 285axis('namespace', _Test, _Ctx) -> 286 throw({not_implemented, "namespace axis"}); 287axis('ancestor_or_self', _Test, _Ctx) -> 288 throw({not_implemented, "ancestor-or-self axis"}). 289 290 291descendant_or_self(Nodes, NodeTest, Acc, Ctx) -> 292 lists:reverse(do_descendant_or_self(Nodes, NodeTest, Acc, Ctx)). 293 294do_descendant_or_self([], _, Acc, _) -> 295 Acc; 296do_descendant_or_self([Node = {_, _, Children, _} | Rest], NodeTest, Acc, Ctx) -> 297 %% depth-first (document order) 298 NewAcc1 = maybe_add_node(Node, NodeTest, Acc, Ctx), 299 NewAcc2 = do_descendant_or_self(Children, NodeTest, NewAcc1, Ctx), 300 do_descendant_or_self(Rest, NodeTest, NewAcc2, Ctx); 301do_descendant_or_self([_Smth | Rest], NodeTest, Acc, Ctx) -> 302 %% NewAcc = maybe_add_node(Smth, NodeTest, Acc, Ctx), - no attribs or texts 303 do_descendant_or_self(Rest, NodeTest, Acc, Ctx). 304 305 306%% Except text nodes 307test_node({wildcard, wildcard}, Element, _Ctx) when not is_binary(Element) -> 308 true; 309test_node({prefix_test, Prefix}, {Tag, _, _, _}, _Ctx) -> 310 test_ns_prefix(Tag, Prefix); 311test_node({prefix_test, Prefix}, {AttrName, _}, _Ctx) -> 312 test_ns_prefix(AttrName, Prefix); 313test_node({name, {Tag, _, _}}, {Tag, _, _, _}, _Ctx) -> 314 true; 315test_node({name, {AttrName, _, _}}, {AttrName, _}, _Ctx) -> 316 %% XXX: check this! 317 true; 318test_node({node_type, text}, Text, _Ctx) when is_binary(Text) -> 319 true; 320test_node({node_type, node}, {_, _, _, _}, _Ctx) -> 321 true; 322test_node({node_type, node}, Text, _Ctx) when is_binary(Text) -> 323 true; 324test_node({node_type, node}, {_, _}, _Ctx) -> 325 true; 326%% test_node({node_type, attribute}, {_, _}, _Ctx) -> 327%% true; [38] - attribute() not exists! 328test_node({node_type, comment}, {comment, _}, _Ctx) -> 329 true; 330test_node({node_type, processing_instruction}, {pi, _}, _Ctx) -> 331 true; 332test_node({processing_instruction, Name}, {pi, Node}, _Ctx) -> 333 NSize = size(Name), 334 case Node of 335 <<Name:NSize/binary, " ", _/binary>> -> 336 true; 337 _ -> 338 false 339 end; 340test_node(_Other, _N, _Ctx) -> 341 false. 342 343test_ns_prefix(Name, Prefix) -> 344 PSize = size(Prefix), 345 case Name of 346 <<Prefix:PSize/binary, ":", _/binary>> -> 347 true; 348 _ -> 349 false 350 end. 351 352%% Append Node to Acc only when NodeTest passed 353maybe_add_node(Node, NodeTest, Acc, Ctx) -> 354 case test_node(NodeTest, Node, Ctx) of 355 true -> 356 [Node | Acc]; 357 false -> 358 Acc 359 end. 360 361%% used for predicate indexing 362%% is_reverse_axis(ancestor) -> 363%% true; 364%% is_reverse_axis(ancestor_or_self) -> 365%% true; 366%% is_reverse_axis(preceding) -> 367%% true; 368%% is_reverse_axis(preceding_sibling) -> 369%% true; 370%% is_reverse_axis(_) -> 371%% flase. 372 373 374%% 375%% Predicates 376%% 377apply_predicates(Predicates,NodeList,Ctx) -> 378 lists:foldl(fun({pred, Pred} ,Nodes) -> 379 apply_predicate(Pred,Nodes,Ctx) 380 end, NodeList,Predicates). 381 382% special case: indexing 383apply_predicate({number,N}, NodeList, _Ctx) when length(NodeList) >= N -> 384 [lists:nth(N,NodeList)]; 385 386apply_predicate(Pred, NodeList,Ctx) -> 387 Size = length(NodeList), 388 Filter = fun(Node, {AccPosition, AccNodes0}) -> 389 Predicate = mochiweb_xpath_utils:boolean_value( 390 execute_expr(Pred,Ctx#ctx{ctx=[Node], position=AccPosition, size = Size})), 391 AccNodes1 = if Predicate -> [Node|AccNodes0]; 392 true -> AccNodes0 393 end, 394 {AccPosition+1, AccNodes1} 395 end, 396 {_, L} = lists:foldl(Filter,{1,[]},NodeList), 397 lists:reverse(L). 398 399 400%% 401%% Compare functions 402%% 403 404%% @see http://www.w3.org/TR/1999/REC-xpath-19991116 , section 3.4 405comp(CompFun,L,R) when is_list(L), is_list(R) -> 406 lists:any(fun(LeftValue) -> 407 lists:any(fun(RightValue)-> 408 CompFun(LeftValue,RightValue) 409 end, R) 410 end, L); 411comp(CompFun,L,R) when is_list(L) -> 412 lists:any(fun(LeftValue) -> CompFun(LeftValue,R) end,L); 413comp(CompFun,L,R) when is_list(R) -> 414 lists:any(fun(RightValue) -> CompFun(L,RightValue) end,R); 415comp(CompFun,L,R) -> 416 CompFun(L,R). 417 418-spec comp_fun(atom()) -> fun((indexed_xpath_return(), indexed_xpath_return()) -> boolean()). 419comp_fun('=') -> 420 fun 421 (A,B) when is_number(A) -> A == mochiweb_xpath_utils:number_value(B); 422 (A,B) when is_number(B) -> mochiweb_xpath_utils:number_value(A) == B; 423 (A,B) when is_boolean(A) -> A == mochiweb_xpath_utils:boolean_value(B); 424 (A,B) when is_boolean(B) -> mochiweb_xpath_utils:boolean_value(A) == B; 425 (A,B) -> mochiweb_xpath_utils:string_value(A) == mochiweb_xpath_utils:string_value(B) 426 end; 427 428comp_fun('!=') -> 429 fun(A,B) -> F = comp_fun('='), 430 not F(A,B) 431 end; 432 433comp_fun('>') -> 434 fun(A,B) -> 435 mochiweb_xpath_utils:number_value(A) > mochiweb_xpath_utils:number_value(B) 436 end; 437comp_fun('<') -> 438 fun(A,B) -> 439 mochiweb_xpath_utils:number_value(A) < mochiweb_xpath_utils:number_value(B) 440 end; 441comp_fun('<=') -> 442 fun(A,B) -> 443 mochiweb_xpath_utils:number_value(A) =< mochiweb_xpath_utils:number_value(B) 444 end; 445comp_fun('>=') -> 446 fun(A,B) -> 447 mochiweb_xpath_utils:number_value(A) >= mochiweb_xpath_utils:number_value(B) 448 end. 449 450%% 451%% Boolean functions 452%% 453 454bool_fun('and') -> 455 fun(A, B) -> 456 mochiweb_xpath_utils:boolean_value(A) 457 andalso mochiweb_xpath_utils:boolean_value(B) 458 end; 459bool_fun('or') -> 460 fun(A, B) -> 461 mochiweb_xpath_utils:boolean_value(A) 462 orelse mochiweb_xpath_utils:boolean_value(B) 463 end. 464%% TODO more boolean operators 465 466%% 467%% Arithmetic functions 468%% 469-spec arith(atom(), indexed_xpath_return(), indexed_xpath_return()) -> number(). 470arith('+', Arg1, Arg2) -> 471 mochiweb_xpath_utils:number_value(Arg1) 472 + mochiweb_xpath_utils:number_value(Arg2); 473arith('-', Arg1, Arg2) -> 474 mochiweb_xpath_utils:number_value(Arg1) 475 - mochiweb_xpath_utils:number_value(Arg2); 476arith('*', Arg1, Arg2) -> 477 mochiweb_xpath_utils:number_value(Arg1) 478 * mochiweb_xpath_utils:number_value(Arg2); 479arith('div', Arg1, Arg2) -> 480 mochiweb_xpath_utils:number_value(Arg1) 481 / mochiweb_xpath_utils:number_value(Arg2); 482arith('mod', Arg1, Arg2) -> 483 mochiweb_xpath_utils:number_value(Arg1) 484 rem mochiweb_xpath_utils:number_value(Arg2). 485 486%% 487%% Helpers 488%% 489 490%% @doc Add a position to each node 491%% @spec add_positions(Doc) -> ExtendedDoc 492%% @type ExtendedDoc = {atom(), [{binary(), any()}], [extended_node()], [non_neg_integer()]} 493-spec add_positions(html_node()) -> indexed_html_node(). 494add_positions(Node) -> 495 R = add_positions_aux(Node, []), 496 R. 497 498add_positions_aux({Tag,Attrs,Children}, Position) -> 499 {_, NewChildren} = lists:foldl(fun(Child, {Count, AccChildren}) -> 500 NewChild = add_positions_aux(Child, [Count | Position]), 501 {Count+1, [NewChild|AccChildren]} 502 end, {1, []}, Children), 503 {Tag, Attrs, lists:reverse(NewChildren), Position}; 504add_positions_aux(Data, _) -> 505 Data. 506 507%% @doc Remove position from each node 508%% @spec remove_positions(ExtendedDoc) -> Doc 509%% @type ExtendedDoc = {atom(), [{binary(), any()}], [extended_node()], [non_neg_integer()]} 510-spec remove_positions(indexed_xpath_return()) -> xpath_return(). 511remove_positions(Nodes) when is_list(Nodes) -> 512 [ remove_positions(SubNode) || SubNode <- Nodes ]; 513remove_positions({Tag, Attrs, Children, _}) -> 514 {Tag, Attrs, remove_positions(Children)}; 515remove_positions(Data) -> 516 Data. 517 518%% @doc Get node according to a position relative to root node 519%% @spec get_node_at(ExtendedDoc, Position) -> ExtendedDoc 520%% @type Position = [non_neg_integer()] 521%% @type ExtendedDoc = {atom(), [{binary(), any()}], [extended_node()], [non_neg_integer()]} 522get_node_at(Node, Position) -> 523 get_node_at_aux(Node, lists:reverse(Position)). 524 525get_node_at_aux(Node, []) -> 526 Node; 527get_node_at_aux({_,_,Children,_}, [Pos|Next]) -> 528 get_node_at_aux(lists:nth(Pos, Children), Next). 529 530%% @doc Get parent position 531%% @spec get_parent_position(Position) -> Position 532%% @type Position = [non_neg_integer()] 533get_parent_position([_|ParentPosition]) -> 534 ParentPosition. 535 536%% @doc Get position relative to my parent 537%% @spec get_self_position(Position) -> non_neg_integer() 538%% @type Position = [non_neg_integer()] 539get_position_in_parent([MyPosition|_]) -> 540 MyPosition. 541