1%% 2%% %CopyrightBegin% 3%% 4%% Copyright Ericsson AB 2003-2016. All Rights Reserved. 5%% 6%% Licensed under the Apache License, Version 2.0 (the "License"); 7%% you may not use this file except in compliance with the License. 8%% You may obtain a copy of the License at 9%% 10%% http://www.apache.org/licenses/LICENSE-2.0 11%% 12%% Unless required by applicable law or agreed to in writing, software 13%% distributed under the License is distributed on an "AS IS" BASIS, 14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15%% See the License for the specific language governing permissions and 16%% limitations under the License. 17%% 18%% %CopyrightEnd% 19%% 20 21%% Description : Token scanner for XPATH grammar 22 23%%%---------------------------------------------------------------------- 24%%% 25%%% The XPATH grammar is a bit tricky, due to operator overloading. 26%%% This version of the scanner is based on the XPATH spec: 27%%% http://www.w3.org/TR/1999/REC-xpath-19991116 (XPATH version 1.0) 28%%% 29%%% Quote from the spec: 30%%% 31%%% "The following special tokenization rules must be applied in the order 32%%% specified to disambiguate the ExprToken grammar: 33%%% 34%%% o If there is a preceding token and the preceding token is not one of 35%%% @, ::. (, [, or an Operator, then a * must be recognized as a 36%%% MultiplyOperator and an NCName must be recognized as an OperatorName 37%%% o If the character following an NCName (possible after intervening 38%%% ExprWhiteSpace) is (, then the token must be recognized as a NodeType 39%%% or a FunctionName. 40%%% o If the two characters following an NCName (possible after intervening 41%%% ExprWhiteSpace) are ::, then the token must be recognized as an 42%%% AxisName. 43%%% o Otherwise, the token must not be recognized as a MultiplyOperator, an 44%%% OperatorName, a NodeType, a FunctionName, or an AxisName." 45%%%---------------------------------------------------------------------- 46 47-module(xmerl_xpath_scan). 48 49 50%% main API 51-export([tokens/1]). 52 53%% exported helper functions 54-export([scan_number/1]). 55 56-include("xmerl.hrl"). 57 58-define(L, 1). 59 60 61tokens(Str) -> 62 tokens(strip_ws(Str), []). 63 64tokens([], Acc) -> 65 lists:reverse([{'$end', ?L, '$end'}|Acc]); 66tokens(Str, Acc) -> 67 case scan_token(Str, Acc) of 68 {rescan, NewStr} -> 69 tokens(NewStr, Acc); 70 {Token, T} -> 71 tokens(strip_ws(T), [Token|Acc]) 72 end. 73 74%% Expr Tokens 75scan_token("(" ++ T, _A) -> {{'(', ?L, '('}, T}; 76scan_token(")" ++ T, _A) -> {{')', ?L, ')'}, T}; 77scan_token("[" ++ T, _A) -> {{'[', ?L, '['}, T}; 78scan_token("]" ++ T, _A) -> {{']', ?L, ']'}, T}; 79scan_token(".." ++ T, _A) -> {rescan,"parent::node()" ++ T} ; 80 % {{'..',?L,'..'}, T}; 81scan_token("@" ++ T, _A) -> {rescan,"attribute::" ++ T}; 82 % {{'@',?L,'@'},T}; 83scan_token("," ++ T, _A) -> {{',', ?L, ','}, T}; 84scan_token("::" ++ T, _A) -> {{'::', ?L, '::'}, T}; 85 86%% operators 87scan_token("//" ++ T, _A) -> {rescan,"/descendant-or-self::node()/" ++ T}; 88 % {{'//',?L,'//'},T}; 89scan_token("/" ++ T, _A) -> {{'/', ?L, '/'}, T}; 90scan_token("|" ++ T, _A) -> {{'|', ?L, '|'}, T}; 91scan_token("+" ++ T, _A) -> {{'+', ?L, '+'}, T}; 92scan_token("-" ++ T, _A) -> {{'-', ?L, '-'}, T}; 93scan_token("=" ++ T, _A) -> {{'=', ?L, '='}, T}; 94scan_token("!=" ++ T, _A) -> {{'!=', ?L, '!='}, T}; 95scan_token("<=" ++ T, _A) -> {{'<=', ?L, '<='}, T}; 96scan_token("<" ++ T, _A) -> {{'<', ?L, '<'}, T}; 97scan_token(">=" ++ T, _A) -> {{'>=', ?L, '>='}, T}; 98scan_token(">" ++ T, _A) -> {{'>', ?L, '>'}, T}; 99 100scan_token("*" ++ T, A) -> 101 Tok = 102 case A of 103 [{X,_,_}|_] -> 104 case special_token(X) of 105 false -> 106 {'*', ?L, '*'}; 107 true -> 108 {'wildcard', ?L, 'wildcard'} 109 end; 110 _ -> 111 {'wildcard', ?L, 'wildcard'} 112 end, 113 {Tok, T}; 114 115%% numbers 116scan_token(Str = [H|_], _A) when H >= $0, H =< $9 -> 117 scan_number(Str); 118scan_token(Str = [$., H|_], A) when H >= $0, H =< $9 -> 119 scan_number(Str, A); 120scan_token("." ++ T, _A) -> 121% {{'.', ?L, '.'}, T}; 122 {rescan, "self::node()" ++ T}; 123 124%% Variable Reference 125scan_token([$$|T], _A) -> 126 {{Prefix, Local}, T1} = scan_name(T), 127 case Prefix of 128 [] -> 129 {{var_reference, ?L, list_to_atom(Local)}, T1}; 130 _ -> 131 {{var_reference, ?L, list_to_atom(Prefix++":"++Local)}, T1} 132 end; 133 134scan_token([H|T], _A) when H == $" ; H == $' -> 135 {Literal, T1} = scan_literal(T, H, []), 136 {{literal, ?L, Literal}, T1}; 137 138scan_token(T, A) -> 139 {{Prefix, Local}, T1} = scan_name(T), 140 case A of 141 [{X,_,_}|_] -> 142 case special_token(X) of 143 false -> 144 operator_name(Prefix, Local, T1); 145 true -> 146 other_name(Prefix, Local, strip_ws(T1)) 147 end; 148 _ -> 149 other_name(Prefix, Local, T1) 150 end. 151 152operator_name([], "and", T) -> {{'and', ?L, 'and'}, T}; 153operator_name([], "or", T) -> {{'or', ?L, 'or'}, T}; 154operator_name([], "mod", T) -> {{'mod', ?L, 'mod'}, T}; 155operator_name([], "div", T) -> {{'div', ?L, 'div'}, T}. 156 157 158other_name(Prefix, [], "*" ++ T) -> 159 %% [37] NameTest ::= '*' | NCName ':' '*' | QName 160 {{prefix_test, ?L, Prefix}, T}; 161other_name(Prefix, Local, T = "(" ++ _) -> 162 node_type_or_function_name(Prefix, Local, T); 163other_name(Prefix, Local, T = "::" ++ _) -> 164 axis(Prefix, Local, T); 165other_name([], Local, T) -> 166 {{name, ?L, {list_to_atom(Local), [], Local}}, T}; 167other_name(Prefix, Local, T) -> 168 {{name, ?L, {list_to_atom(Prefix++":"++Local), Prefix, Local}}, T}. 169 170 171 172%% node types 173node_type_or_function_name([], "comment", T) -> 174 {{node_type, ?L, comment}, T}; 175node_type_or_function_name([], "text", T) -> 176 {{node_type, ?L, text}, T}; 177node_type_or_function_name([], "processing-instruction", T) -> 178 {{'processing-instruction', ?L, 'processing-instruction'}, T}; 179node_type_or_function_name([], "node", T) -> 180 {{node_type, ?L, node}, T}; 181node_type_or_function_name(Prefix, Local, T) -> 182 {{function_name, ?L, list_to_atom(Prefix ++ Local)}, T}. 183 184 185%% axis names 186axis([], "ancestor-or-self", T) -> {{axis, ?L, ancestor_or_self}, T}; 187axis([], "ancestor", T) -> {{axis, ?L, ancestor}, T}; 188axis([], "attribute", T) -> {{axis, ?L, attribute}, T}; 189axis([], "child", T) -> {{axis, ?L, child}, T}; 190axis([], "descendant-or-self", T) -> {{axis, ?L, descendant_or_self}, T}; 191axis([], "descendant", T) -> {{axis, ?L, descendant}, T}; 192axis([], "following-sibling", T) -> {{axis, ?L, following_sibling}, T}; 193axis([], "following", T) -> {{axis, ?L, following}, T}; 194axis([], "namespace", T) -> {{axis, ?L, namespace}, T}; 195axis([], "parent", T) -> {{axis, ?L, parent}, T}; 196axis([], "preceding-sibling", T) -> {{axis, ?L, preceding_sibling}, T}; 197axis([], "preceding", T) -> {{axis, ?L, preceding}, T}; 198axis([], "self", T) -> {{axis, ?L, self}, T}. 199 200 201 202 203scan_literal([H|T], H, Acc) -> 204 {lists:reverse(Acc), T}; 205scan_literal([H|T], Delim, Acc) -> 206 scan_literal(T, Delim, [H|Acc]). 207 208 209scan_name([H1, H2 | T]) when H1 == $: ; H1 == $_ -> 210 if ?whitespace(H2) -> 211 exit({invalid_name, [H1, H2, '...']}); 212 true -> 213 scan_prefix(T, [H2, H1]) 214 end; 215scan_name([H|T]) -> 216 case xmerl_lib:is_letter(H) of 217 true -> 218 scan_prefix(T, [H]); 219 false -> 220 exit({invalid_name, lists:sublist([H|T], 1, 6)}) 221 end; 222scan_name(Str) -> 223 exit({invalid_name, lists:sublist(Str, 1, 6)}). 224 225scan_prefix([], Acc) -> 226 {{[], lists:reverse(Acc)}, []}; 227scan_prefix(Str = [H|_], Acc) when ?whitespace(H) -> 228 {{[], lists:reverse(Acc)}, Str}; 229scan_prefix(T = "::" ++ _, Acc) -> 230 %% This is the next token 231 {{[], lists:reverse(Acc)}, T}; 232scan_prefix(":" ++ T, Acc) -> 233 {LocalPart, T1} = scan_local_part(T, []), 234 Prefix = lists:reverse(Acc), 235 {{Prefix, LocalPart}, T1}; 236scan_prefix(Str = [H|T], Acc) -> 237 case xmerl_lib:is_namechar(H) of 238 true -> 239 scan_prefix(T, [H|Acc]); 240 false -> 241 {{[], lists:reverse(Acc)}, Str} 242 end. 243 244scan_local_part([], Acc) -> 245 {lists:reverse(Acc), []}; 246scan_local_part(Str = [H|_], Acc) when ?whitespace(H) -> 247 {lists:reverse(Acc), Str}; 248scan_local_part(Str = [H|T], Acc) -> 249 case xmerl_lib:is_namechar(H) of 250 true -> 251 scan_local_part(T, [H|Acc]); 252 false -> 253 {lists:reverse(Acc), Str} 254 end. 255 256 257scan_number(T) -> 258 scan_number(T, []). 259 260scan_number([], Acc) -> 261 {{number, ?L, list_to_integer(lists:reverse(Acc))}, []}; 262scan_number("." ++ T, []) -> 263 {Digits, T1} = scan_digits(T, ".0"), 264 Number = list_to_float(Digits), 265 {{number, ?L, Number}, T1}; 266scan_number("." ++ T, Acc) -> 267 {Digits, T1} = scan_digits(T, "." ++ Acc), 268 Number = list_to_float(Digits), 269 {{number, ?L, Number}, T1}; 270scan_number([H|T], Acc) when H >= $0, H =< $9 -> 271 scan_number(T, [H|Acc]); 272scan_number(T, Acc) -> 273 {{number, ?L, list_to_integer(lists:reverse(Acc))}, T}. 274 275scan_digits([], Acc) -> 276 {lists:reverse(Acc), []}; 277scan_digits([H|T], Acc) when H >= $0, H =< $9 -> 278 scan_digits(T, [H|Acc]); 279scan_digits(T, Acc) -> 280 {lists:reverse(Acc), T}. 281 282 283strip_ws([H|T]) when ?whitespace(H) -> 284 strip_ws(T); 285strip_ws(T) -> 286 T. 287 288 289%% special_token('@') -> true; 290special_token('::') -> true; 291special_token(',') -> true; 292special_token('(') -> true; 293special_token('[') -> true; 294special_token('/') -> true; 295%% special_token('//') -> true; 296special_token('|') -> true; 297special_token('+') -> true; 298special_token('-') -> true; 299special_token('=') -> true; 300special_token('!=') -> true; 301special_token('<') -> true; 302special_token('<=') -> true; 303special_token('>') -> true; 304special_token('>=') -> true; 305special_token('and') -> true; 306special_token('or') -> true; 307special_token('mod') -> true; 308special_token('div') -> true; 309special_token(_) -> false. 310