1%%
2%% %CopyrightBegin%
3%%
4%% Copyright Ericsson AB 2003-2016. All Rights Reserved.
5%%
6%% Licensed under the Apache License, Version 2.0 (the "License");
7%% you may not use this file except in compliance with the License.
8%% You may obtain a copy of the License at
9%%
10%%     http://www.apache.org/licenses/LICENSE-2.0
11%%
12%% Unless required by applicable law or agreed to in writing, software
13%% distributed under the License is distributed on an "AS IS" BASIS,
14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15%% See the License for the specific language governing permissions and
16%% limitations under the License.
17%%
18%% %CopyrightEnd%
19%%
20
21%% Description  : Token scanner for XPATH grammar
22
23%%%----------------------------------------------------------------------
24%%%
25%%% The XPATH grammar is a bit tricky, due to operator overloading.
26%%% This version of the scanner is based on the XPATH spec:
27%%% http://www.w3.org/TR/1999/REC-xpath-19991116 (XPATH version 1.0)
28%%%
29%%% Quote from the spec:
30%%%
31%%%  "The following special tokenization rules must be applied in the order
32%%%  specified to disambiguate the ExprToken grammar:
33%%%
34%%%  o If there is a preceding token and the preceding token is not one of
35%%%    @, ::. (, [, or an Operator, then a * must be recognized as a
36%%%    MultiplyOperator and an NCName must be recognized as an OperatorName
37%%%  o If the character following an NCName (possible after intervening
38%%%    ExprWhiteSpace) is (, then the token must be recognized as a NodeType
39%%%    or a FunctionName.
40%%%  o If the two characters following an NCName (possible after intervening
41%%%    ExprWhiteSpace) are ::, then the token must be recognized as an
42%%%    AxisName.
43%%%  o Otherwise, the token must not be recognized as a MultiplyOperator, an
44%%%    OperatorName, a NodeType, a FunctionName, or an AxisName."
45%%%----------------------------------------------------------------------
46
47-module(xmerl_xpath_scan).
48
49
50%% main API
51-export([tokens/1]).
52
53%% exported helper functions
54-export([scan_number/1]).
55
56-include("xmerl.hrl").
57
58-define(L, 1).
59
60
61tokens(Str) ->
62    tokens(strip_ws(Str), []).
63
64tokens([], Acc) ->
65    lists:reverse([{'$end', ?L, '$end'}|Acc]);
66tokens(Str, Acc) ->
67    case scan_token(Str, Acc) of
68	{rescan, NewStr} ->
69	    tokens(NewStr, Acc);
70	{Token, T} ->
71	    tokens(strip_ws(T), [Token|Acc])
72    end.
73
74%% Expr Tokens
75scan_token("(" ++ T, _A) ->  {{'(', ?L, '('}, T};
76scan_token(")" ++ T, _A) ->  {{')', ?L, ')'}, T};
77scan_token("[" ++ T, _A) ->  {{'[', ?L, '['}, T};
78scan_token("]" ++ T, _A) ->  {{']', ?L, ']'}, T};
79scan_token(".." ++ T, _A) -> {rescan,"parent::node()" ++ T} ;
80						% {{'..',?L,'..'}, T};
81scan_token("@" ++ T, _A) ->  {rescan,"attribute::" ++ T};
82						% {{'@',?L,'@'},T};
83scan_token("," ++ T, _A) ->  {{',', ?L, ','}, T};
84scan_token("::" ++ T, _A) -> {{'::', ?L, '::'}, T};
85
86%% operators
87scan_token("//" ++ T, _A) -> {rescan,"/descendant-or-self::node()/" ++ T};
88						% {{'//',?L,'//'},T};
89scan_token("/" ++ T, _A) ->  {{'/', ?L, '/'}, T};
90scan_token("|" ++ T, _A) ->  {{'|', ?L, '|'}, T};
91scan_token("+" ++ T, _A) ->  {{'+', ?L, '+'}, T};
92scan_token("-" ++ T, _A) ->  {{'-', ?L, '-'}, T};
93scan_token("=" ++ T, _A) ->  {{'=', ?L, '='}, T};
94scan_token("!=" ++ T, _A) -> {{'!=', ?L, '!='}, T};
95scan_token("<=" ++ T, _A) -> {{'<=', ?L, '<='}, T};
96scan_token("<" ++ T, _A) ->  {{'<', ?L, '<'}, T};
97scan_token(">=" ++ T, _A) -> {{'>=', ?L, '>='}, T};
98scan_token(">" ++ T, _A) ->  {{'>', ?L, '>'}, T};
99
100scan_token("*" ++ T, A) ->
101    Tok =
102	case A of
103	    [{X,_,_}|_] ->
104		case special_token(X) of
105		    false ->
106			{'*', ?L, '*'};
107		    true ->
108			{'wildcard', ?L, 'wildcard'}
109		end;
110	    _ ->
111		{'wildcard', ?L, 'wildcard'}
112	end,
113    {Tok, T};
114
115%% numbers
116scan_token(Str = [H|_], _A) when H >= $0, H =< $9 ->
117    scan_number(Str);
118scan_token(Str = [$., H|_], A) when H >= $0, H =< $9 ->
119    scan_number(Str, A);
120scan_token("." ++ T, _A) ->
121%    {{'.', ?L, '.'}, T};
122    {rescan, "self::node()" ++ T};
123
124%% Variable Reference
125scan_token([$$|T], _A) ->
126    {{Prefix, Local}, T1} = scan_name(T),
127    case Prefix of
128	[] ->
129	    {{var_reference, ?L, list_to_atom(Local)}, T1};
130	_ ->
131	    {{var_reference, ?L, list_to_atom(Prefix++":"++Local)}, T1}
132    end;
133
134scan_token([H|T], _A) when H == $" ; H == $' ->
135    {Literal, T1} = scan_literal(T, H, []),
136    {{literal, ?L, Literal}, T1};
137
138scan_token(T, A) ->
139    {{Prefix, Local}, T1} = scan_name(T),
140    case A of
141	[{X,_,_}|_] ->
142	    case special_token(X) of
143		false ->
144		    operator_name(Prefix, Local, T1);
145		true ->
146		    other_name(Prefix, Local, strip_ws(T1))
147	    end;
148	_ ->
149	    other_name(Prefix, Local, T1)
150    end.
151
152operator_name([], "and", T) ->	{{'and', ?L, 'and'}, T};
153operator_name([], "or", T) ->	{{'or', ?L, 'or'}, T};
154operator_name([], "mod", T) ->	{{'mod', ?L, 'mod'}, T};
155operator_name([], "div", T) ->	{{'div', ?L, 'div'}, T}.
156
157
158other_name(Prefix, [], "*" ++ T) ->
159    %% [37] NameTest ::= '*' | NCName ':' '*' | QName
160    {{prefix_test, ?L, Prefix}, T};
161other_name(Prefix, Local, T = "(" ++ _) ->
162    node_type_or_function_name(Prefix, Local, T);
163other_name(Prefix, Local, T = "::" ++ _) ->
164    axis(Prefix, Local, T);
165other_name([], Local, T) ->
166    {{name, ?L, {list_to_atom(Local),              [], Local}}, T};
167other_name(Prefix, Local, T) ->
168    {{name, ?L, {list_to_atom(Prefix++":"++Local), Prefix, Local}}, T}.
169
170
171
172%% node types
173node_type_or_function_name([], "comment", T) ->
174    {{node_type, ?L, comment}, T};
175node_type_or_function_name([], "text", T) ->
176    {{node_type, ?L, text}, T};
177node_type_or_function_name([], "processing-instruction", T) ->
178    {{'processing-instruction', ?L, 'processing-instruction'}, T};
179node_type_or_function_name([], "node", T) ->
180    {{node_type, ?L, node}, T};
181node_type_or_function_name(Prefix, Local, T) ->
182    {{function_name, ?L, list_to_atom(Prefix ++ Local)}, T}.
183
184
185%% axis names
186axis([], "ancestor-or-self", T) ->	{{axis, ?L, ancestor_or_self}, T};
187axis([], "ancestor", T) ->		{{axis, ?L, ancestor}, T};
188axis([], "attribute", T) ->		{{axis, ?L, attribute}, T};
189axis([], "child", T) ->			{{axis, ?L, child}, T};
190axis([], "descendant-or-self", T) ->	{{axis, ?L, descendant_or_self}, T};
191axis([], "descendant", T) ->		{{axis, ?L, descendant}, T};
192axis([], "following-sibling", T) ->	{{axis, ?L, following_sibling}, T};
193axis([], "following", T) ->		{{axis, ?L, following}, T};
194axis([], "namespace", T) ->		{{axis, ?L, namespace}, T};
195axis([], "parent", T) ->		{{axis, ?L, parent}, T};
196axis([], "preceding-sibling", T) ->	{{axis, ?L, preceding_sibling}, T};
197axis([], "preceding", T) ->		{{axis, ?L, preceding}, T};
198axis([], "self", T) ->			{{axis, ?L, self}, T}.
199
200
201
202
203scan_literal([H|T], H, Acc) ->
204    {lists:reverse(Acc), T};
205scan_literal([H|T], Delim, Acc) ->
206    scan_literal(T, Delim, [H|Acc]).
207
208
209scan_name([H1, H2 | T]) when H1 == $: ; H1 == $_ ->
210    if ?whitespace(H2) ->
211	    exit({invalid_name, [H1, H2, '...']});
212       true ->
213	    scan_prefix(T, [H2, H1])
214    end;
215scan_name([H|T]) ->
216    case xmerl_lib:is_letter(H) of
217	true ->
218	    scan_prefix(T, [H]);
219	false ->
220	    exit({invalid_name, lists:sublist([H|T], 1, 6)})
221    end;
222scan_name(Str) ->
223    exit({invalid_name, lists:sublist(Str, 1, 6)}).
224
225scan_prefix([], Acc) ->
226    {{[], lists:reverse(Acc)}, []};
227scan_prefix(Str = [H|_], Acc) when ?whitespace(H) ->
228    {{[], lists:reverse(Acc)}, Str};
229scan_prefix(T = "::" ++ _, Acc) ->
230    %% This is the next token
231    {{[], lists:reverse(Acc)}, T};
232scan_prefix(":" ++ T, Acc) ->
233    {LocalPart, T1} = scan_local_part(T, []),
234    Prefix = lists:reverse(Acc),
235    {{Prefix, LocalPart}, T1};
236scan_prefix(Str = [H|T], Acc) ->
237    case xmerl_lib:is_namechar(H) of
238	true ->
239	    scan_prefix(T, [H|Acc]);
240	false ->
241	    {{[], lists:reverse(Acc)}, Str}
242    end.
243
244scan_local_part([], Acc) ->
245    {lists:reverse(Acc), []};
246scan_local_part(Str = [H|_], Acc) when ?whitespace(H) ->
247    {lists:reverse(Acc), Str};
248scan_local_part(Str = [H|T], Acc) ->
249    case xmerl_lib:is_namechar(H) of
250	true ->
251	    scan_local_part(T, [H|Acc]);
252	false ->
253	    {lists:reverse(Acc), Str}
254    end.
255
256
257scan_number(T) ->
258    scan_number(T, []).
259
260scan_number([], Acc) ->
261    {{number, ?L, list_to_integer(lists:reverse(Acc))}, []};
262scan_number("." ++ T, []) ->
263    {Digits, T1} = scan_digits(T, ".0"),
264    Number = list_to_float(Digits),
265    {{number, ?L, Number}, T1};
266scan_number("." ++ T, Acc) ->
267    {Digits, T1} = scan_digits(T, "." ++ Acc),
268    Number = list_to_float(Digits),
269    {{number, ?L, Number}, T1};
270scan_number([H|T], Acc) when H >= $0, H =< $9 ->
271    scan_number(T, [H|Acc]);
272scan_number(T, Acc) ->
273    {{number, ?L, list_to_integer(lists:reverse(Acc))}, T}.
274
275scan_digits([], Acc) ->
276    {lists:reverse(Acc), []};
277scan_digits([H|T], Acc) when H >= $0, H =< $9 ->
278    scan_digits(T, [H|Acc]);
279scan_digits(T, Acc) ->
280    {lists:reverse(Acc), T}.
281
282
283strip_ws([H|T]) when ?whitespace(H) ->
284    strip_ws(T);
285strip_ws(T) ->
286    T.
287
288
289%% special_token('@') -> true;
290special_token('::') -> true;
291special_token(',') -> true;
292special_token('(') -> true;
293special_token('[') -> true;
294special_token('/') -> true;
295%% special_token('//') -> true;
296special_token('|') -> true;
297special_token('+') -> true;
298special_token('-') -> true;
299special_token('=') -> true;
300special_token('!=') -> true;
301special_token('<') -> true;
302special_token('<=') -> true;
303special_token('>') -> true;
304special_token('>=') -> true;
305special_token('and') -> true;
306special_token('or') -> true;
307special_token('mod') -> true;
308special_token('div') -> true;
309special_token(_) -> false.
310