1%% ``Licensed under the Apache License, Version 2.0 (the "License");
2%% you may not use this file except in compliance with the License.
3%% You may obtain a copy of the License at
4%%
5%%     http://www.apache.org/licenses/LICENSE-2.0
6%%
7%% Unless required by applicable law or agreed to in writing, software
8%% distributed under the License is distributed on an "AS IS" BASIS,
9%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10%% See the License for the specific language governing permissions and
11%% limitations under the License.
12%%
13%% The Initial Developer of the Original Code is Ericsson Utvecklings
14%% AB. Portions created by Ericsson are Copyright 1999, Ericsson
15%% Utvecklings AB. All Rights Reserved.''
16%%
17%% @private
18%% @copyright Richard Carlsson 2001-2003. Portions created by Ericsson
19%% are Copyright 1999, Ericsson Utvecklings AB. All Rights Reserved.
20%% @author Richard Carlsson <carlsson.richard@gmail.com>
21%% @see edoc
22%% @end
23
24%% @doc Tokeniser for EDoc. Based on the Erlang standard library module
25%% {@link //stdlib/erl_scan}.
26
27-module(edoc_scanner).
28
29%% NOTE: the interface to this module is ancient and should be updated.
30%% Please do not regard these exported functions as stable. Their
31%% behaviour is described in the documentation of the module `erl_scan'.
32%%
33%% Since there are no `full stop' tokens in EDoc specifications, the
34%% `tokens' function *always* returns `{more, Continuation}' unless an
35%% error occurs.
36
37-export([string/1,string/2,format_error/1]).
38
39-import(lists, [reverse/1]).
40
41string(Cs) -> string(Cs, 1).
42
43string(Cs, StartPos) ->
44    case scan(Cs, StartPos) of
45	{ok,Toks} -> {ok,Toks,StartPos};
46	{error,E} -> {error,E,StartPos}
47    end.
48
49%% format_error(Error)
50%%  Return a string describing the error.
51
52format_error({string,Quote,Head}) ->
53    ["unterminated string starting with " ++ io_lib:write_string(Head,Quote)];
54format_error({illegal,Type}) -> io_lib:fwrite("illegal ~w", [Type]);
55format_error(char) -> "unterminated character";
56format_error(scan) -> "premature end";
57format_error({base,Base}) -> io_lib:fwrite("illegal base '~w'", [Base]);
58format_error(float) -> "bad float";
59
60format_error(Other) -> io_lib:write(Other).
61
62%% Reserved words, not atoms:
63reserved('where') -> true;
64reserved(_) -> false.
65
66%% scan(CharList, StartPos)
67%%  This takes a list of characters and tries to tokenise them.
68%%
69%%  The token list is built in reverse order (in a stack) to save appending
70%%  and then reversed when all the tokens have been collected. Most tokens
71%%  are built in the same way.
72%%
73%%  Returns:
74%%	{ok,[Tok]}
75%%	{error,{ErrorPos,edoc_scanner,What}}
76
77scan(Cs, Pos) ->
78    scan1(Cs, [], Pos).
79
80%% scan1(Characters, TokenStack, Position)
81%%  Scan a list of characters into tokens.
82
83scan1([$\n|Cs], Toks, Pos) ->            	        % Newline
84    scan1(Cs, Toks, Pos+1);
85scan1([C|Cs], Toks, Pos) when C >= 0, C =< $  -> 	% Skip blanks
86    scan1(Cs, Toks, Pos);
87scan1([C|Cs], Toks, Pos) when C >= $a, C =< $z ->	% Unquoted atom
88    scan_atom(C, Cs, Toks, Pos);
89scan1([C|Cs], Toks, Pos) when C >= $\337, C =< $\377, C /= $\367 ->
90    scan_atom(C, Cs, Toks, Pos);
91scan1([C|Cs], Toks, Pos) when C >= $0, C =< $9 ->	% Numbers
92    scan_number(C, Cs, Toks, Pos);
93scan1([$-,C| Cs], Toks, Pos) when C >= $0, C =< $9 ->	% Signed numbers
94    scan_signed_number($-, C, Cs, Toks, Pos);
95scan1([$+,C| Cs], Toks, Pos) when C >= $0, C =< $9 ->	% Signed numbers
96    scan_signed_number($+, C, Cs, Toks, Pos);
97scan1([C|Cs], Toks, Pos) when C >= $A, C =< $Z ->	% Variables
98    scan_variable(C, Cs, Toks, Pos);
99scan1([$_|Cs], Toks, Pos) ->				% Variables
100    scan_variable($_, Cs, Toks, Pos);
101scan1([C|Cs], Toks, Pos) when C >= $\300, C =< $\336, C /= $\327 ->
102    scan_variable(C, Cs, Toks, Pos);
103scan1([$$|Cs], Toks, Pos) ->			% Character constant
104    case scan_char_const(Cs, Toks, Pos) of
105	{ok, Result} ->
106	    {ok, Result};
107	{error, truncated_char} ->
108	    scan_error(char, Pos);
109	{error, illegal_character} ->
110	    scan_error({illegal, char}, Pos)
111    end;
112scan1([$'|Cs0], Toks, Pos) ->				% Quoted atom
113    case scan_string(Cs0, $', Pos) of
114	{S,Cs1,Pos1} ->
115	    case catch list_to_atom(S) of
116		A when is_atom(A) ->
117		    scan1(Cs1, [{atom,Pos,A}|Toks], Pos1);
118		_Error -> scan_error({illegal,atom}, Pos)
119	    end;
120	{error, premature_end} ->
121	    scan_error({string,$',Cs0}, Pos);
122	{error, truncated_char} ->
123	    scan_error(char, Pos);
124	{error, illegal_character} ->
125	    scan_error({illegal, atom}, Pos)
126    end;
127scan1([$"|Cs0], Toks, Pos) ->				% String
128    case scan_string(Cs0, $", Pos) of
129	{S,Cs1,Pos1} ->
130	    case Toks of
131		[{string, Pos0, S0} | Toks1] ->
132		    scan1(Cs1, [{string, Pos0, S0 ++ S} | Toks1],
133			  Pos1);
134		_ ->
135		    scan1(Cs1, [{string,Pos,S}|Toks], Pos1)
136	    end;
137	{error, premature_end} ->
138	    scan_error({string,$",Cs0}, Pos);
139	{error, truncated_char} ->
140	    scan_error(char, Pos);
141	{error, illegal_character} ->
142	    scan_error({illegal, string}, Pos)
143    end;
144%% Punctuation characters and operators, first recognise multiples.
145scan1([$=,$>|Cs], Toks, Pos) ->
146    scan1(Cs, [{'=>',Pos}|Toks], Pos);
147scan1([$<,$<|Cs], Toks, Pos) ->
148    scan1(Cs, [{'<<',Pos}|Toks], Pos);
149scan1([$>,$>|Cs], Toks, Pos) ->
150    scan1(Cs, [{'>>',Pos}|Toks], Pos);
151scan1([$-,$>|Cs], Toks, Pos) ->
152    scan1(Cs, [{'->',Pos}|Toks], Pos);
153scan1([$:,$=|Cs], Toks, Pos) ->
154    scan1(Cs, [{':=',Pos}|Toks], Pos);
155scan1([$:,$:|Cs], Toks, Pos) ->
156    scan1(Cs, [{'::',Pos}|Toks], Pos);
157scan1([$/,$/|Cs], Toks, Pos) ->
158    scan1(Cs, [{'//',Pos}|Toks], Pos);
159scan1([$.,$.,$.|Cs], Toks, Pos) ->
160    scan1(Cs, [{'...',Pos}|Toks], Pos);
161scan1([$.,$.|Cs], Toks, Pos) ->
162    scan1(Cs, [{'..',Pos}|Toks], Pos);
163scan1([C|Cs], Toks, Pos) -> % Punctuation character
164    P = list_to_atom([C]),
165    scan1(Cs, [{P,Pos}|Toks], Pos);
166scan1([], Toks0, _Pos) ->
167    Toks = reverse(Toks0),
168    {ok,Toks}.
169
170%% Note that `_' is not accepted as a variable token.
171scan_variable(C, Cs, Toks, Pos) ->
172    {Wcs,Cs1} = scan_name(Cs, []),
173    W = [C|reverse(Wcs)],
174    case W of
175	"_" ->
176            scan1(Cs1, [{an_var,Pos,'_'}|Toks], Pos);
177	_ ->
178	    case catch list_to_atom(W) of
179		A when is_atom(A) ->
180		    scan1(Cs1, [{var,Pos,A}|Toks], Pos);
181		_ ->
182		    scan_error({illegal,variable}, Pos)
183	    end
184    end.
185
186scan_atom(C, Cs, Toks, Pos) ->
187    {Wcs,Cs1} = scan_name(Cs, []),
188    W = [C|reverse(Wcs)],
189    case catch list_to_atom(W) of
190	A when is_atom(A) ->
191	    case reserved(A) of
192		true ->
193		    scan1(Cs1, [{A,Pos}|Toks], Pos);
194		false ->
195		    scan1(Cs1, [{atom,Pos,A}|Toks], Pos)
196	    end;
197	_ ->
198	    scan_error({illegal,token}, Pos)
199    end.
200
201%% scan_name(Cs) -> lists:splitwith(fun (C) -> name_char(C) end, Cs).
202
203scan_name([C|Cs], Ncs) ->
204    case name_char(C) of
205	true ->
206	    scan_name(Cs, [C|Ncs]);
207	false ->
208	    {Ncs,[C|Cs]}		% Must rebuild here, sigh!
209    end;
210scan_name([], Ncs) ->
211    {Ncs,[]}.
212
213name_char(C) when C >= $a, C =< $z -> true;
214name_char(C) when C >= $\337, C =< $\377, C /= $\367 -> true;
215name_char(C) when C >= $A, C =< $Z -> true;
216name_char(C) when C >= $\300, C =< $\336, C /= $\327 -> true;
217name_char(C) when C >= $0, C =< $9 -> true;
218name_char($_) -> true;
219name_char($@) -> true;
220name_char(_) -> false.
221
222%% scan_string(CharList, QuoteChar, Pos) ->
223%%	{StringChars,RestChars, NewPos}
224
225scan_string(Cs, Quote, Pos) ->
226    scan_string(Cs, [], Quote, Pos).
227
228scan_string([Quote|Cs], Scs, Quote, Pos) ->
229    {reverse(Scs),Cs,Pos};
230scan_string([], _Scs, _Quote, _Pos) ->
231    {error, premature_end};
232scan_string(Cs0, Scs, Quote, Pos) ->
233    case scan_char(Cs0, Pos) of
234	{C,Cs,Pos1} ->
235	    %% Only build the string here
236	    scan_string(Cs, [C|Scs], Quote, Pos1);
237	Error ->
238	    Error
239    end.
240
241%% Note that space characters are not allowed
242scan_char_const([$\040 | _Cs0], _Toks, _Pos) ->
243    {error, illegal_character};
244scan_char_const(Cs0, Toks, Pos) ->
245    case scan_char(Cs0, Pos) of
246	{C,Cs,Pos1} ->
247	    scan1(Cs, [{char,Pos,C}|Toks], Pos1);
248	Error ->
249	    Error
250    end.
251
252%% {Character,RestChars,NewPos} = scan_char(Chars, Pos)
253%% Read a single character from a string or character constant. The
254%% pre-scan phase has checked for errors here.
255%% Note that control characters are not allowed.
256
257scan_char([$\\|Cs], Pos) ->
258    scan_escape(Cs, Pos);
259scan_char([C | _Cs], _Pos) when C =< 16#1f ->
260    {error, illegal_character};
261scan_char([C|Cs], Pos) ->
262    {C,Cs,Pos};
263scan_char([], _Pos) ->
264    {error, truncated_char}.
265
266%% The following conforms to Standard Erlang escape sequences.
267
268-define(HEX(C), C >= $0 andalso C =< $9 orelse
269                C >= $A andalso C =< $F orelse
270                C >= $a andalso C =< $f).
271
272-define(UNICODE(C),
273         (C >= 0 andalso C < 16#D800 orelse
274          C > 16#DFFF andalso C < 16#FFFE orelse
275          C > 16#FFFF andalso C =< 16#10FFFF)).
276
277scan_escape([O1, O2, O3 | Cs], Pos) when        % \<1-3> octal digits
278  O1 >= $0, O1 =< $3, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 ->
279    Val = (O1*8 + O2)*8 + O3 - 73*$0,
280    {Val,Cs,Pos};
281scan_escape([O1, O2 | Cs], Pos) when
282  O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7 ->
283    Val = (O1*8 + O2) - 9*$0,
284    {Val,Cs,Pos};
285scan_escape([O1 | Cs], Pos) when
286  O1 >= $0, O1 =< $7 ->
287    {O1 - $0,Cs,Pos};
288scan_escape([$x, ${ | Cs], Pos) ->
289    scan_hex(Cs, Pos, []);
290scan_escape([$x, H1, H2 | Cs], Pos) when ?HEX(H1), ?HEX(H2) ->
291    Val = (H1*16 + H2) - 17*$0,
292    {Val,Cs,Pos};
293scan_escape([$^, C | Cs], Pos) ->    % \^X -> CTL-X
294    if C >= $\100, C =< $\137 ->
295	    {C - $\100,Cs,Pos};
296       true -> {error, illegal_control_character}
297    end;
298scan_escape([C | Cs], Pos) ->
299    case escape_char(C) of
300	C1 when C1 > $\000 -> {C1,Cs,Pos};
301	_ -> {error, undefined_escape_sequence}
302    end;
303scan_escape([], _Pos) ->
304    {error, truncated_char}.
305
306scan_hex([C | Cs], Pos, HCs) when ?HEX(C) ->
307    scan_hex(Cs, Pos, [C | HCs]);
308scan_hex([$} | Cs], Pos, HCs) ->
309    case catch erlang:list_to_integer(lists:reverse(HCs), 16) of
310        Val when ?UNICODE(Val) ->
311            {Val,Cs,Pos};
312        _ ->
313            {error, undefined_escape_sequence}
314    end;
315scan_hex(_Cs, _Pos, _HCs) ->
316    {error, undefined_escape_sequence}.
317
318%% Note that we return $\000 for undefined escapes.
319escape_char($b) -> $\010;		% \b = BS
320escape_char($d) -> $\177;		% \d = DEL
321escape_char($e) -> $\033;		% \e = ESC
322escape_char($f) -> $\014;		% \f = FF
323escape_char($n) -> $\012;		% \n = LF
324escape_char($r) -> $\015;		% \r = CR
325escape_char($s) -> $\040;		% \s = SPC
326escape_char($t) -> $\011;		% \t = HT
327escape_char($v) -> $\013;		% \v = VT
328escape_char($\\) -> $\134;		% \\ = \
329escape_char($') -> $\047;		% \' = '
330escape_char($") -> $\042;		% \" = "
331escape_char(_C) -> $\000.
332
333%% scan_number(Char, CharList, TokenStack, Pos)
334%%  We handle sign and radix notation:
335%%    [+-]<digits>		- the digits in base [+-]10
336%%    [+-]<digits>.<digits>
337%%    [+-]<digits>.<digits>E+-<digits>
338%%    [+-]<digits>#<digits>	- the digits read in base [+-]B
339%%
340%%  Except for explicitly based integers we build a list of all the
341%%  characters and then use list_to_integer/1 or list_to_float/1 to
342%%  generate the value.
343
344%%  SPos == Start position
345%%  CPos == Current position
346
347scan_number(C, Cs0, Toks, Pos) ->
348    {Ncs,Cs,Pos1} = scan_integer(Cs0, [C], Pos),
349    scan_after_int(Cs, Ncs, Toks, Pos, Pos1).
350
351scan_signed_number(S, C, Cs0, Toks, Pos) ->
352    {Ncs,Cs,Pos1} = scan_integer(Cs0, [C, S], Pos),
353    scan_after_int(Cs, Ncs, Toks, Pos, Pos1).
354
355scan_integer([C|Cs], Stack, Pos) when C >= $0, C =< $9 ->
356    scan_integer(Cs, [C|Stack], Pos);
357scan_integer(Cs, Stack, Pos) ->
358    {Stack,Cs,Pos}.
359
360scan_after_int([$.,C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
361    {Ncs,Cs,CPos1} = scan_integer(Cs0, [C,$.|Ncs0], CPos),
362    scan_after_fraction(Cs, Ncs, Toks, SPos, CPos1);
363scan_after_int(Cs, Ncs, Toks, SPos, CPos) ->
364    N = list_to_integer(reverse(Ncs)),
365    scan1(Cs, [{integer,SPos,N}|Toks], CPos).
366
367scan_after_fraction([$E|Cs], Ncs, Toks, SPos, CPos) ->
368    scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos);
369scan_after_fraction([$e|Cs], Ncs, Toks, SPos, CPos) ->
370    scan_exponent(Cs, [$e|Ncs], Toks, SPos, CPos);
371scan_after_fraction(Cs, Ncs, Toks, SPos, CPos) ->
372    case catch list_to_float(reverse(Ncs)) of
373	N when is_float(N) ->
374	    scan1(Cs, [{float,SPos,N}|Toks], CPos);
375	_Error -> scan_error({illegal,float}, SPos)
376    end.
377
378%% scan_exponent(CharList, NumberCharStack, TokenStack, StartPos, CurPos)
379%%  Generate an error here if E{+|-} not followed by any digits.
380
381scan_exponent([$+|Cs], Ncs, Toks, SPos, CPos) ->
382    scan_exponent1(Cs, [$+|Ncs], Toks, SPos, CPos);
383scan_exponent([$-|Cs], Ncs, Toks, SPos, CPos) ->
384    scan_exponent1(Cs, [$-|Ncs], Toks, SPos, CPos);
385scan_exponent(Cs, Ncs, Toks, SPos, CPos) ->
386    scan_exponent1(Cs, Ncs, Toks, SPos, CPos).
387
388scan_exponent1([C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
389    {Ncs,Cs,CPos1} = scan_integer(Cs0, [C|Ncs0], CPos),
390    case catch list_to_float(reverse(Ncs)) of
391	N when is_float(N) ->
392	    scan1(Cs, [{float,SPos,N}|Toks], CPos1);
393	_Error -> scan_error({illegal,float}, SPos)
394    end;
395scan_exponent1(_, _, _, _, CPos) ->
396    scan_error(float, CPos).
397
398scan_error(In, Pos) ->
399    {error,{Pos,edoc_scanner,In}}.
400