1%%
2%% %CopyrightBegin%
3%%
4%% Copyright Ericsson AB 2000-2017. All Rights Reserved.
5%%
6%% Licensed under the Apache License, Version 2.0 (the "License");
7%% you may not use this file except in compliance with the License.
8%% You may obtain a copy of the License at
9%%
10%%     http://www.apache.org/licenses/LICENSE-2.0
11%%
12%% Unless required by applicable law or agreed to in writing, software
13%% distributed under the License is distributed on an "AS IS" BASIS,
14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15%% See the License for the specific language governing permissions and
16%% limitations under the License.
17%%
18%% %CopyrightEnd%
19%%
20%% Purpose: Scanner for Core Erlang.
21
22%% For handling ISO 8859-1 (Latin-1) we use the following type
23%% information:
24%%
25%% 000 - 037	NUL - US	control
26%% 040 - 057	SPC - /		punctuation
27%% 060 - 071	0 - 9		digit
28%% 072 - 100	: - @		punctuation
29%% 101 - 132	A - Z		uppercase
30%% 133 - 140	[ - `		punctuation
31%% 141 - 172	a - z		lowercase
32%% 173 - 176	{ - ~		punctuation
33%% 177		DEL		control
34%% 200 - 237			control
35%% 240 - 277	NBSP - ¿	punctuation
36%% 300 - 326	À - Ö		uppercase
37%% 327		×		punctuation
38%% 330 - 336	Ø - Þ		uppercase
39%% 337 - 366	ß - ö		lowercase
40%% 367		÷		punctuation
41%% 370 - 377	ø - ÿ		lowercase
42%%
43%% Many punctuation characters region have special meaning.  Must
44%% watch using × \327, bvery close to x \170
45
46-module(core_scan).
47
48-export([string/1, string/2, format_error/1]).
49
50-import(lists, [reverse/1]).
51
52-type location() :: integer().
53-type category() :: atom().
54-type symbol() :: atom() | float() | integer() | string().
55-type token() :: {category(), Anno :: location(), symbol()}
56               | {category(), Anno :: location()}.
57-type tokens() :: [token()].
58-type error_description() :: term().
59-type error_info() :: {erl_anno:location(), module(), error_description()}.
60
61%% string([Char]) ->
62%% string([Char], StartPos) ->
63%%    {ok, [Tok], EndPos} |
64%%    {error, {Pos,core_scan,What}, EndPos}
65
66-spec string(String) -> Return when
67      String :: string(),
68      Return :: {'ok', Tokens :: tokens(), EndLocation}
69              | {'error', ErrorInfo :: error_info(), ErrorLocation},
70      EndLocation :: location(),
71      ErrorLocation :: location().
72
73string(Cs) -> string(Cs, 1).
74
75-spec string(String, StartLocation) -> Return when
76      String :: string(),
77      Return :: {'ok', Tokens :: tokens(), EndLocation}
78              | {'error', ErrorInfo :: error_info(), ErrorLocation},
79      StartLocation :: location(),
80      EndLocation :: location(),
81      ErrorLocation :: location().
82
83string(Cs, Sp) ->
84    %% Add an 'eof' to always get correct handling.
85    case string_pre_scan(Cs, [], Sp) of
86	{done,_,SoFar,Ep} ->			%Got tokens
87	    case scan(reverse(SoFar), Sp) of
88		{ok,Toks} -> {ok,Toks,Ep};
89		{error,E} -> {error,E,Ep}
90	    end;
91	Other -> Other				%An error has occurred
92    end.
93
94%% string_pre_scan(Cs, SoFar0, StartPos) ->
95%%      {done,Rest,SoFar,EndPos} | {error,E,EndPos}.
96
97string_pre_scan(Cs, SoFar0, Sp) ->
98    case pre_scan(Cs, SoFar0, Sp) of
99	{done,Rest,SoFar1,Ep} ->		%Got complete tokens
100	    {done,Rest,SoFar1,Ep};
101	{more,Rest,SoFar1,Ep} ->		%Missing end token
102	    string_pre_scan(Rest ++ eof, SoFar1, Ep);
103	Other -> Other				%An error has occurred
104    end.
105
106%% format_error(Error)
107%%  Return a string describing the error.
108
109-spec format_error(term()) -> iolist().
110
111format_error({string,Quote,Head}) ->
112    ["unterminated " ++ string_thing(Quote) ++
113     " starting with " ++ io_lib:write_string(Head,Quote)];
114format_error({illegal,Type}) -> io_lib:fwrite("illegal ~w", [Type]);
115format_error(char) -> "unterminated character";
116format_error(scan) -> "premature end";
117format_error({base,Base}) -> io_lib:fwrite("illegal base '~w'", [Base]);
118format_error(float) -> "bad float";
119format_error(Other) -> io_lib:write(Other).
120
121string_thing($') -> "atom";    %' stupid emacs
122string_thing($") -> "string".  %" stupid emacs
123
124%% Re-entrant pre-scanner.
125%%
126%% If the input list of characters is insufficient to build a term the
127%% scanner returns a request for more characters and a continuation to be
128%% used when trying to build a term with more characters. To indicate
129%% end-of-file the input character list should be replaced with 'eof'
130%% as an empty list has meaning.
131%%
132%% When more characters are need inside a comment, string or quoted
133%% atom, which can become rather long, instead of pushing the
134%% characters read so far back onto RestChars to be reread, a special
135%% reentry token is returned indicating the middle of a construct.
136%% The token is the start character as an atom, '%', '"' and '\''.
137
138%% pre_scan([Char], SoFar, StartPos) ->
139%%	{done,RestChars,ScannedChars,NewPos} |
140%%	{more,RestChars,ScannedChars,NewPos} |
141%%	{error,{ErrorPos,core_scan,Description},NewPos}.
142%%  Main pre-scan function. It has been split into 2 functions because of
143%%  efficiency, with a good indexing compiler it would be unnecessary.
144
145pre_scan([C|Cs], SoFar, Pos) ->
146    pre_scan(C, Cs, SoFar, Pos);
147pre_scan([], SoFar, Pos) ->
148    {more,[],SoFar,Pos};
149pre_scan(eof, SoFar, Pos) ->
150    {done,eof,SoFar,Pos}.
151
152%% pre_scan(Char, [Char], SoFar, Pos)
153
154pre_scan($$, Cs0, SoFar0, Pos) ->
155    case pre_char(Cs0, [$$|SoFar0]) of
156	{Cs,SoFar} ->
157	    pre_scan(Cs, SoFar, Pos);
158	more ->
159	    {more,[$$|Cs0],SoFar0, Pos};
160	error ->
161	    pre_error(char, Pos, Pos)
162    end;
163pre_scan($', Cs, SoFar, Pos) ->
164    pre_string(Cs, $', '\'', Pos, [$'|SoFar], Pos);
165pre_scan({'\'',Sp}, Cs, SoFar, Pos) ->		%Re-entering quoted atom
166    pre_string(Cs, $', '\'', Sp, SoFar, Pos);
167pre_scan($", Cs, SoFar, Pos) ->
168    pre_string(Cs, $", '"', Pos, [$"|SoFar], Pos);
169pre_scan({'"',Sp}, Cs, SoFar, Pos) ->		%Re-entering string
170    pre_string(Cs, $", '"', Sp, SoFar, Pos);
171pre_scan($%, Cs, SoFar, Pos) ->
172    pre_comment(Cs, SoFar, Pos);
173pre_scan('%', Cs, SoFar, Pos) ->		%Re-entering comment
174    pre_comment(Cs, SoFar, Pos);
175pre_scan($\n, Cs, SoFar, Pos) ->
176    pre_scan(Cs, [$\n|SoFar], Pos+1);
177pre_scan(C, Cs, SoFar, Pos) ->
178    pre_scan(Cs, [C|SoFar], Pos).
179
180%% pre_string([Char], Quote, Reent, StartPos, SoFar, Pos)
181
182pre_string([Q|Cs], Q, _, _, SoFar, Pos) ->
183    pre_scan(Cs, [Q|SoFar], Pos);
184pre_string([$\n|Cs], Q, Reent, Sp, SoFar, Pos) ->
185    pre_string(Cs, Q, Reent, Sp, [$\n|SoFar], Pos+1);
186pre_string([$\\|Cs0], Q, Reent, Sp, SoFar0, Pos) ->
187    case pre_escape(Cs0, SoFar0) of
188	{Cs,SoFar} ->
189	    pre_string(Cs, Q, Reent, Sp, SoFar, Pos);
190	more ->
191	    {more,[{Reent,Sp},$\\|Cs0],SoFar0,Pos};
192	error ->
193	    pre_string_error(Q, Sp, SoFar0, Pos)
194    end;
195pre_string([C|Cs], Q, Reent, Sp, SoFar, Pos) ->
196    pre_string(Cs, Q, Reent, Sp, [C|SoFar], Pos);
197pre_string([], _, Reent, Sp, SoFar, Pos) ->
198    {more,[{Reent,Sp}],SoFar,Pos};
199pre_string(eof, Q, _, Sp, SoFar, Pos) ->
200    pre_string_error(Q, Sp, SoFar, Pos).
201
202pre_string_error(Q, Sp, SoFar, Pos) ->
203    [S,_] = string:split(SoFar, [Q]),
204    pre_error({string,Q,string:slice(string:reverse(S), 0, 16)}, Sp, Pos).
205
206pre_char([C|Cs], SoFar) -> pre_char(C, Cs, SoFar);
207pre_char([], _) -> more;
208pre_char(eof, _) -> error.
209
210pre_char($\\, Cs, SoFar) ->
211    pre_escape(Cs, SoFar);
212pre_char(C, Cs, SoFar) ->
213    {Cs,[C|SoFar]}.
214
215pre_escape([$^|Cs0], SoFar) ->
216    case Cs0 of
217	[C3|Cs] ->
218	    {Cs,[C3,$^,$\\|SoFar]};
219	[] -> more;
220	eof -> error
221    end;
222pre_escape([C|Cs], SoFar) ->
223    {Cs,[C,$\\|SoFar]};
224pre_escape([], _) -> more;
225pre_escape(eof, _) -> error.
226
227%% pre_comment([Char], SoFar, Pos)
228%%  Comments are replaced by one SPACE.
229
230pre_comment([$\n|Cs], SoFar, Pos) ->
231    pre_scan(Cs, [$\n,$\s|SoFar], Pos+1);	%Terminate comment
232pre_comment([_|Cs], SoFar, Pos) ->
233    pre_comment(Cs, SoFar, Pos);
234pre_comment([], SoFar, Pos) ->
235    {more,['%'],SoFar,Pos};
236pre_comment(eof, Sofar, Pos) ->
237    pre_scan(eof, [$\s|Sofar], Pos).
238
239pre_error(E, Epos, Pos) ->
240    {error,{Epos,core_scan,E}, Pos}.
241
242%% scan(CharList, StartPos)
243%%  This takes a list of characters and tries to tokenise them.
244%%
245%%  The token list is built in reverse order (in a stack) to save appending
246%%  and then reversed when all the tokens have been collected. Most tokens
247%%  are built in the same way.
248%%
249%%  Returns:
250%%	{ok,[Tok]}
251%%	{error,{ErrorPos,core_scan,What}}
252
253scan(Cs, Pos) ->
254    scan1(Cs, [], Pos).
255
256%% scan1(Characters, TokenStack, Position)
257%%  Scan a list of characters into tokens.
258
259scan1([$\n|Cs], Toks, Pos) ->            	        %Skip newline
260    scan1(Cs, Toks, Pos+1);
261scan1([C|Cs], Toks, Pos) when C >= $\000, C =< $\s -> 	%Skip control chars
262    scan1(Cs, Toks, Pos);
263scan1([C|Cs], Toks, Pos) when C >= $\200, C =< $\240 ->
264    scan1(Cs, Toks, Pos);
265scan1([C|Cs], Toks, Pos) when C >= $a, C =< $z ->	%Keywords
266    scan_key_word(C, Cs, Toks, Pos);
267scan1([C|Cs], Toks, Pos) when C >= $ß, C =< $ÿ, C /= $÷ ->
268    scan_key_word(C, Cs, Toks, Pos);
269scan1([C|Cs], Toks, Pos) when C >= $A, C =< $Z ->	%Variables
270    scan_variable(C, Cs, Toks, Pos);
271scan1([C|Cs], Toks, Pos) when C >= $À, C =< $Þ, C /= $× ->
272    scan_variable(C, Cs, Toks, Pos);
273scan1([C|Cs], Toks, Pos) when C >= $0, C =< $9 ->	%Numbers
274    scan_number(C, Cs, Toks, Pos);
275scan1([$-,C|Cs], Toks, Pos) when C >= $0, C =< $9 ->	%Signed numbers
276    scan_signed_number($-, C, Cs, Toks, Pos);
277scan1([$+,C|Cs], Toks, Pos) when C >= $0, C =< $9 ->	%Signed numbers
278    scan_signed_number($+, C, Cs, Toks, Pos);
279scan1([$_|Cs], Toks, Pos) ->				%_ variables
280    scan_variable($_, Cs, Toks, Pos);
281scan1([$$|Cs0], Toks, Pos) ->				%Character constant
282    {C,Cs,Pos1} = scan_char(Cs0, Pos),
283    scan1(Cs, [{char,Pos,C}|Toks], Pos1);
284scan1([$'|Cs0], Toks, Pos) ->				%Atom (always quoted)
285    {S,Cs1,Pos1} = scan_string(Cs0, $', Pos),
286    try binary_to_atom(list_to_binary(S), utf8) of
287	A when is_atom(A) ->
288	    scan1(Cs1, [{atom,Pos,A}|Toks], Pos1)
289    catch
290        error:_ ->
291            scan_error({illegal,atom}, Pos)
292    end;
293scan1([$"|Cs0], Toks, Pos) ->				%String
294    {S,Cs1,Pos1} = scan_string(Cs0, $", Pos),
295    scan1(Cs1, [{string,Pos,S}|Toks], Pos1);
296%% Punctuation characters and operators, first recognise multiples.
297scan1("->" ++ Cs, Toks, Pos) ->
298    scan1(Cs, [{'->',Pos}|Toks], Pos);
299scan1("-|" ++ Cs, Toks, Pos) ->
300    scan1(Cs, [{'-|',Pos}|Toks], Pos);
301scan1(":=" ++ Cs, Toks, Pos) ->
302    scan1(Cs, [{':=',Pos}|Toks], Pos);
303scan1("=>" ++ Cs, Toks, Pos) ->
304    scan1(Cs, [{'=>',Pos}|Toks], Pos);
305scan1([C|Cs], Toks, Pos) ->				%Punctuation character
306    P = list_to_atom([C]),
307    scan1(Cs, [{P,Pos}|Toks], Pos);
308scan1([], Toks0, _) ->
309    Toks = reverse(Toks0),
310    {ok,Toks}.
311
312%% scan_key_word(FirstChar, CharList, Tokens, Pos)
313%% scan_variable(FirstChar, CharList, Tokens, Pos)
314
315scan_key_word(C, Cs0, Toks, Pos) ->
316    {Wcs,Cs} = scan_name(Cs0, []),
317    case catch list_to_atom([C|reverse(Wcs)]) of
318	Name when is_atom(Name) ->
319	    scan1(Cs, [{Name,Pos}|Toks], Pos);
320	_Error -> scan_error({illegal,atom}, Pos)
321    end.
322
323scan_variable(C, Cs0, Toks, Pos) ->
324    {Wcs,Cs} = scan_name(Cs0, []),
325    case catch list_to_atom([C|reverse(Wcs)]) of
326	Name when is_atom(Name) ->
327	    scan1(Cs, [{var,Pos,Name}|Toks], Pos);
328	_Error -> scan_error({illegal,var}, Pos)
329    end.
330
331%% scan_name(Cs) -> lists:splitwith(fun (C) -> name_char(C) end, Cs).
332
333scan_name([C|Cs], Ncs) ->
334    case name_char(C) of
335	true -> scan_name(Cs, [C|Ncs]);
336	false -> {Ncs,[C|Cs]}			%Must rebuild here, sigh!
337    end;
338scan_name([], Ncs) ->
339    {Ncs,[]}.
340
341name_char(C) when C >= $a, C =< $z -> true;
342name_char(C) when C >= $ß, C =< $ÿ, C /= $÷ -> true;
343name_char(C) when C >= $A, C =< $Z -> true;
344name_char(C) when C >= $À, C =< $Þ, C /= $× -> true;
345name_char(C) when C >= $0, C =< $9 -> true;
346name_char($_) -> true;
347name_char($@) -> true;
348name_char(_) -> false.
349
350%% scan_string(CharList, QuoteChar, Pos) -> {StringChars,RestChars,NewPos}.
351
352scan_string(Cs, Q, Pos) ->
353    scan_string(Cs, [], Q, Pos).
354
355scan_string([Q|Cs], Scs, Q, Pos) ->
356    {reverse(Scs),Cs,Pos};
357scan_string([$\n|Cs], Scs, Q, Pos) ->
358    scan_string(Cs, [$\n|Scs], Q, Pos+1);
359scan_string([$\\|Cs0], Scs, Q, Pos) ->
360    {C,Cs,Pos1} = scan_escape(Cs0, Pos),
361    scan_string(Cs, [C|Scs], Q, Pos1);
362scan_string([C|Cs], Scs, Q, Pos) ->
363    scan_string(Cs, [C|Scs], Q, Pos).
364
365%% scan_char(Chars, Pos) -> {Char,RestChars,NewPos}.
366%%  Read a single character from a character constant. The pre-scan
367%%  phase has checked for errors here.
368
369scan_char([$\\|Cs], Pos) ->
370    scan_escape(Cs, Pos);
371scan_char([$\n|Cs], Pos) ->                  %Newline
372    {$\n,Cs,Pos+1};
373scan_char([C|Cs], Pos) ->
374    {C,Cs,Pos}.
375
376scan_escape([O1,O2,O3|Cs], Pos) when            %\<1-3> octal digits
377    O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 ->
378    Val = (O1*8 + O2)*8 + O3 - 73*$0,
379    {Val,Cs,Pos};
380scan_escape([O1,O2|Cs], Pos) when
381    O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7 ->
382    Val = (O1*8 + O2) - 9*$0,
383    {Val,Cs,Pos};
384scan_escape([O1|Cs], Pos) when
385    O1 >= $0, O1 =< $7 ->
386    {O1 - $0,Cs,Pos};
387scan_escape([$^,C|Cs], Pos) ->			%\^X -> CTL-X
388    Val = C band 31,
389    {Val,Cs,Pos};
390%scan_escape([$\n,C1|Cs],Pos) ->
391%    {C1,Cs,Pos+1};
392%scan_escape([C,C1|Cs],Pos) when C >= $\000, C =< $\s ->
393%    {C1,Cs,Pos};
394scan_escape([$\n|Cs],Pos) ->
395    {$\n,Cs,Pos+1};
396scan_escape([C0|Cs],Pos) ->
397    C = escape_char(C0),
398    {C,Cs,Pos}.
399
400escape_char($n) -> $\n;				%\n = LF
401escape_char($r) -> $\r;				%\r = CR
402escape_char($t) -> $\t;				%\t = TAB
403escape_char($v) -> $\v;				%\v = VT
404escape_char($b) -> $\b;				%\b = BS
405escape_char($f) -> $\f;				%\f = FF
406escape_char($e) -> $\e;				%\e = ESC
407escape_char($s) -> $\s;				%\s = SPC
408escape_char($d) -> $\d;				%\d = DEL
409escape_char(C) -> C.
410
411%% scan_number(Char, CharList, TokenStack, Pos)
412%%  We can handle simple radix notation:
413%%    <digit>#<digits>		- the digits read in that base
414%%    <digits>			- the digits in base 10
415%%    <digits>.<digits>
416%%    <digits>.<digits>E+-<digits>
417%%
418%%  Except for explicitly based integers we build a list of all the
419%%  characters and then use list_to_integer/1 or list_to_float/1 to
420%%  generate the value.
421
422%%  SPos == Start position
423%%  CPos == Current position
424
425scan_number(C, Cs0, Toks, Pos) ->
426    {Ncs,Cs,Pos1} = scan_integer(Cs0, [C], Pos),
427    scan_after_int(Cs, Ncs, Toks, Pos, Pos1).
428
429scan_signed_number(S, C, Cs0, Toks, Pos) ->
430    {Ncs,Cs,Pos1} = scan_integer(Cs0, [C,S], Pos),
431    scan_after_int(Cs, Ncs, Toks, Pos, Pos1).
432
433scan_integer([C|Cs], Stack, Pos) when C >= $0, C =< $9 ->
434    scan_integer(Cs, [C|Stack], Pos);
435scan_integer(Cs, Stack, Pos) ->
436    {Stack,Cs,Pos}.
437
438scan_after_int([$.,C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
439    {Ncs,Cs,CPos1} = scan_integer(Cs0, [C,$.|Ncs0], CPos),
440    scan_after_fraction(Cs, Ncs, Toks, SPos, CPos1);
441scan_after_int([$#|Cs], Ncs, Toks, SPos, CPos) ->
442    case list_to_integer(reverse(Ncs)) of
443	Base when Base >= 2, Base =< 16 ->
444	    scan_based_int(Cs, 0, Base, Toks, SPos, CPos);
445	Base ->
446	    scan_error({base,Base}, CPos)
447    end;
448scan_after_int(Cs, Ncs, Toks, SPos, CPos) ->
449    N = list_to_integer(reverse(Ncs)),
450    scan1(Cs, [{integer,SPos,N}|Toks], CPos).
451
452scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when
453    C >= $0, C =< $9, C < Base + $0 ->
454    Next = SoFar * Base + (C - $0),
455    scan_based_int(Cs, Next, Base, Toks, SPos, CPos);
456scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when
457    C >= $a, C =< $f, C < Base + $a - 10 ->
458    Next = SoFar * Base + (C - $a + 10),
459    scan_based_int(Cs, Next, Base, Toks, SPos, CPos);
460scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when
461    C >= $A, C =< $F, C < Base + $A - 10 ->
462    Next = SoFar * Base + (C - $A + 10),
463    scan_based_int(Cs, Next, Base, Toks, SPos, CPos);
464scan_based_int(Cs, SoFar, _, Toks, SPos, CPos) ->
465    scan1(Cs, [{integer,SPos,SoFar}|Toks], CPos).
466
467scan_after_fraction([$E|Cs], Ncs, Toks, SPos, CPos) ->
468    scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos);
469scan_after_fraction([$e|Cs], Ncs, Toks, SPos, CPos) ->
470    scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos);
471scan_after_fraction(Cs, Ncs, Toks, SPos, CPos) ->
472    case catch list_to_float(reverse(Ncs)) of
473	N when is_float(N) ->
474	    scan1(Cs, [{float,SPos,N}|Toks], CPos);
475	_Error -> scan_error({illegal,float}, SPos)
476    end.
477
478%% scan_exponent(CharList, NumberCharStack, TokenStack, StartPos, CurPos)
479%%  Generate an error here if E{+|-} not followed by any digits.
480
481scan_exponent([$+|Cs], Ncs, Toks, SPos, CPos) ->
482    scan_exponent1(Cs, [$+|Ncs], Toks, SPos, CPos);
483scan_exponent([$-|Cs], Ncs, Toks, SPos, CPos) ->
484    scan_exponent1(Cs, [$-|Ncs], Toks, SPos, CPos);
485scan_exponent(Cs, Ncs, Toks, SPos, CPos) ->
486    scan_exponent1(Cs, Ncs, Toks, SPos, CPos).
487
488scan_exponent1([C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
489    {Ncs,Cs,CPos1} = scan_integer(Cs0, [C|Ncs0], CPos),
490    case catch list_to_float(reverse(Ncs)) of
491	N when is_float(N) ->
492	    scan1(Cs, [{float,SPos,N}|Toks], CPos1);
493	_Error -> scan_error({illegal,float}, SPos)
494    end;
495scan_exponent1(_, _, _, _, CPos) ->
496    scan_error(float, CPos).
497
498scan_error(In, Pos) ->
499    {error,{Pos,core_scan,In}}.
500