compiler/src/core_scan.erl

%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 2000-2017. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%%     http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%
%% %CopyrightEnd%
%%
%% Purpose: Scanner for Core Erlang.

%% For handling ISO 8859-1 (Latin-1) we use the following type
%% information:
%%
%% 000 - 037	NUL - US	control
%% 040 - 057	SPC - /		punctuation
%% 060 - 071	0 - 9		digit
%% 072 - 100	: - @		punctuation
%% 101 - 132	A - Z		uppercase
%% 133 - 140	[ - `		punctuation
%% 141 - 172	a - z		lowercase
%% 173 - 176	{ - ~		punctuation
%% 177		DEL		control
%% 200 - 237			control
%% 240 - 277	NBSP - ¿	punctuation
%% 300 - 326	À - Ö		uppercase
%% 327		×		punctuation
%% 330 - 336	Ø - Þ		uppercase
%% 337 - 366	ß - ö		lowercase
%% 367		÷		punctuation
%% 370 - 377	ø - ÿ		lowercase
%%
%% Many punctuation characters region have special meaning.  Must
%% watch using × \327, bvery close to x \170

-module(core_scan).

-export([string/1, string/2, format_error/1]).

-import(lists, [reverse/1]).

-type location() :: integer().
-type category() :: atom().
-type symbol() :: atom() | float() | integer() | string().
-type token() :: {category(), Anno :: location(), symbol()}
               | {category(), Anno :: location()}.
-type tokens() :: [token()].
-type error_description() :: term().
-type error_info() :: {erl_anno:location(), module(), error_description()}.

%% string([Char]) ->
%% string([Char], StartPos) ->
%%    {ok, [Tok], EndPos} |
%%    {error, {Pos,core_scan,What}, EndPos}

-spec string(String) -> Return when
      String :: string(),
      Return :: {'ok', Tokens :: tokens(), EndLocation}
              | {'error', ErrorInfo :: error_info(), ErrorLocation},
      EndLocation :: location(),
      ErrorLocation :: location().

string(Cs) -> string(Cs, 1).

-spec string(String, StartLocation) -> Return when
      String :: string(),
      Return :: {'ok', Tokens :: tokens(), EndLocation}
              | {'error', ErrorInfo :: error_info(), ErrorLocation},
      StartLocation :: location(),
      EndLocation :: location(),
      ErrorLocation :: location().

string(Cs, Sp) ->
    %% Add an 'eof' to always get correct handling.
    case string_pre_scan(Cs, [], Sp) of
	{done,_,SoFar,Ep} ->			%Got tokens
	    case scan(reverse(SoFar), Sp) of
		{ok,Toks} -> {ok,Toks,Ep};
		{error,E} -> {error,E,Ep}
	    end;
	Other -> Other				%An error has occurred
    end.

%% string_pre_scan(Cs, SoFar0, StartPos) ->
%%      {done,Rest,SoFar,EndPos} | {error,E,EndPos}.

string_pre_scan(Cs, SoFar0, Sp) ->
    case pre_scan(Cs, SoFar0, Sp) of
	{done,Rest,SoFar1,Ep} ->		%Got complete tokens
	    {done,Rest,SoFar1,Ep};
	{more,Rest,SoFar1,Ep} ->		%Missing end token
	    string_pre_scan(Rest ++ eof, SoFar1, Ep);
	Other -> Other				%An error has occurred
    end.

%% format_error(Error)
%%  Return a string describing the error.

-spec format_error(term()) -> iolist().

format_error({string,Quote,Head}) ->
    ["unterminated " ++ string_thing(Quote) ++
     " starting with " ++ io_lib:write_string(Head,Quote)];
format_error({illegal,Type}) -> io_lib:fwrite("illegal ~w", [Type]);
format_error(char) -> "unterminated character";
format_error(scan) -> "premature end";
format_error({base,Base}) -> io_lib:fwrite("illegal base '~w'", [Base]);
format_error(float) -> "bad float";
format_error(Other) -> io_lib:write(Other).

string_thing($') -> "atom";    %' stupid emacs
string_thing($") -> "string".  %" stupid emacs

%% Re-entrant pre-scanner.
%%
%% If the input list of characters is insufficient to build a term the
%% scanner returns a request for more characters and a continuation to be
%% used when trying to build a term with more characters. To indicate
%% end-of-file the input character list should be replaced with 'eof'
%% as an empty list has meaning.
%%
%% When more characters are need inside a comment, string or quoted
%% atom, which can become rather long, instead of pushing the
%% characters read so far back onto RestChars to be reread, a special
%% reentry token is returned indicating the middle of a construct.
%% The token is the start character as an atom, '%', '"' and '\''.

%% pre_scan([Char], SoFar, StartPos) ->
%%	{done,RestChars,ScannedChars,NewPos} |
%%	{more,RestChars,ScannedChars,NewPos} |
%%	{error,{ErrorPos,core_scan,Description},NewPos}.
%%  Main pre-scan function. It has been split into 2 functions because of
%%  efficiency, with a good indexing compiler it would be unnecessary.

pre_scan([C|Cs], SoFar, Pos) ->
    pre_scan(C, Cs, SoFar, Pos);
pre_scan([], SoFar, Pos) ->
    {more,[],SoFar,Pos};
pre_scan(eof, SoFar, Pos) ->
    {done,eof,SoFar,Pos}.

%% pre_scan(Char, [Char], SoFar, Pos)

pre_scan($$, Cs0, SoFar0, Pos) ->
    case pre_char(Cs0, [$$|SoFar0]) of
	{Cs,SoFar} ->
	    pre_scan(Cs, SoFar, Pos);
	more ->
	    {more,[$$|Cs0],SoFar0, Pos};
	error ->
	    pre_error(char, Pos, Pos)
    end;
pre_scan($', Cs, SoFar, Pos) ->
    pre_string(Cs, $', '\'', Pos, [$'|SoFar], Pos);
pre_scan({'\'',Sp}, Cs, SoFar, Pos) ->		%Re-entering quoted atom
    pre_string(Cs, $', '\'', Sp, SoFar, Pos);
pre_scan($", Cs, SoFar, Pos) ->
    pre_string(Cs, $", '"', Pos, [$"|SoFar], Pos);
pre_scan({'"',Sp}, Cs, SoFar, Pos) ->		%Re-entering string
    pre_string(Cs, $", '"', Sp, SoFar, Pos);
pre_scan($%, Cs, SoFar, Pos) ->
    pre_comment(Cs, SoFar, Pos);
pre_scan('%', Cs, SoFar, Pos) ->		%Re-entering comment
    pre_comment(Cs, SoFar, Pos);
pre_scan($\n, Cs, SoFar, Pos) ->
    pre_scan(Cs, [$\n|SoFar], Pos+1);
pre_scan(C, Cs, SoFar, Pos) ->
    pre_scan(Cs, [C|SoFar], Pos).

%% pre_string([Char], Quote, Reent, StartPos, SoFar, Pos)

pre_string([Q|Cs], Q, _, _, SoFar, Pos) ->
    pre_scan(Cs, [Q|SoFar], Pos);
pre_string([$\n|Cs], Q, Reent, Sp, SoFar, Pos) ->
    pre_string(Cs, Q, Reent, Sp, [$\n|SoFar], Pos+1);
pre_string([$\\|Cs0], Q, Reent, Sp, SoFar0, Pos) ->
    case pre_escape(Cs0, SoFar0) of
	{Cs,SoFar} ->
	    pre_string(Cs, Q, Reent, Sp, SoFar, Pos);
	more ->
	    {more,[{Reent,Sp},$\\|Cs0],SoFar0,Pos};
	error ->
	    pre_string_error(Q, Sp, SoFar0, Pos)
    end;
pre_string([C|Cs], Q, Reent, Sp, SoFar, Pos) ->
    pre_string(Cs, Q, Reent, Sp, [C|SoFar], Pos);
pre_string([], _, Reent, Sp, SoFar, Pos) ->
    {more,[{Reent,Sp}],SoFar,Pos};
pre_string(eof, Q, _, Sp, SoFar, Pos) ->
    pre_string_error(Q, Sp, SoFar, Pos).

pre_string_error(Q, Sp, SoFar, Pos) ->
    [S,_] = string:split(SoFar, [Q]),
    pre_error({string,Q,string:slice(string:reverse(S), 0, 16)}, Sp, Pos).

pre_char([C|Cs], SoFar) -> pre_char(C, Cs, SoFar);
pre_char([], _) -> more;
pre_char(eof, _) -> error.

pre_char($\\, Cs, SoFar) ->
    pre_escape(Cs, SoFar);
pre_char(C, Cs, SoFar) ->
    {Cs,[C|SoFar]}.

pre_escape([$^|Cs0], SoFar) ->
    case Cs0 of
	[C3|Cs] ->
	    {Cs,[C3,$^,$\\|SoFar]};
	[] -> more;
	eof -> error
    end;
pre_escape([C|Cs], SoFar) ->
    {Cs,[C,$\\|SoFar]};
pre_escape([], _) -> more;
pre_escape(eof, _) -> error.

%% pre_comment([Char], SoFar, Pos)
%%  Comments are replaced by one SPACE.

pre_comment([$\n|Cs], SoFar, Pos) ->
    pre_scan(Cs, [$\n,$\s|SoFar], Pos+1);	%Terminate comment
pre_comment([_|Cs], SoFar, Pos) ->
    pre_comment(Cs, SoFar, Pos);
pre_comment([], SoFar, Pos) ->
    {more,['%'],SoFar,Pos};
pre_comment(eof, Sofar, Pos) ->
    pre_scan(eof, [$\s|Sofar], Pos).

pre_error(E, Epos, Pos) ->
    {error,{Epos,core_scan,E}, Pos}.

%% scan(CharList, StartPos)
%%  This takes a list of characters and tries to tokenise them.
%%
%%  The token list is built in reverse order (in a stack) to save appending
%%  and then reversed when all the tokens have been collected. Most tokens
%%  are built in the same way.
%%
%%  Returns:
%%	{ok,[Tok]}
%%	{error,{ErrorPos,core_scan,What}}

scan(Cs, Pos) ->
    scan1(Cs, [], Pos).

%% scan1(Characters, TokenStack, Position)
%%  Scan a list of characters into tokens.

scan1([$\n|Cs], Toks, Pos) ->            	        %Skip newline
    scan1(Cs, Toks, Pos+1);
scan1([C|Cs], Toks, Pos) when C >= $\000, C =< $\s -> 	%Skip control chars
    scan1(Cs, Toks, Pos);
scan1([C|Cs], Toks, Pos) when C >= $\200, C =< $\240 ->
    scan1(Cs, Toks, Pos);
scan1([C|Cs], Toks, Pos) when C >= $a, C =< $z ->	%Keywords
    scan_key_word(C, Cs, Toks, Pos);
scan1([C|Cs], Toks, Pos) when C >= $ß, C =< $ÿ, C /= $÷ ->
    scan_key_word(C, Cs, Toks, Pos);
scan1([C|Cs], Toks, Pos) when C >= $A, C =< $Z ->	%Variables
    scan_variable(C, Cs, Toks, Pos);
scan1([C|Cs], Toks, Pos) when C >= $À, C =< $Þ, C /= $× ->
    scan_variable(C, Cs, Toks, Pos);
scan1([C|Cs], Toks, Pos) when C >= $0, C =< $9 ->	%Numbers
    scan_number(C, Cs, Toks, Pos);
scan1([$-,C|Cs], Toks, Pos) when C >= $0, C =< $9 ->	%Signed numbers
    scan_signed_number($-, C, Cs, Toks, Pos);
scan1([$+,C|Cs], Toks, Pos) when C >= $0, C =< $9 ->	%Signed numbers
    scan_signed_number($+, C, Cs, Toks, Pos);
scan1([$_|Cs], Toks, Pos) ->				%_ variables
    scan_variable($_, Cs, Toks, Pos);
scan1([$$|Cs0], Toks, Pos) ->				%Character constant
    {C,Cs,Pos1} = scan_char(Cs0, Pos),
    scan1(Cs, [{char,Pos,C}|Toks], Pos1);
scan1([$'|Cs0], Toks, Pos) ->				%Atom (always quoted)
    {S,Cs1,Pos1} = scan_string(Cs0, $', Pos),
    try binary_to_atom(list_to_binary(S), utf8) of
	A when is_atom(A) ->
	    scan1(Cs1, [{atom,Pos,A}|Toks], Pos1)
    catch
        error:_ ->
            scan_error({illegal,atom}, Pos)
    end;
scan1([$"|Cs0], Toks, Pos) ->				%String
    {S,Cs1,Pos1} = scan_string(Cs0, $", Pos),
    scan1(Cs1, [{string,Pos,S}|Toks], Pos1);
%% Punctuation characters and operators, first recognise multiples.
scan1("->" ++ Cs, Toks, Pos) ->
    scan1(Cs, [{'->',Pos}|Toks], Pos);
scan1("-|" ++ Cs, Toks, Pos) ->
    scan1(Cs, [{'-|',Pos}|Toks], Pos);
scan1(":=" ++ Cs, Toks, Pos) ->
    scan1(Cs, [{':=',Pos}|Toks], Pos);
scan1("=>" ++ Cs, Toks, Pos) ->
    scan1(Cs, [{'=>',Pos}|Toks], Pos);
scan1([C|Cs], Toks, Pos) ->				%Punctuation character
    P = list_to_atom([C]),
    scan1(Cs, [{P,Pos}|Toks], Pos);
scan1([], Toks0, _) ->
    Toks = reverse(Toks0),
    {ok,Toks}.

%% scan_key_word(FirstChar, CharList, Tokens, Pos)
%% scan_variable(FirstChar, CharList, Tokens, Pos)

scan_key_word(C, Cs0, Toks, Pos) ->
    {Wcs,Cs} = scan_name(Cs0, []),
    case catch list_to_atom([C|reverse(Wcs)]) of
	Name when is_atom(Name) ->
	    scan1(Cs, [{Name,Pos}|Toks], Pos);
	_Error -> scan_error({illegal,atom}, Pos)
    end.

scan_variable(C, Cs0, Toks, Pos) ->
    {Wcs,Cs} = scan_name(Cs0, []),
    case catch list_to_atom([C|reverse(Wcs)]) of
	Name when is_atom(Name) ->
	    scan1(Cs, [{var,Pos,Name}|Toks], Pos);
	_Error -> scan_error({illegal,var}, Pos)
    end.

%% scan_name(Cs) -> lists:splitwith(fun (C) -> name_char(C) end, Cs).

scan_name([C|Cs], Ncs) ->
    case name_char(C) of
	true -> scan_name(Cs, [C|Ncs]);
	false -> {Ncs,[C|Cs]}			%Must rebuild here, sigh!
    end;
scan_name([], Ncs) ->
    {Ncs,[]}.

name_char(C) when C >= $a, C =< $z -> true;
name_char(C) when C >= $ß, C =< $ÿ, C /= $÷ -> true;
name_char(C) when C >= $A, C =< $Z -> true;
name_char(C) when C >= $À, C =< $Þ, C /= $× -> true;
name_char(C) when C >= $0, C =< $9 -> true;
name_char($_) -> true;
name_char($@) -> true;
name_char(_) -> false.

%% scan_string(CharList, QuoteChar, Pos) -> {StringChars,RestChars,NewPos}.

scan_string(Cs, Q, Pos) ->
    scan_string(Cs, [], Q, Pos).

scan_string([Q|Cs], Scs, Q, Pos) ->
    {reverse(Scs),Cs,Pos};
scan_string([$\n|Cs], Scs, Q, Pos) ->
    scan_string(Cs, [$\n|Scs], Q, Pos+1);
scan_string([$\\|Cs0], Scs, Q, Pos) ->
    {C,Cs,Pos1} = scan_escape(Cs0, Pos),
    scan_string(Cs, [C|Scs], Q, Pos1);
scan_string([C|Cs], Scs, Q, Pos) ->
    scan_string(Cs, [C|Scs], Q, Pos).

%% scan_char(Chars, Pos) -> {Char,RestChars,NewPos}.
%%  Read a single character from a character constant. The pre-scan
%%  phase has checked for errors here.

scan_char([$\\|Cs], Pos) ->
    scan_escape(Cs, Pos);
scan_char([$\n|Cs], Pos) ->                  %Newline
    {$\n,Cs,Pos+1};
scan_char([C|Cs], Pos) ->
    {C,Cs,Pos}.

scan_escape([O1,O2,O3|Cs], Pos) when            %\<1-3> octal digits
    O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 ->
    Val = (O1*8 + O2)*8 + O3 - 73*$0,
    {Val,Cs,Pos};
scan_escape([O1,O2|Cs], Pos) when
    O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7 ->
    Val = (O1*8 + O2) - 9*$0,
    {Val,Cs,Pos};
scan_escape([O1|Cs], Pos) when
    O1 >= $0, O1 =< $7 ->
    {O1 - $0,Cs,Pos};
scan_escape([$^,C|Cs], Pos) ->			%\^X -> CTL-X
    Val = C band 31,
    {Val,Cs,Pos};
%scan_escape([$\n,C1|Cs],Pos) ->
%    {C1,Cs,Pos+1};
%scan_escape([C,C1|Cs],Pos) when C >= $\000, C =< $\s ->
%    {C1,Cs,Pos};
scan_escape([$\n|Cs],Pos) ->
    {$\n,Cs,Pos+1};
scan_escape([C0|Cs],Pos) ->
    C = escape_char(C0),
    {C,Cs,Pos}.

escape_char($n) -> $\n;				%\n = LF
escape_char($r) -> $\r;				%\r = CR
escape_char($t) -> $\t;				%\t = TAB
escape_char($v) -> $\v;				%\v = VT
escape_char($b) -> $\b;				%\b = BS
escape_char($f) -> $\f;				%\f = FF
escape_char($e) -> $\e;				%\e = ESC
escape_char($s) -> $\s;				%\s = SPC
escape_char($d) -> $\d;				%\d = DEL
escape_char(C) -> C.

%% scan_number(Char, CharList, TokenStack, Pos)
%%  We can handle simple radix notation:
%%    <digit>#<digits>		- the digits read in that base
%%    <digits>			- the digits in base 10
%%    <digits>.<digits>
%%    <digits>.<digits>E+-<digits>
%%
%%  Except for explicitly based integers we build a list of all the
%%  characters and then use list_to_integer/1 or list_to_float/1 to
%%  generate the value.

%%  SPos == Start position
%%  CPos == Current position

scan_number(C, Cs0, Toks, Pos) ->
    {Ncs,Cs,Pos1} = scan_integer(Cs0, [C], Pos),
    scan_after_int(Cs, Ncs, Toks, Pos, Pos1).

scan_signed_number(S, C, Cs0, Toks, Pos) ->
    {Ncs,Cs,Pos1} = scan_integer(Cs0, [C,S], Pos),
    scan_after_int(Cs, Ncs, Toks, Pos, Pos1).

scan_integer([C|Cs], Stack, Pos) when C >= $0, C =< $9 ->
    scan_integer(Cs, [C|Stack], Pos);
scan_integer(Cs, Stack, Pos) ->
    {Stack,Cs,Pos}.

scan_after_int([$.,C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
    {Ncs,Cs,CPos1} = scan_integer(Cs0, [C,$.|Ncs0], CPos),
    scan_after_fraction(Cs, Ncs, Toks, SPos, CPos1);
scan_after_int([$#|Cs], Ncs, Toks, SPos, CPos) ->
    case list_to_integer(reverse(Ncs)) of
	Base when Base >= 2, Base =< 16 ->
	    scan_based_int(Cs, 0, Base, Toks, SPos, CPos);
	Base ->
	    scan_error({base,Base}, CPos)
    end;
scan_after_int(Cs, Ncs, Toks, SPos, CPos) ->
    N = list_to_integer(reverse(Ncs)),
    scan1(Cs, [{integer,SPos,N}|Toks], CPos).

scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when
    C >= $0, C =< $9, C < Base + $0 ->
    Next = SoFar * Base + (C - $0),
    scan_based_int(Cs, Next, Base, Toks, SPos, CPos);
scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when
    C >= $a, C =< $f, C < Base + $a - 10 ->
    Next = SoFar * Base + (C - $a + 10),
    scan_based_int(Cs, Next, Base, Toks, SPos, CPos);
scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when
    C >= $A, C =< $F, C < Base + $A - 10 ->
    Next = SoFar * Base + (C - $A + 10),
    scan_based_int(Cs, Next, Base, Toks, SPos, CPos);
scan_based_int(Cs, SoFar, _, Toks, SPos, CPos) ->
    scan1(Cs, [{integer,SPos,SoFar}|Toks], CPos).

scan_after_fraction([$E|Cs], Ncs, Toks, SPos, CPos) ->
    scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos);
scan_after_fraction([$e|Cs], Ncs, Toks, SPos, CPos) ->
    scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos);
scan_after_fraction(Cs, Ncs, Toks, SPos, CPos) ->
    case catch list_to_float(reverse(Ncs)) of
	N when is_float(N) ->
	    scan1(Cs, [{float,SPos,N}|Toks], CPos);
	_Error -> scan_error({illegal,float}, SPos)
    end.

%% scan_exponent(CharList, NumberCharStack, TokenStack, StartPos, CurPos)
%%  Generate an error here if E{+|-} not followed by any digits.

scan_exponent([$+|Cs], Ncs, Toks, SPos, CPos) ->
    scan_exponent1(Cs, [$+|Ncs], Toks, SPos, CPos);
scan_exponent([$-|Cs], Ncs, Toks, SPos, CPos) ->
    scan_exponent1(Cs, [$-|Ncs], Toks, SPos, CPos);
scan_exponent(Cs, Ncs, Toks, SPos, CPos) ->
    scan_exponent1(Cs, Ncs, Toks, SPos, CPos).

scan_exponent1([C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
    {Ncs,Cs,CPos1} = scan_integer(Cs0, [C|Ncs0], CPos),
    case catch list_to_float(reverse(Ncs)) of
	N when is_float(N) ->
	    scan1(Cs, [{float,SPos,N}|Toks], CPos1);
	_Error -> scan_error({illegal,float}, SPos)
    end;
scan_exponent1(_, _, _, _, CPos) ->
    scan_error(float, CPos).

scan_error(In, Pos) ->
    {error,{Pos,core_scan,In}}.