1%% 2%% %CopyrightBegin% 3%% 4%% Copyright Ericsson AB 2000-2017. All Rights Reserved. 5%% 6%% Licensed under the Apache License, Version 2.0 (the "License"); 7%% you may not use this file except in compliance with the License. 8%% You may obtain a copy of the License at 9%% 10%% http://www.apache.org/licenses/LICENSE-2.0 11%% 12%% Unless required by applicable law or agreed to in writing, software 13%% distributed under the License is distributed on an "AS IS" BASIS, 14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15%% See the License for the specific language governing permissions and 16%% limitations under the License. 17%% 18%% %CopyrightEnd% 19%% 20%% Purpose: Scanner for Core Erlang. 21 22%% For handling ISO 8859-1 (Latin-1) we use the following type 23%% information: 24%% 25%% 000 - 037 NUL - US control 26%% 040 - 057 SPC - / punctuation 27%% 060 - 071 0 - 9 digit 28%% 072 - 100 : - @ punctuation 29%% 101 - 132 A - Z uppercase 30%% 133 - 140 [ - ` punctuation 31%% 141 - 172 a - z lowercase 32%% 173 - 176 { - ~ punctuation 33%% 177 DEL control 34%% 200 - 237 control 35%% 240 - 277 NBSP - ¿ punctuation 36%% 300 - 326 À - Ö uppercase 37%% 327 × punctuation 38%% 330 - 336 Ø - Þ uppercase 39%% 337 - 366 ß - ö lowercase 40%% 367 ÷ punctuation 41%% 370 - 377 ø - ÿ lowercase 42%% 43%% Many punctuation characters region have special meaning. Must 44%% watch using × \327, bvery close to x \170 45 46-module(core_scan). 47 48-export([string/1, string/2, format_error/1]). 49 50-import(lists, [reverse/1]). 51 52-type location() :: integer(). 53-type category() :: atom(). 54-type symbol() :: atom() | float() | integer() | string(). 55-type token() :: {category(), Anno :: location(), symbol()} 56 | {category(), Anno :: location()}. 57-type tokens() :: [token()]. 58-type error_description() :: term(). 59-type error_info() :: {erl_anno:location(), module(), error_description()}. 60 61%% string([Char]) -> 62%% string([Char], StartPos) -> 63%% {ok, [Tok], EndPos} | 64%% {error, {Pos,core_scan,What}, EndPos} 65 66-spec string(String) -> Return when 67 String :: string(), 68 Return :: {'ok', Tokens :: tokens(), EndLocation} 69 | {'error', ErrorInfo :: error_info(), ErrorLocation}, 70 EndLocation :: location(), 71 ErrorLocation :: location(). 72 73string(Cs) -> string(Cs, 1). 74 75-spec string(String, StartLocation) -> Return when 76 String :: string(), 77 Return :: {'ok', Tokens :: tokens(), EndLocation} 78 | {'error', ErrorInfo :: error_info(), ErrorLocation}, 79 StartLocation :: location(), 80 EndLocation :: location(), 81 ErrorLocation :: location(). 82 83string(Cs, Sp) -> 84 %% Add an 'eof' to always get correct handling. 85 case string_pre_scan(Cs, [], Sp) of 86 {done,_,SoFar,Ep} -> %Got tokens 87 case scan(reverse(SoFar), Sp) of 88 {ok,Toks} -> {ok,Toks,Ep}; 89 {error,E} -> {error,E,Ep} 90 end; 91 Other -> Other %An error has occurred 92 end. 93 94%% string_pre_scan(Cs, SoFar0, StartPos) -> 95%% {done,Rest,SoFar,EndPos} | {error,E,EndPos}. 96 97string_pre_scan(Cs, SoFar0, Sp) -> 98 case pre_scan(Cs, SoFar0, Sp) of 99 {done,Rest,SoFar1,Ep} -> %Got complete tokens 100 {done,Rest,SoFar1,Ep}; 101 {more,Rest,SoFar1,Ep} -> %Missing end token 102 string_pre_scan(Rest ++ eof, SoFar1, Ep); 103 Other -> Other %An error has occurred 104 end. 105 106%% format_error(Error) 107%% Return a string describing the error. 108 109-spec format_error(term()) -> iolist(). 110 111format_error({string,Quote,Head}) -> 112 ["unterminated " ++ string_thing(Quote) ++ 113 " starting with " ++ io_lib:write_string(Head,Quote)]; 114format_error({illegal,Type}) -> io_lib:fwrite("illegal ~w", [Type]); 115format_error(char) -> "unterminated character"; 116format_error(scan) -> "premature end"; 117format_error({base,Base}) -> io_lib:fwrite("illegal base '~w'", [Base]); 118format_error(float) -> "bad float"; 119format_error(Other) -> io_lib:write(Other). 120 121string_thing($') -> "atom"; %' stupid emacs 122string_thing($") -> "string". %" stupid emacs 123 124%% Re-entrant pre-scanner. 125%% 126%% If the input list of characters is insufficient to build a term the 127%% scanner returns a request for more characters and a continuation to be 128%% used when trying to build a term with more characters. To indicate 129%% end-of-file the input character list should be replaced with 'eof' 130%% as an empty list has meaning. 131%% 132%% When more characters are need inside a comment, string or quoted 133%% atom, which can become rather long, instead of pushing the 134%% characters read so far back onto RestChars to be reread, a special 135%% reentry token is returned indicating the middle of a construct. 136%% The token is the start character as an atom, '%', '"' and '\''. 137 138%% pre_scan([Char], SoFar, StartPos) -> 139%% {done,RestChars,ScannedChars,NewPos} | 140%% {more,RestChars,ScannedChars,NewPos} | 141%% {error,{ErrorPos,core_scan,Description},NewPos}. 142%% Main pre-scan function. It has been split into 2 functions because of 143%% efficiency, with a good indexing compiler it would be unnecessary. 144 145pre_scan([C|Cs], SoFar, Pos) -> 146 pre_scan(C, Cs, SoFar, Pos); 147pre_scan([], SoFar, Pos) -> 148 {more,[],SoFar,Pos}; 149pre_scan(eof, SoFar, Pos) -> 150 {done,eof,SoFar,Pos}. 151 152%% pre_scan(Char, [Char], SoFar, Pos) 153 154pre_scan($$, Cs0, SoFar0, Pos) -> 155 case pre_char(Cs0, [$$|SoFar0]) of 156 {Cs,SoFar} -> 157 pre_scan(Cs, SoFar, Pos); 158 more -> 159 {more,[$$|Cs0],SoFar0, Pos}; 160 error -> 161 pre_error(char, Pos, Pos) 162 end; 163pre_scan($', Cs, SoFar, Pos) -> 164 pre_string(Cs, $', '\'', Pos, [$'|SoFar], Pos); 165pre_scan({'\'',Sp}, Cs, SoFar, Pos) -> %Re-entering quoted atom 166 pre_string(Cs, $', '\'', Sp, SoFar, Pos); 167pre_scan($", Cs, SoFar, Pos) -> 168 pre_string(Cs, $", '"', Pos, [$"|SoFar], Pos); 169pre_scan({'"',Sp}, Cs, SoFar, Pos) -> %Re-entering string 170 pre_string(Cs, $", '"', Sp, SoFar, Pos); 171pre_scan($%, Cs, SoFar, Pos) -> 172 pre_comment(Cs, SoFar, Pos); 173pre_scan('%', Cs, SoFar, Pos) -> %Re-entering comment 174 pre_comment(Cs, SoFar, Pos); 175pre_scan($\n, Cs, SoFar, Pos) -> 176 pre_scan(Cs, [$\n|SoFar], Pos+1); 177pre_scan(C, Cs, SoFar, Pos) -> 178 pre_scan(Cs, [C|SoFar], Pos). 179 180%% pre_string([Char], Quote, Reent, StartPos, SoFar, Pos) 181 182pre_string([Q|Cs], Q, _, _, SoFar, Pos) -> 183 pre_scan(Cs, [Q|SoFar], Pos); 184pre_string([$\n|Cs], Q, Reent, Sp, SoFar, Pos) -> 185 pre_string(Cs, Q, Reent, Sp, [$\n|SoFar], Pos+1); 186pre_string([$\\|Cs0], Q, Reent, Sp, SoFar0, Pos) -> 187 case pre_escape(Cs0, SoFar0) of 188 {Cs,SoFar} -> 189 pre_string(Cs, Q, Reent, Sp, SoFar, Pos); 190 more -> 191 {more,[{Reent,Sp},$\\|Cs0],SoFar0,Pos}; 192 error -> 193 pre_string_error(Q, Sp, SoFar0, Pos) 194 end; 195pre_string([C|Cs], Q, Reent, Sp, SoFar, Pos) -> 196 pre_string(Cs, Q, Reent, Sp, [C|SoFar], Pos); 197pre_string([], _, Reent, Sp, SoFar, Pos) -> 198 {more,[{Reent,Sp}],SoFar,Pos}; 199pre_string(eof, Q, _, Sp, SoFar, Pos) -> 200 pre_string_error(Q, Sp, SoFar, Pos). 201 202pre_string_error(Q, Sp, SoFar, Pos) -> 203 [S,_] = string:split(SoFar, [Q]), 204 pre_error({string,Q,string:slice(string:reverse(S), 0, 16)}, Sp, Pos). 205 206pre_char([C|Cs], SoFar) -> pre_char(C, Cs, SoFar); 207pre_char([], _) -> more; 208pre_char(eof, _) -> error. 209 210pre_char($\\, Cs, SoFar) -> 211 pre_escape(Cs, SoFar); 212pre_char(C, Cs, SoFar) -> 213 {Cs,[C|SoFar]}. 214 215pre_escape([$^|Cs0], SoFar) -> 216 case Cs0 of 217 [C3|Cs] -> 218 {Cs,[C3,$^,$\\|SoFar]}; 219 [] -> more; 220 eof -> error 221 end; 222pre_escape([C|Cs], SoFar) -> 223 {Cs,[C,$\\|SoFar]}; 224pre_escape([], _) -> more; 225pre_escape(eof, _) -> error. 226 227%% pre_comment([Char], SoFar, Pos) 228%% Comments are replaced by one SPACE. 229 230pre_comment([$\n|Cs], SoFar, Pos) -> 231 pre_scan(Cs, [$\n,$\s|SoFar], Pos+1); %Terminate comment 232pre_comment([_|Cs], SoFar, Pos) -> 233 pre_comment(Cs, SoFar, Pos); 234pre_comment([], SoFar, Pos) -> 235 {more,['%'],SoFar,Pos}; 236pre_comment(eof, Sofar, Pos) -> 237 pre_scan(eof, [$\s|Sofar], Pos). 238 239pre_error(E, Epos, Pos) -> 240 {error,{Epos,core_scan,E}, Pos}. 241 242%% scan(CharList, StartPos) 243%% This takes a list of characters and tries to tokenise them. 244%% 245%% The token list is built in reverse order (in a stack) to save appending 246%% and then reversed when all the tokens have been collected. Most tokens 247%% are built in the same way. 248%% 249%% Returns: 250%% {ok,[Tok]} 251%% {error,{ErrorPos,core_scan,What}} 252 253scan(Cs, Pos) -> 254 scan1(Cs, [], Pos). 255 256%% scan1(Characters, TokenStack, Position) 257%% Scan a list of characters into tokens. 258 259scan1([$\n|Cs], Toks, Pos) -> %Skip newline 260 scan1(Cs, Toks, Pos+1); 261scan1([C|Cs], Toks, Pos) when C >= $\000, C =< $\s -> %Skip control chars 262 scan1(Cs, Toks, Pos); 263scan1([C|Cs], Toks, Pos) when C >= $\200, C =< $\240 -> 264 scan1(Cs, Toks, Pos); 265scan1([C|Cs], Toks, Pos) when C >= $a, C =< $z -> %Keywords 266 scan_key_word(C, Cs, Toks, Pos); 267scan1([C|Cs], Toks, Pos) when C >= $ß, C =< $ÿ, C /= $÷ -> 268 scan_key_word(C, Cs, Toks, Pos); 269scan1([C|Cs], Toks, Pos) when C >= $A, C =< $Z -> %Variables 270 scan_variable(C, Cs, Toks, Pos); 271scan1([C|Cs], Toks, Pos) when C >= $À, C =< $Þ, C /= $× -> 272 scan_variable(C, Cs, Toks, Pos); 273scan1([C|Cs], Toks, Pos) when C >= $0, C =< $9 -> %Numbers 274 scan_number(C, Cs, Toks, Pos); 275scan1([$-,C|Cs], Toks, Pos) when C >= $0, C =< $9 -> %Signed numbers 276 scan_signed_number($-, C, Cs, Toks, Pos); 277scan1([$+,C|Cs], Toks, Pos) when C >= $0, C =< $9 -> %Signed numbers 278 scan_signed_number($+, C, Cs, Toks, Pos); 279scan1([$_|Cs], Toks, Pos) -> %_ variables 280 scan_variable($_, Cs, Toks, Pos); 281scan1([$$|Cs0], Toks, Pos) -> %Character constant 282 {C,Cs,Pos1} = scan_char(Cs0, Pos), 283 scan1(Cs, [{char,Pos,C}|Toks], Pos1); 284scan1([$'|Cs0], Toks, Pos) -> %Atom (always quoted) 285 {S,Cs1,Pos1} = scan_string(Cs0, $', Pos), 286 try binary_to_atom(list_to_binary(S), utf8) of 287 A when is_atom(A) -> 288 scan1(Cs1, [{atom,Pos,A}|Toks], Pos1) 289 catch 290 error:_ -> 291 scan_error({illegal,atom}, Pos) 292 end; 293scan1([$"|Cs0], Toks, Pos) -> %String 294 {S,Cs1,Pos1} = scan_string(Cs0, $", Pos), 295 scan1(Cs1, [{string,Pos,S}|Toks], Pos1); 296%% Punctuation characters and operators, first recognise multiples. 297scan1("->" ++ Cs, Toks, Pos) -> 298 scan1(Cs, [{'->',Pos}|Toks], Pos); 299scan1("-|" ++ Cs, Toks, Pos) -> 300 scan1(Cs, [{'-|',Pos}|Toks], Pos); 301scan1(":=" ++ Cs, Toks, Pos) -> 302 scan1(Cs, [{':=',Pos}|Toks], Pos); 303scan1("=>" ++ Cs, Toks, Pos) -> 304 scan1(Cs, [{'=>',Pos}|Toks], Pos); 305scan1([C|Cs], Toks, Pos) -> %Punctuation character 306 P = list_to_atom([C]), 307 scan1(Cs, [{P,Pos}|Toks], Pos); 308scan1([], Toks0, _) -> 309 Toks = reverse(Toks0), 310 {ok,Toks}. 311 312%% scan_key_word(FirstChar, CharList, Tokens, Pos) 313%% scan_variable(FirstChar, CharList, Tokens, Pos) 314 315scan_key_word(C, Cs0, Toks, Pos) -> 316 {Wcs,Cs} = scan_name(Cs0, []), 317 case catch list_to_atom([C|reverse(Wcs)]) of 318 Name when is_atom(Name) -> 319 scan1(Cs, [{Name,Pos}|Toks], Pos); 320 _Error -> scan_error({illegal,atom}, Pos) 321 end. 322 323scan_variable(C, Cs0, Toks, Pos) -> 324 {Wcs,Cs} = scan_name(Cs0, []), 325 case catch list_to_atom([C|reverse(Wcs)]) of 326 Name when is_atom(Name) -> 327 scan1(Cs, [{var,Pos,Name}|Toks], Pos); 328 _Error -> scan_error({illegal,var}, Pos) 329 end. 330 331%% scan_name(Cs) -> lists:splitwith(fun (C) -> name_char(C) end, Cs). 332 333scan_name([C|Cs], Ncs) -> 334 case name_char(C) of 335 true -> scan_name(Cs, [C|Ncs]); 336 false -> {Ncs,[C|Cs]} %Must rebuild here, sigh! 337 end; 338scan_name([], Ncs) -> 339 {Ncs,[]}. 340 341name_char(C) when C >= $a, C =< $z -> true; 342name_char(C) when C >= $ß, C =< $ÿ, C /= $÷ -> true; 343name_char(C) when C >= $A, C =< $Z -> true; 344name_char(C) when C >= $À, C =< $Þ, C /= $× -> true; 345name_char(C) when C >= $0, C =< $9 -> true; 346name_char($_) -> true; 347name_char($@) -> true; 348name_char(_) -> false. 349 350%% scan_string(CharList, QuoteChar, Pos) -> {StringChars,RestChars,NewPos}. 351 352scan_string(Cs, Q, Pos) -> 353 scan_string(Cs, [], Q, Pos). 354 355scan_string([Q|Cs], Scs, Q, Pos) -> 356 {reverse(Scs),Cs,Pos}; 357scan_string([$\n|Cs], Scs, Q, Pos) -> 358 scan_string(Cs, [$\n|Scs], Q, Pos+1); 359scan_string([$\\|Cs0], Scs, Q, Pos) -> 360 {C,Cs,Pos1} = scan_escape(Cs0, Pos), 361 scan_string(Cs, [C|Scs], Q, Pos1); 362scan_string([C|Cs], Scs, Q, Pos) -> 363 scan_string(Cs, [C|Scs], Q, Pos). 364 365%% scan_char(Chars, Pos) -> {Char,RestChars,NewPos}. 366%% Read a single character from a character constant. The pre-scan 367%% phase has checked for errors here. 368 369scan_char([$\\|Cs], Pos) -> 370 scan_escape(Cs, Pos); 371scan_char([$\n|Cs], Pos) -> %Newline 372 {$\n,Cs,Pos+1}; 373scan_char([C|Cs], Pos) -> 374 {C,Cs,Pos}. 375 376scan_escape([O1,O2,O3|Cs], Pos) when %\<1-3> octal digits 377 O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 -> 378 Val = (O1*8 + O2)*8 + O3 - 73*$0, 379 {Val,Cs,Pos}; 380scan_escape([O1,O2|Cs], Pos) when 381 O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7 -> 382 Val = (O1*8 + O2) - 9*$0, 383 {Val,Cs,Pos}; 384scan_escape([O1|Cs], Pos) when 385 O1 >= $0, O1 =< $7 -> 386 {O1 - $0,Cs,Pos}; 387scan_escape([$^,C|Cs], Pos) -> %\^X -> CTL-X 388 Val = C band 31, 389 {Val,Cs,Pos}; 390%scan_escape([$\n,C1|Cs],Pos) -> 391% {C1,Cs,Pos+1}; 392%scan_escape([C,C1|Cs],Pos) when C >= $\000, C =< $\s -> 393% {C1,Cs,Pos}; 394scan_escape([$\n|Cs],Pos) -> 395 {$\n,Cs,Pos+1}; 396scan_escape([C0|Cs],Pos) -> 397 C = escape_char(C0), 398 {C,Cs,Pos}. 399 400escape_char($n) -> $\n; %\n = LF 401escape_char($r) -> $\r; %\r = CR 402escape_char($t) -> $\t; %\t = TAB 403escape_char($v) -> $\v; %\v = VT 404escape_char($b) -> $\b; %\b = BS 405escape_char($f) -> $\f; %\f = FF 406escape_char($e) -> $\e; %\e = ESC 407escape_char($s) -> $\s; %\s = SPC 408escape_char($d) -> $\d; %\d = DEL 409escape_char(C) -> C. 410 411%% scan_number(Char, CharList, TokenStack, Pos) 412%% We can handle simple radix notation: 413%% <digit>#<digits> - the digits read in that base 414%% <digits> - the digits in base 10 415%% <digits>.<digits> 416%% <digits>.<digits>E+-<digits> 417%% 418%% Except for explicitly based integers we build a list of all the 419%% characters and then use list_to_integer/1 or list_to_float/1 to 420%% generate the value. 421 422%% SPos == Start position 423%% CPos == Current position 424 425scan_number(C, Cs0, Toks, Pos) -> 426 {Ncs,Cs,Pos1} = scan_integer(Cs0, [C], Pos), 427 scan_after_int(Cs, Ncs, Toks, Pos, Pos1). 428 429scan_signed_number(S, C, Cs0, Toks, Pos) -> 430 {Ncs,Cs,Pos1} = scan_integer(Cs0, [C,S], Pos), 431 scan_after_int(Cs, Ncs, Toks, Pos, Pos1). 432 433scan_integer([C|Cs], Stack, Pos) when C >= $0, C =< $9 -> 434 scan_integer(Cs, [C|Stack], Pos); 435scan_integer(Cs, Stack, Pos) -> 436 {Stack,Cs,Pos}. 437 438scan_after_int([$.,C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 -> 439 {Ncs,Cs,CPos1} = scan_integer(Cs0, [C,$.|Ncs0], CPos), 440 scan_after_fraction(Cs, Ncs, Toks, SPos, CPos1); 441scan_after_int([$#|Cs], Ncs, Toks, SPos, CPos) -> 442 case list_to_integer(reverse(Ncs)) of 443 Base when Base >= 2, Base =< 16 -> 444 scan_based_int(Cs, 0, Base, Toks, SPos, CPos); 445 Base -> 446 scan_error({base,Base}, CPos) 447 end; 448scan_after_int(Cs, Ncs, Toks, SPos, CPos) -> 449 N = list_to_integer(reverse(Ncs)), 450 scan1(Cs, [{integer,SPos,N}|Toks], CPos). 451 452scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when 453 C >= $0, C =< $9, C < Base + $0 -> 454 Next = SoFar * Base + (C - $0), 455 scan_based_int(Cs, Next, Base, Toks, SPos, CPos); 456scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when 457 C >= $a, C =< $f, C < Base + $a - 10 -> 458 Next = SoFar * Base + (C - $a + 10), 459 scan_based_int(Cs, Next, Base, Toks, SPos, CPos); 460scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when 461 C >= $A, C =< $F, C < Base + $A - 10 -> 462 Next = SoFar * Base + (C - $A + 10), 463 scan_based_int(Cs, Next, Base, Toks, SPos, CPos); 464scan_based_int(Cs, SoFar, _, Toks, SPos, CPos) -> 465 scan1(Cs, [{integer,SPos,SoFar}|Toks], CPos). 466 467scan_after_fraction([$E|Cs], Ncs, Toks, SPos, CPos) -> 468 scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos); 469scan_after_fraction([$e|Cs], Ncs, Toks, SPos, CPos) -> 470 scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos); 471scan_after_fraction(Cs, Ncs, Toks, SPos, CPos) -> 472 case catch list_to_float(reverse(Ncs)) of 473 N when is_float(N) -> 474 scan1(Cs, [{float,SPos,N}|Toks], CPos); 475 _Error -> scan_error({illegal,float}, SPos) 476 end. 477 478%% scan_exponent(CharList, NumberCharStack, TokenStack, StartPos, CurPos) 479%% Generate an error here if E{+|-} not followed by any digits. 480 481scan_exponent([$+|Cs], Ncs, Toks, SPos, CPos) -> 482 scan_exponent1(Cs, [$+|Ncs], Toks, SPos, CPos); 483scan_exponent([$-|Cs], Ncs, Toks, SPos, CPos) -> 484 scan_exponent1(Cs, [$-|Ncs], Toks, SPos, CPos); 485scan_exponent(Cs, Ncs, Toks, SPos, CPos) -> 486 scan_exponent1(Cs, Ncs, Toks, SPos, CPos). 487 488scan_exponent1([C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 -> 489 {Ncs,Cs,CPos1} = scan_integer(Cs0, [C|Ncs0], CPos), 490 case catch list_to_float(reverse(Ncs)) of 491 N when is_float(N) -> 492 scan1(Cs, [{float,SPos,N}|Toks], CPos1); 493 _Error -> scan_error({illegal,float}, SPos) 494 end; 495scan_exponent1(_, _, _, _, CPos) -> 496 scan_error(float, CPos). 497 498scan_error(In, Pos) -> 499 {error,{Pos,core_scan,In}}. 500