1%% ``Licensed under the Apache License, Version 2.0 (the "License"); 2%% you may not use this file except in compliance with the License. 3%% You may obtain a copy of the License at 4%% 5%% http://www.apache.org/licenses/LICENSE-2.0 6%% 7%% Unless required by applicable law or agreed to in writing, software 8%% distributed under the License is distributed on an "AS IS" BASIS, 9%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10%% See the License for the specific language governing permissions and 11%% limitations under the License. 12%% 13%% The Initial Developer of the Original Code is Ericsson Utvecklings 14%% AB. Portions created by Ericsson are Copyright 1999, Ericsson 15%% Utvecklings AB. All Rights Reserved.'' 16%% 17%% @private 18%% @copyright Richard Carlsson 2001-2003. Portions created by Ericsson 19%% are Copyright 1999, Ericsson Utvecklings AB. All Rights Reserved. 20%% @author Richard Carlsson <carlsson.richard@gmail.com> 21%% @see edoc 22%% @end 23 24%% @doc Tokeniser for EDoc. Based on the Erlang standard library module 25%% {@link //stdlib/erl_scan}. 26 27-module(edoc_scanner). 28 29%% NOTE: the interface to this module is ancient and should be updated. 30%% Please do not regard these exported functions as stable. Their 31%% behaviour is described in the documentation of the module `erl_scan'. 32%% 33%% Since there are no `full stop' tokens in EDoc specifications, the 34%% `tokens' function *always* returns `{more, Continuation}' unless an 35%% error occurs. 36 37-export([string/1,string/2,format_error/1]). 38 39-import(lists, [reverse/1]). 40 41string(Cs) -> string(Cs, 1). 42 43string(Cs, StartPos) -> 44 case scan(Cs, StartPos) of 45 {ok,Toks} -> {ok,Toks,StartPos}; 46 {error,E} -> {error,E,StartPos} 47 end. 48 49%% format_error(Error) 50%% Return a string describing the error. 51 52format_error({string,Quote,Head}) -> 53 ["unterminated string starting with " ++ io_lib:write_string(Head,Quote)]; 54format_error({illegal,Type}) -> io_lib:fwrite("illegal ~w", [Type]); 55format_error(char) -> "unterminated character"; 56format_error(scan) -> "premature end"; 57format_error({base,Base}) -> io_lib:fwrite("illegal base '~w'", [Base]); 58format_error(float) -> "bad float"; 59 60format_error(Other) -> io_lib:write(Other). 61 62%% Reserved words, not atoms: 63reserved('where') -> true; 64reserved(_) -> false. 65 66%% scan(CharList, StartPos) 67%% This takes a list of characters and tries to tokenise them. 68%% 69%% The token list is built in reverse order (in a stack) to save appending 70%% and then reversed when all the tokens have been collected. Most tokens 71%% are built in the same way. 72%% 73%% Returns: 74%% {ok,[Tok]} 75%% {error,{ErrorPos,edoc_scanner,What}} 76 77scan(Cs, Pos) -> 78 scan1(Cs, [], Pos). 79 80%% scan1(Characters, TokenStack, Position) 81%% Scan a list of characters into tokens. 82 83scan1([$\n|Cs], Toks, Pos) -> % Newline 84 scan1(Cs, Toks, Pos+1); 85scan1([C|Cs], Toks, Pos) when C >= 0, C =< $ -> % Skip blanks 86 scan1(Cs, Toks, Pos); 87scan1([C|Cs], Toks, Pos) when C >= $a, C =< $z -> % Unquoted atom 88 scan_atom(C, Cs, Toks, Pos); 89scan1([C|Cs], Toks, Pos) when C >= $\337, C =< $\377, C /= $\367 -> 90 scan_atom(C, Cs, Toks, Pos); 91scan1([C|Cs], Toks, Pos) when C >= $0, C =< $9 -> % Numbers 92 scan_number(C, Cs, Toks, Pos); 93scan1([$-,C| Cs], Toks, Pos) when C >= $0, C =< $9 -> % Signed numbers 94 scan_signed_number($-, C, Cs, Toks, Pos); 95scan1([$+,C| Cs], Toks, Pos) when C >= $0, C =< $9 -> % Signed numbers 96 scan_signed_number($+, C, Cs, Toks, Pos); 97scan1([C|Cs], Toks, Pos) when C >= $A, C =< $Z -> % Variables 98 scan_variable(C, Cs, Toks, Pos); 99scan1([$_|Cs], Toks, Pos) -> % Variables 100 scan_variable($_, Cs, Toks, Pos); 101scan1([C|Cs], Toks, Pos) when C >= $\300, C =< $\336, C /= $\327 -> 102 scan_variable(C, Cs, Toks, Pos); 103scan1([$$|Cs], Toks, Pos) -> % Character constant 104 case scan_char_const(Cs, Toks, Pos) of 105 {ok, Result} -> 106 {ok, Result}; 107 {error, truncated_char} -> 108 scan_error(char, Pos); 109 {error, illegal_character} -> 110 scan_error({illegal, char}, Pos) 111 end; 112scan1([$'|Cs0], Toks, Pos) -> % Quoted atom 113 case scan_string(Cs0, $', Pos) of 114 {S,Cs1,Pos1} -> 115 case catch list_to_atom(S) of 116 A when is_atom(A) -> 117 scan1(Cs1, [{atom,Pos,A}|Toks], Pos1); 118 _Error -> scan_error({illegal,atom}, Pos) 119 end; 120 {error, premature_end} -> 121 scan_error({string,$',Cs0}, Pos); 122 {error, truncated_char} -> 123 scan_error(char, Pos); 124 {error, illegal_character} -> 125 scan_error({illegal, atom}, Pos) 126 end; 127scan1([$"|Cs0], Toks, Pos) -> % String 128 case scan_string(Cs0, $", Pos) of 129 {S,Cs1,Pos1} -> 130 case Toks of 131 [{string, Pos0, S0} | Toks1] -> 132 scan1(Cs1, [{string, Pos0, S0 ++ S} | Toks1], 133 Pos1); 134 _ -> 135 scan1(Cs1, [{string,Pos,S}|Toks], Pos1) 136 end; 137 {error, premature_end} -> 138 scan_error({string,$",Cs0}, Pos); 139 {error, truncated_char} -> 140 scan_error(char, Pos); 141 {error, illegal_character} -> 142 scan_error({illegal, string}, Pos) 143 end; 144%% Punctuation characters and operators, first recognise multiples. 145scan1([$=,$>|Cs], Toks, Pos) -> 146 scan1(Cs, [{'=>',Pos}|Toks], Pos); 147scan1([$<,$<|Cs], Toks, Pos) -> 148 scan1(Cs, [{'<<',Pos}|Toks], Pos); 149scan1([$>,$>|Cs], Toks, Pos) -> 150 scan1(Cs, [{'>>',Pos}|Toks], Pos); 151scan1([$-,$>|Cs], Toks, Pos) -> 152 scan1(Cs, [{'->',Pos}|Toks], Pos); 153scan1([$:,$=|Cs], Toks, Pos) -> 154 scan1(Cs, [{':=',Pos}|Toks], Pos); 155scan1([$:,$:|Cs], Toks, Pos) -> 156 scan1(Cs, [{'::',Pos}|Toks], Pos); 157scan1([$/,$/|Cs], Toks, Pos) -> 158 scan1(Cs, [{'//',Pos}|Toks], Pos); 159scan1([$.,$.,$.|Cs], Toks, Pos) -> 160 scan1(Cs, [{'...',Pos}|Toks], Pos); 161scan1([$.,$.|Cs], Toks, Pos) -> 162 scan1(Cs, [{'..',Pos}|Toks], Pos); 163scan1([C|Cs], Toks, Pos) -> % Punctuation character 164 P = list_to_atom([C]), 165 scan1(Cs, [{P,Pos}|Toks], Pos); 166scan1([], Toks0, _Pos) -> 167 Toks = reverse(Toks0), 168 {ok,Toks}. 169 170%% Note that `_' is not accepted as a variable token. 171scan_variable(C, Cs, Toks, Pos) -> 172 {Wcs,Cs1} = scan_name(Cs, []), 173 W = [C|reverse(Wcs)], 174 case W of 175 "_" -> 176 scan1(Cs1, [{an_var,Pos,'_'}|Toks], Pos); 177 _ -> 178 case catch list_to_atom(W) of 179 A when is_atom(A) -> 180 scan1(Cs1, [{var,Pos,A}|Toks], Pos); 181 _ -> 182 scan_error({illegal,variable}, Pos) 183 end 184 end. 185 186scan_atom(C, Cs, Toks, Pos) -> 187 {Wcs,Cs1} = scan_name(Cs, []), 188 W = [C|reverse(Wcs)], 189 case catch list_to_atom(W) of 190 A when is_atom(A) -> 191 case reserved(A) of 192 true -> 193 scan1(Cs1, [{A,Pos}|Toks], Pos); 194 false -> 195 scan1(Cs1, [{atom,Pos,A}|Toks], Pos) 196 end; 197 _ -> 198 scan_error({illegal,token}, Pos) 199 end. 200 201%% scan_name(Cs) -> lists:splitwith(fun (C) -> name_char(C) end, Cs). 202 203scan_name([C|Cs], Ncs) -> 204 case name_char(C) of 205 true -> 206 scan_name(Cs, [C|Ncs]); 207 false -> 208 {Ncs,[C|Cs]} % Must rebuild here, sigh! 209 end; 210scan_name([], Ncs) -> 211 {Ncs,[]}. 212 213name_char(C) when C >= $a, C =< $z -> true; 214name_char(C) when C >= $\337, C =< $\377, C /= $\367 -> true; 215name_char(C) when C >= $A, C =< $Z -> true; 216name_char(C) when C >= $\300, C =< $\336, C /= $\327 -> true; 217name_char(C) when C >= $0, C =< $9 -> true; 218name_char($_) -> true; 219name_char($@) -> true; 220name_char(_) -> false. 221 222%% scan_string(CharList, QuoteChar, Pos) -> 223%% {StringChars,RestChars, NewPos} 224 225scan_string(Cs, Quote, Pos) -> 226 scan_string(Cs, [], Quote, Pos). 227 228scan_string([Quote|Cs], Scs, Quote, Pos) -> 229 {reverse(Scs),Cs,Pos}; 230scan_string([], _Scs, _Quote, _Pos) -> 231 {error, premature_end}; 232scan_string(Cs0, Scs, Quote, Pos) -> 233 case scan_char(Cs0, Pos) of 234 {C,Cs,Pos1} -> 235 %% Only build the string here 236 scan_string(Cs, [C|Scs], Quote, Pos1); 237 Error -> 238 Error 239 end. 240 241%% Note that space characters are not allowed 242scan_char_const([$\040 | _Cs0], _Toks, _Pos) -> 243 {error, illegal_character}; 244scan_char_const(Cs0, Toks, Pos) -> 245 case scan_char(Cs0, Pos) of 246 {C,Cs,Pos1} -> 247 scan1(Cs, [{char,Pos,C}|Toks], Pos1); 248 Error -> 249 Error 250 end. 251 252%% {Character,RestChars,NewPos} = scan_char(Chars, Pos) 253%% Read a single character from a string or character constant. The 254%% pre-scan phase has checked for errors here. 255%% Note that control characters are not allowed. 256 257scan_char([$\\|Cs], Pos) -> 258 scan_escape(Cs, Pos); 259scan_char([C | _Cs], _Pos) when C =< 16#1f -> 260 {error, illegal_character}; 261scan_char([C|Cs], Pos) -> 262 {C,Cs,Pos}; 263scan_char([], _Pos) -> 264 {error, truncated_char}. 265 266%% The following conforms to Standard Erlang escape sequences. 267 268-define(HEX(C), C >= $0 andalso C =< $9 orelse 269 C >= $A andalso C =< $F orelse 270 C >= $a andalso C =< $f). 271 272-define(UNICODE(C), 273 (C >= 0 andalso C < 16#D800 orelse 274 C > 16#DFFF andalso C < 16#FFFE orelse 275 C > 16#FFFF andalso C =< 16#10FFFF)). 276 277scan_escape([O1, O2, O3 | Cs], Pos) when % \<1-3> octal digits 278 O1 >= $0, O1 =< $3, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 -> 279 Val = (O1*8 + O2)*8 + O3 - 73*$0, 280 {Val,Cs,Pos}; 281scan_escape([O1, O2 | Cs], Pos) when 282 O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7 -> 283 Val = (O1*8 + O2) - 9*$0, 284 {Val,Cs,Pos}; 285scan_escape([O1 | Cs], Pos) when 286 O1 >= $0, O1 =< $7 -> 287 {O1 - $0,Cs,Pos}; 288scan_escape([$x, ${ | Cs], Pos) -> 289 scan_hex(Cs, Pos, []); 290scan_escape([$x, H1, H2 | Cs], Pos) when ?HEX(H1), ?HEX(H2) -> 291 Val = (H1*16 + H2) - 17*$0, 292 {Val,Cs,Pos}; 293scan_escape([$^, C | Cs], Pos) -> % \^X -> CTL-X 294 if C >= $\100, C =< $\137 -> 295 {C - $\100,Cs,Pos}; 296 true -> {error, illegal_control_character} 297 end; 298scan_escape([C | Cs], Pos) -> 299 case escape_char(C) of 300 C1 when C1 > $\000 -> {C1,Cs,Pos}; 301 _ -> {error, undefined_escape_sequence} 302 end; 303scan_escape([], _Pos) -> 304 {error, truncated_char}. 305 306scan_hex([C | Cs], Pos, HCs) when ?HEX(C) -> 307 scan_hex(Cs, Pos, [C | HCs]); 308scan_hex([$} | Cs], Pos, HCs) -> 309 case catch erlang:list_to_integer(lists:reverse(HCs), 16) of 310 Val when ?UNICODE(Val) -> 311 {Val,Cs,Pos}; 312 _ -> 313 {error, undefined_escape_sequence} 314 end; 315scan_hex(_Cs, _Pos, _HCs) -> 316 {error, undefined_escape_sequence}. 317 318%% Note that we return $\000 for undefined escapes. 319escape_char($b) -> $\010; % \b = BS 320escape_char($d) -> $\177; % \d = DEL 321escape_char($e) -> $\033; % \e = ESC 322escape_char($f) -> $\014; % \f = FF 323escape_char($n) -> $\012; % \n = LF 324escape_char($r) -> $\015; % \r = CR 325escape_char($s) -> $\040; % \s = SPC 326escape_char($t) -> $\011; % \t = HT 327escape_char($v) -> $\013; % \v = VT 328escape_char($\\) -> $\134; % \\ = \ 329escape_char($') -> $\047; % \' = ' 330escape_char($") -> $\042; % \" = " 331escape_char(_C) -> $\000. 332 333%% scan_number(Char, CharList, TokenStack, Pos) 334%% We handle sign and radix notation: 335%% [+-]<digits> - the digits in base [+-]10 336%% [+-]<digits>.<digits> 337%% [+-]<digits>.<digits>E+-<digits> 338%% [+-]<digits>#<digits> - the digits read in base [+-]B 339%% 340%% Except for explicitly based integers we build a list of all the 341%% characters and then use list_to_integer/1 or list_to_float/1 to 342%% generate the value. 343 344%% SPos == Start position 345%% CPos == Current position 346 347scan_number(C, Cs0, Toks, Pos) -> 348 {Ncs,Cs,Pos1} = scan_integer(Cs0, [C], Pos), 349 scan_after_int(Cs, Ncs, Toks, Pos, Pos1). 350 351scan_signed_number(S, C, Cs0, Toks, Pos) -> 352 {Ncs,Cs,Pos1} = scan_integer(Cs0, [C, S], Pos), 353 scan_after_int(Cs, Ncs, Toks, Pos, Pos1). 354 355scan_integer([C|Cs], Stack, Pos) when C >= $0, C =< $9 -> 356 scan_integer(Cs, [C|Stack], Pos); 357scan_integer(Cs, Stack, Pos) -> 358 {Stack,Cs,Pos}. 359 360scan_after_int([$.,C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 -> 361 {Ncs,Cs,CPos1} = scan_integer(Cs0, [C,$.|Ncs0], CPos), 362 scan_after_fraction(Cs, Ncs, Toks, SPos, CPos1); 363scan_after_int(Cs, Ncs, Toks, SPos, CPos) -> 364 N = list_to_integer(reverse(Ncs)), 365 scan1(Cs, [{integer,SPos,N}|Toks], CPos). 366 367scan_after_fraction([$E|Cs], Ncs, Toks, SPos, CPos) -> 368 scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos); 369scan_after_fraction([$e|Cs], Ncs, Toks, SPos, CPos) -> 370 scan_exponent(Cs, [$e|Ncs], Toks, SPos, CPos); 371scan_after_fraction(Cs, Ncs, Toks, SPos, CPos) -> 372 case catch list_to_float(reverse(Ncs)) of 373 N when is_float(N) -> 374 scan1(Cs, [{float,SPos,N}|Toks], CPos); 375 _Error -> scan_error({illegal,float}, SPos) 376 end. 377 378%% scan_exponent(CharList, NumberCharStack, TokenStack, StartPos, CurPos) 379%% Generate an error here if E{+|-} not followed by any digits. 380 381scan_exponent([$+|Cs], Ncs, Toks, SPos, CPos) -> 382 scan_exponent1(Cs, [$+|Ncs], Toks, SPos, CPos); 383scan_exponent([$-|Cs], Ncs, Toks, SPos, CPos) -> 384 scan_exponent1(Cs, [$-|Ncs], Toks, SPos, CPos); 385scan_exponent(Cs, Ncs, Toks, SPos, CPos) -> 386 scan_exponent1(Cs, Ncs, Toks, SPos, CPos). 387 388scan_exponent1([C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 -> 389 {Ncs,Cs,CPos1} = scan_integer(Cs0, [C|Ncs0], CPos), 390 case catch list_to_float(reverse(Ncs)) of 391 N when is_float(N) -> 392 scan1(Cs, [{float,SPos,N}|Toks], CPos1); 393 _Error -> scan_error({illegal,float}, SPos) 394 end; 395scan_exponent1(_, _, _, _, CPos) -> 396 scan_error(float, CPos). 397 398scan_error(In, Pos) -> 399 {error,{Pos,edoc_scanner,In}}. 400