1%% Copyright (c) 2008-2013 Robert Virding
2%%
3%% Licensed under the Apache License, Version 2.0 (the "License");
4%% you may not use this file except in compliance with the License.
5%% You may obtain a copy of the License at
6%%
7%%     http://www.apache.org/licenses/LICENSE-2.0
8%%
9%% Unless required by applicable law or agreed to in writing, software
10%% distributed under the License is distributed on an "AS IS" BASIS,
11%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12%% See the License for the specific language governing permissions and
13%% limitations under the License.
14
15%% File    : lfe_scan.xrl
16%% Author  : Robert Virding
17%% Purpose : Token definitions for Lisp Flavoured Erlang.
18
19Definitions.
20B    = [01]
21O    = [0-7]
22D    = [0-9]
23H    = [0-9a-fA-F]
24B36  = [0-9a-zA-Z]
25U    = [A-Z]
26L    = [a-z]
27A    = ({U}|{L})
28DEL  = [][()}{";\000-\s]
29SYM  = [^][()}{";\000-\s\177-\237]
30SSYM = [^][()}{"|;#`',\000-\s\177-\237]
31WS   = ([\000-\s]|;[^\n]*)
32
33Rules.
34%% Bracketed Comments using #| foo |#
35#{D}*\|[^\|]*\|+([^#\|][^\|]*\|+)*# :
36        block_comment(string:substr(TokenChars, 3)).
37
38%% Separators
39'               :    {token,{'\'',TokenLine}}.
40`               :    {token,{'`',TokenLine}}.
41,               :    {token,{',',TokenLine}}.
42,@              :    {token,{',@',TokenLine}}.
43\.              :    {token,{'.',TokenLine}}.
44[][()}{]        :    {token,{list_to_atom(TokenChars),TokenLine}}.
45
46#{D}*[bB]\(     :    {token,{'#B(',TokenLine}}.
47#{D}*[mM]\(     :    {token,{'#M(',TokenLine}}.
48#{D}*\(         :    {token,{'#(',TokenLine}}.
49#{D}*\.         :    {token,{'#.',TokenLine}}.
50
51#{D}*`          :    {token,{'#`',TokenLine}}.
52#{D}*;          :    {token,{'#;',TokenLine}}.
53#{D}*,          :    {token,{'#,',TokenLine}}.
54#{D}*,@         :    {token,{'#,@',TokenLine}}.
55
56%% Characters
57#{D}*\\(x{H}+|.) :   char_token(skip_past(TokenChars, $\\, $\\), TokenLine).
58
59%% Based numbers
60#{D}*\*{SYM}+   :    base_token(skip_past(TokenChars, $*, $*), 2, TokenLine).
61#{D}*[bB]{SYM}+ :    base_token(skip_past(TokenChars, $b, $B), 2, TokenLine).
62#{D}*[oO]{SYM}+ :    base_token(skip_past(TokenChars, $o, $O), 8, TokenLine).
63#{D}*[dD]{SYM}+ :    base_token(skip_past(TokenChars, $d, $D), 10, TokenLine).
64#{D}*[xX]{SYM}+ :    base_token(skip_past(TokenChars, $x, $X), 16, TokenLine).
65#{D}*[rR]{SYM}+ :
66        %% Scan over digit chars to get base.
67        {Base,[_|Ds]} = base1(tl(TokenChars), 10, 0),
68        base_token(Ds, Base, TokenLine).
69
70%% String
71"(\\x{H}+;|\\.|[^"\\])*" :
72        %% Strip quotes.
73        S = string:substr(TokenChars, 2, TokenLen - 2),
74        {token,{string,TokenLine,chars(S)}}.
75%% Binary string
76#"(\\x{H}+;|\\.|[^"\\])*" :
77        %% Strip quotes.
78        S = string:substr(TokenChars, 3, TokenLen - 3),
79        Bin = unicode:characters_to_binary(chars(S), utf8, utf8),
80        {token,{binary,TokenLine,Bin}}.
81%% Symbols
82\|(\\x{H}+;|\\.|[^|\\])*\| :
83        %% Strip quotes.
84        S = string:substr(TokenChars, 2, TokenLen - 2),
85        symbol_token(chars(S), TokenLine).
86%% Funs
87#'{SSYM}{SYM}*/{D}+ :
88        %% Strip sharpsign single-quote.
89        FunStr = string:substr(TokenChars,3),
90        {token,{'#\'',TokenLine,FunStr}}.
91%% Atoms
92[+-]?{D}+       :
93        case catch {ok,list_to_integer(TokenChars)} of
94            {ok,I} -> {token,{number,TokenLine,I}};
95            _ -> {error,"illegal integer"}
96        end.
97[+-]?{D}+\.{D}+([eE][+-]?{D}+)? :
98        case catch {ok,list_to_float(TokenChars)} of
99            {ok,F} -> {token,{number,TokenLine,F}};
100            _ -> {error,"illegal float"}
101        end.
102{SSYM}{SYM}*    :
103        symbol_token(TokenChars, TokenLine).
104{WS}+           :    skip_token.
105
106Erlang code.
107%% Copyright (c) 2008-2013 Robert Virding
108%%
109%% Licensed under the Apache License, Version 2.0 (the "License");
110%% you may not use this file except in compliance with the License.
111%% You may obtain a copy of the License at
112%%
113%%     http://www.apache.org/licenses/LICENSE-2.0
114%%
115%% Unless required by applicable law or agreed to in writing, software
116%% distributed under the License is distributed on an "AS IS" BASIS,
117%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
118%% See the License for the specific language governing permissions and
119%% limitations under the License.
120
121%% File    : lfe_scan.erl
122%% Author  : Robert Virding
123%% Purpose : Token definitions for Lisp Flavoured Erlang.
124
125-export([start_symbol_char/1,symbol_char/1]).
126
127-import(string, [substr/2,substr/3]).
128
129%% start_symbol_char(Char) -> true | false.
130%% symbol_char(Char) -> true | false.
131%%  Define start symbol chars and symbol chars.
132
133start_symbol_char($#) -> false;
134start_symbol_char($`) -> false;
135start_symbol_char($') -> false;                 %'
136start_symbol_char($,) -> false;
137start_symbol_char($|) -> false;                 %Symbol quote character
138start_symbol_char(C) -> symbol_char(C).
139
140symbol_char($() -> false;
141symbol_char($)) -> false;
142symbol_char($[) -> false;
143symbol_char($]) -> false;
144symbol_char(${) -> false;
145symbol_char($}) -> false;
146symbol_char($") -> false;
147symbol_char($;) -> false;
148symbol_char(C) -> ((C > $\s) and (C =< $~)) orelse (C > $\240).
149
150%% symbol_token(Chars, Line) -> {token,{symbol,Line,Symbol}} | {error,E}.
151%%  Build a symbol from list of legal characters, else error.
152
153symbol_token(Cs, L) ->
154    case catch {ok,list_to_atom(Cs)} of
155        {ok,S} -> {token,{symbol,L,S}};
156        _ -> {error,"illegal symbol"}
157    end.
158
159%% base_token(Chars, Base, Line) -> Integer.
160%%  Convert a string of Base characters into a number. We only allow
161%%  base betqeen 2 and 36, and an optional sign character first.
162
163base_token(_, B, _) when B < 2; B > 36 ->
164    {error,"illegal number base"};
165base_token([$+|Cs], B, L) -> base_token(Cs, B, +1, L);
166base_token([$-|Cs], B, L) -> base_token(Cs, B, -1, L);
167base_token(Cs, B, L) -> base_token(Cs, B, +1, L).
168
169base_token(Cs, B, S, L) ->
170    case base1(Cs, B, 0) of
171        {N,[]} -> {token,{number,L,S*N}};
172        {_,_} -> {error,"illegal based number"}
173    end.
174
175base1([C|Cs], Base, SoFar) when C >= $0, C =< $9, C < Base + $0 ->
176    Next = SoFar * Base + (C - $0),
177    base1(Cs, Base, Next);
178base1([C|Cs], Base, SoFar) when C >= $a, C =< $z, C < Base + $a - 10 ->
179    Next = SoFar * Base + (C - $a + 10),
180    base1(Cs, Base, Next);
181base1([C|Cs], Base, SoFar) when C >= $A, C =< $Z, C < Base + $A - 10 ->
182    Next = SoFar * Base + (C - $A + 10),
183    base1(Cs, Base, Next);
184base1([C|Cs], _Base, SoFar) -> {SoFar,[C|Cs]};
185base1([], _Base, N) -> {N,[]}.
186
187-define(IS_UNICODE(C), ((C >= 0) and (C =< 16#10FFFF))).
188
189%% char_token(InputChars, Line) -> {token,{number,L,N}} | {error,E}.
190%%  Convert an input string into the corresponding character. For a
191%%  sequence of hex characters we check resultant is code is in the
192%%  unicode range.
193
194char_token([$x,C|Cs], L) ->
195    case base1([C|Cs], 16, 0) of
196        {N,[]} when ?IS_UNICODE(N) -> {token,{number,L,N}};
197        _ -> {error,"illegal character"}
198    end;
199char_token([C], L) -> {token,{number,L,C}}.
200
201%% chars(InputChars) -> Chars.
202%%  Convert an input string into the corresponding string characters.
203%%  We know that the input string is correct.
204
205chars([$\\,$x,C|Cs0]) ->
206    case hex_char(C) of
207        true ->
208            case base1([C|Cs0], 16, 0) of
209                {N,[$;|Cs1]} -> [N|chars(Cs1)];
210                _Other -> [escape_char($x)|chars([C|Cs0])]
211            end;
212        false -> [escape_char($x)|chars([C|Cs0])]
213    end;
214chars([$\\,C|Cs]) -> [escape_char(C)|chars(Cs)];
215chars([C|Cs]) -> [C|chars(Cs)];
216chars([]) -> [].
217
218hex_char(C) when C >= $0, C =< $9 -> true;
219hex_char(C) when C >= $a, C =< $f -> true;
220hex_char(C) when C >= $A, C =< $F -> true;
221hex_char(_) -> false.
222
223escape_char($b) -> $\b;                %\b = BS
224escape_char($t) -> $\t;                %\t = TAB
225escape_char($n) -> $\n;                %\n = LF
226escape_char($v) -> $\v;                %\v = VT
227escape_char($f) -> $\f;                %\f = FF
228escape_char($r) -> $\r;                %\r = CR
229escape_char($e) -> $\e;                %\e = ESC
230escape_char($s) -> $\s;                %\s = SPC
231escape_char($d) -> $\d;                %\d = DEL
232escape_char(C) -> C.
233
234%% Block Comment:
235%%  Provide a sensible error when people attempt to include nested
236%%  comments because currently the parser cannot process them without
237%%  a rebuild. But simply exploding on a '#|' is not going to be that
238%%  helpful.
239
240block_comment(TokenChars) ->
241    %% Check we're not opening another comment block.
242    case string:str(TokenChars, "#|") of
243        0 -> skip_token; %% No nesting found
244        _ -> {error, "illegal nested block comment"}
245    end.
246
247%% skip_until(String, Char1, Char2) -> String.
248%% skip_past(String, Char1, Char2) -> String.
249
250%% skip_until([C|_]=Cs, C1, C2) when C =:= C1 ; C =:= C2 -> Cs;
251%% skip_until([_|Cs], C1, C2) -> skip_until(Cs, C1, C2);
252%% skip_until([], _, _) -> [].
253
254skip_past([C|Cs], C1, C2) when C =:= C1 ; C =:= C2 -> Cs;
255skip_past([_|Cs], C1, C2) -> skip_past(Cs, C1, C2);
256skip_past([], _, _) -> [].
257