1%%
2%% %CopyrightBegin%
3%%
4%% Copyright Ericsson AB 1997-2016. All Rights Reserved.
5%%
6%% Licensed under the Apache License, Version 2.0 (the "License");
7%% you may not use this file except in compliance with the License.
8%% You may obtain a copy of the License at
9%%
10%%     http://www.apache.org/licenses/LICENSE-2.0
11%%
12%% Unless required by applicable law or agreed to in writing, software
13%% distributed under the License is distributed on an "AS IS" BASIS,
14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15%% See the License for the specific language governing permissions and
16%% limitations under the License.
17%%
18%% %CopyrightEnd%
19%%
20%%
21-module(asn1ct_tok).
22
23%% Tokenize ASN.1 code (input to parser generated with yecc)
24
25-export([file/1,format_error/1]).
26
27file(File0) ->
28    case file:open(File0, [read])  of
29	{error, Reason} ->
30	    {error,{File0,file:format_error(Reason)}};
31	{ok,Stream} ->
32	    try
33		process(Stream, 1, [])
34	    catch
35		throw:{error,Line,Reason} ->
36		    File = filename:basename(File0),
37		    Error = {structured_error,{File,Line},?MODULE,Reason},
38		    {error,[Error]}
39	    end
40    end.
41
42process(Stream, Lno, R) ->
43    process(io:get_line(Stream, ''), Stream, Lno, R).
44
45process(eof, Stream, Lno, Acc) ->
46    ok = file:close(Stream),
47    lists:reverse([{'$end',Lno}|Acc]);
48process(L, Stream, Lno0, Acc) when is_list(L) ->
49    try tokenise(Stream, L, Lno0, []) of
50	{Lno,[]} ->
51	    process(Stream, Lno, Acc);
52	{Lno,Ts} ->
53	    process(Stream, Lno, Ts++Acc)
54    catch
55	throw:{error,Reason} ->
56	    throw({error,Lno0,Reason})
57    end.
58
59format_error(eof_in_comment) ->
60    "premature end of file in multi-line comment";
61format_error(eol_in_token) ->
62    "end of line in token";
63format_error({invalid_binary_number,Str}) ->
64    io_lib:format("invalid binary number: '~s'", [Str]);
65format_error({invalid_hex_number,Str}) ->
66    io_lib:format("invalid hex number: '~s'", [Str]);
67format_error(Other) ->
68    io_lib:format("~p", [Other]).
69
70tokenise(Stream, [$&,H|T], Lno, R) when $A =< H , H =< $Z ->
71    {X,T1} = get_name(T, [H]),
72    tokenise(Stream, T1, Lno, [{typefieldreference,Lno,X}|R]);
73tokenise(Stream, [$&,H|T], Lno, R) when $a =< H , H =< $z ->
74    {X,T1} = get_name(T, [H]),
75    tokenise(Stream, T1, Lno, [{valuefieldreference,Lno,X}|R]);
76
77tokenise(Stream, "--"++T, Lno, R) ->
78    tokenise(Stream, skip_comment(T), Lno, R);
79
80tokenise(Stream, [$-,H|T], Lno, R) when $0 =< H , H =< $9 ->
81    {X, T1} = get_number(T, [H]),
82    tokenise(Stream, T1, Lno, [{number,Lno,-list_to_integer(X)}|R]);
83
84tokenise(Stream, "/*"++T, Lno0, R) ->
85    {Lno,T1} = skip_multiline_comment(Stream, T, Lno0, 0),
86    tokenise(Stream, T1, Lno, R);
87
88tokenise(Stream, "::="++T, Lno, R) ->
89    tokenise(Stream, T, Lno, [{'::=',Lno}|R]);
90tokenise(Stream, ":"++T, Lno, R) ->
91    tokenise(Stream, T, Lno, [{':',Lno}|R]);
92
93tokenise(Stream, "'"++T0, Lno, R) ->
94    {Thing, T1} = collect_quoted(T0, Lno, []),
95    tokenise(Stream, T1, Lno, [Thing|R]);
96tokenise(Stream,[$"|T],Lno,R) ->
97    {Str,T1} = collect_string(T,Lno),
98    tokenise(Stream,T1,Lno,[Str|R]);
99
100tokenise(Stream, "{"++T, Lno, R) ->
101    tokenise(Stream, T, Lno, [{'{',Lno}|R]);
102tokenise(Stream, "}"++T, Lno, R) ->
103    tokenise(Stream, T, Lno, [{'}',Lno}|R]);
104
105%% Even though x.680 specify '[[' and ']]' as lexical items
106%% it does not work to have them as such since the single '[' and ']' can
107%% be used beside each other in 'WITH SYNTAX' in x.681.
108%% The solution chosen here, i.e. to have them as separate lexical items
109%% will not detect the cases where there is white space between them
110%% which would be an error in the use in ExtensionAdditionGroups.
111
112tokenise(Stream, "]"++T, Lno, R) ->
113    tokenise(Stream, T, Lno, [{']',Lno}|R]);
114tokenise(Stream, "["++T,Lno,R) ->
115    tokenise(Stream, T, Lno, [{'[',Lno}|R]);
116
117tokenise(Stream, ","++T,Lno,R) ->
118    tokenise(Stream, T, Lno, [{',',Lno}|R]);
119
120tokenise(Stream, "("++T, Lno, R) ->
121    tokenise(Stream, T, Lno, [{'(',Lno}|R]);
122tokenise(Stream, ")"++T, Lno, R) ->
123    tokenise(Stream, T, Lno, [{')',Lno}|R]);
124
125tokenise(Stream, "..."++T,Lno,R) ->
126    tokenise(Stream, T, Lno, [{'...',Lno}|R]);
127tokenise(Stream, ".."++T, Lno, R) ->
128    tokenise(Stream, T, Lno, [{'..',Lno}|R]);
129tokenise(Stream, "."++T, Lno, R) ->
130    tokenise(Stream, T, Lno, [{'.',Lno}|R]);
131
132tokenise(Stream, "|"++T, Lno, R) ->
133    tokenise(Stream, T, Lno, [{'|',Lno}|R]);
134
135tokenise(Stream, [H|T], Lno, R) when $A =< H , H =< $Z ->
136    {X,T1} = get_name(T, [H]),
137    case reserved_word(X) of
138	true ->
139	    tokenise(Stream, T1, Lno, [{X,Lno}|R]);
140	false ->
141	    tokenise(Stream, T1, Lno, [{typereference,Lno,X}|R]);
142	rstrtype ->
143	    tokenise(Stream, T1, Lno, [{restrictedcharacterstringtype,Lno,X}|R])
144    end;
145
146tokenise(Stream, [H|T], Lno, R) when $a =< H , H =< $z ->
147    {X, T1} = get_name(T, [H]),
148    tokenise(Stream, T1, Lno, [{identifier,Lno,X}|R]);
149
150tokenise(Stream, [H|T], Lno, R) when $0 =< H , H =< $9 ->
151    {X, T1} = get_number(T, [H]),
152    tokenise(Stream, T1, Lno, [{number,Lno,list_to_integer(X)}|R]);
153
154tokenise(Stream, [H|T], Lno, R) when H =< $\s ->
155    tokenise(Stream, T, Lno, R);
156
157tokenise(Stream, [H|T], Lno, R) ->
158    tokenise(Stream, T, Lno, [{list_to_atom([H]),Lno}|R]);
159tokenise(_Stream, [], Lno, R) ->
160    {Lno+1,R}.
161
162
163collect_string(L, Lno) ->
164    collect_string(L, Lno, []).
165
166collect_string([$"|T], _Lno, Str) ->
167    {{cstring,1,lists:reverse(Str)},T};
168collect_string([H|T], Lno, Str) ->
169    collect_string(T, Lno, [H|Str]);
170collect_string([], _, _) ->
171    throw({error,missing_quote_at_eof}).
172
173%% <name> is letters digits hyphens.
174%% Hypen is not the last character. Hypen hyphen is NOT allowed.
175%%
176%% <identifier> ::= <lowercase> <name>
177
178get_name([$-,Char|T]=T0, Acc) ->
179    case isalnum(Char) of
180	true ->
181	    get_name(T, [Char,$-|Acc]);
182	false ->
183	    {list_to_atom(lists:reverse(Acc)),T0}
184    end;
185get_name([$-|_]=T, Acc) ->
186    {list_to_atom(lists:reverse(Acc)),T};
187get_name([Char|T]=T0, Acc) ->
188    case isalnum(Char) of
189	true ->
190	    get_name(T, [Char|Acc]);
191	false ->
192	    {list_to_atom(lists:reverse(Acc)),T0}
193    end;
194get_name([], Acc) ->
195    {list_to_atom(lists:reverse(Acc)),[]}.
196
197isalnum(H) when $A =< H , H =< $Z ->
198    true;
199isalnum(H) when $a =< H , H =< $z ->
200    true;
201isalnum(H) when $0 =< H , H =< $9 ->
202    true;
203isalnum(_) ->
204    false.
205
206isdigit(H) when $0 =< H , H =< $9 ->
207    true;
208isdigit(_) ->
209    false.
210
211get_number([H|T]=T0, L) ->
212    case isdigit(H) of
213	true ->
214	    get_number(T, [H|L]);
215	false ->
216	    {lists:reverse(L), T0}
217    end;
218get_number([], L) ->
219    {lists:reverse(L), []}.
220
221skip_comment([]) -> [];
222skip_comment("--"++T) -> T;
223skip_comment([_|T]) -> skip_comment(T).
224
225skip_multiline_comment(Stream, [], Lno, Level) ->
226    case io:get_line(Stream, '') of
227	eof ->
228	    throw({error,eof_in_comment});
229	Line ->
230	    skip_multiline_comment(Stream, Line, Lno+1, Level)
231    end;
232skip_multiline_comment(_Stream, "*/"++T, Lno, 0) ->
233    {Lno,T};
234skip_multiline_comment(Stream, "*/"++T, Lno, Level) ->
235    skip_multiline_comment(Stream, T, Lno, Level - 1);
236skip_multiline_comment(Stream, "/*"++T, Lno, Level) ->
237    skip_multiline_comment(Stream, T, Lno, Level + 1);
238skip_multiline_comment(Stream, [_|T], Lno, Level) ->
239    skip_multiline_comment(Stream, T, Lno, Level).
240
241collect_quoted("'B"++T, Lno, L) ->
242    case validate_bin(L) of
243        {ok, Bin} ->
244            {{bstring,Lno,Bin}, T};
245        false ->
246            throw({error,{invalid_binary_number,lists:reverse(L)}})
247    end;
248collect_quoted("'H"++T, Lno, L) ->
249    case validate_hex(L) of
250        {ok, Hex} ->
251            {{hstring,Lno,Hex}, T};
252        false ->
253            throw({error,{invalid_hex_number,lists:reverse(L)}})
254    end;
255collect_quoted([H|T], Lno, L) ->
256    collect_quoted(T, Lno,[H|L]);
257collect_quoted([], _, _) ->        % This should be allowed FIX later
258    throw({error,eol_in_token}).
259
260validate_bin(L) ->
261    validate_bin(L,[]).
262
263validate_bin([H|T], A) when H =:= $0; H =:= $1 ->
264    validate_bin(T, [H|A]);
265validate_bin([$\s|T], A) ->
266    validate_bin(T, A);
267validate_bin([_|_], _) ->
268    false;
269validate_bin([], A) ->
270    {ok, A}.
271
272validate_hex(L) ->
273    validate_hex(L,[]).
274
275validate_hex([H|T], A) when $0 =< H , H =< $9 ->
276    validate_hex(T, [H|A]);
277validate_hex([H|T], A) when $A =< H , H =< $F ->
278    validate_hex(T, [H|A]);
279validate_hex([$\s|T], A) ->
280    validate_hex(T, A);
281validate_hex([_|_], _) ->
282    false;
283validate_hex([], A) ->
284    {ok, A}.
285
286%% reserved_word(A) -> true|false|rstrtype
287%% A = atom()
288%% returns true if A is a reserved ASN.1 word
289%% returns false if A is not a reserved word
290%% returns rstrtype if A is a reserved word in the group
291%% 	RestrictedCharacterStringType
292reserved_word('ABSENT') -> true;
293reserved_word('ALL') -> true;
294reserved_word('ANY') -> true;
295reserved_word('APPLICATION') -> true;
296reserved_word('AUTOMATIC') -> true;
297reserved_word('BEGIN') -> true;
298reserved_word('BIT') -> true;
299reserved_word('BMPString') -> rstrtype;
300reserved_word('BOOLEAN') -> true;
301reserved_word('BY') -> true;
302reserved_word('CHARACTER') -> true;
303reserved_word('CHOICE') -> true;
304reserved_word('CLASS') -> true;
305reserved_word('COMPONENT') -> true;
306reserved_word('COMPONENTS') -> true;
307reserved_word('CONSTRAINED') -> true;
308reserved_word('CONTAINING') -> true;
309reserved_word('DEFAULT') -> true;
310reserved_word('DEFINED') -> true; % not present in X.680 07/2002
311reserved_word('DEFINITIONS') -> true;
312reserved_word('EMBEDDED') -> true;
313reserved_word('ENCODED') -> true;
314reserved_word('END') -> true;
315reserved_word('ENUMERATED') -> true;
316reserved_word('EXCEPT') -> true;
317reserved_word('EXPLICIT') -> true;
318reserved_word('EXPORTS') -> true;
319reserved_word('EXTENSIBILITY') -> true;
320reserved_word('EXTERNAL') -> true;
321reserved_word('FALSE') -> true;
322reserved_word('FROM') -> true;
323reserved_word('GeneralizedTime') -> true;
324reserved_word('GeneralString') -> rstrtype;
325reserved_word('GraphicString') -> rstrtype;
326reserved_word('IA5String') -> rstrtype;
327reserved_word('IDENTIFIER') -> true;
328reserved_word('IMPLICIT') -> true;
329reserved_word('IMPLIED') -> true;
330reserved_word('IMPORTS') -> true;
331reserved_word('INCLUDES') -> true;
332reserved_word('INSTANCE') -> true;
333reserved_word('INTEGER') -> true;
334reserved_word('INTERSECTION') -> true;
335reserved_word('MAX') -> true;
336reserved_word('MIN') -> true;
337reserved_word('MINUS-INFINITY') -> true;
338reserved_word('NULL') -> true;
339reserved_word('NumericString') -> rstrtype;
340reserved_word('OBJECT') -> true;
341reserved_word('ObjectDescriptor') -> true;
342reserved_word('OCTET') -> true;
343reserved_word('OF') -> true;
344reserved_word('OPTIONAL') -> true;
345reserved_word('PATTERN') -> true;
346reserved_word('PDV') -> true;
347reserved_word('PLUS-INFINITY') -> true;
348reserved_word('PRESENT') -> true;
349reserved_word('PrintableString') -> rstrtype;
350reserved_word('PRIVATE') -> true;
351reserved_word('REAL') -> true;
352reserved_word('RELATIVE-OID') -> true;
353reserved_word('SEQUENCE') -> true;
354reserved_word('SET') -> true;
355reserved_word('SIZE') -> true;
356reserved_word('STRING') -> true;
357reserved_word('SYNTAX') -> true;
358reserved_word('T61String') -> rstrtype;
359reserved_word('TAGS') -> true;
360reserved_word('TeletexString') -> rstrtype;
361reserved_word('TRUE') -> true;
362reserved_word('UNION') -> true;
363reserved_word('UNIQUE') -> true;
364reserved_word('UNIVERSAL') -> true;
365reserved_word('UniversalString') -> rstrtype;
366reserved_word('UTCTime') -> true;
367reserved_word('UTF8String') -> rstrtype;
368reserved_word('VideotexString') -> rstrtype;
369reserved_word('VisibleString') -> rstrtype;
370reserved_word('WITH') -> true;
371reserved_word(_) -> false.
372