1// This file is part of PyANTLR. See LICENSE.txt for license
2// details..........Copyright (C) Wolfgang Haefelinger, 2004.
3//
4// $Id$
5
6header {
7
8    // import language specific stuff
9    // need to import my local module defining super classes etc.
10    import asn1
11}
12options {
13	language="Python";
14}
15
16class asn1_l extends Lexer("asn1.CharScanner");
17
18options {
19    k = 3;
20    charVocabulary = '\3'..'\377';
21    caseSensitive=true;
22    testLiterals = true;
23    codeGenMakeSwitchThreshold = 2;
24    codeGenBitsetTestThreshold = 2;
25    importVocab=ASN1;
26}
27
28tokens {
29    DOTDOT;
30    ELLIPSIS;
31}
32
33ASSIGN_OP           :   "::="   ;
34BAR                 :   '|'     ;
35COLON               :   ':'     ;
36COMMA               :   ','     ;
37DOT                 :   '.'     ;
38DOTDOT              :   ".."    ;
39ELLIPSIS            :   "..."   ;
40EXCLAMATION         :   '!'     ;
41INTERSECTION        :   '^'     ;
42LESS                :   '<'     ;
43L_BRACE             :   '{'     ;
44L_BRACKET           :   '['     ;
45LL_BRACKET          :  { self.state_with_syntax==False }?  "[["    ;
46L_PAREN             :   '('     ;
47MINUS               :   '-'     ;
48PLUS                :   '+'     ;
49R_BRACE             :   '}'     ;
50R_BRACKET           :   ']'     ;
51RR_BRACKET          :  { self.state_with_syntax==False }?  "]]"    ;
52R_PAREN             :   ')'     ;
53SEMI                :   ';'     ;
54AT                  :   '@'     ;
55
56
57/* These are whitespace (without newline) characters according to X.680:2002 */
58protected
59WSchr
60    : '\t'   // horizontal tab (HT)  '\t' 0x09  9
61    | ' '   // space          (SP)  ' '  0x20 32
62    ;
63
64/* Same as WSign - just ignore consumed character */
65protected
66WSign
67    : WSchr { $setText("") }
68    ;
69
70/* the end of line */
71protected
72EOLchr
73    :   (
74            options {
75                generateAmbigWarnings = false;
76            }
77        : '\r''\n'
78        | '\r'
79        | '\n'
80//        | '\v'   // vertical   tab (VT)       0x0b 11
81//        | '\f'   // form feed      (FF)  '\f' 0x0c 12
82        )
83        {
84            $newline
85        }
86    ;
87
88/* like EOL but we ignore the consumed symbol */
89protected
90EOLign
91    : EOLchr {
92            $setText("")
93        }
94    ;
95
96/* like EOL but we normalize consumed symbol */
97protected
98EOLnrm
99    : EOLchr { $setText("\n")  }
100    ;
101
102/* upper (ASCII) case characters */
103protected
104UPCHR
105    : 'A' .. 'Z'
106    ;
107
108/* lower (ASCII) case characters */
109protected
110LOCHR
111    : 'a' .. 'z'
112    ;
113
114/* what's a (arabic) digit */
115protected
116DIGIT
117    : '0' .. '9'
118    ;
119
120/* whats a (roman) letter - yes, the name sucks a bit */
121protected
122CHR
123    : UPCHR | LOCHR
124    ;
125
126/* what's allowed in an identifier */
127protected
128IDCHR
129    : CHR | '-' | DIGIT
130    ;
131
132
133/* a binary digit */
134protected
135BINCHR
136    :   ('0'|'1')
137    ;
138
139/* a hex digit */
140protected
141HEXCHR
142    :   ('0'..'9')
143    |   ('A'..'F')
144    |   ('a'..'f')
145    ;
146
147/* a binary string */
148protected
149BINSTR
150    : "'" (BINCHR|WSign|EOLign)+ "'B" ;
151
152/* a hex string */
153protected
154HEXSTR
155    : "'" (HEXCHR|WSign|EOLign)+ "'H"  ;
156
157/* escape character in character strings */
158protected
159CHResc
160    : '"' '"' { $setText("\"") }
161    ;
162
163
164/* define which input symbols we can skip (so called whitespace) */
165WS
166    : ( WSchr | EOLchr )+   { $skip }
167    ;
168
169
170/* A number is a sequence of digits - note that deliberatly we allow
171** here for tokens like '001' etc.
172*/
173TOKEN_NUMBER
174    : (DIGIT)+
175    ;
176
177
178/* what's an idenifier */
179ID
180{ lowchrseen=False}
181    : ("BIT" WS "STRING") => "BIT" WS "STRING"        {
182            $setType(TOKEN_BIT_STRING)
183        }
184    | ("OCTET" WS "STRING") => "OCTET" WS "STRING"    {
185            $setType(TOKEN_OCTET_STRING)
186        }
187    | ("OBJECT" WS "IDENTIFIER") => "OBJECT" WS "IDENTIFIER" {
188            $setType(TOKEN_OBJECT_IDENTIFIER)
189        }
190    | ("ENCODED" WS "BY") => "ENCODED" WS "BY" {
191            $setType(TOKEN_ENCODED_BY)
192        }
193    | ("CONSTRAINED" WS "BY") => "CONSTRAINED" WS "BY" {
194            $setType(TOKEN_CONSTRAINED_BY)
195        }
196    | ("DEFINED" WS "BY") => "DEFINED" WS "BY" {
197            $setType(TOKEN_DEFINED_BY)
198        }
199    | UPCHR ( LOCHR{lowchrseen=True}|UPCHR|DIGIT|'-')* {
200            $setType(TOKEN_Word)
201            if lowchrseen: pass
202            else: $setType(TOKEN_WORD)
203        }
204    |  LOCHR ( IDCHR )* {
205            $setType(TOKEN_word)
206        }
207    ;
208
209/* what's a field */
210FIELD
211{ lowchrseen=False }
212    :  '&' UPCHR ( LOCHR{lowchrseen=True}|UPCHR|DIGIT|'-')* {
213            $setType(TOKEN_Field)
214            if lowchrseen:
215              pass
216            else:
217              $setType(TOKEN_FIELD)
218        }
219    |  '&' LOCHR ( IDCHR )* { $setType(TOKEN_field) }
220    ;
221
222
223
224/* an octet string is either a bit string or a hex string */
225OCTSTR
226    : (BINSTR)=>BINSTR { $setType(TOKEN_BSTRING)  }
227    | HEXSTR           { $setType(TOKEN_HSTRING)  }
228    ;
229
230
231/* A character string: this rule is not 1oo% correct as it will not
232** ignore ws before  and  after eol. This needs  best to be handled
233** via a language specific function. Note  that  rule  EOLnrm  will
234** replace any eol character by \n to simplify text processing.
235** Contrary, ws is not normalized as ws can't be ignored in general.
236*/
237TOKEN_CSTRING
238    :   '"' (CHResc | EOLnrm | ~('"'|'\r'|'\n'))* '"' {
239            s = self.chr_ws_erase($getText,"\n","\t ")
240            $setText(s)
241        }
242    ;
243
244
245
246/* ASN.1 has kind of tricky comment rule: A comment starts with "--"
247** and ends either with a "--" or with a eol character. Nesting of
248** comments is therefore not possible, ie.
249**  -- not visible -- visible -- not visible
250** The real ugly thing about this is that you can't just uncomment
251** a line (regardless of it's content) by prefixing the liene with
252** "--". For example assume you have this line:
253**  one INTEGER ::= 1  -- sample integer
254** Then have this:
255**  -- one INTEGER ::= 1  -- sample integer
256** This will hide ASN.1 and just makes the comment visible!
257*/
258
259COMMENT
260    :
261        "--"
262        (
263            ~('-'|'\n'|'\r') | {self.LA(2) != '-'}? '-'
264        )*
265        {
266            if self.LA(1) == '-': self.match("--");
267            $skip
268        }
269    ;
270
271ALTCOMMENT
272    :   { altcomment == true }?
273        ( ALTCOMMENT1
274        | ALTCOMMENT2
275        | ALTCOMMENT3
276        )
277        {
278            $skip
279        }
280    ;
281
282/* Due to problematic ASN.1  commentaries we have an alternative -
283** "//" starts a comment that eat's up everything till end of line
284** (as in C++ and Java).
285*/
286
287protected
288ALTCOMMENT1
289    :
290        { altcomment == true }? "//" (~('\n'|'\r'))*
291        {
292            pass
293        }
294    ;
295
296/* We also also for typical C comments albeit not nested ones */
297protected
298ALTCOMMENT2
299    : "/*"
300        (
301            options {
302                greedy=false;
303            }
304        : '\r' ( options { warnWhenFollowAmbig=false; } : '\n')? { $newline }
305        | '\n' { $newline }
306        | .
307        )*
308        "*/"
309        {
310            pass
311        }
312    ;
313
314/* And as homage to the master of style, Niklaus Wirth, we also also
315** comments ala PASCAL */
316protected
317ALTCOMMENT3
318    : "{*"
319        (
320            options {
321                greedy=false;
322            }
323        : '\r' ( options { warnWhenFollowAmbig=false; } : '\n')? { $nl }
324        | '\n' { $nl; }
325        | .
326        )*
327        "*}"
328        {
329            pass
330        }
331    ;
332
333