1
2package hclsyntax
3
4import (
5    "bytes"
6
7    "github.com/hashicorp/hcl2/hcl"
8)
9
10// This file is generated from scan_tokens.rl. DO NOT EDIT.
11%%{
12  # (except when you are actually in scan_tokens.rl here, so edit away!)
13
14  machine hcltok;
15  write data;
16}%%
17
18func scanTokens(data []byte, filename string, start hcl.Pos, mode scanMode) []Token {
19    stripData := stripUTF8BOM(data)
20    start.Byte += len(data) - len(stripData)
21    data = stripData
22
23    f := &tokenAccum{
24        Filename:  filename,
25        Bytes:     data,
26        Pos:       start,
27        StartByte: start.Byte,
28    }
29
30    %%{
31        include UnicodeDerived "unicode_derived.rl";
32
33        UTF8Cont = 0x80 .. 0xBF;
34        AnyUTF8 = (
35            0x00..0x7F |
36            0xC0..0xDF . UTF8Cont |
37            0xE0..0xEF . UTF8Cont . UTF8Cont |
38            0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont
39        );
40        BrokenUTF8 = any - AnyUTF8;
41
42        NumberLitContinue = (digit|'.'|('e'|'E') ('+'|'-')? digit);
43        NumberLit = digit ("" | (NumberLitContinue - '.') | (NumberLitContinue* (NumberLitContinue - '.')));
44        Ident = (ID_Start | '_') (ID_Continue | '-')*;
45
46        # Symbols that just represent themselves are handled as a single rule.
47        SelfToken = "[" | "]" | "(" | ")" | "." | "," | "*" | "/" | "%" | "+" | "-" | "=" | "<" | ">" | "!" | "?" | ":" | "\n" | "&" | "|" | "~" | "^" | ";" | "`" | "'";
48
49        EqualOp = "==";
50        NotEqual = "!=";
51        GreaterThanEqual = ">=";
52        LessThanEqual = "<=";
53        LogicalAnd = "&&";
54        LogicalOr = "||";
55
56        Ellipsis = "...";
57        FatArrow = "=>";
58
59        Newline = '\r' ? '\n';
60        EndOfLine = Newline;
61
62        BeginStringTmpl = '"';
63        BeginHeredocTmpl = '<<' ('-')? Ident Newline;
64
65        Comment = (
66            # The :>> operator in these is a "finish-guarded concatenation",
67            # which terminates the sequence on its left when it completes
68            # the sequence on its right.
69            # In the single-line comment cases this is allowing us to make
70            # the trailing EndOfLine optional while still having the overall
71            # pattern terminate. In the multi-line case it ensures that
72            # the first comment in the file ends at the first */, rather than
73            # gobbling up all of the "any*" until the _final_ */ in the file.
74            ("#" (any - EndOfLine)* :>> EndOfLine?) |
75            ("//" (any - EndOfLine)* :>> EndOfLine?) |
76            ("/*" any* :>> "*/")
77        );
78
79        # Note: hclwrite assumes that only ASCII spaces appear between tokens,
80        # and uses this assumption to recreate the spaces between tokens by
81        # looking at byte offset differences. This means it will produce
82        # incorrect results in the presence of tabs, but that's acceptable
83        # because the canonical style (which hclwrite itself can impose
84        # automatically is to never use tabs).
85        Spaces = (' ' | 0x09)+;
86
87        action beginStringTemplate {
88            token(TokenOQuote);
89            fcall stringTemplate;
90        }
91
92        action endStringTemplate {
93            token(TokenCQuote);
94            fret;
95        }
96
97        action beginHeredocTemplate {
98            token(TokenOHeredoc);
99            // the token is currently the whole heredoc introducer, like
100            // <<EOT or <<-EOT, followed by a newline. We want to extract
101            // just the "EOT" portion that we'll use as the closing marker.
102
103            marker := data[ts+2:te-1]
104            if marker[0] == '-' {
105                marker = marker[1:]
106            }
107            if marker[len(marker)-1] == '\r' {
108                marker = marker[:len(marker)-1]
109            }
110
111            heredocs = append(heredocs, heredocInProgress{
112                Marker:      marker,
113                StartOfLine: true,
114            })
115
116            fcall heredocTemplate;
117        }
118
119        action heredocLiteralEOL {
120            // This action is called specificially when a heredoc literal
121            // ends with a newline character.
122
123            // This might actually be our end marker.
124            topdoc := &heredocs[len(heredocs)-1]
125            if topdoc.StartOfLine {
126                maybeMarker := bytes.TrimSpace(data[ts:te])
127                if bytes.Equal(maybeMarker, topdoc.Marker) {
128                    // We actually emit two tokens here: the end-of-heredoc
129                    // marker first, and then separately the newline that
130                    // follows it. This then avoids issues with the closing
131                    // marker consuming a newline that would normally be used
132                    // to mark the end of an attribute definition.
133                    // We might have either a \n sequence or an \r\n sequence
134                    // here, so we must handle both.
135                    nls := te-1
136                    nle := te
137                    te--
138                    if data[te-1] == '\r' {
139                        // back up one more byte
140                        nls--
141                        te--
142                    }
143                    token(TokenCHeredoc);
144                    ts = nls
145                    te = nle
146                    token(TokenNewline);
147                    heredocs = heredocs[:len(heredocs)-1]
148                    fret;
149                }
150            }
151
152            topdoc.StartOfLine = true;
153            token(TokenStringLit);
154        }
155
156        action heredocLiteralMidline {
157            // This action is called when a heredoc literal _doesn't_ end
158            // with a newline character, e.g. because we're about to enter
159            // an interpolation sequence.
160            heredocs[len(heredocs)-1].StartOfLine = false;
161            token(TokenStringLit);
162        }
163
164        action bareTemplateLiteral {
165            token(TokenStringLit);
166        }
167
168        action beginTemplateInterp {
169            token(TokenTemplateInterp);
170            braces++;
171            retBraces = append(retBraces, braces);
172            if len(heredocs) > 0 {
173                heredocs[len(heredocs)-1].StartOfLine = false;
174            }
175            fcall main;
176        }
177
178        action beginTemplateControl {
179            token(TokenTemplateControl);
180            braces++;
181            retBraces = append(retBraces, braces);
182            if len(heredocs) > 0 {
183                heredocs[len(heredocs)-1].StartOfLine = false;
184            }
185            fcall main;
186        }
187
188        action openBrace {
189            token(TokenOBrace);
190            braces++;
191        }
192
193        action closeBrace {
194            if len(retBraces) > 0 && retBraces[len(retBraces)-1] == braces {
195                token(TokenTemplateSeqEnd);
196                braces--;
197                retBraces = retBraces[0:len(retBraces)-1]
198                fret;
199            } else {
200                token(TokenCBrace);
201                braces--;
202            }
203        }
204
205        action closeTemplateSeqEatWhitespace {
206            // Only consume from the retBraces stack and return if we are at
207            // a suitable brace nesting level, otherwise things will get
208            // confused. (Not entering this branch indicates a syntax error,
209            // which we will catch in the parser.)
210            if len(retBraces) > 0 && retBraces[len(retBraces)-1] == braces {
211                token(TokenTemplateSeqEnd);
212                braces--;
213                retBraces = retBraces[0:len(retBraces)-1]
214                fret;
215            } else {
216                // We intentionally generate a TokenTemplateSeqEnd here,
217                // even though the user apparently wanted a brace, because
218                // we want to allow the parser to catch the incorrect use
219                // of a ~} to balance a generic opening brace, rather than
220                // a template sequence.
221                token(TokenTemplateSeqEnd);
222                braces--;
223            }
224        }
225
226        TemplateInterp = "${" ("~")?;
227        TemplateControl = "%{" ("~")?;
228        EndStringTmpl = '"';
229        NewlineChars = ("\r"|"\n");
230        NewlineCharsSeq = NewlineChars+;
231        StringLiteralChars = (AnyUTF8 - NewlineChars);
232        TemplateIgnoredNonBrace = (^'{' %{ fhold; });
233        TemplateNotInterp = '$' (TemplateIgnoredNonBrace | TemplateInterp);
234        TemplateNotControl = '%' (TemplateIgnoredNonBrace | TemplateControl);
235        QuotedStringLiteralWithEsc = ('\\' StringLiteralChars) | (StringLiteralChars - ("$" | '%' | '"' | "\\"));
236        TemplateStringLiteral = (
237            (TemplateNotInterp) |
238            (TemplateNotControl) |
239            (QuotedStringLiteralWithEsc)+
240        );
241        HeredocStringLiteral = (
242            (TemplateNotInterp) |
243            (TemplateNotControl) |
244            (StringLiteralChars - ("$" | '%'))*
245        );
246        BareStringLiteral = (
247            (TemplateNotInterp) |
248            (TemplateNotControl) |
249            (StringLiteralChars - ("$" | '%'))*
250        ) Newline?;
251
252        stringTemplate := |*
253            TemplateInterp        => beginTemplateInterp;
254            TemplateControl       => beginTemplateControl;
255            EndStringTmpl         => endStringTemplate;
256            TemplateStringLiteral => { token(TokenQuotedLit); };
257            NewlineCharsSeq       => { token(TokenQuotedNewline); };
258            AnyUTF8               => { token(TokenInvalid); };
259            BrokenUTF8            => { token(TokenBadUTF8); };
260        *|;
261
262        heredocTemplate := |*
263            TemplateInterp        => beginTemplateInterp;
264            TemplateControl       => beginTemplateControl;
265            HeredocStringLiteral EndOfLine => heredocLiteralEOL;
266            HeredocStringLiteral  => heredocLiteralMidline;
267            BrokenUTF8            => { token(TokenBadUTF8); };
268        *|;
269
270        bareTemplate := |*
271            TemplateInterp        => beginTemplateInterp;
272            TemplateControl       => beginTemplateControl;
273            BareStringLiteral     => bareTemplateLiteral;
274            BrokenUTF8            => { token(TokenBadUTF8); };
275        *|;
276
277        identOnly := |*
278            Ident            => { token(TokenIdent) };
279            BrokenUTF8       => { token(TokenBadUTF8) };
280            AnyUTF8          => { token(TokenInvalid) };
281        *|;
282
283        main := |*
284            Spaces           => {};
285            NumberLit        => { token(TokenNumberLit) };
286            Ident            => { token(TokenIdent) };
287
288            Comment          => { token(TokenComment) };
289            Newline          => { token(TokenNewline) };
290
291            EqualOp          => { token(TokenEqualOp); };
292            NotEqual         => { token(TokenNotEqual); };
293            GreaterThanEqual => { token(TokenGreaterThanEq); };
294            LessThanEqual    => { token(TokenLessThanEq); };
295            LogicalAnd       => { token(TokenAnd); };
296            LogicalOr        => { token(TokenOr); };
297            Ellipsis         => { token(TokenEllipsis); };
298            FatArrow         => { token(TokenFatArrow); };
299            SelfToken        => { selfToken() };
300
301            "{"              => openBrace;
302            "}"              => closeBrace;
303
304            "~}"             => closeTemplateSeqEatWhitespace;
305
306            BeginStringTmpl  => beginStringTemplate;
307            BeginHeredocTmpl => beginHeredocTemplate;
308
309            BrokenUTF8       => { token(TokenBadUTF8) };
310            AnyUTF8          => { token(TokenInvalid) };
311        *|;
312
313    }%%
314
315    // Ragel state
316	p := 0  // "Pointer" into data
317	pe := len(data) // End-of-data "pointer"
318    ts := 0
319    te := 0
320    act := 0
321    eof := pe
322    var stack []int
323    var top int
324
325    var cs int // current state
326    switch mode {
327    case scanNormal:
328        cs = hcltok_en_main
329    case scanTemplate:
330        cs = hcltok_en_bareTemplate
331    case scanIdentOnly:
332        cs = hcltok_en_identOnly
333    default:
334        panic("invalid scanMode")
335    }
336
337    braces := 0
338    var retBraces []int // stack of brace levels that cause us to use fret
339    var heredocs []heredocInProgress // stack of heredocs we're currently processing
340
341    %%{
342        prepush {
343            stack = append(stack, 0);
344        }
345        postpop {
346            stack = stack[:len(stack)-1];
347        }
348    }%%
349
350    // Make Go compiler happy
351    _ = ts
352    _ = te
353    _ = act
354    _ = eof
355
356    token := func (ty TokenType) {
357        f.emitToken(ty, ts, te)
358    }
359    selfToken := func () {
360        b := data[ts:te]
361        if len(b) != 1 {
362            // should never happen
363            panic("selfToken only works for single-character tokens")
364        }
365        f.emitToken(TokenType(b[0]), ts, te)
366    }
367
368    %%{
369        write init nocs;
370        write exec;
371    }%%
372
373    // If we fall out here without being in a final state then we've
374    // encountered something that the scanner can't match, which we'll
375    // deal with as an invalid.
376    if cs < hcltok_first_final {
377        if mode == scanTemplate && len(stack) == 0 {
378            // If we're scanning a bare template then any straggling
379            // top-level stuff is actually literal string, rather than
380            // invalid. This handles the case where the template ends
381            // with a single "$" or "%", which trips us up because we
382            // want to see another character to decide if it's a sequence
383            // or an escape.
384            f.emitToken(TokenStringLit, ts, len(data))
385        } else {
386            f.emitToken(TokenInvalid, ts, len(data))
387        }
388    }
389
390    // We always emit a synthetic EOF token at the end, since it gives the
391    // parser position information for an "unexpected EOF" diagnostic.
392    f.emitToken(TokenEOF, len(data), len(data))
393
394    return f.Tokens
395}
396