1package hclsyntax
2
3import (
4	"bytes"
5	"fmt"
6
7	"github.com/apparentlymart/go-textseg/textseg"
8	"github.com/hashicorp/hcl2/hcl"
9)
10
11// Token represents a sequence of bytes from some HCL code that has been
12// tagged with a type and its range within the source file.
13type Token struct {
14	Type  TokenType
15	Bytes []byte
16	Range hcl.Range
17}
18
19// Tokens is a slice of Token.
20type Tokens []Token
21
22// TokenType is an enumeration used for the Type field on Token.
23type TokenType rune
24
25const (
26	// Single-character tokens are represented by their own character, for
27	// convenience in producing these within the scanner. However, the values
28	// are otherwise arbitrary and just intended to be mnemonic for humans
29	// who might see them in debug output.
30
31	TokenOBrace   TokenType = '{'
32	TokenCBrace   TokenType = '}'
33	TokenOBrack   TokenType = '['
34	TokenCBrack   TokenType = ']'
35	TokenOParen   TokenType = '('
36	TokenCParen   TokenType = ')'
37	TokenOQuote   TokenType = '«'
38	TokenCQuote   TokenType = '»'
39	TokenOHeredoc TokenType = 'H'
40	TokenCHeredoc TokenType = 'h'
41
42	TokenStar    TokenType = '*'
43	TokenSlash   TokenType = '/'
44	TokenPlus    TokenType = '+'
45	TokenMinus   TokenType = '-'
46	TokenPercent TokenType = '%'
47
48	TokenEqual         TokenType = '='
49	TokenEqualOp       TokenType = '≔'
50	TokenNotEqual      TokenType = '≠'
51	TokenLessThan      TokenType = '<'
52	TokenLessThanEq    TokenType = '≤'
53	TokenGreaterThan   TokenType = '>'
54	TokenGreaterThanEq TokenType = '≥'
55
56	TokenAnd  TokenType = '∧'
57	TokenOr   TokenType = '∨'
58	TokenBang TokenType = '!'
59
60	TokenDot   TokenType = '.'
61	TokenComma TokenType = ','
62
63	TokenEllipsis TokenType = '…'
64	TokenFatArrow TokenType = '⇒'
65
66	TokenQuestion TokenType = '?'
67	TokenColon    TokenType = ':'
68
69	TokenTemplateInterp  TokenType = '∫'
70	TokenTemplateControl TokenType = 'λ'
71	TokenTemplateSeqEnd  TokenType = '∎'
72
73	TokenQuotedLit TokenType = 'Q' // might contain backslash escapes
74	TokenStringLit TokenType = 'S' // cannot contain backslash escapes
75	TokenNumberLit TokenType = 'N'
76	TokenIdent     TokenType = 'I'
77
78	TokenComment TokenType = 'C'
79
80	TokenNewline TokenType = '\n'
81	TokenEOF     TokenType = '␄'
82
83	// The rest are not used in the language but recognized by the scanner so
84	// we can generate good diagnostics in the parser when users try to write
85	// things that might work in other languages they are familiar with, or
86	// simply make incorrect assumptions about the HCL language.
87
88	TokenBitwiseAnd    TokenType = '&'
89	TokenBitwiseOr     TokenType = '|'
90	TokenBitwiseNot    TokenType = '~'
91	TokenBitwiseXor    TokenType = '^'
92	TokenStarStar      TokenType = '➚'
93	TokenApostrophe    TokenType = '\''
94	TokenBacktick      TokenType = '`'
95	TokenSemicolon     TokenType = ';'
96	TokenTabs          TokenType = '␉'
97	TokenInvalid       TokenType = '�'
98	TokenBadUTF8       TokenType = '��'
99	TokenQuotedNewline TokenType = '␤'
100
101	// TokenNil is a placeholder for when a token is required but none is
102	// available, e.g. when reporting errors. The scanner will never produce
103	// this as part of a token stream.
104	TokenNil TokenType = '\x00'
105)
106
107func (t TokenType) GoString() string {
108	return fmt.Sprintf("hclsyntax.%s", t.String())
109}
110
111type scanMode int
112
113const (
114	scanNormal scanMode = iota
115	scanTemplate
116	scanIdentOnly
117)
118
119type tokenAccum struct {
120	Filename  string
121	Bytes     []byte
122	Pos       hcl.Pos
123	Tokens    []Token
124	StartByte int
125}
126
127func (f *tokenAccum) emitToken(ty TokenType, startOfs, endOfs int) {
128	// Walk through our buffer to figure out how much we need to adjust
129	// the start pos to get our end pos.
130
131	start := f.Pos
132	start.Column += startOfs + f.StartByte - f.Pos.Byte // Safe because only ASCII spaces can be in the offset
133	start.Byte = startOfs + f.StartByte
134
135	end := start
136	end.Byte = endOfs + f.StartByte
137	b := f.Bytes[startOfs:endOfs]
138	for len(b) > 0 {
139		advance, seq, _ := textseg.ScanGraphemeClusters(b, true)
140		if (len(seq) == 1 && seq[0] == '\n') || (len(seq) == 2 && seq[0] == '\r' && seq[1] == '\n') {
141			end.Line++
142			end.Column = 1
143		} else {
144			end.Column++
145		}
146		b = b[advance:]
147	}
148
149	f.Pos = end
150
151	f.Tokens = append(f.Tokens, Token{
152		Type:  ty,
153		Bytes: f.Bytes[startOfs:endOfs],
154		Range: hcl.Range{
155			Filename: f.Filename,
156			Start:    start,
157			End:      end,
158		},
159	})
160}
161
162type heredocInProgress struct {
163	Marker      []byte
164	StartOfLine bool
165}
166
167func tokenOpensFlushHeredoc(tok Token) bool {
168	if tok.Type != TokenOHeredoc {
169		return false
170	}
171	return bytes.HasPrefix(tok.Bytes, []byte{'<', '<', '-'})
172}
173
174// checkInvalidTokens does a simple pass across the given tokens and generates
175// diagnostics for tokens that should _never_ appear in HCL source. This
176// is intended to avoid the need for the parser to have special support
177// for them all over.
178//
179// Returns a diagnostics with no errors if everything seems acceptable.
180// Otherwise, returns zero or more error diagnostics, though tries to limit
181// repetition of the same information.
182func checkInvalidTokens(tokens Tokens) hcl.Diagnostics {
183	var diags hcl.Diagnostics
184
185	toldBitwise := 0
186	toldExponent := 0
187	toldBacktick := 0
188	toldApostrophe := 0
189	toldSemicolon := 0
190	toldTabs := 0
191	toldBadUTF8 := 0
192
193	for _, tok := range tokens {
194		// copy token so it's safe to point to it
195		tok := tok
196
197		switch tok.Type {
198		case TokenBitwiseAnd, TokenBitwiseOr, TokenBitwiseXor, TokenBitwiseNot:
199			if toldBitwise < 4 {
200				var suggestion string
201				switch tok.Type {
202				case TokenBitwiseAnd:
203					suggestion = " Did you mean boolean AND (\"&&\")?"
204				case TokenBitwiseOr:
205					suggestion = " Did you mean boolean OR (\"&&\")?"
206				case TokenBitwiseNot:
207					suggestion = " Did you mean boolean NOT (\"!\")?"
208				}
209
210				diags = append(diags, &hcl.Diagnostic{
211					Severity: hcl.DiagError,
212					Summary:  "Unsupported operator",
213					Detail:   fmt.Sprintf("Bitwise operators are not supported.%s", suggestion),
214					Subject:  &tok.Range,
215				})
216				toldBitwise++
217			}
218		case TokenStarStar:
219			if toldExponent < 1 {
220				diags = append(diags, &hcl.Diagnostic{
221					Severity: hcl.DiagError,
222					Summary:  "Unsupported operator",
223					Detail:   "\"**\" is not a supported operator. Exponentiation is not supported as an operator.",
224					Subject:  &tok.Range,
225				})
226
227				toldExponent++
228			}
229		case TokenBacktick:
230			// Only report for alternating (even) backticks, so we won't report both start and ends of the same
231			// backtick-quoted string.
232			if (toldBacktick % 2) == 0 {
233				diags = append(diags, &hcl.Diagnostic{
234					Severity: hcl.DiagError,
235					Summary:  "Invalid character",
236					Detail:   "The \"`\" character is not valid. To create a multi-line string, use the \"heredoc\" syntax, like \"<<EOT\".",
237					Subject:  &tok.Range,
238				})
239			}
240			if toldBacktick <= 2 {
241				toldBacktick++
242			}
243		case TokenApostrophe:
244			if (toldApostrophe % 2) == 0 {
245				newDiag := &hcl.Diagnostic{
246					Severity: hcl.DiagError,
247					Summary:  "Invalid character",
248					Detail:   "Single quotes are not valid. Use double quotes (\") to enclose strings.",
249					Subject:  &tok.Range,
250				}
251				diags = append(diags, newDiag)
252			}
253			if toldApostrophe <= 2 {
254				toldApostrophe++
255			}
256		case TokenSemicolon:
257			if toldSemicolon < 1 {
258				diags = append(diags, &hcl.Diagnostic{
259					Severity: hcl.DiagError,
260					Summary:  "Invalid character",
261					Detail:   "The \";\" character is not valid. Use newlines to separate arguments and blocks, and commas to separate items in collection values.",
262					Subject:  &tok.Range,
263				})
264
265				toldSemicolon++
266			}
267		case TokenTabs:
268			if toldTabs < 1 {
269				diags = append(diags, &hcl.Diagnostic{
270					Severity: hcl.DiagError,
271					Summary:  "Invalid character",
272					Detail:   "Tab characters may not be used. The recommended indentation style is two spaces per indent.",
273					Subject:  &tok.Range,
274				})
275
276				toldTabs++
277			}
278		case TokenBadUTF8:
279			if toldBadUTF8 < 1 {
280				diags = append(diags, &hcl.Diagnostic{
281					Severity: hcl.DiagError,
282					Summary:  "Invalid character encoding",
283					Detail:   "All input files must be UTF-8 encoded. Ensure that UTF-8 encoding is selected in your editor.",
284					Subject:  &tok.Range,
285				})
286
287				toldBadUTF8++
288			}
289		case TokenQuotedNewline:
290			diags = append(diags, &hcl.Diagnostic{
291				Severity: hcl.DiagError,
292				Summary:  "Invalid multi-line string",
293				Detail:   "Quoted strings may not be split over multiple lines. To produce a multi-line string, either use the \\n escape to represent a newline character or use the \"heredoc\" multi-line template syntax.",
294				Subject:  &tok.Range,
295			})
296		case TokenInvalid:
297			diags = append(diags, &hcl.Diagnostic{
298				Severity: hcl.DiagError,
299				Summary:  "Invalid character",
300				Detail:   "This character is not used within the language.",
301				Subject:  &tok.Range,
302			})
303		}
304	}
305	return diags
306}
307
308var utf8BOM = []byte{0xef, 0xbb, 0xbf}
309
310// stripUTF8BOM checks whether the given buffer begins with a UTF-8 byte order
311// mark (0xEF 0xBB 0xBF) and, if so, returns a truncated slice with the same
312// backing array but with the BOM skipped.
313//
314// If there is no BOM present, the given slice is returned verbatim.
315func stripUTF8BOM(src []byte) []byte {
316	if bytes.HasPrefix(src, utf8BOM) {
317		return src[3:]
318	}
319	return src
320}
321