1package hclsyntax 2 3import ( 4 "bytes" 5 "fmt" 6 7 "github.com/apparentlymart/go-textseg/textseg" 8 "github.com/hashicorp/hcl2/hcl" 9) 10 11// Token represents a sequence of bytes from some HCL code that has been 12// tagged with a type and its range within the source file. 13type Token struct { 14 Type TokenType 15 Bytes []byte 16 Range hcl.Range 17} 18 19// Tokens is a slice of Token. 20type Tokens []Token 21 22// TokenType is an enumeration used for the Type field on Token. 23type TokenType rune 24 25const ( 26 // Single-character tokens are represented by their own character, for 27 // convenience in producing these within the scanner. However, the values 28 // are otherwise arbitrary and just intended to be mnemonic for humans 29 // who might see them in debug output. 30 31 TokenOBrace TokenType = '{' 32 TokenCBrace TokenType = '}' 33 TokenOBrack TokenType = '[' 34 TokenCBrack TokenType = ']' 35 TokenOParen TokenType = '(' 36 TokenCParen TokenType = ')' 37 TokenOQuote TokenType = '«' 38 TokenCQuote TokenType = '»' 39 TokenOHeredoc TokenType = 'H' 40 TokenCHeredoc TokenType = 'h' 41 42 TokenStar TokenType = '*' 43 TokenSlash TokenType = '/' 44 TokenPlus TokenType = '+' 45 TokenMinus TokenType = '-' 46 TokenPercent TokenType = '%' 47 48 TokenEqual TokenType = '=' 49 TokenEqualOp TokenType = '≔' 50 TokenNotEqual TokenType = '≠' 51 TokenLessThan TokenType = '<' 52 TokenLessThanEq TokenType = '≤' 53 TokenGreaterThan TokenType = '>' 54 TokenGreaterThanEq TokenType = '≥' 55 56 TokenAnd TokenType = '∧' 57 TokenOr TokenType = '∨' 58 TokenBang TokenType = '!' 59 60 TokenDot TokenType = '.' 61 TokenComma TokenType = ',' 62 63 TokenEllipsis TokenType = '…' 64 TokenFatArrow TokenType = '⇒' 65 66 TokenQuestion TokenType = '?' 67 TokenColon TokenType = ':' 68 69 TokenTemplateInterp TokenType = '∫' 70 TokenTemplateControl TokenType = 'λ' 71 TokenTemplateSeqEnd TokenType = '∎' 72 73 TokenQuotedLit TokenType = 'Q' // might contain backslash escapes 74 TokenStringLit TokenType = 'S' // cannot contain backslash escapes 75 TokenNumberLit TokenType = 'N' 76 TokenIdent TokenType = 'I' 77 78 TokenComment TokenType = 'C' 79 80 TokenNewline TokenType = '\n' 81 TokenEOF TokenType = '␄' 82 83 // The rest are not used in the language but recognized by the scanner so 84 // we can generate good diagnostics in the parser when users try to write 85 // things that might work in other languages they are familiar with, or 86 // simply make incorrect assumptions about the HCL language. 87 88 TokenBitwiseAnd TokenType = '&' 89 TokenBitwiseOr TokenType = '|' 90 TokenBitwiseNot TokenType = '~' 91 TokenBitwiseXor TokenType = '^' 92 TokenStarStar TokenType = '➚' 93 TokenApostrophe TokenType = '\'' 94 TokenBacktick TokenType = '`' 95 TokenSemicolon TokenType = ';' 96 TokenTabs TokenType = '␉' 97 TokenInvalid TokenType = '�' 98 TokenBadUTF8 TokenType = '' 99 TokenQuotedNewline TokenType = '' 100 101 // TokenNil is a placeholder for when a token is required but none is 102 // available, e.g. when reporting errors. The scanner will never produce 103 // this as part of a token stream. 104 TokenNil TokenType = '\x00' 105) 106 107func (t TokenType) GoString() string { 108 return fmt.Sprintf("hclsyntax.%s", t.String()) 109} 110 111type scanMode int 112 113const ( 114 scanNormal scanMode = iota 115 scanTemplate 116 scanIdentOnly 117) 118 119type tokenAccum struct { 120 Filename string 121 Bytes []byte 122 Pos hcl.Pos 123 Tokens []Token 124 StartByte int 125} 126 127func (f *tokenAccum) emitToken(ty TokenType, startOfs, endOfs int) { 128 // Walk through our buffer to figure out how much we need to adjust 129 // the start pos to get our end pos. 130 131 start := f.Pos 132 start.Column += startOfs + f.StartByte - f.Pos.Byte // Safe because only ASCII spaces can be in the offset 133 start.Byte = startOfs + f.StartByte 134 135 end := start 136 end.Byte = endOfs + f.StartByte 137 b := f.Bytes[startOfs:endOfs] 138 for len(b) > 0 { 139 advance, seq, _ := textseg.ScanGraphemeClusters(b, true) 140 if (len(seq) == 1 && seq[0] == '\n') || (len(seq) == 2 && seq[0] == '\r' && seq[1] == '\n') { 141 end.Line++ 142 end.Column = 1 143 } else { 144 end.Column++ 145 } 146 b = b[advance:] 147 } 148 149 f.Pos = end 150 151 f.Tokens = append(f.Tokens, Token{ 152 Type: ty, 153 Bytes: f.Bytes[startOfs:endOfs], 154 Range: hcl.Range{ 155 Filename: f.Filename, 156 Start: start, 157 End: end, 158 }, 159 }) 160} 161 162type heredocInProgress struct { 163 Marker []byte 164 StartOfLine bool 165} 166 167func tokenOpensFlushHeredoc(tok Token) bool { 168 if tok.Type != TokenOHeredoc { 169 return false 170 } 171 return bytes.HasPrefix(tok.Bytes, []byte{'<', '<', '-'}) 172} 173 174// checkInvalidTokens does a simple pass across the given tokens and generates 175// diagnostics for tokens that should _never_ appear in HCL source. This 176// is intended to avoid the need for the parser to have special support 177// for them all over. 178// 179// Returns a diagnostics with no errors if everything seems acceptable. 180// Otherwise, returns zero or more error diagnostics, though tries to limit 181// repetition of the same information. 182func checkInvalidTokens(tokens Tokens) hcl.Diagnostics { 183 var diags hcl.Diagnostics 184 185 toldBitwise := 0 186 toldExponent := 0 187 toldBacktick := 0 188 toldApostrophe := 0 189 toldSemicolon := 0 190 toldTabs := 0 191 toldBadUTF8 := 0 192 193 for _, tok := range tokens { 194 // copy token so it's safe to point to it 195 tok := tok 196 197 switch tok.Type { 198 case TokenBitwiseAnd, TokenBitwiseOr, TokenBitwiseXor, TokenBitwiseNot: 199 if toldBitwise < 4 { 200 var suggestion string 201 switch tok.Type { 202 case TokenBitwiseAnd: 203 suggestion = " Did you mean boolean AND (\"&&\")?" 204 case TokenBitwiseOr: 205 suggestion = " Did you mean boolean OR (\"&&\")?" 206 case TokenBitwiseNot: 207 suggestion = " Did you mean boolean NOT (\"!\")?" 208 } 209 210 diags = append(diags, &hcl.Diagnostic{ 211 Severity: hcl.DiagError, 212 Summary: "Unsupported operator", 213 Detail: fmt.Sprintf("Bitwise operators are not supported.%s", suggestion), 214 Subject: &tok.Range, 215 }) 216 toldBitwise++ 217 } 218 case TokenStarStar: 219 if toldExponent < 1 { 220 diags = append(diags, &hcl.Diagnostic{ 221 Severity: hcl.DiagError, 222 Summary: "Unsupported operator", 223 Detail: "\"**\" is not a supported operator. Exponentiation is not supported as an operator.", 224 Subject: &tok.Range, 225 }) 226 227 toldExponent++ 228 } 229 case TokenBacktick: 230 // Only report for alternating (even) backticks, so we won't report both start and ends of the same 231 // backtick-quoted string. 232 if (toldBacktick % 2) == 0 { 233 diags = append(diags, &hcl.Diagnostic{ 234 Severity: hcl.DiagError, 235 Summary: "Invalid character", 236 Detail: "The \"`\" character is not valid. To create a multi-line string, use the \"heredoc\" syntax, like \"<<EOT\".", 237 Subject: &tok.Range, 238 }) 239 } 240 if toldBacktick <= 2 { 241 toldBacktick++ 242 } 243 case TokenApostrophe: 244 if (toldApostrophe % 2) == 0 { 245 newDiag := &hcl.Diagnostic{ 246 Severity: hcl.DiagError, 247 Summary: "Invalid character", 248 Detail: "Single quotes are not valid. Use double quotes (\") to enclose strings.", 249 Subject: &tok.Range, 250 } 251 diags = append(diags, newDiag) 252 } 253 if toldApostrophe <= 2 { 254 toldApostrophe++ 255 } 256 case TokenSemicolon: 257 if toldSemicolon < 1 { 258 diags = append(diags, &hcl.Diagnostic{ 259 Severity: hcl.DiagError, 260 Summary: "Invalid character", 261 Detail: "The \";\" character is not valid. Use newlines to separate arguments and blocks, and commas to separate items in collection values.", 262 Subject: &tok.Range, 263 }) 264 265 toldSemicolon++ 266 } 267 case TokenTabs: 268 if toldTabs < 1 { 269 diags = append(diags, &hcl.Diagnostic{ 270 Severity: hcl.DiagError, 271 Summary: "Invalid character", 272 Detail: "Tab characters may not be used. The recommended indentation style is two spaces per indent.", 273 Subject: &tok.Range, 274 }) 275 276 toldTabs++ 277 } 278 case TokenBadUTF8: 279 if toldBadUTF8 < 1 { 280 diags = append(diags, &hcl.Diagnostic{ 281 Severity: hcl.DiagError, 282 Summary: "Invalid character encoding", 283 Detail: "All input files must be UTF-8 encoded. Ensure that UTF-8 encoding is selected in your editor.", 284 Subject: &tok.Range, 285 }) 286 287 toldBadUTF8++ 288 } 289 case TokenQuotedNewline: 290 diags = append(diags, &hcl.Diagnostic{ 291 Severity: hcl.DiagError, 292 Summary: "Invalid multi-line string", 293 Detail: "Quoted strings may not be split over multiple lines. To produce a multi-line string, either use the \\n escape to represent a newline character or use the \"heredoc\" multi-line template syntax.", 294 Subject: &tok.Range, 295 }) 296 case TokenInvalid: 297 diags = append(diags, &hcl.Diagnostic{ 298 Severity: hcl.DiagError, 299 Summary: "Invalid character", 300 Detail: "This character is not used within the language.", 301 Subject: &tok.Range, 302 }) 303 } 304 } 305 return diags 306} 307 308var utf8BOM = []byte{0xef, 0xbb, 0xbf} 309 310// stripUTF8BOM checks whether the given buffer begins with a UTF-8 byte order 311// mark (0xEF 0xBB 0xBF) and, if so, returns a truncated slice with the same 312// backing array but with the BOM skipped. 313// 314// If there is no BOM present, the given slice is returned verbatim. 315func stripUTF8BOM(src []byte) []byte { 316 if bytes.HasPrefix(src, utf8BOM) { 317 return src[3:] 318 } 319 return src 320} 321