1package jmespath
2
3import (
4	"bytes"
5	"encoding/json"
6	"fmt"
7	"strconv"
8	"strings"
9	"unicode/utf8"
10)
11
12type token struct {
13	tokenType tokType
14	value     string
15	position  int
16	length    int
17}
18
19type tokType int
20
21const eof = -1
22
23// Lexer contains information about the expression being tokenized.
24type Lexer struct {
25	expression string       // The expression provided by the user.
26	currentPos int          // The current position in the string.
27	lastWidth  int          // The width of the current rune.  This
28	buf        bytes.Buffer // Internal buffer used for building up values.
29}
30
31// SyntaxError is the main error used whenever a lexing or parsing error occurs.
32type SyntaxError struct {
33	msg        string // Error message displayed to user
34	Expression string // Expression that generated a SyntaxError
35	Offset     int    // The location in the string where the error occurred
36}
37
38func (e SyntaxError) Error() string {
39	// In the future, it would be good to underline the specific
40	// location where the error occurred.
41	return "SyntaxError: " + e.msg
42}
43
44// HighlightLocation will show where the syntax error occurred.
45// It will place a "^" character on a line below the expression
46// at the point where the syntax error occurred.
47func (e SyntaxError) HighlightLocation() string {
48	return e.Expression + "\n" + strings.Repeat(" ", e.Offset) + "^"
49}
50
51//go:generate stringer -type=tokType
52const (
53	tUnknown tokType = iota
54	tStar
55	tDot
56	tFilter
57	tFlatten
58	tLparen
59	tRparen
60	tLbracket
61	tRbracket
62	tLbrace
63	tRbrace
64	tOr
65	tPipe
66	tNumber
67	tUnquotedIdentifier
68	tQuotedIdentifier
69	tComma
70	tColon
71	tLT
72	tLTE
73	tGT
74	tGTE
75	tEQ
76	tNE
77	tJSONLiteral
78	tStringLiteral
79	tCurrent
80	tExpref
81	tAnd
82	tNot
83	tEOF
84)
85
86var basicTokens = map[rune]tokType{
87	'.': tDot,
88	'*': tStar,
89	',': tComma,
90	':': tColon,
91	'{': tLbrace,
92	'}': tRbrace,
93	']': tRbracket, // tLbracket not included because it could be "[]"
94	'(': tLparen,
95	')': tRparen,
96	'@': tCurrent,
97}
98
99// Bit mask for [a-zA-Z_] shifted down 64 bits to fit in a single uint64.
100// When using this bitmask just be sure to shift the rune down 64 bits
101// before checking against identifierStartBits.
102const identifierStartBits uint64 = 576460745995190270
103
104// Bit mask for [a-zA-Z0-9], 128 bits -> 2 uint64s.
105var identifierTrailingBits = [2]uint64{287948901175001088, 576460745995190270}
106
107var whiteSpace = map[rune]bool{
108	' ': true, '\t': true, '\n': true, '\r': true,
109}
110
111func (t token) String() string {
112	return fmt.Sprintf("Token{%+v, %s, %d, %d}",
113		t.tokenType, t.value, t.position, t.length)
114}
115
116// NewLexer creates a new JMESPath lexer.
117func NewLexer() *Lexer {
118	lexer := Lexer{}
119	return &lexer
120}
121
122func (lexer *Lexer) next() rune {
123	if lexer.currentPos >= len(lexer.expression) {
124		lexer.lastWidth = 0
125		return eof
126	}
127	r, w := utf8.DecodeRuneInString(lexer.expression[lexer.currentPos:])
128	lexer.lastWidth = w
129	lexer.currentPos += w
130	return r
131}
132
133func (lexer *Lexer) back() {
134	lexer.currentPos -= lexer.lastWidth
135}
136
137func (lexer *Lexer) peek() rune {
138	t := lexer.next()
139	lexer.back()
140	return t
141}
142
143// tokenize takes an expression and returns corresponding tokens.
144func (lexer *Lexer) tokenize(expression string) ([]token, error) {
145	var tokens []token
146	lexer.expression = expression
147	lexer.currentPos = 0
148	lexer.lastWidth = 0
149loop:
150	for {
151		r := lexer.next()
152		if identifierStartBits&(1<<(uint64(r)-64)) > 0 {
153			t := lexer.consumeUnquotedIdentifier()
154			tokens = append(tokens, t)
155		} else if val, ok := basicTokens[r]; ok {
156			// Basic single char token.
157			t := token{
158				tokenType: val,
159				value:     string(r),
160				position:  lexer.currentPos - lexer.lastWidth,
161				length:    1,
162			}
163			tokens = append(tokens, t)
164		} else if r == '-' || (r >= '0' && r <= '9') {
165			t := lexer.consumeNumber()
166			tokens = append(tokens, t)
167		} else if r == '[' {
168			t := lexer.consumeLBracket()
169			tokens = append(tokens, t)
170		} else if r == '"' {
171			t, err := lexer.consumeQuotedIdentifier()
172			if err != nil {
173				return tokens, err
174			}
175			tokens = append(tokens, t)
176		} else if r == '\'' {
177			t, err := lexer.consumeRawStringLiteral()
178			if err != nil {
179				return tokens, err
180			}
181			tokens = append(tokens, t)
182		} else if r == '`' {
183			t, err := lexer.consumeLiteral()
184			if err != nil {
185				return tokens, err
186			}
187			tokens = append(tokens, t)
188		} else if r == '|' {
189			t := lexer.matchOrElse(r, '|', tOr, tPipe)
190			tokens = append(tokens, t)
191		} else if r == '<' {
192			t := lexer.matchOrElse(r, '=', tLTE, tLT)
193			tokens = append(tokens, t)
194		} else if r == '>' {
195			t := lexer.matchOrElse(r, '=', tGTE, tGT)
196			tokens = append(tokens, t)
197		} else if r == '!' {
198			t := lexer.matchOrElse(r, '=', tNE, tNot)
199			tokens = append(tokens, t)
200		} else if r == '=' {
201			t := lexer.matchOrElse(r, '=', tEQ, tUnknown)
202			tokens = append(tokens, t)
203		} else if r == '&' {
204			t := lexer.matchOrElse(r, '&', tAnd, tExpref)
205			tokens = append(tokens, t)
206		} else if r == eof {
207			break loop
208		} else if _, ok := whiteSpace[r]; ok {
209			// Ignore whitespace
210		} else {
211			return tokens, lexer.syntaxError(fmt.Sprintf("Unknown char: %s", strconv.QuoteRuneToASCII(r)))
212		}
213	}
214	tokens = append(tokens, token{tEOF, "", len(lexer.expression), 0})
215	return tokens, nil
216}
217
218// Consume characters until the ending rune "r" is reached.
219// If the end of the expression is reached before seeing the
220// terminating rune "r", then an error is returned.
221// If no error occurs then the matching substring is returned.
222// The returned string will not include the ending rune.
223func (lexer *Lexer) consumeUntil(end rune) (string, error) {
224	start := lexer.currentPos
225	current := lexer.next()
226	for current != end && current != eof {
227		if current == '\\' && lexer.peek() != eof {
228			lexer.next()
229		}
230		current = lexer.next()
231	}
232	if lexer.lastWidth == 0 {
233		// Then we hit an EOF so we never reached the closing
234		// delimiter.
235		return "", SyntaxError{
236			msg:        "Unclosed delimiter: " + string(end),
237			Expression: lexer.expression,
238			Offset:     len(lexer.expression),
239		}
240	}
241	return lexer.expression[start : lexer.currentPos-lexer.lastWidth], nil
242}
243
244func (lexer *Lexer) consumeLiteral() (token, error) {
245	start := lexer.currentPos
246	value, err := lexer.consumeUntil('`')
247	if err != nil {
248		return token{}, err
249	}
250	value = strings.Replace(value, "\\`", "`", -1)
251	return token{
252		tokenType: tJSONLiteral,
253		value:     value,
254		position:  start,
255		length:    len(value),
256	}, nil
257}
258
259func (lexer *Lexer) consumeRawStringLiteral() (token, error) {
260	start := lexer.currentPos
261	currentIndex := start
262	current := lexer.next()
263	for current != '\'' && lexer.peek() != eof {
264		if current == '\\' && lexer.peek() == '\'' {
265			chunk := lexer.expression[currentIndex : lexer.currentPos-1]
266			lexer.buf.WriteString(chunk)
267			lexer.buf.WriteString("'")
268			lexer.next()
269			currentIndex = lexer.currentPos
270		}
271		current = lexer.next()
272	}
273	if lexer.lastWidth == 0 {
274		// Then we hit an EOF so we never reached the closing
275		// delimiter.
276		return token{}, SyntaxError{
277			msg:        "Unclosed delimiter: '",
278			Expression: lexer.expression,
279			Offset:     len(lexer.expression),
280		}
281	}
282	if currentIndex < lexer.currentPos {
283		lexer.buf.WriteString(lexer.expression[currentIndex : lexer.currentPos-1])
284	}
285	value := lexer.buf.String()
286	// Reset the buffer so it can reused again.
287	lexer.buf.Reset()
288	return token{
289		tokenType: tStringLiteral,
290		value:     value,
291		position:  start,
292		length:    len(value),
293	}, nil
294}
295
296func (lexer *Lexer) syntaxError(msg string) SyntaxError {
297	return SyntaxError{
298		msg:        msg,
299		Expression: lexer.expression,
300		Offset:     lexer.currentPos - 1,
301	}
302}
303
304// Checks for a two char token, otherwise matches a single character
305// token. This is used whenever a two char token overlaps a single
306// char token, e.g. "||" -> tPipe, "|" -> tOr.
307func (lexer *Lexer) matchOrElse(first rune, second rune, matchedType tokType, singleCharType tokType) token {
308	start := lexer.currentPos - lexer.lastWidth
309	nextRune := lexer.next()
310	var t token
311	if nextRune == second {
312		t = token{
313			tokenType: matchedType,
314			value:     string(first) + string(second),
315			position:  start,
316			length:    2,
317		}
318	} else {
319		lexer.back()
320		t = token{
321			tokenType: singleCharType,
322			value:     string(first),
323			position:  start,
324			length:    1,
325		}
326	}
327	return t
328}
329
330func (lexer *Lexer) consumeLBracket() token {
331	// There's three options here:
332	// 1. A filter expression "[?"
333	// 2. A flatten operator "[]"
334	// 3. A bare rbracket "["
335	start := lexer.currentPos - lexer.lastWidth
336	nextRune := lexer.next()
337	var t token
338	if nextRune == '?' {
339		t = token{
340			tokenType: tFilter,
341			value:     "[?",
342			position:  start,
343			length:    2,
344		}
345	} else if nextRune == ']' {
346		t = token{
347			tokenType: tFlatten,
348			value:     "[]",
349			position:  start,
350			length:    2,
351		}
352	} else {
353		t = token{
354			tokenType: tLbracket,
355			value:     "[",
356			position:  start,
357			length:    1,
358		}
359		lexer.back()
360	}
361	return t
362}
363
364func (lexer *Lexer) consumeQuotedIdentifier() (token, error) {
365	start := lexer.currentPos
366	value, err := lexer.consumeUntil('"')
367	if err != nil {
368		return token{}, err
369	}
370	var decoded string
371	asJSON := []byte("\"" + value + "\"")
372	if err := json.Unmarshal([]byte(asJSON), &decoded); err != nil {
373		return token{}, err
374	}
375	return token{
376		tokenType: tQuotedIdentifier,
377		value:     decoded,
378		position:  start - 1,
379		length:    len(decoded),
380	}, nil
381}
382
383func (lexer *Lexer) consumeUnquotedIdentifier() token {
384	// Consume runes until we reach the end of an unquoted
385	// identifier.
386	start := lexer.currentPos - lexer.lastWidth
387	for {
388		r := lexer.next()
389		if r < 0 || r > 128 || identifierTrailingBits[uint64(r)/64]&(1<<(uint64(r)%64)) == 0 {
390			lexer.back()
391			break
392		}
393	}
394	value := lexer.expression[start:lexer.currentPos]
395	return token{
396		tokenType: tUnquotedIdentifier,
397		value:     value,
398		position:  start,
399		length:    lexer.currentPos - start,
400	}
401}
402
403func (lexer *Lexer) consumeNumber() token {
404	// Consume runes until we reach something that's not a number.
405	start := lexer.currentPos - lexer.lastWidth
406	for {
407		r := lexer.next()
408		if r < '0' || r > '9' {
409			lexer.back()
410			break
411		}
412	}
413	value := lexer.expression[start:lexer.currentPos]
414	return token{
415		tokenType: tNumber,
416		value:     value,
417		position:  start,
418		length:    lexer.currentPos - start,
419	}
420}
421