1package lexer
2
3import (
4	"fmt"
5	"strings"
6	"unicode"
7	"unicode/utf8"
8)
9
10const (
11	//XItemError is an error with the parser input
12	XItemError XItemType = "Error"
13	//XItemAbsLocPath is an absolute path
14	XItemAbsLocPath = "Absolute path"
15	//XItemAbbrAbsLocPath represents an abbreviated absolute path
16	XItemAbbrAbsLocPath = "Abbreviated absolute path"
17	//XItemAbbrRelLocPath marks the start of a path expression
18	XItemAbbrRelLocPath = "Abbreviated relative path"
19	//XItemRelLocPath represents a relative location path
20	XItemRelLocPath = "Relative path"
21	//XItemEndPath marks the end of a path
22	XItemEndPath = "End path instruction"
23	//XItemAxis marks an axis specifier of a path
24	XItemAxis = "Axis"
25	//XItemAbbrAxis marks an abbreviated axis specifier (just @ at this point)
26	XItemAbbrAxis = "Abbreviated attribute axis"
27	//XItemNCName marks a namespace name in a node test
28	XItemNCName = "Namespace"
29	//XItemQName marks the local name in an a node test
30	XItemQName = "Local name"
31	//XItemNodeType marks a node type in a node test
32	XItemNodeType = "Node type"
33	//XItemProcLit marks a processing-instruction literal
34	XItemProcLit = "processing-instruction"
35	//XItemFunction marks a function call
36	XItemFunction = "function"
37	//XItemArgument marks a function argument
38	XItemArgument = "function argument"
39	//XItemEndFunction marks the end of a function
40	XItemEndFunction = "end of function"
41	//XItemPredicate marks a predicate in an axis
42	XItemPredicate = "predicate"
43	//XItemEndPredicate marks a predicate in an axis
44	XItemEndPredicate = "end of predicate"
45	//XItemStrLit marks a string literal
46	XItemStrLit = "string literal"
47	//XItemNumLit marks a numeric literal
48	XItemNumLit = "numeric literal"
49	//XItemOperator marks an operator
50	XItemOperator = "operator"
51	//XItemVariable marks a variable reference
52	XItemVariable = "variable"
53)
54
55const (
56	eof = -(iota + 1)
57)
58
59//XItemType is the parser token types
60type XItemType string
61
62//XItem is the token emitted from the parser
63type XItem struct {
64	Typ XItemType
65	Val string
66}
67
68type stateFn func(*Lexer) stateFn
69
70//Lexer lexes out XPath expressions
71type Lexer struct {
72	input string
73	start int
74	pos   int
75	width int
76	items chan XItem
77}
78
79//Lex an XPath expresion on the io.Reader
80func Lex(xpath string) chan XItem {
81	l := &Lexer{
82		input: xpath,
83		items: make(chan XItem),
84	}
85	go l.run()
86	return l.items
87}
88
89func (l *Lexer) run() {
90	for state := startState; state != nil; {
91		state = state(l)
92	}
93
94	if l.peek() != eof {
95		l.errorf("Malformed XPath expression")
96	}
97
98	close(l.items)
99}
100
101func (l *Lexer) emit(t XItemType) {
102	l.items <- XItem{t, l.input[l.start:l.pos]}
103	l.start = l.pos
104}
105
106func (l *Lexer) emitVal(t XItemType, val string) {
107	l.items <- XItem{t, val}
108	l.start = l.pos
109}
110
111func (l *Lexer) next() (r rune) {
112	if l.pos >= len(l.input) {
113		l.width = 0
114		return eof
115	}
116
117	r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
118
119	l.pos += l.width
120
121	return r
122}
123
124func (l *Lexer) ignore() {
125	l.start = l.pos
126}
127
128func (l *Lexer) backup() {
129	l.pos -= l.width
130}
131
132func (l *Lexer) peek() rune {
133	r := l.next()
134
135	l.backup()
136	return r
137}
138
139func (l *Lexer) peekAt(n int) rune {
140	if n <= 1 {
141		return l.peek()
142	}
143
144	width := 0
145	var ret rune
146
147	for count := 0; count < n; count++ {
148		r, s := utf8.DecodeRuneInString(l.input[l.pos+width:])
149		width += s
150
151		if l.pos+width > len(l.input) {
152			return eof
153		}
154
155		ret = r
156	}
157
158	return ret
159}
160
161func (l *Lexer) accept(valid string) bool {
162	if strings.ContainsRune(valid, l.next()) {
163		return true
164	}
165
166	l.backup()
167	return false
168}
169
170func (l *Lexer) acceptRun(valid string) {
171	for strings.ContainsRune(valid, l.next()) {
172	}
173	l.backup()
174}
175
176func (l *Lexer) skip(num int) {
177	for i := 0; i < num; i++ {
178		l.next()
179	}
180	l.ignore()
181}
182
183func (l *Lexer) skipWS(ig bool) {
184	for {
185		n := l.next()
186
187		if n == eof || !unicode.IsSpace(n) {
188			break
189		}
190	}
191
192	l.backup()
193
194	if ig {
195		l.ignore()
196	}
197}
198
199func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
200	l.items <- XItem{
201		XItemError,
202		fmt.Sprintf(format, args...),
203	}
204
205	return nil
206}
207
208func isElemChar(r rune) bool {
209	return string(r) != ":" && string(r) != "/" &&
210		(unicode.Is(first, r) || unicode.Is(second, r) || string(r) == "*") &&
211		r != eof
212}
213
214func startState(l *Lexer) stateFn {
215	l.skipWS(true)
216
217	if string(l.peek()) == "/" {
218		l.next()
219		l.ignore()
220
221		if string(l.next()) == "/" {
222			l.ignore()
223			return abbrAbsLocPathState
224		}
225
226		l.backup()
227		return absLocPathState
228	} else if string(l.peek()) == `'` || string(l.peek()) == `"` {
229		if err := getStrLit(l, XItemStrLit); err != nil {
230			return l.errorf(err.Error())
231		}
232
233		if l.peek() != eof {
234			return startState
235		}
236	} else if getNumLit(l) {
237		l.skipWS(true)
238		if l.peek() != eof {
239			return startState
240		}
241	} else if string(l.peek()) == "$" {
242		l.next()
243		l.ignore()
244		r := l.peek()
245		for unicode.Is(first, r) || unicode.Is(second, r) {
246			l.next()
247			r = l.peek()
248		}
249		tok := l.input[l.start:l.pos]
250		if len(tok) == 0 {
251			return l.errorf("Empty variable name")
252		}
253		l.emit(XItemVariable)
254		l.skipWS(true)
255		if l.peek() != eof {
256			return startState
257		}
258	} else if st := findOperatorState(l); st != nil {
259		return st
260	} else {
261		if isElemChar(l.peek()) {
262			colons := 0
263
264			for {
265				if isElemChar(l.peek()) {
266					l.next()
267				} else if string(l.peek()) == ":" {
268					l.next()
269					colons++
270				} else {
271					break
272				}
273			}
274
275			if string(l.peek()) == "(" && colons <= 1 {
276				tok := l.input[l.start:l.pos]
277				err := procFunc(l, tok)
278				if err != nil {
279					return l.errorf(err.Error())
280				}
281
282				l.skipWS(true)
283
284				if string(l.peek()) == "/" {
285					l.next()
286					l.ignore()
287
288					if string(l.next()) == "/" {
289						l.ignore()
290						return abbrRelLocPathState
291					}
292
293					l.backup()
294					return relLocPathState
295				}
296
297				return startState
298			}
299
300			l.pos = l.start
301			return relLocPathState
302		} else if string(l.peek()) == "@" {
303			return relLocPathState
304		}
305	}
306
307	return nil
308}
309
310func strPeek(str string, l *Lexer) bool {
311	for i := 0; i < len(str); i++ {
312		if string(l.peekAt(i+1)) != string(str[i]) {
313			return false
314		}
315	}
316	return true
317}
318
319func findOperatorState(l *Lexer) stateFn {
320	l.skipWS(true)
321
322	switch string(l.peek()) {
323	case ">", "<", "!":
324		l.next()
325		if string(l.peek()) == "=" {
326			l.next()
327		}
328		l.emit(XItemOperator)
329		return startState
330	case "|", "+", "-", "*", "=":
331		l.next()
332		l.emit(XItemOperator)
333		return startState
334	case "(":
335		l.next()
336		l.emit(XItemOperator)
337		for state := startState; state != nil; {
338			state = state(l)
339		}
340		l.skipWS(true)
341		if string(l.next()) != ")" {
342			return l.errorf("Missing end )")
343		}
344		l.emit(XItemOperator)
345		return startState
346	}
347
348	if strPeek("and", l) {
349		l.next()
350		l.next()
351		l.next()
352		l.emit(XItemOperator)
353		return startState
354	}
355
356	if strPeek("or", l) {
357		l.next()
358		l.next()
359		l.emit(XItemOperator)
360		return startState
361	}
362
363	if strPeek("mod", l) {
364		l.next()
365		l.next()
366		l.next()
367		l.emit(XItemOperator)
368		return startState
369	}
370
371	if strPeek("div", l) {
372		l.next()
373		l.next()
374		l.next()
375		l.emit(XItemOperator)
376		return startState
377	}
378
379	return nil
380}
381
382func getStrLit(l *Lexer, tok XItemType) error {
383	q := l.next()
384	var r rune
385
386	l.ignore()
387
388	for r != q {
389		r = l.next()
390		if r == eof {
391			return fmt.Errorf("Unexpected end of string literal.")
392		}
393	}
394
395	l.backup()
396	l.emit(tok)
397	l.next()
398	l.ignore()
399
400	return nil
401}
402
403func getNumLit(l *Lexer) bool {
404	const dig = "0123456789"
405	l.accept("-")
406	start := l.pos
407	l.acceptRun(dig)
408
409	if l.pos == start {
410		return false
411	}
412
413	if l.accept(".") {
414		l.acceptRun(dig)
415	}
416
417	l.emit(XItemNumLit)
418	return true
419}
420