1// The bulk of this lexer code is taken from http://golang.org/src/pkg/text/template/parse/lex.go
2// Described in a talk by Rob Pike: http://cuddle.googlecode.com/hg/talk/lex.html#title-slide and http://www.youtube.com/watch?v=HxaD_trXwRE
3//
4// Copyright 2011 The Go Authors. All rights reserved.
5// Use of this source code is governed by a BSD-style
6// license that can be found in the LICENSE file. (Available here: http://golang.org/LICENSE)
7//
8// For the remainder of the file:
9// Copyright 2014 Richard Lehane. All rights reserved.
10//
11// Licensed under the Apache License, Version 2.0 (the "License");
12// you may not use this file except in compliance with the License.
13// You may obtain a copy of the License at
14//
15//     http://www.apache.org/licenses/LICENSE-2.0
16//
17// Unless required by applicable law or agreed to in writing, software
18// distributed under the License is distributed on an "AS IS" BASIS,
19// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20// See the License for the specific language governing permissions and
21// limitations under the License.
22
23package pronom
24
25import (
26	"fmt"
27	"strings"
28	"unicode/utf8"
29)
30
31type item struct {
32	typ itemType
33	pos int
34	val string
35}
36
37func (i item) String() string {
38	switch {
39	case i.typ == itemEOF:
40		return "EOF"
41	case i.typ == itemError:
42		return i.val
43	}
44	return fmt.Sprintf("%q", i.val)
45}
46
47type itemType int
48
49const (
50	itemError itemType = iota
51	itemEOF
52	itemCurlyLeft
53	itemCurlyRight
54	itemWildStart
55	itemSlash
56	itemWildEnd
57	itemWildSingle //??
58	itemWild       //*
59	itemUnprocessedText
60	itemEnterGroup
61	itemExitGroup
62	itemChoiceMarker
63	itemNotMarker
64	itemRangeMarker
65	itemMaskMarker
66	itemAnyMaskMarker
67	itemHexText
68	itemQuoteText
69	itemQuote
70	itemSpace
71)
72
73const (
74	leftBracket  = '['
75	rightBracket = ']'
76	leftParens   = '('
77	rightParens  = ')'
78	leftCurly    = '{'
79	rightCurly   = '}'
80	wildSingle   = '?'
81	wild         = '*'
82	not          = '!'
83	colon        = ':'
84	slash        = '-'
85	pipe         = '|'
86	quot         = '\''
87	space        = ' '
88	tab          = '\t'
89	amp          = '&'
90	tilda        = '~'
91	newline      = '\n'
92	carriage     = '\r'
93)
94
95const digits = "0123456789"
96
97const hexadecimal = digits + "abcdefABCDEF"
98
99const hexnonquote = hexadecimal + " " + "\n" + "\r"
100
101const digitswild = digits + "*"
102
103const eof = -1
104
105type stateFn func(*lexer) stateFn
106
107// lexer holds the state of the scanner.
108type lexer struct {
109	name    string
110	input   string
111	state   stateFn
112	pos     int
113	start   int
114	width   int
115	lastPos int
116	items   chan item
117}
118
119// next returns the next rune in the input.
120func (l *lexer) next() rune {
121	if int(l.pos) >= len(l.input) {
122		l.width = 0
123		return eof
124	}
125	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
126	l.width = w
127	l.pos += l.width
128	return r
129}
130
131// peek returns but does not consume the next rune in the input.
132func (l *lexer) peek() rune {
133	r := l.next()
134	l.backup()
135	return r
136}
137
138// backup steps back one rune. Can only be called once per call of next.
139func (l *lexer) backup() {
140	l.pos -= l.width
141}
142
143// emit passes an item back to the client.
144func (l *lexer) emit(t itemType) {
145	l.items <- item{t, l.start, l.input[l.start:l.pos]}
146	l.start = l.pos
147}
148
149// acceptRun consumes a run of runes from the valid set.
150func (l *lexer) acceptRun(valid string) {
151	for strings.IndexRune(valid, l.next()) >= 0 {
152	}
153	l.backup()
154}
155
156// acceptText consumes a run of runes that are deemed to be plain sequences (hex or quoted values)
157func (l *lexer) acceptText(group bool) error {
158	valid := hexnonquote
159	if group {
160		valid = hexadecimal
161	}
162	for {
163		l.acceptRun(valid)
164		switch l.peek() {
165		default:
166			return nil
167		case quot:
168			r := l.next()
169			for r = l.next(); r != eof && r != quot; r = l.next() {
170			}
171			if r != quot {
172				return fmt.Errorf("expected closing quote, got %v", r)
173			}
174		}
175	}
176}
177
178// errorf returns an error token and terminates the scan by passing
179// back a nil pointer that will be the next state, terminating l.nextItem.
180func (l *lexer) errorf(format string, args ...interface{}) stateFn {
181	l.items <- item{itemError, l.start, fmt.Sprintf("Lex error in "+l.name+": "+format, args...)}
182	return nil
183}
184
185// nextItem returns the next item from the input.
186func (l *lexer) nextItem() item {
187	item := <-l.items
188	l.lastPos = item.pos
189	return item
190}
191
192// lex creates a new scanner for the input string.
193func lex(name, input string, start stateFn) *lexer {
194	l := &lexer{
195		name:  name,
196		input: input,
197		items: make(chan item),
198	}
199	go l.run(start)
200	return l
201}
202
203// run runs the state machine for the lexer.
204func (l *lexer) run(start stateFn) {
205	for l.state = start; l.state != nil; {
206		l.state = l.state(l)
207	}
208}
209
210// lexer for PRONOM signature files - reports, container and droid
211func lexPRONOM(name, input string) *lexer {
212	return lex(name, input, insideText)
213}
214
215func insideText(l *lexer) stateFn {
216	if err := l.acceptText(false); err != nil {
217		return l.errorf(err.Error())
218	}
219	if l.pos > l.start {
220		l.emit(itemUnprocessedText)
221	}
222	r := l.next()
223	switch r {
224	default:
225		return l.errorf("encountered invalid character %q", r)
226	case eof:
227		l.emit(itemEOF)
228		return nil
229	case leftBracket:
230		l.emit(itemEnterGroup)
231		return insideLeftBracket
232	case leftParens:
233		l.emit(itemEnterGroup)
234		return insideLeftParens
235	case leftCurly:
236		l.emit(itemCurlyLeft)
237		return insideWild
238	case wildSingle:
239		return insideWildSingle
240	case wild:
241		l.emit(itemWild)
242		return insideText
243	}
244}
245
246func (l *lexer) insideGroup(boundary itemType) stateFn {
247	depth := 1
248	for {
249		if err := l.acceptText(true); err != nil {
250			return l.errorf(err.Error())
251		}
252		if l.pos > l.start {
253			l.emit(itemUnprocessedText)
254		}
255		r := l.next()
256		switch r {
257		default:
258			return l.errorf("encountered invalid character %q", r)
259		case leftBracket:
260			l.emit(itemEnterGroup)
261			depth++
262		case rightBracket:
263			l.emit(itemExitGroup)
264			depth--
265			if depth == 0 {
266				if boundary != rightBracket {
267					return l.errorf("expected group to close with %q, got %q", boundary, r)
268				}
269				return insideText
270			}
271		case rightParens:
272			if boundary != rightParens {
273				return l.errorf("expected group to close with %q, got %q", boundary, r)
274			}
275			l.emit(itemExitGroup)
276			return insideText
277		case not:
278			l.emit(itemNotMarker)
279		case pipe, space, tab:
280			l.emit(itemChoiceMarker)
281		case colon, slash:
282			l.emit(itemRangeMarker)
283		case amp:
284			l.emit(itemMaskMarker)
285		case tilda:
286			l.emit(itemAnyMaskMarker)
287		}
288	}
289}
290
291func insideLeftBracket(l *lexer) stateFn {
292	return l.insideGroup(rightBracket)
293}
294
295func insideLeftParens(l *lexer) stateFn {
296	return l.insideGroup(rightParens)
297}
298
299func insideWildSingle(l *lexer) stateFn {
300	r := l.next()
301	if r == wildSingle {
302		l.emit(itemWildSingle)
303		return insideText
304	}
305	return l.errorf("expecting a double '?', got %q", r)
306}
307
308func insideWild(l *lexer) stateFn {
309	l.acceptRun(digits) // don't accept a '*' as start of range
310	if l.pos > l.start {
311		l.emit(itemWildStart)
312	}
313	r := l.next()
314	if r == slash {
315		l.emit(itemSlash)
316		l.acceptRun(digitswild)
317		if l.pos > l.start {
318			l.emit(itemWildEnd)
319		}
320		r = l.next()
321	}
322	if r == rightCurly {
323		l.emit(itemCurlyRight)
324		return insideText
325	}
326	return l.errorf("expecting a closing bracket, got %q", r)
327}
328
329// text lexer
330func lexText(input string) *lexer {
331	return lex("textProcessor", input, insideUnprocessedText)
332}
333
334func insideUnprocessedText(l *lexer) stateFn {
335	for {
336		l.acceptRun(hexadecimal)
337		if l.pos > l.start {
338			l.emit(itemHexText)
339		}
340		switch l.next() {
341		default:
342			l.backup()
343			return l.errorf("unexpected character in text: %q", l.next())
344		case eof:
345			l.emit(itemEOF)
346			return nil
347		case quot:
348			l.emit(itemQuote)
349			return insideQuoteText
350		case space, tab, newline, carriage:
351			l.emit(itemSpace)
352		}
353	}
354}
355
356func insideQuoteText(l *lexer) stateFn {
357	r := l.next()
358	for ; r != eof && r != quot; r = l.next() {
359	}
360	if r == quot {
361		l.backup()
362		l.emit(itemQuoteText)
363		l.next()
364		l.emit(itemQuote)
365		return insideUnprocessedText
366	}
367	return l.errorf("expected closing quote, reached end of string")
368}
369