1// The bulk of this lexer code is taken from http://golang.org/src/pkg/text/template/parse/lex.go 2// Described in a talk by Rob Pike: http://cuddle.googlecode.com/hg/talk/lex.html#title-slide and http://www.youtube.com/watch?v=HxaD_trXwRE 3// 4// Copyright 2011 The Go Authors. All rights reserved. 5// Use of this source code is governed by a BSD-style 6// license that can be found in the LICENSE file. (Available here: http://golang.org/LICENSE) 7// 8// For the remainder of the file: 9// Copyright 2014 Richard Lehane. All rights reserved. 10// 11// Licensed under the Apache License, Version 2.0 (the "License"); 12// you may not use this file except in compliance with the License. 13// You may obtain a copy of the License at 14// 15// http://www.apache.org/licenses/LICENSE-2.0 16// 17// Unless required by applicable law or agreed to in writing, software 18// distributed under the License is distributed on an "AS IS" BASIS, 19// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20// See the License for the specific language governing permissions and 21// limitations under the License. 22 23package pronom 24 25import ( 26 "fmt" 27 "strings" 28 "unicode/utf8" 29) 30 31type item struct { 32 typ itemType 33 pos int 34 val string 35} 36 37func (i item) String() string { 38 switch { 39 case i.typ == itemEOF: 40 return "EOF" 41 case i.typ == itemError: 42 return i.val 43 } 44 return fmt.Sprintf("%q", i.val) 45} 46 47type itemType int 48 49const ( 50 itemError itemType = iota 51 itemEOF 52 itemCurlyLeft 53 itemCurlyRight 54 itemWildStart 55 itemSlash 56 itemWildEnd 57 itemWildSingle //?? 58 itemWild //* 59 itemUnprocessedText 60 itemEnterGroup 61 itemExitGroup 62 itemChoiceMarker 63 itemNotMarker 64 itemRangeMarker 65 itemMaskMarker 66 itemAnyMaskMarker 67 itemHexText 68 itemQuoteText 69 itemQuote 70 itemSpace 71) 72 73const ( 74 leftBracket = '[' 75 rightBracket = ']' 76 leftParens = '(' 77 rightParens = ')' 78 leftCurly = '{' 79 rightCurly = '}' 80 wildSingle = '?' 81 wild = '*' 82 not = '!' 83 colon = ':' 84 slash = '-' 85 pipe = '|' 86 quot = '\'' 87 space = ' ' 88 tab = '\t' 89 amp = '&' 90 tilda = '~' 91 newline = '\n' 92 carriage = '\r' 93) 94 95const digits = "0123456789" 96 97const hexadecimal = digits + "abcdefABCDEF" 98 99const hexnonquote = hexadecimal + " " + "\n" + "\r" 100 101const digitswild = digits + "*" 102 103const eof = -1 104 105type stateFn func(*lexer) stateFn 106 107// lexer holds the state of the scanner. 108type lexer struct { 109 name string 110 input string 111 state stateFn 112 pos int 113 start int 114 width int 115 lastPos int 116 items chan item 117} 118 119// next returns the next rune in the input. 120func (l *lexer) next() rune { 121 if int(l.pos) >= len(l.input) { 122 l.width = 0 123 return eof 124 } 125 r, w := utf8.DecodeRuneInString(l.input[l.pos:]) 126 l.width = w 127 l.pos += l.width 128 return r 129} 130 131// peek returns but does not consume the next rune in the input. 132func (l *lexer) peek() rune { 133 r := l.next() 134 l.backup() 135 return r 136} 137 138// backup steps back one rune. Can only be called once per call of next. 139func (l *lexer) backup() { 140 l.pos -= l.width 141} 142 143// emit passes an item back to the client. 144func (l *lexer) emit(t itemType) { 145 l.items <- item{t, l.start, l.input[l.start:l.pos]} 146 l.start = l.pos 147} 148 149// acceptRun consumes a run of runes from the valid set. 150func (l *lexer) acceptRun(valid string) { 151 for strings.IndexRune(valid, l.next()) >= 0 { 152 } 153 l.backup() 154} 155 156// acceptText consumes a run of runes that are deemed to be plain sequences (hex or quoted values) 157func (l *lexer) acceptText(group bool) error { 158 valid := hexnonquote 159 if group { 160 valid = hexadecimal 161 } 162 for { 163 l.acceptRun(valid) 164 switch l.peek() { 165 default: 166 return nil 167 case quot: 168 r := l.next() 169 for r = l.next(); r != eof && r != quot; r = l.next() { 170 } 171 if r != quot { 172 return fmt.Errorf("expected closing quote, got %v", r) 173 } 174 } 175 } 176} 177 178// errorf returns an error token and terminates the scan by passing 179// back a nil pointer that will be the next state, terminating l.nextItem. 180func (l *lexer) errorf(format string, args ...interface{}) stateFn { 181 l.items <- item{itemError, l.start, fmt.Sprintf("Lex error in "+l.name+": "+format, args...)} 182 return nil 183} 184 185// nextItem returns the next item from the input. 186func (l *lexer) nextItem() item { 187 item := <-l.items 188 l.lastPos = item.pos 189 return item 190} 191 192// lex creates a new scanner for the input string. 193func lex(name, input string, start stateFn) *lexer { 194 l := &lexer{ 195 name: name, 196 input: input, 197 items: make(chan item), 198 } 199 go l.run(start) 200 return l 201} 202 203// run runs the state machine for the lexer. 204func (l *lexer) run(start stateFn) { 205 for l.state = start; l.state != nil; { 206 l.state = l.state(l) 207 } 208} 209 210// lexer for PRONOM signature files - reports, container and droid 211func lexPRONOM(name, input string) *lexer { 212 return lex(name, input, insideText) 213} 214 215func insideText(l *lexer) stateFn { 216 if err := l.acceptText(false); err != nil { 217 return l.errorf(err.Error()) 218 } 219 if l.pos > l.start { 220 l.emit(itemUnprocessedText) 221 } 222 r := l.next() 223 switch r { 224 default: 225 return l.errorf("encountered invalid character %q", r) 226 case eof: 227 l.emit(itemEOF) 228 return nil 229 case leftBracket: 230 l.emit(itemEnterGroup) 231 return insideLeftBracket 232 case leftParens: 233 l.emit(itemEnterGroup) 234 return insideLeftParens 235 case leftCurly: 236 l.emit(itemCurlyLeft) 237 return insideWild 238 case wildSingle: 239 return insideWildSingle 240 case wild: 241 l.emit(itemWild) 242 return insideText 243 } 244} 245 246func (l *lexer) insideGroup(boundary itemType) stateFn { 247 depth := 1 248 for { 249 if err := l.acceptText(true); err != nil { 250 return l.errorf(err.Error()) 251 } 252 if l.pos > l.start { 253 l.emit(itemUnprocessedText) 254 } 255 r := l.next() 256 switch r { 257 default: 258 return l.errorf("encountered invalid character %q", r) 259 case leftBracket: 260 l.emit(itemEnterGroup) 261 depth++ 262 case rightBracket: 263 l.emit(itemExitGroup) 264 depth-- 265 if depth == 0 { 266 if boundary != rightBracket { 267 return l.errorf("expected group to close with %q, got %q", boundary, r) 268 } 269 return insideText 270 } 271 case rightParens: 272 if boundary != rightParens { 273 return l.errorf("expected group to close with %q, got %q", boundary, r) 274 } 275 l.emit(itemExitGroup) 276 return insideText 277 case not: 278 l.emit(itemNotMarker) 279 case pipe, space, tab: 280 l.emit(itemChoiceMarker) 281 case colon, slash: 282 l.emit(itemRangeMarker) 283 case amp: 284 l.emit(itemMaskMarker) 285 case tilda: 286 l.emit(itemAnyMaskMarker) 287 } 288 } 289} 290 291func insideLeftBracket(l *lexer) stateFn { 292 return l.insideGroup(rightBracket) 293} 294 295func insideLeftParens(l *lexer) stateFn { 296 return l.insideGroup(rightParens) 297} 298 299func insideWildSingle(l *lexer) stateFn { 300 r := l.next() 301 if r == wildSingle { 302 l.emit(itemWildSingle) 303 return insideText 304 } 305 return l.errorf("expecting a double '?', got %q", r) 306} 307 308func insideWild(l *lexer) stateFn { 309 l.acceptRun(digits) // don't accept a '*' as start of range 310 if l.pos > l.start { 311 l.emit(itemWildStart) 312 } 313 r := l.next() 314 if r == slash { 315 l.emit(itemSlash) 316 l.acceptRun(digitswild) 317 if l.pos > l.start { 318 l.emit(itemWildEnd) 319 } 320 r = l.next() 321 } 322 if r == rightCurly { 323 l.emit(itemCurlyRight) 324 return insideText 325 } 326 return l.errorf("expecting a closing bracket, got %q", r) 327} 328 329// text lexer 330func lexText(input string) *lexer { 331 return lex("textProcessor", input, insideUnprocessedText) 332} 333 334func insideUnprocessedText(l *lexer) stateFn { 335 for { 336 l.acceptRun(hexadecimal) 337 if l.pos > l.start { 338 l.emit(itemHexText) 339 } 340 switch l.next() { 341 default: 342 l.backup() 343 return l.errorf("unexpected character in text: %q", l.next()) 344 case eof: 345 l.emit(itemEOF) 346 return nil 347 case quot: 348 l.emit(itemQuote) 349 return insideQuoteText 350 case space, tab, newline, carriage: 351 l.emit(itemSpace) 352 } 353 } 354} 355 356func insideQuoteText(l *lexer) stateFn { 357 r := l.next() 358 for ; r != eof && r != quot; r = l.next() { 359 } 360 if r == quot { 361 l.backup() 362 l.emit(itemQuoteText) 363 l.next() 364 l.emit(itemQuote) 365 return insideUnprocessedText 366 } 367 return l.errorf("expected closing quote, reached end of string") 368} 369