1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Package scanner implements a scanner for gcfg configuration text. 6// It takes a []byte as source which can then be tokenized 7// through repeated calls to the Scan method. 8// 9// Note that the API for the scanner package may change to accommodate new 10// features or implementation changes in gcfg. 11// 12package scanner 13 14import ( 15 "fmt" 16 "path/filepath" 17 "unicode" 18 "unicode/utf8" 19) 20 21import ( 22 "github.com/go-git/gcfg/token" 23) 24 25// An ErrorHandler may be provided to Scanner.Init. If a syntax error is 26// encountered and a handler was installed, the handler is called with a 27// position and an error message. The position points to the beginning of 28// the offending token. 29// 30type ErrorHandler func(pos token.Position, msg string) 31 32// A Scanner holds the scanner's internal state while processing 33// a given text. It can be allocated as part of another data 34// structure but must be initialized via Init before use. 35// 36type Scanner struct { 37 // immutable state 38 file *token.File // source file handle 39 dir string // directory portion of file.Name() 40 src []byte // source 41 err ErrorHandler // error reporting; or nil 42 mode Mode // scanning mode 43 44 // scanning state 45 ch rune // current character 46 offset int // character offset 47 rdOffset int // reading offset (position after current character) 48 lineOffset int // current line offset 49 nextVal bool // next token is expected to be a value 50 51 // public state - ok to modify 52 ErrorCount int // number of errors encountered 53} 54 55// Read the next Unicode char into s.ch. 56// s.ch < 0 means end-of-file. 57// 58func (s *Scanner) next() { 59 if s.rdOffset < len(s.src) { 60 s.offset = s.rdOffset 61 if s.ch == '\n' { 62 s.lineOffset = s.offset 63 s.file.AddLine(s.offset) 64 } 65 r, w := rune(s.src[s.rdOffset]), 1 66 switch { 67 case r == 0: 68 s.error(s.offset, "illegal character NUL") 69 case r >= 0x80: 70 // not ASCII 71 r, w = utf8.DecodeRune(s.src[s.rdOffset:]) 72 if r == utf8.RuneError && w == 1 { 73 s.error(s.offset, "illegal UTF-8 encoding") 74 } 75 } 76 s.rdOffset += w 77 s.ch = r 78 } else { 79 s.offset = len(s.src) 80 if s.ch == '\n' { 81 s.lineOffset = s.offset 82 s.file.AddLine(s.offset) 83 } 84 s.ch = -1 // eof 85 } 86} 87 88// A mode value is a set of flags (or 0). 89// They control scanner behavior. 90// 91type Mode uint 92 93const ( 94 ScanComments Mode = 1 << iota // return comments as COMMENT tokens 95) 96 97// Init prepares the scanner s to tokenize the text src by setting the 98// scanner at the beginning of src. The scanner uses the file set file 99// for position information and it adds line information for each line. 100// It is ok to re-use the same file when re-scanning the same file as 101// line information which is already present is ignored. Init causes a 102// panic if the file size does not match the src size. 103// 104// Calls to Scan will invoke the error handler err if they encounter a 105// syntax error and err is not nil. Also, for each error encountered, 106// the Scanner field ErrorCount is incremented by one. The mode parameter 107// determines how comments are handled. 108// 109// Note that Init may call err if there is an error in the first character 110// of the file. 111// 112func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) { 113 // Explicitly initialize all fields since a scanner may be reused. 114 if file.Size() != len(src) { 115 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src))) 116 } 117 s.file = file 118 s.dir, _ = filepath.Split(file.Name()) 119 s.src = src 120 s.err = err 121 s.mode = mode 122 123 s.ch = ' ' 124 s.offset = 0 125 s.rdOffset = 0 126 s.lineOffset = 0 127 s.ErrorCount = 0 128 s.nextVal = false 129 130 s.next() 131} 132 133func (s *Scanner) error(offs int, msg string) { 134 if s.err != nil { 135 s.err(s.file.Position(s.file.Pos(offs)), msg) 136 } 137 s.ErrorCount++ 138} 139 140func (s *Scanner) scanComment() string { 141 // initial [;#] already consumed 142 offs := s.offset - 1 // position of initial [;#] 143 144 for s.ch != '\n' && s.ch >= 0 { 145 s.next() 146 } 147 return string(s.src[offs:s.offset]) 148} 149 150func isLetter(ch rune) bool { 151 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch >= 0x80 && unicode.IsLetter(ch) 152} 153 154func isDigit(ch rune) bool { 155 return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch) 156} 157 158func (s *Scanner) scanIdentifier() string { 159 offs := s.offset 160 for isLetter(s.ch) || isDigit(s.ch) || s.ch == '-' { 161 s.next() 162 } 163 return string(s.src[offs:s.offset]) 164} 165 166func (s *Scanner) scanEscape(val bool) { 167 offs := s.offset 168 ch := s.ch 169 s.next() // always make progress 170 switch ch { 171 case '\\', '"': 172 // ok 173 case 'n', 't', 'b': 174 if val { 175 break // ok 176 } 177 fallthrough 178 default: 179 s.error(offs, "unknown escape sequence") 180 } 181} 182 183func (s *Scanner) scanString() string { 184 // '"' opening already consumed 185 offs := s.offset - 1 186 187 for s.ch != '"' { 188 ch := s.ch 189 s.next() 190 if ch == '\n' || ch < 0 { 191 s.error(offs, "string not terminated") 192 break 193 } 194 if ch == '\\' { 195 s.scanEscape(false) 196 } 197 } 198 199 s.next() 200 201 return string(s.src[offs:s.offset]) 202} 203 204func stripCR(b []byte) []byte { 205 c := make([]byte, len(b)) 206 i := 0 207 for _, ch := range b { 208 if ch != '\r' { 209 c[i] = ch 210 i++ 211 } 212 } 213 return c[:i] 214} 215 216func (s *Scanner) scanValString() string { 217 offs := s.offset 218 219 hasCR := false 220 end := offs 221 inQuote := false 222loop: 223 for inQuote || s.ch >= 0 && s.ch != '\n' && s.ch != ';' && s.ch != '#' { 224 ch := s.ch 225 s.next() 226 switch { 227 case inQuote && ch == '\\': 228 s.scanEscape(true) 229 case !inQuote && ch == '\\': 230 if s.ch == '\r' { 231 hasCR = true 232 s.next() 233 } 234 if s.ch != '\n' { 235 s.scanEscape(true) 236 } else { 237 s.next() 238 } 239 case ch == '"': 240 inQuote = !inQuote 241 case ch == '\r': 242 hasCR = true 243 case ch < 0 || inQuote && ch == '\n': 244 s.error(offs, "string not terminated") 245 break loop 246 } 247 if inQuote || !isWhiteSpace(ch) { 248 end = s.offset 249 } 250 } 251 252 lit := s.src[offs:end] 253 if hasCR { 254 lit = stripCR(lit) 255 } 256 257 return string(lit) 258} 259 260func isWhiteSpace(ch rune) bool { 261 return ch == ' ' || ch == '\t' || ch == '\r' 262} 263 264func (s *Scanner) skipWhitespace() { 265 for isWhiteSpace(s.ch) { 266 s.next() 267 } 268} 269 270// Scan scans the next token and returns the token position, the token, 271// and its literal string if applicable. The source end is indicated by 272// token.EOF. 273// 274// If the returned token is a literal (token.IDENT, token.STRING) or 275// token.COMMENT, the literal string has the corresponding value. 276// 277// If the returned token is token.ILLEGAL, the literal string is the 278// offending character. 279// 280// In all other cases, Scan returns an empty literal string. 281// 282// For more tolerant parsing, Scan will return a valid token if 283// possible even if a syntax error was encountered. Thus, even 284// if the resulting token sequence contains no illegal tokens, 285// a client may not assume that no error occurred. Instead it 286// must check the scanner's ErrorCount or the number of calls 287// of the error handler, if there was one installed. 288// 289// Scan adds line information to the file added to the file 290// set with Init. Token positions are relative to that file 291// and thus relative to the file set. 292// 293func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) { 294scanAgain: 295 s.skipWhitespace() 296 297 // current token start 298 pos = s.file.Pos(s.offset) 299 300 // determine token value 301 switch ch := s.ch; { 302 case s.nextVal: 303 lit = s.scanValString() 304 tok = token.STRING 305 s.nextVal = false 306 case isLetter(ch): 307 lit = s.scanIdentifier() 308 tok = token.IDENT 309 default: 310 s.next() // always make progress 311 switch ch { 312 case -1: 313 tok = token.EOF 314 case '\n': 315 tok = token.EOL 316 case '"': 317 tok = token.STRING 318 lit = s.scanString() 319 case '[': 320 tok = token.LBRACK 321 case ']': 322 tok = token.RBRACK 323 case ';', '#': 324 // comment 325 lit = s.scanComment() 326 if s.mode&ScanComments == 0 { 327 // skip comment 328 goto scanAgain 329 } 330 tok = token.COMMENT 331 case '=': 332 tok = token.ASSIGN 333 s.nextVal = true 334 default: 335 s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch)) 336 tok = token.ILLEGAL 337 lit = string(ch) 338 } 339 } 340 341 return 342} 343