1/* 2Copyright 2012 Google Inc. All Rights Reserved. 3 4Licensed under the Apache License, Version 2.0 (the "License"); 5you may not use this file except in compliance with the License. 6You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10Unless required by applicable law or agreed to in writing, software 11distributed under the License is distributed on an "AS IS" BASIS, 12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13See the License for the specific language governing permissions and 14limitations under the License. 15*/ 16 17package shlex 18 19/* 20Package shlex implements a simple lexer which splits input in to tokens using 21shell-style rules for quoting and commenting. 22*/ 23import ( 24 "bufio" 25 "errors" 26 "fmt" 27 "io" 28 "strings" 29) 30 31/* 32A TokenType is a top-level token; a word, space, comment, unknown. 33*/ 34type TokenType int 35 36/* 37A RuneTokenType is the type of a UTF-8 character; a character, quote, space, escape. 38*/ 39type RuneTokenType int 40 41type lexerState int 42 43type Token struct { 44 tokenType TokenType 45 value string 46} 47 48/* 49Two tokens are equal if both their types and values are equal. A nil token can 50never equal another token. 51*/ 52func (a *Token) Equal(b *Token) bool { 53 if a == nil || b == nil { 54 return false 55 } 56 if a.tokenType != b.tokenType { 57 return false 58 } 59 return a.value == b.value 60} 61 62const ( 63 RUNE_CHAR string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789._-,/@$*()+=><:;&^%~|!?[]{}" 64 RUNE_SPACE string = " \t\r\n" 65 RUNE_ESCAPING_QUOTE string = "\"" 66 RUNE_NONESCAPING_QUOTE string = "'" 67 RUNE_ESCAPE = "\\" 68 RUNE_COMMENT = "#" 69 70 RUNETOKEN_UNKNOWN RuneTokenType = 0 71 RUNETOKEN_CHAR RuneTokenType = 1 72 RUNETOKEN_SPACE RuneTokenType = 2 73 RUNETOKEN_ESCAPING_QUOTE RuneTokenType = 3 74 RUNETOKEN_NONESCAPING_QUOTE RuneTokenType = 4 75 RUNETOKEN_ESCAPE RuneTokenType = 5 76 RUNETOKEN_COMMENT RuneTokenType = 6 77 RUNETOKEN_EOF RuneTokenType = 7 78 79 TOKEN_UNKNOWN TokenType = 0 80 TOKEN_WORD TokenType = 1 81 TOKEN_SPACE TokenType = 2 82 TOKEN_COMMENT TokenType = 3 83 84 STATE_START lexerState = 0 85 STATE_INWORD lexerState = 1 86 STATE_ESCAPING lexerState = 2 87 STATE_ESCAPING_QUOTED lexerState = 3 88 STATE_QUOTED_ESCAPING lexerState = 4 89 STATE_QUOTED lexerState = 5 90 STATE_COMMENT lexerState = 6 91 92 INITIAL_TOKEN_CAPACITY int = 100 93) 94 95/* 96A type for classifying characters. This allows for different sorts of 97classifiers - those accepting extended non-ascii chars, or strict posix 98compatibility, for example. 99*/ 100type TokenClassifier struct { 101 typeMap map[int32]RuneTokenType 102} 103 104func addRuneClass(typeMap *map[int32]RuneTokenType, runes string, tokenType RuneTokenType) { 105 for _, rune := range runes { 106 (*typeMap)[int32(rune)] = tokenType 107 } 108} 109 110/* 111Create a new classifier for basic ASCII characters. 112*/ 113func NewDefaultClassifier() *TokenClassifier { 114 typeMap := map[int32]RuneTokenType{} 115 addRuneClass(&typeMap, RUNE_CHAR, RUNETOKEN_CHAR) 116 addRuneClass(&typeMap, RUNE_SPACE, RUNETOKEN_SPACE) 117 addRuneClass(&typeMap, RUNE_ESCAPING_QUOTE, RUNETOKEN_ESCAPING_QUOTE) 118 addRuneClass(&typeMap, RUNE_NONESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE) 119 addRuneClass(&typeMap, RUNE_ESCAPE, RUNETOKEN_ESCAPE) 120 addRuneClass(&typeMap, RUNE_COMMENT, RUNETOKEN_COMMENT) 121 return &TokenClassifier{ 122 typeMap: typeMap} 123} 124 125func (classifier *TokenClassifier) ClassifyRune(rune int32) RuneTokenType { 126 return classifier.typeMap[rune] 127} 128 129/* 130A type for turning an input stream in to a sequence of strings. Whitespace and 131comments are skipped. 132*/ 133type Lexer struct { 134 tokenizer *Tokenizer 135} 136 137/* 138Create a new lexer. 139*/ 140func NewLexer(r io.Reader) (*Lexer, error) { 141 142 tokenizer, err := NewTokenizer(r) 143 if err != nil { 144 return nil, err 145 } 146 lexer := &Lexer{tokenizer: tokenizer} 147 return lexer, nil 148} 149 150/* 151Return the next word, and an error value. If there are no more words, the error 152will be io.EOF. 153*/ 154func (l *Lexer) NextWord() (string, error) { 155 var token *Token 156 var err error 157 for { 158 token, err = l.tokenizer.NextToken() 159 if err != nil { 160 return "", err 161 } 162 switch token.tokenType { 163 case TOKEN_WORD: 164 { 165 return token.value, nil 166 } 167 case TOKEN_COMMENT: 168 { 169 // skip comments 170 } 171 default: 172 { 173 panic(fmt.Sprintf("Unknown token type: %v", token.tokenType)) 174 } 175 } 176 } 177 return "", io.EOF 178} 179 180/* 181A type for turning an input stream in to a sequence of typed tokens. 182*/ 183type Tokenizer struct { 184 input *bufio.Reader 185 classifier *TokenClassifier 186} 187 188/* 189Create a new tokenizer. 190*/ 191func NewTokenizer(r io.Reader) (*Tokenizer, error) { 192 input := bufio.NewReader(r) 193 classifier := NewDefaultClassifier() 194 tokenizer := &Tokenizer{ 195 input: input, 196 classifier: classifier} 197 return tokenizer, nil 198} 199 200/* 201Scan the stream for the next token. 202 203This uses an internal state machine. It will panic if it encounters a character 204which it does not know how to handle. 205*/ 206func (t *Tokenizer) scanStream() (*Token, error) { 207 state := STATE_START 208 var tokenType TokenType 209 value := make([]int32, 0, INITIAL_TOKEN_CAPACITY) 210 var ( 211 nextRune int32 212 nextRuneType RuneTokenType 213 err error 214 ) 215SCAN: 216 for { 217 nextRune, _, err = t.input.ReadRune() 218 nextRuneType = t.classifier.ClassifyRune(nextRune) 219 if err != nil { 220 if err == io.EOF { 221 nextRuneType = RUNETOKEN_EOF 222 err = nil 223 } else { 224 return nil, err 225 } 226 } 227 switch state { 228 case STATE_START: // no runes read yet 229 { 230 switch nextRuneType { 231 case RUNETOKEN_EOF: 232 { 233 return nil, io.EOF 234 } 235 case RUNETOKEN_CHAR: 236 { 237 tokenType = TOKEN_WORD 238 value = append(value, nextRune) 239 state = STATE_INWORD 240 } 241 case RUNETOKEN_SPACE: 242 { 243 } 244 case RUNETOKEN_ESCAPING_QUOTE: 245 { 246 tokenType = TOKEN_WORD 247 state = STATE_QUOTED_ESCAPING 248 } 249 case RUNETOKEN_NONESCAPING_QUOTE: 250 { 251 tokenType = TOKEN_WORD 252 state = STATE_QUOTED 253 } 254 case RUNETOKEN_ESCAPE: 255 { 256 tokenType = TOKEN_WORD 257 state = STATE_ESCAPING 258 } 259 case RUNETOKEN_COMMENT: 260 { 261 tokenType = TOKEN_COMMENT 262 state = STATE_COMMENT 263 } 264 default: 265 { 266 return nil, errors.New(fmt.Sprintf("Unknown rune: %v", nextRune)) 267 } 268 } 269 } 270 case STATE_INWORD: // in a regular word 271 { 272 switch nextRuneType { 273 case RUNETOKEN_EOF: 274 { 275 break SCAN 276 } 277 case RUNETOKEN_CHAR, RUNETOKEN_COMMENT: 278 { 279 value = append(value, nextRune) 280 } 281 case RUNETOKEN_SPACE: 282 { 283 t.input.UnreadRune() 284 break SCAN 285 } 286 case RUNETOKEN_ESCAPING_QUOTE: 287 { 288 state = STATE_QUOTED_ESCAPING 289 } 290 case RUNETOKEN_NONESCAPING_QUOTE: 291 { 292 state = STATE_QUOTED 293 } 294 case RUNETOKEN_ESCAPE: 295 { 296 state = STATE_ESCAPING 297 } 298 default: 299 { 300 return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune)) 301 } 302 } 303 } 304 case STATE_ESCAPING: // the next rune after an escape character 305 { 306 switch nextRuneType { 307 case RUNETOKEN_EOF: 308 { 309 err = errors.New("EOF found after escape character") 310 break SCAN 311 } 312 case RUNETOKEN_CHAR, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT: 313 { 314 state = STATE_INWORD 315 value = append(value, nextRune) 316 } 317 default: 318 { 319 return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune)) 320 } 321 } 322 } 323 case STATE_ESCAPING_QUOTED: // the next rune after an escape character, in double quotes 324 { 325 switch nextRuneType { 326 case RUNETOKEN_EOF: 327 { 328 err = errors.New("EOF found after escape character") 329 break SCAN 330 } 331 case RUNETOKEN_CHAR, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT: 332 { 333 state = STATE_QUOTED_ESCAPING 334 value = append(value, nextRune) 335 } 336 default: 337 { 338 return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune)) 339 } 340 } 341 } 342 case STATE_QUOTED_ESCAPING: // in escaping double quotes 343 { 344 switch nextRuneType { 345 case RUNETOKEN_EOF: 346 { 347 err = errors.New("EOF found when expecting closing quote.") 348 break SCAN 349 } 350 case RUNETOKEN_CHAR, RUNETOKEN_UNKNOWN, RUNETOKEN_SPACE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_COMMENT: 351 { 352 value = append(value, nextRune) 353 } 354 case RUNETOKEN_ESCAPING_QUOTE: 355 { 356 state = STATE_INWORD 357 } 358 case RUNETOKEN_ESCAPE: 359 { 360 state = STATE_ESCAPING_QUOTED 361 } 362 default: 363 { 364 return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune)) 365 } 366 } 367 } 368 case STATE_QUOTED: // in non-escaping single quotes 369 { 370 switch nextRuneType { 371 case RUNETOKEN_EOF: 372 { 373 err = errors.New("EOF found when expecting closing quote.") 374 break SCAN 375 } 376 case RUNETOKEN_CHAR, RUNETOKEN_UNKNOWN, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT: 377 { 378 value = append(value, nextRune) 379 } 380 case RUNETOKEN_NONESCAPING_QUOTE: 381 { 382 state = STATE_INWORD 383 } 384 default: 385 { 386 return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune)) 387 } 388 } 389 } 390 case STATE_COMMENT: 391 { 392 switch nextRuneType { 393 case RUNETOKEN_EOF: 394 { 395 break SCAN 396 } 397 case RUNETOKEN_CHAR, RUNETOKEN_UNKNOWN, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT, RUNETOKEN_NONESCAPING_QUOTE: 398 { 399 value = append(value, nextRune) 400 } 401 case RUNETOKEN_SPACE: 402 { 403 if nextRune == '\n' { 404 state = STATE_START 405 break SCAN 406 } else { 407 value = append(value, nextRune) 408 } 409 } 410 default: 411 { 412 return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune)) 413 } 414 } 415 } 416 default: 417 { 418 panic(fmt.Sprintf("Unexpected state: %v", state)) 419 } 420 } 421 } 422 token := &Token{ 423 tokenType: tokenType, 424 value: string(value)} 425 return token, err 426} 427 428/* 429Return the next token in the stream, and an error value. If there are no more 430tokens available, the error value will be io.EOF. 431*/ 432func (t *Tokenizer) NextToken() (*Token, error) { 433 return t.scanStream() 434} 435 436/* 437Split a string in to a slice of strings, based upon shell-style rules for 438quoting, escaping, and spaces. 439*/ 440func Split(s string) ([]string, error) { 441 l, err := NewLexer(strings.NewReader(s)) 442 if err != nil { 443 return nil, err 444 } 445 subStrings := []string{} 446 for { 447 word, err := l.NextWord() 448 if err != nil { 449 if err == io.EOF { 450 return subStrings, nil 451 } 452 return subStrings, err 453 } 454 subStrings = append(subStrings, word) 455 } 456 return subStrings, nil 457} 458