1/* 2Copyright 2012 Google Inc. All Rights Reserved. 3 4Licensed under the Apache License, Version 2.0 (the "License"); 5you may not use this file except in compliance with the License. 6You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10Unless required by applicable law or agreed to in writing, software 11distributed under the License is distributed on an "AS IS" BASIS, 12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13See the License for the specific language governing permissions and 14limitations under the License. 15*/ 16 17/* 18Package shlex implements a simple lexer which splits input in to tokens using 19shell-style rules for quoting and commenting. 20 21The basic use case uses the default ASCII lexer to split a string into sub-strings: 22 23 shlex.Split("one \"two three\" four") -> []string{"one", "two three", "four"} 24 25To process a stream of strings: 26 27 l := NewLexer(os.Stdin) 28 for ; token, err := l.Next(); err != nil { 29 // process token 30 } 31 32To access the raw token stream (which includes tokens for comments): 33 34 t := NewTokenizer(os.Stdin) 35 for ; token, err := t.Next(); err != nil { 36 // process token 37 } 38 39*/ 40package shlex 41 42import ( 43 "bufio" 44 "fmt" 45 "io" 46 "strings" 47) 48 49// TokenType is a top-level token classification: A word, space, comment, unknown. 50type TokenType int 51 52// runeTokenClass is the type of a UTF-8 character classification: A quote, space, escape. 53type runeTokenClass int 54 55// the internal state used by the lexer state machine 56type lexerState int 57 58// Token is a (type, value) pair representing a lexographical token. 59type Token struct { 60 tokenType TokenType 61 value string 62} 63 64// Equal reports whether tokens a, and b, are equal. 65// Two tokens are equal if both their types and values are equal. A nil token can 66// never be equal to another token. 67func (a *Token) Equal(b *Token) bool { 68 if a == nil || b == nil { 69 return false 70 } 71 if a.tokenType != b.tokenType { 72 return false 73 } 74 return a.value == b.value 75} 76 77// Named classes of UTF-8 runes 78const ( 79 spaceRunes = " \t\r\n" 80 escapingQuoteRunes = `"` 81 nonEscapingQuoteRunes = "'" 82 escapeRunes = `\` 83 commentRunes = "#" 84) 85 86// Classes of rune token 87const ( 88 unknownRuneClass runeTokenClass = iota 89 spaceRuneClass 90 escapingQuoteRuneClass 91 nonEscapingQuoteRuneClass 92 escapeRuneClass 93 commentRuneClass 94 eofRuneClass 95) 96 97// Classes of lexographic token 98const ( 99 UnknownToken TokenType = iota 100 WordToken 101 SpaceToken 102 CommentToken 103) 104 105// Lexer state machine states 106const ( 107 startState lexerState = iota // no runes have been seen 108 inWordState // processing regular runes in a word 109 escapingState // we have just consumed an escape rune; the next rune is literal 110 escapingQuotedState // we have just consumed an escape rune within a quoted string 111 quotingEscapingState // we are within a quoted string that supports escaping ("...") 112 quotingState // we are within a string that does not support escaping ('...') 113 commentState // we are within a comment (everything following an unquoted or unescaped # 114) 115 116// tokenClassifier is used for classifying rune characters. 117type tokenClassifier map[rune]runeTokenClass 118 119func (typeMap tokenClassifier) addRuneClass(runes string, tokenType runeTokenClass) { 120 for _, runeChar := range runes { 121 typeMap[runeChar] = tokenType 122 } 123} 124 125// newDefaultClassifier creates a new classifier for ASCII characters. 126func newDefaultClassifier() tokenClassifier { 127 t := tokenClassifier{} 128 t.addRuneClass(spaceRunes, spaceRuneClass) 129 t.addRuneClass(escapingQuoteRunes, escapingQuoteRuneClass) 130 t.addRuneClass(nonEscapingQuoteRunes, nonEscapingQuoteRuneClass) 131 t.addRuneClass(escapeRunes, escapeRuneClass) 132 t.addRuneClass(commentRunes, commentRuneClass) 133 return t 134} 135 136// ClassifyRune classifiees a rune 137func (t tokenClassifier) ClassifyRune(runeVal rune) runeTokenClass { 138 return t[runeVal] 139} 140 141// Lexer turns an input stream into a sequence of tokens. Whitespace and comments are skipped. 142type Lexer Tokenizer 143 144// NewLexer creates a new lexer from an input stream. 145func NewLexer(r io.Reader) *Lexer { 146 147 return (*Lexer)(NewTokenizer(r)) 148} 149 150// Next returns the next word, or an error. If there are no more words, 151// the error will be io.EOF. 152func (l *Lexer) Next() (string, error) { 153 for { 154 token, err := (*Tokenizer)(l).Next() 155 if err != nil { 156 return "", err 157 } 158 switch token.tokenType { 159 case WordToken: 160 return token.value, nil 161 case CommentToken: 162 // skip comments 163 default: 164 return "", fmt.Errorf("Unknown token type: %v", token.tokenType) 165 } 166 } 167} 168 169// Tokenizer turns an input stream into a sequence of typed tokens 170type Tokenizer struct { 171 input bufio.Reader 172 classifier tokenClassifier 173} 174 175// NewTokenizer creates a new tokenizer from an input stream. 176func NewTokenizer(r io.Reader) *Tokenizer { 177 input := bufio.NewReader(r) 178 classifier := newDefaultClassifier() 179 return &Tokenizer{ 180 input: *input, 181 classifier: classifier} 182} 183 184// scanStream scans the stream for the next token using the internal state machine. 185// It will panic if it encounters a rune which it does not know how to handle. 186func (t *Tokenizer) scanStream() (*Token, error) { 187 state := startState 188 var tokenType TokenType 189 var value []rune 190 var nextRune rune 191 var nextRuneType runeTokenClass 192 var err error 193 194 for { 195 nextRune, _, err = t.input.ReadRune() 196 nextRuneType = t.classifier.ClassifyRune(nextRune) 197 198 if err == io.EOF { 199 nextRuneType = eofRuneClass 200 err = nil 201 } else if err != nil { 202 return nil, err 203 } 204 205 switch state { 206 case startState: // no runes read yet 207 { 208 switch nextRuneType { 209 case eofRuneClass: 210 { 211 return nil, io.EOF 212 } 213 case spaceRuneClass: 214 { 215 } 216 case escapingQuoteRuneClass: 217 { 218 tokenType = WordToken 219 state = quotingEscapingState 220 } 221 case nonEscapingQuoteRuneClass: 222 { 223 tokenType = WordToken 224 state = quotingState 225 } 226 case escapeRuneClass: 227 { 228 tokenType = WordToken 229 state = escapingState 230 } 231 case commentRuneClass: 232 { 233 tokenType = CommentToken 234 state = commentState 235 } 236 default: 237 { 238 tokenType = WordToken 239 value = append(value, nextRune) 240 state = inWordState 241 } 242 } 243 } 244 case inWordState: // in a regular word 245 { 246 switch nextRuneType { 247 case eofRuneClass: 248 { 249 token := &Token{ 250 tokenType: tokenType, 251 value: string(value)} 252 return token, err 253 } 254 case spaceRuneClass: 255 { 256 token := &Token{ 257 tokenType: tokenType, 258 value: string(value)} 259 return token, err 260 } 261 case escapingQuoteRuneClass: 262 { 263 state = quotingEscapingState 264 } 265 case nonEscapingQuoteRuneClass: 266 { 267 state = quotingState 268 } 269 case escapeRuneClass: 270 { 271 state = escapingState 272 } 273 default: 274 { 275 value = append(value, nextRune) 276 } 277 } 278 } 279 case escapingState: // the rune after an escape character 280 { 281 switch nextRuneType { 282 case eofRuneClass: 283 { 284 err = fmt.Errorf("EOF found after escape character") 285 token := &Token{ 286 tokenType: tokenType, 287 value: string(value)} 288 return token, err 289 } 290 default: 291 { 292 state = inWordState 293 value = append(value, nextRune) 294 } 295 } 296 } 297 case escapingQuotedState: // the next rune after an escape character, in double quotes 298 { 299 switch nextRuneType { 300 case eofRuneClass: 301 { 302 err = fmt.Errorf("EOF found after escape character") 303 token := &Token{ 304 tokenType: tokenType, 305 value: string(value)} 306 return token, err 307 } 308 default: 309 { 310 state = quotingEscapingState 311 value = append(value, nextRune) 312 } 313 } 314 } 315 case quotingEscapingState: // in escaping double quotes 316 { 317 switch nextRuneType { 318 case eofRuneClass: 319 { 320 err = fmt.Errorf("EOF found when expecting closing quote") 321 token := &Token{ 322 tokenType: tokenType, 323 value: string(value)} 324 return token, err 325 } 326 case escapingQuoteRuneClass: 327 { 328 state = inWordState 329 } 330 case escapeRuneClass: 331 { 332 state = escapingQuotedState 333 } 334 default: 335 { 336 value = append(value, nextRune) 337 } 338 } 339 } 340 case quotingState: // in non-escaping single quotes 341 { 342 switch nextRuneType { 343 case eofRuneClass: 344 { 345 err = fmt.Errorf("EOF found when expecting closing quote") 346 token := &Token{ 347 tokenType: tokenType, 348 value: string(value)} 349 return token, err 350 } 351 case nonEscapingQuoteRuneClass: 352 { 353 state = inWordState 354 } 355 default: 356 { 357 value = append(value, nextRune) 358 } 359 } 360 } 361 case commentState: // in a comment 362 { 363 switch nextRuneType { 364 case eofRuneClass: 365 { 366 token := &Token{ 367 tokenType: tokenType, 368 value: string(value)} 369 return token, err 370 } 371 case spaceRuneClass: 372 { 373 if nextRune == '\n' { 374 state = startState 375 token := &Token{ 376 tokenType: tokenType, 377 value: string(value)} 378 return token, err 379 } else { 380 value = append(value, nextRune) 381 } 382 } 383 default: 384 { 385 value = append(value, nextRune) 386 } 387 } 388 } 389 default: 390 { 391 return nil, fmt.Errorf("Unexpected state: %v", state) 392 } 393 } 394 } 395} 396 397// Next returns the next token in the stream. 398func (t *Tokenizer) Next() (*Token, error) { 399 return t.scanStream() 400} 401 402// Split partitions a string into a slice of strings. 403func Split(s string) ([]string, error) { 404 l := NewLexer(strings.NewReader(s)) 405 subStrings := make([]string, 0) 406 for { 407 word, err := l.Next() 408 if err != nil { 409 if err == io.EOF { 410 return subStrings, nil 411 } 412 return subStrings, err 413 } 414 subStrings = append(subStrings, word) 415 } 416} 417