1/*
2Copyright 2012 Google Inc. All Rights Reserved.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17/*
18Package shlex implements a simple lexer which splits input in to tokens using
19shell-style rules for quoting and commenting.
20
21The basic use case uses the default ASCII lexer to split a string into sub-strings:
22
23  shlex.Split("one \"two three\" four") -> []string{"one", "two three", "four"}
24
25To process a stream of strings:
26
27  l := NewLexer(os.Stdin)
28  for ; token, err := l.Next(); err != nil {
29  	// process token
30  }
31
32To access the raw token stream (which includes tokens for comments):
33
34  t := NewTokenizer(os.Stdin)
35  for ; token, err := t.Next(); err != nil {
36	// process token
37  }
38
39*/
40package shlex
41
42import (
43	"bufio"
44	"fmt"
45	"io"
46	"strings"
47)
48
49// TokenType is a top-level token classification: A word, space, comment, unknown.
50type TokenType int
51
52// runeTokenClass is the type of a UTF-8 character classification: A quote, space, escape.
53type runeTokenClass int
54
55// the internal state used by the lexer state machine
56type lexerState int
57
58// Token is a (type, value) pair representing a lexographical token.
59type Token struct {
60	tokenType TokenType
61	value     string
62}
63
64// Equal reports whether tokens a, and b, are equal.
65// Two tokens are equal if both their types and values are equal. A nil token can
66// never be equal to another token.
67func (a *Token) Equal(b *Token) bool {
68	if a == nil || b == nil {
69		return false
70	}
71	if a.tokenType != b.tokenType {
72		return false
73	}
74	return a.value == b.value
75}
76
77// Named classes of UTF-8 runes
78const (
79	spaceRunes            = " \t\r\n"
80	escapingQuoteRunes    = `"`
81	nonEscapingQuoteRunes = "'"
82	escapeRunes           = `\`
83	commentRunes          = "#"
84)
85
86// Classes of rune token
87const (
88	unknownRuneClass runeTokenClass = iota
89	spaceRuneClass
90	escapingQuoteRuneClass
91	nonEscapingQuoteRuneClass
92	escapeRuneClass
93	commentRuneClass
94	eofRuneClass
95)
96
97// Classes of lexographic token
98const (
99	UnknownToken TokenType = iota
100	WordToken
101	SpaceToken
102	CommentToken
103)
104
105// Lexer state machine states
106const (
107	startState           lexerState = iota // no runes have been seen
108	inWordState                            // processing regular runes in a word
109	escapingState                          // we have just consumed an escape rune; the next rune is literal
110	escapingQuotedState                    // we have just consumed an escape rune within a quoted string
111	quotingEscapingState                   // we are within a quoted string that supports escaping ("...")
112	quotingState                           // we are within a string that does not support escaping ('...')
113	commentState                           // we are within a comment (everything following an unquoted or unescaped #
114)
115
116// tokenClassifier is used for classifying rune characters.
117type tokenClassifier map[rune]runeTokenClass
118
119func (typeMap tokenClassifier) addRuneClass(runes string, tokenType runeTokenClass) {
120	for _, runeChar := range runes {
121		typeMap[runeChar] = tokenType
122	}
123}
124
125// newDefaultClassifier creates a new classifier for ASCII characters.
126func newDefaultClassifier() tokenClassifier {
127	t := tokenClassifier{}
128	t.addRuneClass(spaceRunes, spaceRuneClass)
129	t.addRuneClass(escapingQuoteRunes, escapingQuoteRuneClass)
130	t.addRuneClass(nonEscapingQuoteRunes, nonEscapingQuoteRuneClass)
131	t.addRuneClass(escapeRunes, escapeRuneClass)
132	t.addRuneClass(commentRunes, commentRuneClass)
133	return t
134}
135
136// ClassifyRune classifiees a rune
137func (t tokenClassifier) ClassifyRune(runeVal rune) runeTokenClass {
138	return t[runeVal]
139}
140
141// Lexer turns an input stream into a sequence of tokens. Whitespace and comments are skipped.
142type Lexer Tokenizer
143
144// NewLexer creates a new lexer from an input stream.
145func NewLexer(r io.Reader) *Lexer {
146
147	return (*Lexer)(NewTokenizer(r))
148}
149
150// Next returns the next word, or an error. If there are no more words,
151// the error will be io.EOF.
152func (l *Lexer) Next() (string, error) {
153	for {
154		token, err := (*Tokenizer)(l).Next()
155		if err != nil {
156			return "", err
157		}
158		switch token.tokenType {
159		case WordToken:
160			return token.value, nil
161		case CommentToken:
162			// skip comments
163		default:
164			return "", fmt.Errorf("Unknown token type: %v", token.tokenType)
165		}
166	}
167}
168
169// Tokenizer turns an input stream into a sequence of typed tokens
170type Tokenizer struct {
171	input      bufio.Reader
172	classifier tokenClassifier
173}
174
175// NewTokenizer creates a new tokenizer from an input stream.
176func NewTokenizer(r io.Reader) *Tokenizer {
177	input := bufio.NewReader(r)
178	classifier := newDefaultClassifier()
179	return &Tokenizer{
180		input:      *input,
181		classifier: classifier}
182}
183
184// scanStream scans the stream for the next token using the internal state machine.
185// It will panic if it encounters a rune which it does not know how to handle.
186func (t *Tokenizer) scanStream() (*Token, error) {
187	state := startState
188	var tokenType TokenType
189	var value []rune
190	var nextRune rune
191	var nextRuneType runeTokenClass
192	var err error
193
194	for {
195		nextRune, _, err = t.input.ReadRune()
196		nextRuneType = t.classifier.ClassifyRune(nextRune)
197
198		if err == io.EOF {
199			nextRuneType = eofRuneClass
200			err = nil
201		} else if err != nil {
202			return nil, err
203		}
204
205		switch state {
206		case startState: // no runes read yet
207			{
208				switch nextRuneType {
209				case eofRuneClass:
210					{
211						return nil, io.EOF
212					}
213				case spaceRuneClass:
214					{
215					}
216				case escapingQuoteRuneClass:
217					{
218						tokenType = WordToken
219						state = quotingEscapingState
220					}
221				case nonEscapingQuoteRuneClass:
222					{
223						tokenType = WordToken
224						state = quotingState
225					}
226				case escapeRuneClass:
227					{
228						tokenType = WordToken
229						state = escapingState
230					}
231				case commentRuneClass:
232					{
233						tokenType = CommentToken
234						state = commentState
235					}
236				default:
237					{
238						tokenType = WordToken
239						value = append(value, nextRune)
240						state = inWordState
241					}
242				}
243			}
244		case inWordState: // in a regular word
245			{
246				switch nextRuneType {
247				case eofRuneClass:
248					{
249						token := &Token{
250							tokenType: tokenType,
251							value:     string(value)}
252						return token, err
253					}
254				case spaceRuneClass:
255					{
256						token := &Token{
257							tokenType: tokenType,
258							value:     string(value)}
259						return token, err
260					}
261				case escapingQuoteRuneClass:
262					{
263						state = quotingEscapingState
264					}
265				case nonEscapingQuoteRuneClass:
266					{
267						state = quotingState
268					}
269				case escapeRuneClass:
270					{
271						state = escapingState
272					}
273				default:
274					{
275						value = append(value, nextRune)
276					}
277				}
278			}
279		case escapingState: // the rune after an escape character
280			{
281				switch nextRuneType {
282				case eofRuneClass:
283					{
284						err = fmt.Errorf("EOF found after escape character")
285						token := &Token{
286							tokenType: tokenType,
287							value:     string(value)}
288						return token, err
289					}
290				default:
291					{
292						state = inWordState
293						value = append(value, nextRune)
294					}
295				}
296			}
297		case escapingQuotedState: // the next rune after an escape character, in double quotes
298			{
299				switch nextRuneType {
300				case eofRuneClass:
301					{
302						err = fmt.Errorf("EOF found after escape character")
303						token := &Token{
304							tokenType: tokenType,
305							value:     string(value)}
306						return token, err
307					}
308				default:
309					{
310						state = quotingEscapingState
311						value = append(value, nextRune)
312					}
313				}
314			}
315		case quotingEscapingState: // in escaping double quotes
316			{
317				switch nextRuneType {
318				case eofRuneClass:
319					{
320						err = fmt.Errorf("EOF found when expecting closing quote")
321						token := &Token{
322							tokenType: tokenType,
323							value:     string(value)}
324						return token, err
325					}
326				case escapingQuoteRuneClass:
327					{
328						state = inWordState
329					}
330				case escapeRuneClass:
331					{
332						state = escapingQuotedState
333					}
334				default:
335					{
336						value = append(value, nextRune)
337					}
338				}
339			}
340		case quotingState: // in non-escaping single quotes
341			{
342				switch nextRuneType {
343				case eofRuneClass:
344					{
345						err = fmt.Errorf("EOF found when expecting closing quote")
346						token := &Token{
347							tokenType: tokenType,
348							value:     string(value)}
349						return token, err
350					}
351				case nonEscapingQuoteRuneClass:
352					{
353						state = inWordState
354					}
355				default:
356					{
357						value = append(value, nextRune)
358					}
359				}
360			}
361		case commentState: // in a comment
362			{
363				switch nextRuneType {
364				case eofRuneClass:
365					{
366						token := &Token{
367							tokenType: tokenType,
368							value:     string(value)}
369						return token, err
370					}
371				case spaceRuneClass:
372					{
373						if nextRune == '\n' {
374							state = startState
375							token := &Token{
376								tokenType: tokenType,
377								value:     string(value)}
378							return token, err
379						} else {
380							value = append(value, nextRune)
381						}
382					}
383				default:
384					{
385						value = append(value, nextRune)
386					}
387				}
388			}
389		default:
390			{
391				return nil, fmt.Errorf("Unexpected state: %v", state)
392			}
393		}
394	}
395}
396
397// Next returns the next token in the stream.
398func (t *Tokenizer) Next() (*Token, error) {
399	return t.scanStream()
400}
401
402// Split partitions a string into a slice of strings.
403func Split(s string) ([]string, error) {
404	l := NewLexer(strings.NewReader(s))
405	subStrings := make([]string, 0)
406	for {
407		word, err := l.Next()
408		if err != nil {
409			if err == io.EOF {
410				return subStrings, nil
411			}
412			return subStrings, err
413		}
414		subStrings = append(subStrings, word)
415	}
416}
417