1/*
2Copyright 2012 Google Inc. All Rights Reserved.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17package shlex
18
19/*
20Package shlex implements a simple lexer which splits input in to tokens using
21shell-style rules for quoting and commenting.
22*/
23import (
24	"bufio"
25	"errors"
26	"fmt"
27	"io"
28	"strings"
29)
30
31/*
32A TokenType is a top-level token; a word, space, comment, unknown.
33*/
34type TokenType int
35
36/*
37A RuneTokenType is the type of a UTF-8 character; a character, quote, space, escape.
38*/
39type RuneTokenType int
40
41type lexerState int
42
43type Token struct {
44	tokenType TokenType
45	value     string
46}
47
48/*
49Two tokens are equal if both their types and values are equal. A nil token can
50never equal another token.
51*/
52func (a *Token) Equal(b *Token) bool {
53	if a == nil || b == nil {
54		return false
55	}
56	if a.tokenType != b.tokenType {
57		return false
58	}
59	return a.value == b.value
60}
61
62const (
63	RUNE_CHAR              string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789._-,/@$*()+=><:;&^%~|!?[]{}"
64	RUNE_SPACE             string = " \t\r\n"
65	RUNE_ESCAPING_QUOTE    string = "\""
66	RUNE_NONESCAPING_QUOTE string = "'"
67	RUNE_ESCAPE                   = "\\"
68	RUNE_COMMENT                  = "#"
69
70	RUNETOKEN_UNKNOWN           RuneTokenType = 0
71	RUNETOKEN_CHAR              RuneTokenType = 1
72	RUNETOKEN_SPACE             RuneTokenType = 2
73	RUNETOKEN_ESCAPING_QUOTE    RuneTokenType = 3
74	RUNETOKEN_NONESCAPING_QUOTE RuneTokenType = 4
75	RUNETOKEN_ESCAPE            RuneTokenType = 5
76	RUNETOKEN_COMMENT           RuneTokenType = 6
77	RUNETOKEN_EOF               RuneTokenType = 7
78
79	TOKEN_UNKNOWN TokenType = 0
80	TOKEN_WORD    TokenType = 1
81	TOKEN_SPACE   TokenType = 2
82	TOKEN_COMMENT TokenType = 3
83
84	STATE_START           lexerState = 0
85	STATE_INWORD          lexerState = 1
86	STATE_ESCAPING        lexerState = 2
87	STATE_ESCAPING_QUOTED lexerState = 3
88	STATE_QUOTED_ESCAPING lexerState = 4
89	STATE_QUOTED          lexerState = 5
90	STATE_COMMENT         lexerState = 6
91
92	INITIAL_TOKEN_CAPACITY int = 100
93)
94
95/*
96A type for classifying characters. This allows for different sorts of
97classifiers - those accepting extended non-ascii chars, or strict posix
98compatibility, for example.
99*/
100type TokenClassifier struct {
101	typeMap map[int32]RuneTokenType
102}
103
104func addRuneClass(typeMap *map[int32]RuneTokenType, runes string, tokenType RuneTokenType) {
105	for _, rune := range runes {
106		(*typeMap)[int32(rune)] = tokenType
107	}
108}
109
110/*
111Create a new classifier for basic ASCII characters.
112*/
113func NewDefaultClassifier() *TokenClassifier {
114	typeMap := map[int32]RuneTokenType{}
115	addRuneClass(&typeMap, RUNE_CHAR, RUNETOKEN_CHAR)
116	addRuneClass(&typeMap, RUNE_SPACE, RUNETOKEN_SPACE)
117	addRuneClass(&typeMap, RUNE_ESCAPING_QUOTE, RUNETOKEN_ESCAPING_QUOTE)
118	addRuneClass(&typeMap, RUNE_NONESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE)
119	addRuneClass(&typeMap, RUNE_ESCAPE, RUNETOKEN_ESCAPE)
120	addRuneClass(&typeMap, RUNE_COMMENT, RUNETOKEN_COMMENT)
121	return &TokenClassifier{
122		typeMap: typeMap}
123}
124
125func (classifier *TokenClassifier) ClassifyRune(rune int32) RuneTokenType {
126	return classifier.typeMap[rune]
127}
128
129/*
130A type for turning an input stream in to a sequence of strings. Whitespace and
131comments are skipped.
132*/
133type Lexer struct {
134	tokenizer *Tokenizer
135}
136
137/*
138Create a new lexer.
139*/
140func NewLexer(r io.Reader) (*Lexer, error) {
141
142	tokenizer, err := NewTokenizer(r)
143	if err != nil {
144		return nil, err
145	}
146	lexer := &Lexer{tokenizer: tokenizer}
147	return lexer, nil
148}
149
150/*
151Return the next word, and an error value. If there are no more words, the error
152will be io.EOF.
153*/
154func (l *Lexer) NextWord() (string, error) {
155	var token *Token
156	var err error
157	for {
158		token, err = l.tokenizer.NextToken()
159		if err != nil {
160			return "", err
161		}
162		switch token.tokenType {
163		case TOKEN_WORD:
164			{
165				return token.value, nil
166			}
167		case TOKEN_COMMENT:
168			{
169				// skip comments
170			}
171		default:
172			{
173				panic(fmt.Sprintf("Unknown token type: %v", token.tokenType))
174			}
175		}
176	}
177	return "", io.EOF
178}
179
180/*
181A type for turning an input stream in to a sequence of typed tokens.
182*/
183type Tokenizer struct {
184	input      *bufio.Reader
185	classifier *TokenClassifier
186}
187
188/*
189Create a new tokenizer.
190*/
191func NewTokenizer(r io.Reader) (*Tokenizer, error) {
192	input := bufio.NewReader(r)
193	classifier := NewDefaultClassifier()
194	tokenizer := &Tokenizer{
195		input:      input,
196		classifier: classifier}
197	return tokenizer, nil
198}
199
200/*
201Scan the stream for the next token.
202
203This uses an internal state machine. It will panic if it encounters a character
204which it does not know how to handle.
205*/
206func (t *Tokenizer) scanStream() (*Token, error) {
207	state := STATE_START
208	var tokenType TokenType
209	value := make([]int32, 0, INITIAL_TOKEN_CAPACITY)
210	var (
211		nextRune     int32
212		nextRuneType RuneTokenType
213		err          error
214	)
215SCAN:
216	for {
217		nextRune, _, err = t.input.ReadRune()
218		nextRuneType = t.classifier.ClassifyRune(nextRune)
219		if err != nil {
220			if err == io.EOF {
221				nextRuneType = RUNETOKEN_EOF
222				err = nil
223			} else {
224				return nil, err
225			}
226		}
227		switch state {
228		case STATE_START: // no runes read yet
229			{
230				switch nextRuneType {
231				case RUNETOKEN_EOF:
232					{
233						return nil, io.EOF
234					}
235				case RUNETOKEN_CHAR:
236					{
237						tokenType = TOKEN_WORD
238						value = append(value, nextRune)
239						state = STATE_INWORD
240					}
241				case RUNETOKEN_SPACE:
242					{
243					}
244				case RUNETOKEN_ESCAPING_QUOTE:
245					{
246						tokenType = TOKEN_WORD
247						state = STATE_QUOTED_ESCAPING
248					}
249				case RUNETOKEN_NONESCAPING_QUOTE:
250					{
251						tokenType = TOKEN_WORD
252						state = STATE_QUOTED
253					}
254				case RUNETOKEN_ESCAPE:
255					{
256						tokenType = TOKEN_WORD
257						state = STATE_ESCAPING
258					}
259				case RUNETOKEN_COMMENT:
260					{
261						tokenType = TOKEN_COMMENT
262						state = STATE_COMMENT
263					}
264				default:
265					{
266						return nil, errors.New(fmt.Sprintf("Unknown rune: %v", nextRune))
267					}
268				}
269			}
270		case STATE_INWORD: // in a regular word
271			{
272				switch nextRuneType {
273				case RUNETOKEN_EOF:
274					{
275						break SCAN
276					}
277				case RUNETOKEN_CHAR, RUNETOKEN_COMMENT:
278					{
279						value = append(value, nextRune)
280					}
281				case RUNETOKEN_SPACE:
282					{
283						t.input.UnreadRune()
284						break SCAN
285					}
286				case RUNETOKEN_ESCAPING_QUOTE:
287					{
288						state = STATE_QUOTED_ESCAPING
289					}
290				case RUNETOKEN_NONESCAPING_QUOTE:
291					{
292						state = STATE_QUOTED
293					}
294				case RUNETOKEN_ESCAPE:
295					{
296						state = STATE_ESCAPING
297					}
298				default:
299					{
300						return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
301					}
302				}
303			}
304		case STATE_ESCAPING: // the next rune after an escape character
305			{
306				switch nextRuneType {
307				case RUNETOKEN_EOF:
308					{
309						err = errors.New("EOF found after escape character")
310						break SCAN
311					}
312				case RUNETOKEN_CHAR, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT:
313					{
314						state = STATE_INWORD
315						value = append(value, nextRune)
316					}
317				default:
318					{
319						return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
320					}
321				}
322			}
323		case STATE_ESCAPING_QUOTED: // the next rune after an escape character, in double quotes
324			{
325				switch nextRuneType {
326				case RUNETOKEN_EOF:
327					{
328						err = errors.New("EOF found after escape character")
329						break SCAN
330					}
331				case RUNETOKEN_CHAR, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT:
332					{
333						state = STATE_QUOTED_ESCAPING
334						value = append(value, nextRune)
335					}
336				default:
337					{
338						return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
339					}
340				}
341			}
342		case STATE_QUOTED_ESCAPING: // in escaping double quotes
343			{
344				switch nextRuneType {
345				case RUNETOKEN_EOF:
346					{
347						err = errors.New("EOF found when expecting closing quote.")
348						break SCAN
349					}
350				case RUNETOKEN_CHAR, RUNETOKEN_UNKNOWN, RUNETOKEN_SPACE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_COMMENT:
351					{
352						value = append(value, nextRune)
353					}
354				case RUNETOKEN_ESCAPING_QUOTE:
355					{
356						state = STATE_INWORD
357					}
358				case RUNETOKEN_ESCAPE:
359					{
360						state = STATE_ESCAPING_QUOTED
361					}
362				default:
363					{
364						return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
365					}
366				}
367			}
368		case STATE_QUOTED: // in non-escaping single quotes
369			{
370				switch nextRuneType {
371				case RUNETOKEN_EOF:
372					{
373						err = errors.New("EOF found when expecting closing quote.")
374						break SCAN
375					}
376				case RUNETOKEN_CHAR, RUNETOKEN_UNKNOWN, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT:
377					{
378						value = append(value, nextRune)
379					}
380				case RUNETOKEN_NONESCAPING_QUOTE:
381					{
382						state = STATE_INWORD
383					}
384				default:
385					{
386						return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
387					}
388				}
389			}
390		case STATE_COMMENT:
391			{
392				switch nextRuneType {
393				case RUNETOKEN_EOF:
394					{
395						break SCAN
396					}
397				case RUNETOKEN_CHAR, RUNETOKEN_UNKNOWN, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT, RUNETOKEN_NONESCAPING_QUOTE:
398					{
399						value = append(value, nextRune)
400					}
401				case RUNETOKEN_SPACE:
402					{
403						if nextRune == '\n' {
404							state = STATE_START
405							break SCAN
406						} else {
407							value = append(value, nextRune)
408						}
409					}
410				default:
411					{
412						return nil, errors.New(fmt.Sprintf("Uknown rune: %v", nextRune))
413					}
414				}
415			}
416		default:
417			{
418				panic(fmt.Sprintf("Unexpected state: %v", state))
419			}
420		}
421	}
422	token := &Token{
423		tokenType: tokenType,
424		value:     string(value)}
425	return token, err
426}
427
428/*
429Return the next token in the stream, and an error value. If there are no more
430tokens available, the error value will be io.EOF.
431*/
432func (t *Tokenizer) NextToken() (*Token, error) {
433	return t.scanStream()
434}
435
436/*
437Split a string in to a slice of strings, based upon shell-style rules for
438quoting, escaping, and spaces.
439*/
440func Split(s string) ([]string, error) {
441	l, err := NewLexer(strings.NewReader(s))
442	if err != nil {
443		return nil, err
444	}
445	subStrings := []string{}
446	for {
447		word, err := l.NextWord()
448		if err != nil {
449			if err == io.EOF {
450				return subStrings, nil
451			}
452			return subStrings, err
453		}
454		subStrings = append(subStrings, word)
455	}
456	return subStrings, nil
457}
458