1// Copyright 2012 The Gorilla Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package scanner
6
7import (
8	"fmt"
9	"regexp"
10	"strings"
11	"unicode"
12	"unicode/utf8"
13)
14
15// tokenType identifies the type of lexical tokens.
16type tokenType int
17
18// String returns a string representation of the token type.
19func (t tokenType) String() string {
20	return tokenNames[t]
21}
22
23// Token represents a token and the corresponding string.
24type Token struct {
25	Type   tokenType
26	Value  string
27	Line   int
28	Column int
29}
30
31// String returns a string representation of the token.
32func (t *Token) String() string {
33	if len(t.Value) > 10 {
34		return fmt.Sprintf("%s (line: %d, column: %d): %.10q...",
35			t.Type, t.Line, t.Column, t.Value)
36	}
37	return fmt.Sprintf("%s (line: %d, column: %d): %q",
38		t.Type, t.Line, t.Column, t.Value)
39}
40
41// All tokens -----------------------------------------------------------------
42
43// The complete list of tokens in CSS3.
44const (
45	// Scanner flags.
46	TokenError tokenType = iota
47	TokenEOF
48	// From now on, only tokens from the CSS specification.
49	TokenIdent
50	TokenAtKeyword
51	TokenString
52	TokenHash
53	TokenNumber
54	TokenPercentage
55	TokenDimension
56	TokenURI
57	TokenUnicodeRange
58	TokenCDO
59	TokenCDC
60	TokenS
61	TokenComment
62	TokenFunction
63	TokenIncludes
64	TokenDashMatch
65	TokenPrefixMatch
66	TokenSuffixMatch
67	TokenSubstringMatch
68	TokenChar
69	TokenBOM
70)
71
72// tokenNames maps tokenType's to their names. Used for conversion to string.
73var tokenNames = map[tokenType]string{
74	TokenError:          "error",
75	TokenEOF:            "EOF",
76	TokenIdent:          "IDENT",
77	TokenAtKeyword:      "ATKEYWORD",
78	TokenString:         "STRING",
79	TokenHash:           "HASH",
80	TokenNumber:         "NUMBER",
81	TokenPercentage:     "PERCENTAGE",
82	TokenDimension:      "DIMENSION",
83	TokenURI:            "URI",
84	TokenUnicodeRange:   "UNICODE-RANGE",
85	TokenCDO:            "CDO",
86	TokenCDC:            "CDC",
87	TokenS:              "S",
88	TokenComment:        "COMMENT",
89	TokenFunction:       "FUNCTION",
90	TokenIncludes:       "INCLUDES",
91	TokenDashMatch:      "DASHMATCH",
92	TokenPrefixMatch:    "PREFIXMATCH",
93	TokenSuffixMatch:    "SUFFIXMATCH",
94	TokenSubstringMatch: "SUBSTRINGMATCH",
95	TokenChar:           "CHAR",
96	TokenBOM:            "BOM",
97}
98
99// Macros and productions -----------------------------------------------------
100// http://www.w3.org/TR/css3-syntax/#tokenization
101
102var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`)
103
104// macros maps macro names to patterns to be expanded.
105var macros = map[string]string{
106	// must be escaped: `\.+*?()|[]{}^$`
107	"ident":      `-?{nmstart}{nmchar}*`,
108	"name":       `{nmchar}+`,
109	"nmstart":    `[a-zA-Z_]|{nonascii}|{escape}`,
110	"nonascii":   "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
111	"unicode":    `\\[0-9a-fA-F]{1,6}{wc}?`,
112	"escape":     "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
113	"nmchar":     `[a-zA-Z0-9_-]|{nonascii}|{escape}`,
114	"num":        `[0-9]*\.[0-9]+|[0-9]+`,
115	"string":     `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`,
116	"stringchar": `{urlchar}|[ ]|\\{nl}`,
117	"nl":         `[\n\r\f]|\r\n`,
118	"w":          `{wc}*`,
119	"wc":         `[\t\n\f\r ]`,
120
121	// urlchar should accept [(ascii characters minus those that need escaping)|{nonascii}|{escape}]
122	// ASCII characters range = `[\u0020-\u007e]`
123	// Skip space \u0020 = `[\u0021-\u007e]`
124	// Skip quotation mark \0022 = `[\u0021\u0023-\u007e]`
125	// Skip apostrophe \u0027 = `[\u0021\u0023-\u0026\u0028-\u007e]`
126	// Skip reverse solidus \u005c = `[\u0021\u0023-\u0026\u0028-\u005b\u005d\u007e]`
127	// Finally, the left square bracket (\u005b) and right (\u005d) needs escaping themselves
128	"urlchar": "[\u0021\u0023-\u0026\u0028-\\\u005b\\\u005d-\u007E]|{nonascii}|{escape}",
129}
130
131// productions maps the list of tokens to patterns to be expanded.
132var productions = map[tokenType]string{
133	// Unused regexps (matched using other methods) are commented out.
134	TokenIdent:        `{ident}`,
135	TokenAtKeyword:    `@{ident}`,
136	TokenString:       `{string}`,
137	TokenHash:         `#{name}`,
138	TokenNumber:       `{num}`,
139	TokenPercentage:   `{num}%`,
140	TokenDimension:    `{num}{ident}`,
141	TokenURI:          `url\({w}(?:{string}|{urlchar}*?){w}\)`,
142	TokenUnicodeRange: `U\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`,
143	//TokenCDO:            `<!--`,
144	TokenCDC:      `-->`,
145	TokenS:        `{wc}+`,
146	TokenComment:  `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`,
147	TokenFunction: `{ident}\(`,
148	//TokenIncludes:       `~=`,
149	//TokenDashMatch:      `\|=`,
150	//TokenPrefixMatch:    `\^=`,
151	//TokenSuffixMatch:    `\$=`,
152	//TokenSubstringMatch: `\*=`,
153	//TokenChar:           `[^"']`,
154	//TokenBOM:            "\uFEFF",
155}
156
157// matchers maps the list of tokens to compiled regular expressions.
158//
159// The map is filled on init() using the macros and productions defined in
160// the CSS specification.
161var matchers = map[tokenType]*regexp.Regexp{}
162
163// matchOrder is the order to test regexps when first-char shortcuts
164// can't be used.
165var matchOrder = []tokenType{
166	TokenURI,
167	TokenFunction,
168	TokenUnicodeRange,
169	TokenIdent,
170	TokenDimension,
171	TokenPercentage,
172	TokenNumber,
173	TokenCDC,
174}
175
176func init() {
177	// replace macros and compile regexps for productions.
178	replaceMacro := func(s string) string {
179		return "(?:" + macros[s[1:len(s)-1]] + ")"
180	}
181	for t, s := range productions {
182		for macroRegexp.MatchString(s) {
183			s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro)
184		}
185		matchers[t] = regexp.MustCompile("^(?:" + s + ")")
186	}
187}
188
189// Scanner --------------------------------------------------------------------
190
191// New returns a new CSS scanner for the given input.
192func New(input string) *Scanner {
193	// Normalize newlines.
194	input = strings.Replace(input, "\r\n", "\n", -1)
195	return &Scanner{
196		input: input,
197		row:   1,
198		col:   1,
199	}
200}
201
202// Scanner scans an input and emits tokens following the CSS3 specification.
203type Scanner struct {
204	input string
205	pos   int
206	row   int
207	col   int
208	err   *Token
209}
210
211// Next returns the next token from the input.
212//
213// At the end of the input the token type is TokenEOF.
214//
215// If the input can't be tokenized the token type is TokenError. This occurs
216// in case of unclosed quotation marks or comments.
217func (s *Scanner) Next() *Token {
218	if s.err != nil {
219		return s.err
220	}
221	if s.pos >= len(s.input) {
222		s.err = &Token{TokenEOF, "", s.row, s.col}
223		return s.err
224	}
225	if s.pos == 0 {
226		// Test BOM only once, at the beginning of the file.
227		if strings.HasPrefix(s.input, "\uFEFF") {
228			return s.emitSimple(TokenBOM, "\uFEFF")
229		}
230	}
231	// There's a lot we can guess based on the first byte so we'll take a
232	// shortcut before testing multiple regexps.
233	input := s.input[s.pos:]
234	switch input[0] {
235	case '\t', '\n', '\f', '\r', ' ':
236		// Whitespace.
237		return s.emitToken(TokenS, matchers[TokenS].FindString(input))
238	case '.':
239		// Dot is too common to not have a quick check.
240		// We'll test if this is a Char; if it is followed by a number it is a
241		// dimension/percentage/number, and this will be matched later.
242		if len(input) > 1 && !unicode.IsDigit(rune(input[1])) {
243			return s.emitSimple(TokenChar, ".")
244		}
245	case '#':
246		// Another common one: Hash or Char.
247		if match := matchers[TokenHash].FindString(input); match != "" {
248			return s.emitToken(TokenHash, match)
249		}
250		return s.emitSimple(TokenChar, "#")
251	case '@':
252		// Another common one: AtKeyword or Char.
253		if match := matchers[TokenAtKeyword].FindString(input); match != "" {
254			return s.emitSimple(TokenAtKeyword, match)
255		}
256		return s.emitSimple(TokenChar, "@")
257	case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}':
258		// More common chars.
259		return s.emitSimple(TokenChar, string(input[0]))
260	case '"', '\'':
261		// String or error.
262		match := matchers[TokenString].FindString(input)
263		if match != "" {
264			return s.emitToken(TokenString, match)
265		}
266
267		s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col}
268		return s.err
269	case '/':
270		// Comment, error or Char.
271		if len(input) > 1 && input[1] == '*' {
272			match := matchers[TokenComment].FindString(input)
273			if match != "" {
274				return s.emitToken(TokenComment, match)
275			} else {
276				s.err = &Token{TokenError, "unclosed comment", s.row, s.col}
277				return s.err
278			}
279		}
280		return s.emitSimple(TokenChar, "/")
281	case '~':
282		// Includes or Char.
283		return s.emitPrefixOrChar(TokenIncludes, "~=")
284	case '|':
285		// DashMatch or Char.
286		return s.emitPrefixOrChar(TokenDashMatch, "|=")
287	case '^':
288		// PrefixMatch or Char.
289		return s.emitPrefixOrChar(TokenPrefixMatch, "^=")
290	case '$':
291		// SuffixMatch or Char.
292		return s.emitPrefixOrChar(TokenSuffixMatch, "$=")
293	case '*':
294		// SubstringMatch or Char.
295		return s.emitPrefixOrChar(TokenSubstringMatch, "*=")
296	case '<':
297		// CDO or Char.
298		return s.emitPrefixOrChar(TokenCDO, "<!--")
299	}
300	// Test all regexps, in order.
301	for _, token := range matchOrder {
302		if match := matchers[token].FindString(input); match != "" {
303			return s.emitToken(token, match)
304		}
305	}
306	// We already handled unclosed quotation marks and comments,
307	// so this can only be a Char.
308	r, width := utf8.DecodeRuneInString(input)
309	token := &Token{TokenChar, string(r), s.row, s.col}
310	s.col += width
311	s.pos += width
312	return token
313}
314
315// updatePosition updates input coordinates based on the consumed text.
316func (s *Scanner) updatePosition(text string) {
317	width := utf8.RuneCountInString(text)
318	lines := strings.Count(text, "\n")
319	s.row += lines
320	if lines == 0 {
321		s.col += width
322	} else {
323		s.col = utf8.RuneCountInString(text[strings.LastIndex(text, "\n"):])
324	}
325	s.pos += len(text) // while col is a rune index, pos is a byte index
326}
327
328// emitToken returns a Token for the string v and updates the scanner position.
329func (s *Scanner) emitToken(t tokenType, v string) *Token {
330	token := &Token{t, v, s.row, s.col}
331	s.updatePosition(v)
332	return token
333}
334
335// emitSimple returns a Token for the string v and updates the scanner
336// position in a simplified manner.
337//
338// The string is known to have only ASCII characters and to not have a newline.
339func (s *Scanner) emitSimple(t tokenType, v string) *Token {
340	token := &Token{t, v, s.row, s.col}
341	s.col += len(v)
342	s.pos += len(v)
343	return token
344}
345
346// emitPrefixOrChar returns a Token for type t if the current position
347// matches the given prefix. Otherwise it returns a Char token using the
348// first character from the prefix.
349//
350// The prefix is known to have only ASCII characters and to not have a newline.
351func (s *Scanner) emitPrefixOrChar(t tokenType, prefix string) *Token {
352	if strings.HasPrefix(s.input[s.pos:], prefix) {
353		return s.emitSimple(t, prefix)
354	}
355	return s.emitSimple(TokenChar, string(prefix[0]))
356}
357