1// Copyright 2012 The Gorilla Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package scanner
6
7import (
8	"fmt"
9	"regexp"
10	"strings"
11	"unicode"
12	"unicode/utf8"
13)
14
15// tokenType identifies the type of lexical tokens.
16type tokenType int
17
18// String returns a string representation of the token type.
19func (t tokenType) String() string {
20	return tokenNames[t]
21}
22
23// Token represents a token and the corresponding string.
24type Token struct {
25	Type   tokenType
26	Value  string
27	Line   int
28	Column int
29}
30
31// String returns a string representation of the token.
32func (t *Token) String() string {
33	if len(t.Value) > 10 {
34		return fmt.Sprintf("%s (line: %d, column: %d): %.10q...",
35			t.Type, t.Line, t.Column, t.Value)
36	}
37	return fmt.Sprintf("%s (line: %d, column: %d): %q",
38		t.Type, t.Line, t.Column, t.Value)
39}
40
41// All tokens -----------------------------------------------------------------
42
43// The complete list of tokens in CSS3.
44const (
45	// Scanner flags.
46	TokenError tokenType = iota
47	TokenEOF
48	// From now on, only tokens from the CSS specification.
49	TokenIdent
50	TokenAtKeyword
51	TokenString
52	TokenHash
53	TokenNumber
54	TokenPercentage
55	TokenDimension
56	TokenURI
57	TokenUnicodeRange
58	TokenCDO
59	TokenCDC
60	TokenS
61	TokenComment
62	TokenFunction
63	TokenIncludes
64	TokenDashMatch
65	TokenPrefixMatch
66	TokenSuffixMatch
67	TokenSubstringMatch
68	TokenChar
69	TokenBOM
70)
71
72// tokenNames maps tokenType's to their names. Used for conversion to string.
73var tokenNames = map[tokenType]string{
74	TokenError:          "error",
75	TokenEOF:            "EOF",
76	TokenIdent:          "IDENT",
77	TokenAtKeyword:      "ATKEYWORD",
78	TokenString:         "STRING",
79	TokenHash:           "HASH",
80	TokenNumber:         "NUMBER",
81	TokenPercentage:     "PERCENTAGE",
82	TokenDimension:      "DIMENSION",
83	TokenURI:            "URI",
84	TokenUnicodeRange:   "UNICODE-RANGE",
85	TokenCDO:            "CDO",
86	TokenCDC:            "CDC",
87	TokenS:              "S",
88	TokenComment:        "COMMENT",
89	TokenFunction:       "FUNCTION",
90	TokenIncludes:       "INCLUDES",
91	TokenDashMatch:      "DASHMATCH",
92	TokenPrefixMatch:    "PREFIXMATCH",
93	TokenSuffixMatch:    "SUFFIXMATCH",
94	TokenSubstringMatch: "SUBSTRINGMATCH",
95	TokenChar:           "CHAR",
96	TokenBOM:            "BOM",
97}
98
99// Macros and productions -----------------------------------------------------
100// http://www.w3.org/TR/css3-syntax/#tokenization
101
102var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`)
103
104// macros maps macro names to patterns to be expanded.
105var macros = map[string]string{
106	// must be escaped: `\.+*?()|[]{}^$`
107	"ident":      `-?{nmstart}{nmchar}*`,
108	"name":       `{nmchar}+`,
109	"nmstart":    `[a-zA-Z_]|{nonascii}|{escape}`,
110	"nonascii":   "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
111	"unicode":    `\\[0-9a-fA-F]{1,6}{wc}?`,
112	"escape":     "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
113	"nmchar":     `[a-zA-Z0-9_-]|{nonascii}|{escape}`,
114	"num":        `[0-9]*\.[0-9]+|[0-9]+`,
115	"string":     `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`,
116	"stringchar": `{urlchar}|[ ]|\\{nl}`,
117	"urlchar":    "[\u0009\u0021\u0023-\u0026\u0027-\u007E]|{nonascii}|{escape}",
118	"nl":         `[\n\r\f]|\r\n`,
119	"w":          `{wc}*`,
120	"wc":         `[\t\n\f\r ]`,
121}
122
123// productions maps the list of tokens to patterns to be expanded.
124var productions = map[tokenType]string{
125	// Unused regexps (matched using other methods) are commented out.
126	TokenIdent:        `{ident}`,
127	TokenAtKeyword:    `@{ident}`,
128	TokenString:       `{string}`,
129	TokenHash:         `#{name}`,
130	TokenNumber:       `{num}`,
131	TokenPercentage:   `{num}%`,
132	TokenDimension:    `{num}{ident}`,
133	TokenURI:          `url\({w}(?:{string}|{urlchar}*){w}\)`,
134	TokenUnicodeRange: `U\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`,
135	//TokenCDO:            `<!--`,
136	TokenCDC:      `-->`,
137	TokenS:        `{wc}+`,
138	TokenComment:  `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`,
139	TokenFunction: `{ident}\(`,
140	//TokenIncludes:       `~=`,
141	//TokenDashMatch:      `\|=`,
142	//TokenPrefixMatch:    `\^=`,
143	//TokenSuffixMatch:    `\$=`,
144	//TokenSubstringMatch: `\*=`,
145	//TokenChar:           `[^"']`,
146	//TokenBOM:            "\uFEFF",
147}
148
149// matchers maps the list of tokens to compiled regular expressions.
150//
151// The map is filled on init() using the macros and productions defined in
152// the CSS specification.
153var matchers = map[tokenType]*regexp.Regexp{}
154
155// matchOrder is the order to test regexps when first-char shortcuts
156// can't be used.
157var matchOrder = []tokenType{
158	TokenURI,
159	TokenFunction,
160	TokenUnicodeRange,
161	TokenIdent,
162	TokenDimension,
163	TokenPercentage,
164	TokenNumber,
165	TokenCDC,
166}
167
168func init() {
169	// replace macros and compile regexps for productions.
170	replaceMacro := func(s string) string {
171		return "(?:" + macros[s[1:len(s)-1]] + ")"
172	}
173	for t, s := range productions {
174		for macroRegexp.MatchString(s) {
175			s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro)
176		}
177		matchers[t] = regexp.MustCompile("^(?:" + s + ")")
178	}
179}
180
181// Scanner --------------------------------------------------------------------
182
183// New returns a new CSS scanner for the given input.
184func New(input string) *Scanner {
185	// Normalize newlines.
186	input = strings.Replace(input, "\r\n", "\n", -1)
187	return &Scanner{
188		input: input,
189		row:   1,
190		col:   1,
191	}
192}
193
194// Scanner scans an input and emits tokens following the CSS3 specification.
195type Scanner struct {
196	input string
197	pos   int
198	row   int
199	col   int
200	err   *Token
201}
202
203// Next returns the next token from the input.
204//
205// At the end of the input the token type is TokenEOF.
206//
207// If the input can't be tokenized the token type is TokenError. This occurs
208// in case of unclosed quotation marks or comments.
209func (s *Scanner) Next() *Token {
210	if s.err != nil {
211		return s.err
212	}
213	if s.pos >= len(s.input) {
214		s.err = &Token{TokenEOF, "", s.row, s.col}
215		return s.err
216	}
217	if s.pos == 0 {
218		// Test BOM only once, at the beginning of the file.
219		if strings.HasPrefix(s.input, "\uFEFF") {
220			return s.emitSimple(TokenBOM, "\uFEFF")
221		}
222	}
223	// There's a lot we can guess based on the first byte so we'll take a
224	// shortcut before testing multiple regexps.
225	input := s.input[s.pos:]
226	switch input[0] {
227	case '\t', '\n', '\f', '\r', ' ':
228		// Whitespace.
229		return s.emitToken(TokenS, matchers[TokenS].FindString(input))
230	case '.':
231		// Dot is too common to not have a quick check.
232		// We'll test if this is a Char; if it is followed by a number it is a
233		// dimension/percentage/number, and this will be matched later.
234		if len(input) > 1 && !unicode.IsDigit(rune(input[1])) {
235			return s.emitSimple(TokenChar, ".")
236		}
237	case '#':
238		// Another common one: Hash or Char.
239		if match := matchers[TokenHash].FindString(input); match != "" {
240			return s.emitToken(TokenHash, match)
241		}
242		return s.emitSimple(TokenChar, "#")
243	case '@':
244		// Another common one: AtKeyword or Char.
245		if match := matchers[TokenAtKeyword].FindString(input); match != "" {
246			return s.emitSimple(TokenAtKeyword, match)
247		}
248		return s.emitSimple(TokenChar, "@")
249	case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}':
250		// More common chars.
251		return s.emitSimple(TokenChar, string(input[0]))
252	case '"', '\'':
253		// String or error.
254		match := matchers[TokenString].FindString(input)
255		if match != "" {
256			return s.emitToken(TokenString, match)
257		} else {
258			s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col}
259			return s.err
260		}
261	case '/':
262		// Comment, error or Char.
263		if len(input) > 1 && input[1] == '*' {
264			match := matchers[TokenComment].FindString(input)
265			if match != "" {
266				return s.emitToken(TokenComment, match)
267			} else {
268				s.err = &Token{TokenError, "unclosed comment", s.row, s.col}
269				return s.err
270			}
271		}
272		return s.emitSimple(TokenChar, "/")
273	case '~':
274		// Includes or Char.
275		return s.emitPrefixOrChar(TokenIncludes, "~=")
276	case '|':
277		// DashMatch or Char.
278		return s.emitPrefixOrChar(TokenDashMatch, "|=")
279	case '^':
280		// PrefixMatch or Char.
281		return s.emitPrefixOrChar(TokenPrefixMatch, "^=")
282	case '$':
283		// SuffixMatch or Char.
284		return s.emitPrefixOrChar(TokenSuffixMatch, "$=")
285	case '*':
286		// SubstringMatch or Char.
287		return s.emitPrefixOrChar(TokenSubstringMatch, "*=")
288	case '<':
289		// CDO or Char.
290		return s.emitPrefixOrChar(TokenCDO, "<!--")
291	}
292	// Test all regexps, in order.
293	for _, token := range matchOrder {
294		if match := matchers[token].FindString(input); match != "" {
295			return s.emitToken(token, match)
296		}
297	}
298	// We already handled unclosed quotation marks and comments,
299	// so this can only be a Char.
300	r, width := utf8.DecodeRuneInString(input)
301	token := &Token{TokenChar, string(r), s.row, s.col}
302	s.col += width
303	s.pos += width
304	return token
305}
306
307// updatePosition updates input coordinates based on the consumed text.
308func (s *Scanner) updatePosition(text string) {
309	width := utf8.RuneCountInString(text)
310	lines := strings.Count(text, "\n")
311	s.row += lines
312	if lines == 0 {
313		s.col += width
314	} else {
315		s.col = utf8.RuneCountInString(text[strings.LastIndex(text, "\n"):])
316	}
317	s.pos += len(text) // while col is a rune index, pos is a byte index
318}
319
320// emitToken returns a Token for the string v and updates the scanner position.
321func (s *Scanner) emitToken(t tokenType, v string) *Token {
322	token := &Token{t, v, s.row, s.col}
323	s.updatePosition(v)
324	return token
325}
326
327// emitSimple returns a Token for the string v and updates the scanner
328// position in a simplified manner.
329//
330// The string is known to have only ASCII characters and to not have a newline.
331func (s *Scanner) emitSimple(t tokenType, v string) *Token {
332	token := &Token{t, v, s.row, s.col}
333	s.col += len(v)
334	s.pos += len(v)
335	return token
336}
337
338// emitPrefixOrChar returns a Token for type t if the current position
339// matches the given prefix. Otherwise it returns a Char token using the
340// first character from the prefix.
341//
342// The prefix is known to have only ASCII characters and to not have a newline.
343func (s *Scanner) emitPrefixOrChar(t tokenType, prefix string) *Token {
344	if strings.HasPrefix(s.input[s.pos:], prefix) {
345		return s.emitSimple(t, prefix)
346	}
347	return s.emitSimple(TokenChar, string(prefix[0]))
348}
349