1// Copyright 2012 The Gorilla Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package scanner 6 7import ( 8 "fmt" 9 "regexp" 10 "strings" 11 "unicode" 12 "unicode/utf8" 13) 14 15// tokenType identifies the type of lexical tokens. 16type tokenType int 17 18// String returns a string representation of the token type. 19func (t tokenType) String() string { 20 return tokenNames[t] 21} 22 23// Token represents a token and the corresponding string. 24type Token struct { 25 Type tokenType 26 Value string 27 Line int 28 Column int 29} 30 31// String returns a string representation of the token. 32func (t *Token) String() string { 33 if len(t.Value) > 10 { 34 return fmt.Sprintf("%s (line: %d, column: %d): %.10q...", 35 t.Type, t.Line, t.Column, t.Value) 36 } 37 return fmt.Sprintf("%s (line: %d, column: %d): %q", 38 t.Type, t.Line, t.Column, t.Value) 39} 40 41// All tokens ----------------------------------------------------------------- 42 43// The complete list of tokens in CSS3. 44const ( 45 // Scanner flags. 46 TokenError tokenType = iota 47 TokenEOF 48 // From now on, only tokens from the CSS specification. 49 TokenIdent 50 TokenAtKeyword 51 TokenString 52 TokenHash 53 TokenNumber 54 TokenPercentage 55 TokenDimension 56 TokenURI 57 TokenUnicodeRange 58 TokenCDO 59 TokenCDC 60 TokenS 61 TokenComment 62 TokenFunction 63 TokenIncludes 64 TokenDashMatch 65 TokenPrefixMatch 66 TokenSuffixMatch 67 TokenSubstringMatch 68 TokenChar 69 TokenBOM 70) 71 72// tokenNames maps tokenType's to their names. Used for conversion to string. 73var tokenNames = map[tokenType]string{ 74 TokenError: "error", 75 TokenEOF: "EOF", 76 TokenIdent: "IDENT", 77 TokenAtKeyword: "ATKEYWORD", 78 TokenString: "STRING", 79 TokenHash: "HASH", 80 TokenNumber: "NUMBER", 81 TokenPercentage: "PERCENTAGE", 82 TokenDimension: "DIMENSION", 83 TokenURI: "URI", 84 TokenUnicodeRange: "UNICODE-RANGE", 85 TokenCDO: "CDO", 86 TokenCDC: "CDC", 87 TokenS: "S", 88 TokenComment: "COMMENT", 89 TokenFunction: "FUNCTION", 90 TokenIncludes: "INCLUDES", 91 TokenDashMatch: "DASHMATCH", 92 TokenPrefixMatch: "PREFIXMATCH", 93 TokenSuffixMatch: "SUFFIXMATCH", 94 TokenSubstringMatch: "SUBSTRINGMATCH", 95 TokenChar: "CHAR", 96 TokenBOM: "BOM", 97} 98 99// Macros and productions ----------------------------------------------------- 100// http://www.w3.org/TR/css3-syntax/#tokenization 101 102var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`) 103 104// macros maps macro names to patterns to be expanded. 105var macros = map[string]string{ 106 // must be escaped: `\.+*?()|[]{}^$` 107 "ident": `-?{nmstart}{nmchar}*`, 108 "name": `{nmchar}+`, 109 "nmstart": `[a-zA-Z_]|{nonascii}|{escape}`, 110 "nonascii": "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]", 111 "unicode": `\\[0-9a-fA-F]{1,6}{wc}?`, 112 "escape": "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]", 113 "nmchar": `[a-zA-Z0-9_-]|{nonascii}|{escape}`, 114 "num": `[0-9]*\.[0-9]+|[0-9]+`, 115 "string": `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`, 116 "stringchar": `{urlchar}|[ ]|\\{nl}`, 117 "urlchar": "[\u0009\u0021\u0023-\u0026\u0027-\u007E]|{nonascii}|{escape}", 118 "nl": `[\n\r\f]|\r\n`, 119 "w": `{wc}*`, 120 "wc": `[\t\n\f\r ]`, 121} 122 123// productions maps the list of tokens to patterns to be expanded. 124var productions = map[tokenType]string{ 125 // Unused regexps (matched using other methods) are commented out. 126 TokenIdent: `{ident}`, 127 TokenAtKeyword: `@{ident}`, 128 TokenString: `{string}`, 129 TokenHash: `#{name}`, 130 TokenNumber: `{num}`, 131 TokenPercentage: `{num}%`, 132 TokenDimension: `{num}{ident}`, 133 TokenURI: `url\({w}(?:{string}|{urlchar}*){w}\)`, 134 TokenUnicodeRange: `U\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`, 135 //TokenCDO: `<!--`, 136 TokenCDC: `-->`, 137 TokenS: `{wc}+`, 138 TokenComment: `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`, 139 TokenFunction: `{ident}\(`, 140 //TokenIncludes: `~=`, 141 //TokenDashMatch: `\|=`, 142 //TokenPrefixMatch: `\^=`, 143 //TokenSuffixMatch: `\$=`, 144 //TokenSubstringMatch: `\*=`, 145 //TokenChar: `[^"']`, 146 //TokenBOM: "\uFEFF", 147} 148 149// matchers maps the list of tokens to compiled regular expressions. 150// 151// The map is filled on init() using the macros and productions defined in 152// the CSS specification. 153var matchers = map[tokenType]*regexp.Regexp{} 154 155// matchOrder is the order to test regexps when first-char shortcuts 156// can't be used. 157var matchOrder = []tokenType{ 158 TokenURI, 159 TokenFunction, 160 TokenUnicodeRange, 161 TokenIdent, 162 TokenDimension, 163 TokenPercentage, 164 TokenNumber, 165 TokenCDC, 166} 167 168func init() { 169 // replace macros and compile regexps for productions. 170 replaceMacro := func(s string) string { 171 return "(?:" + macros[s[1:len(s)-1]] + ")" 172 } 173 for t, s := range productions { 174 for macroRegexp.MatchString(s) { 175 s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro) 176 } 177 matchers[t] = regexp.MustCompile("^(?:" + s + ")") 178 } 179} 180 181// Scanner -------------------------------------------------------------------- 182 183// New returns a new CSS scanner for the given input. 184func New(input string) *Scanner { 185 // Normalize newlines. 186 input = strings.Replace(input, "\r\n", "\n", -1) 187 return &Scanner{ 188 input: input, 189 row: 1, 190 col: 1, 191 } 192} 193 194// Scanner scans an input and emits tokens following the CSS3 specification. 195type Scanner struct { 196 input string 197 pos int 198 row int 199 col int 200 err *Token 201} 202 203// Next returns the next token from the input. 204// 205// At the end of the input the token type is TokenEOF. 206// 207// If the input can't be tokenized the token type is TokenError. This occurs 208// in case of unclosed quotation marks or comments. 209func (s *Scanner) Next() *Token { 210 if s.err != nil { 211 return s.err 212 } 213 if s.pos >= len(s.input) { 214 s.err = &Token{TokenEOF, "", s.row, s.col} 215 return s.err 216 } 217 if s.pos == 0 { 218 // Test BOM only once, at the beginning of the file. 219 if strings.HasPrefix(s.input, "\uFEFF") { 220 return s.emitSimple(TokenBOM, "\uFEFF") 221 } 222 } 223 // There's a lot we can guess based on the first byte so we'll take a 224 // shortcut before testing multiple regexps. 225 input := s.input[s.pos:] 226 switch input[0] { 227 case '\t', '\n', '\f', '\r', ' ': 228 // Whitespace. 229 return s.emitToken(TokenS, matchers[TokenS].FindString(input)) 230 case '.': 231 // Dot is too common to not have a quick check. 232 // We'll test if this is a Char; if it is followed by a number it is a 233 // dimension/percentage/number, and this will be matched later. 234 if len(input) > 1 && !unicode.IsDigit(rune(input[1])) { 235 return s.emitSimple(TokenChar, ".") 236 } 237 case '#': 238 // Another common one: Hash or Char. 239 if match := matchers[TokenHash].FindString(input); match != "" { 240 return s.emitToken(TokenHash, match) 241 } 242 return s.emitSimple(TokenChar, "#") 243 case '@': 244 // Another common one: AtKeyword or Char. 245 if match := matchers[TokenAtKeyword].FindString(input); match != "" { 246 return s.emitSimple(TokenAtKeyword, match) 247 } 248 return s.emitSimple(TokenChar, "@") 249 case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}': 250 // More common chars. 251 return s.emitSimple(TokenChar, string(input[0])) 252 case '"', '\'': 253 // String or error. 254 match := matchers[TokenString].FindString(input) 255 if match != "" { 256 return s.emitToken(TokenString, match) 257 } else { 258 s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col} 259 return s.err 260 } 261 case '/': 262 // Comment, error or Char. 263 if len(input) > 1 && input[1] == '*' { 264 match := matchers[TokenComment].FindString(input) 265 if match != "" { 266 return s.emitToken(TokenComment, match) 267 } else { 268 s.err = &Token{TokenError, "unclosed comment", s.row, s.col} 269 return s.err 270 } 271 } 272 return s.emitSimple(TokenChar, "/") 273 case '~': 274 // Includes or Char. 275 return s.emitPrefixOrChar(TokenIncludes, "~=") 276 case '|': 277 // DashMatch or Char. 278 return s.emitPrefixOrChar(TokenDashMatch, "|=") 279 case '^': 280 // PrefixMatch or Char. 281 return s.emitPrefixOrChar(TokenPrefixMatch, "^=") 282 case '$': 283 // SuffixMatch or Char. 284 return s.emitPrefixOrChar(TokenSuffixMatch, "$=") 285 case '*': 286 // SubstringMatch or Char. 287 return s.emitPrefixOrChar(TokenSubstringMatch, "*=") 288 case '<': 289 // CDO or Char. 290 return s.emitPrefixOrChar(TokenCDO, "<!--") 291 } 292 // Test all regexps, in order. 293 for _, token := range matchOrder { 294 if match := matchers[token].FindString(input); match != "" { 295 return s.emitToken(token, match) 296 } 297 } 298 // We already handled unclosed quotation marks and comments, 299 // so this can only be a Char. 300 r, width := utf8.DecodeRuneInString(input) 301 token := &Token{TokenChar, string(r), s.row, s.col} 302 s.col += width 303 s.pos += width 304 return token 305} 306 307// updatePosition updates input coordinates based on the consumed text. 308func (s *Scanner) updatePosition(text string) { 309 width := utf8.RuneCountInString(text) 310 lines := strings.Count(text, "\n") 311 s.row += lines 312 if lines == 0 { 313 s.col += width 314 } else { 315 s.col = utf8.RuneCountInString(text[strings.LastIndex(text, "\n"):]) 316 } 317 s.pos += len(text) // while col is a rune index, pos is a byte index 318} 319 320// emitToken returns a Token for the string v and updates the scanner position. 321func (s *Scanner) emitToken(t tokenType, v string) *Token { 322 token := &Token{t, v, s.row, s.col} 323 s.updatePosition(v) 324 return token 325} 326 327// emitSimple returns a Token for the string v and updates the scanner 328// position in a simplified manner. 329// 330// The string is known to have only ASCII characters and to not have a newline. 331func (s *Scanner) emitSimple(t tokenType, v string) *Token { 332 token := &Token{t, v, s.row, s.col} 333 s.col += len(v) 334 s.pos += len(v) 335 return token 336} 337 338// emitPrefixOrChar returns a Token for type t if the current position 339// matches the given prefix. Otherwise it returns a Char token using the 340// first character from the prefix. 341// 342// The prefix is known to have only ASCII characters and to not have a newline. 343func (s *Scanner) emitPrefixOrChar(t tokenType, prefix string) *Token { 344 if strings.HasPrefix(s.input[s.pos:], prefix) { 345 return s.emitSimple(t, prefix) 346 } 347 return s.emitSimple(TokenChar, string(prefix[0])) 348} 349