1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package template 6 7import ( 8 "bytes" 9 "strings" 10) 11 12// transitionFunc is the array of context transition functions for text nodes. 13// A transition function takes a context and template text input, and returns 14// the updated context and the number of bytes consumed from the front of the 15// input. 16var transitionFunc = [...]func(context, []byte) (context, int){ 17 stateText: tText, 18 stateTag: tTag, 19 stateAttrName: tAttrName, 20 stateAfterName: tAfterName, 21 stateBeforeValue: tBeforeValue, 22 stateHTMLCmt: tHTMLCmt, 23 stateRCDATA: tSpecialTagEnd, 24 stateAttr: tAttr, 25 stateURL: tURL, 26 stateSrcset: tURL, 27 stateJS: tJS, 28 stateJSDqStr: tJSDelimited, 29 stateJSSqStr: tJSDelimited, 30 stateJSRegexp: tJSDelimited, 31 stateJSBlockCmt: tBlockCmt, 32 stateJSLineCmt: tLineCmt, 33 stateCSS: tCSS, 34 stateCSSDqStr: tCSSStr, 35 stateCSSSqStr: tCSSStr, 36 stateCSSDqURL: tCSSStr, 37 stateCSSSqURL: tCSSStr, 38 stateCSSURL: tCSSStr, 39 stateCSSBlockCmt: tBlockCmt, 40 stateCSSLineCmt: tLineCmt, 41 stateError: tError, 42} 43 44var commentStart = []byte("<!--") 45var commentEnd = []byte("-->") 46 47// tText is the context transition function for the text state. 48func tText(c context, s []byte) (context, int) { 49 k := 0 50 for { 51 i := k + bytes.IndexByte(s[k:], '<') 52 if i < k || i+1 == len(s) { 53 return c, len(s) 54 } else if i+4 <= len(s) && bytes.Equal(commentStart, s[i:i+4]) { 55 return context{state: stateHTMLCmt}, i + 4 56 } 57 i++ 58 end := false 59 if s[i] == '/' { 60 if i+1 == len(s) { 61 return c, len(s) 62 } 63 end, i = true, i+1 64 } 65 j, e := eatTagName(s, i) 66 if j != i { 67 if end { 68 e = elementNone 69 } 70 // We've found an HTML tag. 71 return context{state: stateTag, element: e}, j 72 } 73 k = j 74 } 75} 76 77var elementContentType = [...]state{ 78 elementNone: stateText, 79 elementScript: stateJS, 80 elementStyle: stateCSS, 81 elementTextarea: stateRCDATA, 82 elementTitle: stateRCDATA, 83} 84 85// tTag is the context transition function for the tag state. 86func tTag(c context, s []byte) (context, int) { 87 // Find the attribute name. 88 i := eatWhiteSpace(s, 0) 89 if i == len(s) { 90 return c, len(s) 91 } 92 if s[i] == '>' { 93 return context{ 94 state: elementContentType[c.element], 95 element: c.element, 96 }, i + 1 97 } 98 j, err := eatAttrName(s, i) 99 if err != nil { 100 return context{state: stateError, err: err}, len(s) 101 } 102 state, attr := stateTag, attrNone 103 if i == j { 104 return context{ 105 state: stateError, 106 err: errorf(ErrBadHTML, nil, 0, "expected space, attr name, or end of tag, but got %q", s[i:]), 107 }, len(s) 108 } 109 110 attrName := strings.ToLower(string(s[i:j])) 111 if c.element == elementScript && attrName == "type" { 112 attr = attrScriptType 113 } else { 114 switch attrType(attrName) { 115 case contentTypeURL: 116 attr = attrURL 117 case contentTypeCSS: 118 attr = attrStyle 119 case contentTypeJS: 120 attr = attrScript 121 case contentTypeSrcset: 122 attr = attrSrcset 123 } 124 } 125 126 if j == len(s) { 127 state = stateAttrName 128 } else { 129 state = stateAfterName 130 } 131 return context{state: state, element: c.element, attr: attr}, j 132} 133 134// tAttrName is the context transition function for stateAttrName. 135func tAttrName(c context, s []byte) (context, int) { 136 i, err := eatAttrName(s, 0) 137 if err != nil { 138 return context{state: stateError, err: err}, len(s) 139 } else if i != len(s) { 140 c.state = stateAfterName 141 } 142 return c, i 143} 144 145// tAfterName is the context transition function for stateAfterName. 146func tAfterName(c context, s []byte) (context, int) { 147 // Look for the start of the value. 148 i := eatWhiteSpace(s, 0) 149 if i == len(s) { 150 return c, len(s) 151 } else if s[i] != '=' { 152 // Occurs due to tag ending '>', and valueless attribute. 153 c.state = stateTag 154 return c, i 155 } 156 c.state = stateBeforeValue 157 // Consume the "=". 158 return c, i + 1 159} 160 161var attrStartStates = [...]state{ 162 attrNone: stateAttr, 163 attrScript: stateJS, 164 attrScriptType: stateAttr, 165 attrStyle: stateCSS, 166 attrURL: stateURL, 167 attrSrcset: stateSrcset, 168} 169 170// tBeforeValue is the context transition function for stateBeforeValue. 171func tBeforeValue(c context, s []byte) (context, int) { 172 i := eatWhiteSpace(s, 0) 173 if i == len(s) { 174 return c, len(s) 175 } 176 // Find the attribute delimiter. 177 delim := delimSpaceOrTagEnd 178 switch s[i] { 179 case '\'': 180 delim, i = delimSingleQuote, i+1 181 case '"': 182 delim, i = delimDoubleQuote, i+1 183 } 184 c.state, c.delim = attrStartStates[c.attr], delim 185 return c, i 186} 187 188// tHTMLCmt is the context transition function for stateHTMLCmt. 189func tHTMLCmt(c context, s []byte) (context, int) { 190 if i := bytes.Index(s, commentEnd); i != -1 { 191 return context{}, i + 3 192 } 193 return c, len(s) 194} 195 196// specialTagEndMarkers maps element types to the character sequence that 197// case-insensitively signals the end of the special tag body. 198var specialTagEndMarkers = [...][]byte{ 199 elementScript: []byte("script"), 200 elementStyle: []byte("style"), 201 elementTextarea: []byte("textarea"), 202 elementTitle: []byte("title"), 203} 204 205var ( 206 specialTagEndPrefix = []byte("</") 207 tagEndSeparators = []byte("> \t\n\f/") 208) 209 210// tSpecialTagEnd is the context transition function for raw text and RCDATA 211// element states. 212func tSpecialTagEnd(c context, s []byte) (context, int) { 213 if c.element != elementNone { 214 if i := indexTagEnd(s, specialTagEndMarkers[c.element]); i != -1 { 215 return context{}, i 216 } 217 } 218 return c, len(s) 219} 220 221// indexTagEnd finds the index of a special tag end in a case insensitive way, or returns -1 222func indexTagEnd(s []byte, tag []byte) int { 223 res := 0 224 plen := len(specialTagEndPrefix) 225 for len(s) > 0 { 226 // Try to find the tag end prefix first 227 i := bytes.Index(s, specialTagEndPrefix) 228 if i == -1 { 229 return i 230 } 231 s = s[i+plen:] 232 // Try to match the actual tag if there is still space for it 233 if len(tag) <= len(s) && bytes.EqualFold(tag, s[:len(tag)]) { 234 s = s[len(tag):] 235 // Check the tag is followed by a proper separator 236 if len(s) > 0 && bytes.IndexByte(tagEndSeparators, s[0]) != -1 { 237 return res + i 238 } 239 res += len(tag) 240 } 241 res += i + plen 242 } 243 return -1 244} 245 246// tAttr is the context transition function for the attribute state. 247func tAttr(c context, s []byte) (context, int) { 248 return c, len(s) 249} 250 251// tURL is the context transition function for the URL state. 252func tURL(c context, s []byte) (context, int) { 253 if bytes.ContainsAny(s, "#?") { 254 c.urlPart = urlPartQueryOrFrag 255 } else if len(s) != eatWhiteSpace(s, 0) && c.urlPart == urlPartNone { 256 // HTML5 uses "Valid URL potentially surrounded by spaces" for 257 // attrs: https://www.w3.org/TR/html5/index.html#attributes-1 258 c.urlPart = urlPartPreQuery 259 } 260 return c, len(s) 261} 262 263// tJS is the context transition function for the JS state. 264func tJS(c context, s []byte) (context, int) { 265 i := bytes.IndexAny(s, `"'/`) 266 if i == -1 { 267 // Entire input is non string, comment, regexp tokens. 268 c.jsCtx = nextJSCtx(s, c.jsCtx) 269 return c, len(s) 270 } 271 c.jsCtx = nextJSCtx(s[:i], c.jsCtx) 272 switch s[i] { 273 case '"': 274 c.state, c.jsCtx = stateJSDqStr, jsCtxRegexp 275 case '\'': 276 c.state, c.jsCtx = stateJSSqStr, jsCtxRegexp 277 case '/': 278 switch { 279 case i+1 < len(s) && s[i+1] == '/': 280 c.state, i = stateJSLineCmt, i+1 281 case i+1 < len(s) && s[i+1] == '*': 282 c.state, i = stateJSBlockCmt, i+1 283 case c.jsCtx == jsCtxRegexp: 284 c.state = stateJSRegexp 285 case c.jsCtx == jsCtxDivOp: 286 c.jsCtx = jsCtxRegexp 287 default: 288 return context{ 289 state: stateError, 290 err: errorf(ErrSlashAmbig, nil, 0, "'/' could start a division or regexp: %.32q", s[i:]), 291 }, len(s) 292 } 293 default: 294 panic("unreachable") 295 } 296 return c, i + 1 297} 298 299// tJSDelimited is the context transition function for the JS string and regexp 300// states. 301func tJSDelimited(c context, s []byte) (context, int) { 302 specials := `\"` 303 switch c.state { 304 case stateJSSqStr: 305 specials = `\'` 306 case stateJSRegexp: 307 specials = `\/[]` 308 } 309 310 k, inCharset := 0, false 311 for { 312 i := k + bytes.IndexAny(s[k:], specials) 313 if i < k { 314 break 315 } 316 switch s[i] { 317 case '\\': 318 i++ 319 if i == len(s) { 320 return context{ 321 state: stateError, 322 err: errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in JS string: %q", s), 323 }, len(s) 324 } 325 case '[': 326 inCharset = true 327 case ']': 328 inCharset = false 329 default: 330 // end delimiter 331 if !inCharset { 332 c.state, c.jsCtx = stateJS, jsCtxDivOp 333 return c, i + 1 334 } 335 } 336 k = i + 1 337 } 338 339 if inCharset { 340 // This can be fixed by making context richer if interpolation 341 // into charsets is desired. 342 return context{ 343 state: stateError, 344 err: errorf(ErrPartialCharset, nil, 0, "unfinished JS regexp charset: %q", s), 345 }, len(s) 346 } 347 348 return c, len(s) 349} 350 351var blockCommentEnd = []byte("*/") 352 353// tBlockCmt is the context transition function for /*comment*/ states. 354func tBlockCmt(c context, s []byte) (context, int) { 355 i := bytes.Index(s, blockCommentEnd) 356 if i == -1 { 357 return c, len(s) 358 } 359 switch c.state { 360 case stateJSBlockCmt: 361 c.state = stateJS 362 case stateCSSBlockCmt: 363 c.state = stateCSS 364 default: 365 panic(c.state.String()) 366 } 367 return c, i + 2 368} 369 370// tLineCmt is the context transition function for //comment states. 371func tLineCmt(c context, s []byte) (context, int) { 372 var lineTerminators string 373 var endState state 374 switch c.state { 375 case stateJSLineCmt: 376 lineTerminators, endState = "\n\r\u2028\u2029", stateJS 377 case stateCSSLineCmt: 378 lineTerminators, endState = "\n\f\r", stateCSS 379 // Line comments are not part of any published CSS standard but 380 // are supported by the 4 major browsers. 381 // This defines line comments as 382 // LINECOMMENT ::= "//" [^\n\f\d]* 383 // since https://www.w3.org/TR/css3-syntax/#SUBTOK-nl defines 384 // newlines: 385 // nl ::= #xA | #xD #xA | #xD | #xC 386 default: 387 panic(c.state.String()) 388 } 389 390 i := bytes.IndexAny(s, lineTerminators) 391 if i == -1 { 392 return c, len(s) 393 } 394 c.state = endState 395 // Per section 7.4 of EcmaScript 5 : https://es5.github.com/#x7.4 396 // "However, the LineTerminator at the end of the line is not 397 // considered to be part of the single-line comment; it is 398 // recognized separately by the lexical grammar and becomes part 399 // of the stream of input elements for the syntactic grammar." 400 return c, i 401} 402 403// tCSS is the context transition function for the CSS state. 404func tCSS(c context, s []byte) (context, int) { 405 // CSS quoted strings are almost never used except for: 406 // (1) URLs as in background: "/foo.png" 407 // (2) Multiword font-names as in font-family: "Times New Roman" 408 // (3) List separators in content values as in inline-lists: 409 // <style> 410 // ul.inlineList { list-style: none; padding:0 } 411 // ul.inlineList > li { display: inline } 412 // ul.inlineList > li:before { content: ", " } 413 // ul.inlineList > li:first-child:before { content: "" } 414 // </style> 415 // <ul class=inlineList><li>One<li>Two<li>Three</ul> 416 // (4) Attribute value selectors as in a[href="http://example.com/"] 417 // 418 // We conservatively treat all strings as URLs, but make some 419 // allowances to avoid confusion. 420 // 421 // In (1), our conservative assumption is justified. 422 // In (2), valid font names do not contain ':', '?', or '#', so our 423 // conservative assumption is fine since we will never transition past 424 // urlPartPreQuery. 425 // In (3), our protocol heuristic should not be tripped, and there 426 // should not be non-space content after a '?' or '#', so as long as 427 // we only %-encode RFC 3986 reserved characters we are ok. 428 // In (4), we should URL escape for URL attributes, and for others we 429 // have the attribute name available if our conservative assumption 430 // proves problematic for real code. 431 432 k := 0 433 for { 434 i := k + bytes.IndexAny(s[k:], `("'/`) 435 if i < k { 436 return c, len(s) 437 } 438 switch s[i] { 439 case '(': 440 // Look for url to the left. 441 p := bytes.TrimRight(s[:i], "\t\n\f\r ") 442 if endsWithCSSKeyword(p, "url") { 443 j := len(s) - len(bytes.TrimLeft(s[i+1:], "\t\n\f\r ")) 444 switch { 445 case j != len(s) && s[j] == '"': 446 c.state, j = stateCSSDqURL, j+1 447 case j != len(s) && s[j] == '\'': 448 c.state, j = stateCSSSqURL, j+1 449 default: 450 c.state = stateCSSURL 451 } 452 return c, j 453 } 454 case '/': 455 if i+1 < len(s) { 456 switch s[i+1] { 457 case '/': 458 c.state = stateCSSLineCmt 459 return c, i + 2 460 case '*': 461 c.state = stateCSSBlockCmt 462 return c, i + 2 463 } 464 } 465 case '"': 466 c.state = stateCSSDqStr 467 return c, i + 1 468 case '\'': 469 c.state = stateCSSSqStr 470 return c, i + 1 471 } 472 k = i + 1 473 } 474} 475 476// tCSSStr is the context transition function for the CSS string and URL states. 477func tCSSStr(c context, s []byte) (context, int) { 478 var endAndEsc string 479 switch c.state { 480 case stateCSSDqStr, stateCSSDqURL: 481 endAndEsc = `\"` 482 case stateCSSSqStr, stateCSSSqURL: 483 endAndEsc = `\'` 484 case stateCSSURL: 485 // Unquoted URLs end with a newline or close parenthesis. 486 // The below includes the wc (whitespace character) and nl. 487 endAndEsc = "\\\t\n\f\r )" 488 default: 489 panic(c.state.String()) 490 } 491 492 k := 0 493 for { 494 i := k + bytes.IndexAny(s[k:], endAndEsc) 495 if i < k { 496 c, nread := tURL(c, decodeCSS(s[k:])) 497 return c, k + nread 498 } 499 if s[i] == '\\' { 500 i++ 501 if i == len(s) { 502 return context{ 503 state: stateError, 504 err: errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in CSS string: %q", s), 505 }, len(s) 506 } 507 } else { 508 c.state = stateCSS 509 return c, i + 1 510 } 511 c, _ = tURL(c, decodeCSS(s[:i+1])) 512 k = i + 1 513 } 514} 515 516// tError is the context transition function for the error state. 517func tError(c context, s []byte) (context, int) { 518 return c, len(s) 519} 520 521// eatAttrName returns the largest j such that s[i:j] is an attribute name. 522// It returns an error if s[i:] does not look like it begins with an 523// attribute name, such as encountering a quote mark without a preceding 524// equals sign. 525func eatAttrName(s []byte, i int) (int, *Error) { 526 for j := i; j < len(s); j++ { 527 switch s[j] { 528 case ' ', '\t', '\n', '\f', '\r', '=', '>': 529 return j, nil 530 case '\'', '"', '<': 531 // These result in a parse warning in HTML5 and are 532 // indicative of serious problems if seen in an attr 533 // name in a template. 534 return -1, errorf(ErrBadHTML, nil, 0, "%q in attribute name: %.32q", s[j:j+1], s) 535 default: 536 // No-op. 537 } 538 } 539 return len(s), nil 540} 541 542var elementNameMap = map[string]element{ 543 "script": elementScript, 544 "style": elementStyle, 545 "textarea": elementTextarea, 546 "title": elementTitle, 547} 548 549// asciiAlpha reports whether c is an ASCII letter. 550func asciiAlpha(c byte) bool { 551 return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' 552} 553 554// asciiAlphaNum reports whether c is an ASCII letter or digit. 555func asciiAlphaNum(c byte) bool { 556 return asciiAlpha(c) || '0' <= c && c <= '9' 557} 558 559// eatTagName returns the largest j such that s[i:j] is a tag name and the tag type. 560func eatTagName(s []byte, i int) (int, element) { 561 if i == len(s) || !asciiAlpha(s[i]) { 562 return i, elementNone 563 } 564 j := i + 1 565 for j < len(s) { 566 x := s[j] 567 if asciiAlphaNum(x) { 568 j++ 569 continue 570 } 571 // Allow "x-y" or "x:y" but not "x-", "-y", or "x--y". 572 if (x == ':' || x == '-') && j+1 < len(s) && asciiAlphaNum(s[j+1]) { 573 j += 2 574 continue 575 } 576 break 577 } 578 return j, elementNameMap[strings.ToLower(string(s[i:j]))] 579} 580 581// eatWhiteSpace returns the largest j such that s[i:j] is white space. 582func eatWhiteSpace(s []byte, i int) int { 583 for j := i; j < len(s); j++ { 584 switch s[j] { 585 case ' ', '\t', '\n', '\f', '\r': 586 // No-op. 587 default: 588 return j 589 } 590 } 591 return len(s) 592} 593