1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package template
6
7import (
8	"bytes"
9	"strings"
10)
11
12// transitionFunc is the array of context transition functions for text nodes.
13// A transition function takes a context and template text input, and returns
14// the updated context and the number of bytes consumed from the front of the
15// input.
16var transitionFunc = [...]func(context, []byte) (context, int){
17	stateText:        tText,
18	stateTag:         tTag,
19	stateAttrName:    tAttrName,
20	stateAfterName:   tAfterName,
21	stateBeforeValue: tBeforeValue,
22	stateHTMLCmt:     tHTMLCmt,
23	stateRCDATA:      tSpecialTagEnd,
24	stateAttr:        tAttr,
25	stateURL:         tURL,
26	stateSrcset:      tURL,
27	stateJS:          tJS,
28	stateJSDqStr:     tJSDelimited,
29	stateJSSqStr:     tJSDelimited,
30	stateJSRegexp:    tJSDelimited,
31	stateJSBlockCmt:  tBlockCmt,
32	stateJSLineCmt:   tLineCmt,
33	stateCSS:         tCSS,
34	stateCSSDqStr:    tCSSStr,
35	stateCSSSqStr:    tCSSStr,
36	stateCSSDqURL:    tCSSStr,
37	stateCSSSqURL:    tCSSStr,
38	stateCSSURL:      tCSSStr,
39	stateCSSBlockCmt: tBlockCmt,
40	stateCSSLineCmt:  tLineCmt,
41	stateError:       tError,
42}
43
44var commentStart = []byte("<!--")
45var commentEnd = []byte("-->")
46
47// tText is the context transition function for the text state.
48func tText(c context, s []byte) (context, int) {
49	k := 0
50	for {
51		i := k + bytes.IndexByte(s[k:], '<')
52		if i < k || i+1 == len(s) {
53			return c, len(s)
54		} else if i+4 <= len(s) && bytes.Equal(commentStart, s[i:i+4]) {
55			return context{state: stateHTMLCmt}, i + 4
56		}
57		i++
58		end := false
59		if s[i] == '/' {
60			if i+1 == len(s) {
61				return c, len(s)
62			}
63			end, i = true, i+1
64		}
65		j, e := eatTagName(s, i)
66		if j != i {
67			if end {
68				e = elementNone
69			}
70			// We've found an HTML tag.
71			return context{state: stateTag, element: e}, j
72		}
73		k = j
74	}
75}
76
77var elementContentType = [...]state{
78	elementNone:     stateText,
79	elementScript:   stateJS,
80	elementStyle:    stateCSS,
81	elementTextarea: stateRCDATA,
82	elementTitle:    stateRCDATA,
83}
84
85// tTag is the context transition function for the tag state.
86func tTag(c context, s []byte) (context, int) {
87	// Find the attribute name.
88	i := eatWhiteSpace(s, 0)
89	if i == len(s) {
90		return c, len(s)
91	}
92	if s[i] == '>' {
93		return context{
94			state:   elementContentType[c.element],
95			element: c.element,
96		}, i + 1
97	}
98	j, err := eatAttrName(s, i)
99	if err != nil {
100		return context{state: stateError, err: err}, len(s)
101	}
102	state, attr := stateTag, attrNone
103	if i == j {
104		return context{
105			state: stateError,
106			err:   errorf(ErrBadHTML, nil, 0, "expected space, attr name, or end of tag, but got %q", s[i:]),
107		}, len(s)
108	}
109
110	attrName := strings.ToLower(string(s[i:j]))
111	if c.element == elementScript && attrName == "type" {
112		attr = attrScriptType
113	} else {
114		switch attrType(attrName) {
115		case contentTypeURL:
116			attr = attrURL
117		case contentTypeCSS:
118			attr = attrStyle
119		case contentTypeJS:
120			attr = attrScript
121		case contentTypeSrcset:
122			attr = attrSrcset
123		}
124	}
125
126	if j == len(s) {
127		state = stateAttrName
128	} else {
129		state = stateAfterName
130	}
131	return context{state: state, element: c.element, attr: attr}, j
132}
133
134// tAttrName is the context transition function for stateAttrName.
135func tAttrName(c context, s []byte) (context, int) {
136	i, err := eatAttrName(s, 0)
137	if err != nil {
138		return context{state: stateError, err: err}, len(s)
139	} else if i != len(s) {
140		c.state = stateAfterName
141	}
142	return c, i
143}
144
145// tAfterName is the context transition function for stateAfterName.
146func tAfterName(c context, s []byte) (context, int) {
147	// Look for the start of the value.
148	i := eatWhiteSpace(s, 0)
149	if i == len(s) {
150		return c, len(s)
151	} else if s[i] != '=' {
152		// Occurs due to tag ending '>', and valueless attribute.
153		c.state = stateTag
154		return c, i
155	}
156	c.state = stateBeforeValue
157	// Consume the "=".
158	return c, i + 1
159}
160
161var attrStartStates = [...]state{
162	attrNone:       stateAttr,
163	attrScript:     stateJS,
164	attrScriptType: stateAttr,
165	attrStyle:      stateCSS,
166	attrURL:        stateURL,
167	attrSrcset:     stateSrcset,
168}
169
170// tBeforeValue is the context transition function for stateBeforeValue.
171func tBeforeValue(c context, s []byte) (context, int) {
172	i := eatWhiteSpace(s, 0)
173	if i == len(s) {
174		return c, len(s)
175	}
176	// Find the attribute delimiter.
177	delim := delimSpaceOrTagEnd
178	switch s[i] {
179	case '\'':
180		delim, i = delimSingleQuote, i+1
181	case '"':
182		delim, i = delimDoubleQuote, i+1
183	}
184	c.state, c.delim = attrStartStates[c.attr], delim
185	return c, i
186}
187
188// tHTMLCmt is the context transition function for stateHTMLCmt.
189func tHTMLCmt(c context, s []byte) (context, int) {
190	if i := bytes.Index(s, commentEnd); i != -1 {
191		return context{}, i + 3
192	}
193	return c, len(s)
194}
195
196// specialTagEndMarkers maps element types to the character sequence that
197// case-insensitively signals the end of the special tag body.
198var specialTagEndMarkers = [...][]byte{
199	elementScript:   []byte("script"),
200	elementStyle:    []byte("style"),
201	elementTextarea: []byte("textarea"),
202	elementTitle:    []byte("title"),
203}
204
205var (
206	specialTagEndPrefix = []byte("</")
207	tagEndSeparators    = []byte("> \t\n\f/")
208)
209
210// tSpecialTagEnd is the context transition function for raw text and RCDATA
211// element states.
212func tSpecialTagEnd(c context, s []byte) (context, int) {
213	if c.element != elementNone {
214		if i := indexTagEnd(s, specialTagEndMarkers[c.element]); i != -1 {
215			return context{}, i
216		}
217	}
218	return c, len(s)
219}
220
221// indexTagEnd finds the index of a special tag end in a case insensitive way, or returns -1
222func indexTagEnd(s []byte, tag []byte) int {
223	res := 0
224	plen := len(specialTagEndPrefix)
225	for len(s) > 0 {
226		// Try to find the tag end prefix first
227		i := bytes.Index(s, specialTagEndPrefix)
228		if i == -1 {
229			return i
230		}
231		s = s[i+plen:]
232		// Try to match the actual tag if there is still space for it
233		if len(tag) <= len(s) && bytes.EqualFold(tag, s[:len(tag)]) {
234			s = s[len(tag):]
235			// Check the tag is followed by a proper separator
236			if len(s) > 0 && bytes.IndexByte(tagEndSeparators, s[0]) != -1 {
237				return res + i
238			}
239			res += len(tag)
240		}
241		res += i + plen
242	}
243	return -1
244}
245
246// tAttr is the context transition function for the attribute state.
247func tAttr(c context, s []byte) (context, int) {
248	return c, len(s)
249}
250
251// tURL is the context transition function for the URL state.
252func tURL(c context, s []byte) (context, int) {
253	if bytes.ContainsAny(s, "#?") {
254		c.urlPart = urlPartQueryOrFrag
255	} else if len(s) != eatWhiteSpace(s, 0) && c.urlPart == urlPartNone {
256		// HTML5 uses "Valid URL potentially surrounded by spaces" for
257		// attrs: https://www.w3.org/TR/html5/index.html#attributes-1
258		c.urlPart = urlPartPreQuery
259	}
260	return c, len(s)
261}
262
263// tJS is the context transition function for the JS state.
264func tJS(c context, s []byte) (context, int) {
265	i := bytes.IndexAny(s, `"'/`)
266	if i == -1 {
267		// Entire input is non string, comment, regexp tokens.
268		c.jsCtx = nextJSCtx(s, c.jsCtx)
269		return c, len(s)
270	}
271	c.jsCtx = nextJSCtx(s[:i], c.jsCtx)
272	switch s[i] {
273	case '"':
274		c.state, c.jsCtx = stateJSDqStr, jsCtxRegexp
275	case '\'':
276		c.state, c.jsCtx = stateJSSqStr, jsCtxRegexp
277	case '/':
278		switch {
279		case i+1 < len(s) && s[i+1] == '/':
280			c.state, i = stateJSLineCmt, i+1
281		case i+1 < len(s) && s[i+1] == '*':
282			c.state, i = stateJSBlockCmt, i+1
283		case c.jsCtx == jsCtxRegexp:
284			c.state = stateJSRegexp
285		case c.jsCtx == jsCtxDivOp:
286			c.jsCtx = jsCtxRegexp
287		default:
288			return context{
289				state: stateError,
290				err:   errorf(ErrSlashAmbig, nil, 0, "'/' could start a division or regexp: %.32q", s[i:]),
291			}, len(s)
292		}
293	default:
294		panic("unreachable")
295	}
296	return c, i + 1
297}
298
299// tJSDelimited is the context transition function for the JS string and regexp
300// states.
301func tJSDelimited(c context, s []byte) (context, int) {
302	specials := `\"`
303	switch c.state {
304	case stateJSSqStr:
305		specials = `\'`
306	case stateJSRegexp:
307		specials = `\/[]`
308	}
309
310	k, inCharset := 0, false
311	for {
312		i := k + bytes.IndexAny(s[k:], specials)
313		if i < k {
314			break
315		}
316		switch s[i] {
317		case '\\':
318			i++
319			if i == len(s) {
320				return context{
321					state: stateError,
322					err:   errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in JS string: %q", s),
323				}, len(s)
324			}
325		case '[':
326			inCharset = true
327		case ']':
328			inCharset = false
329		default:
330			// end delimiter
331			if !inCharset {
332				c.state, c.jsCtx = stateJS, jsCtxDivOp
333				return c, i + 1
334			}
335		}
336		k = i + 1
337	}
338
339	if inCharset {
340		// This can be fixed by making context richer if interpolation
341		// into charsets is desired.
342		return context{
343			state: stateError,
344			err:   errorf(ErrPartialCharset, nil, 0, "unfinished JS regexp charset: %q", s),
345		}, len(s)
346	}
347
348	return c, len(s)
349}
350
351var blockCommentEnd = []byte("*/")
352
353// tBlockCmt is the context transition function for /*comment*/ states.
354func tBlockCmt(c context, s []byte) (context, int) {
355	i := bytes.Index(s, blockCommentEnd)
356	if i == -1 {
357		return c, len(s)
358	}
359	switch c.state {
360	case stateJSBlockCmt:
361		c.state = stateJS
362	case stateCSSBlockCmt:
363		c.state = stateCSS
364	default:
365		panic(c.state.String())
366	}
367	return c, i + 2
368}
369
370// tLineCmt is the context transition function for //comment states.
371func tLineCmt(c context, s []byte) (context, int) {
372	var lineTerminators string
373	var endState state
374	switch c.state {
375	case stateJSLineCmt:
376		lineTerminators, endState = "\n\r\u2028\u2029", stateJS
377	case stateCSSLineCmt:
378		lineTerminators, endState = "\n\f\r", stateCSS
379		// Line comments are not part of any published CSS standard but
380		// are supported by the 4 major browsers.
381		// This defines line comments as
382		//     LINECOMMENT ::= "//" [^\n\f\d]*
383		// since https://www.w3.org/TR/css3-syntax/#SUBTOK-nl defines
384		// newlines:
385		//     nl ::= #xA | #xD #xA | #xD | #xC
386	default:
387		panic(c.state.String())
388	}
389
390	i := bytes.IndexAny(s, lineTerminators)
391	if i == -1 {
392		return c, len(s)
393	}
394	c.state = endState
395	// Per section 7.4 of EcmaScript 5 : https://es5.github.com/#x7.4
396	// "However, the LineTerminator at the end of the line is not
397	// considered to be part of the single-line comment; it is
398	// recognized separately by the lexical grammar and becomes part
399	// of the stream of input elements for the syntactic grammar."
400	return c, i
401}
402
403// tCSS is the context transition function for the CSS state.
404func tCSS(c context, s []byte) (context, int) {
405	// CSS quoted strings are almost never used except for:
406	// (1) URLs as in background: "/foo.png"
407	// (2) Multiword font-names as in font-family: "Times New Roman"
408	// (3) List separators in content values as in inline-lists:
409	//    <style>
410	//    ul.inlineList { list-style: none; padding:0 }
411	//    ul.inlineList > li { display: inline }
412	//    ul.inlineList > li:before { content: ", " }
413	//    ul.inlineList > li:first-child:before { content: "" }
414	//    </style>
415	//    <ul class=inlineList><li>One<li>Two<li>Three</ul>
416	// (4) Attribute value selectors as in a[href="http://example.com/"]
417	//
418	// We conservatively treat all strings as URLs, but make some
419	// allowances to avoid confusion.
420	//
421	// In (1), our conservative assumption is justified.
422	// In (2), valid font names do not contain ':', '?', or '#', so our
423	// conservative assumption is fine since we will never transition past
424	// urlPartPreQuery.
425	// In (3), our protocol heuristic should not be tripped, and there
426	// should not be non-space content after a '?' or '#', so as long as
427	// we only %-encode RFC 3986 reserved characters we are ok.
428	// In (4), we should URL escape for URL attributes, and for others we
429	// have the attribute name available if our conservative assumption
430	// proves problematic for real code.
431
432	k := 0
433	for {
434		i := k + bytes.IndexAny(s[k:], `("'/`)
435		if i < k {
436			return c, len(s)
437		}
438		switch s[i] {
439		case '(':
440			// Look for url to the left.
441			p := bytes.TrimRight(s[:i], "\t\n\f\r ")
442			if endsWithCSSKeyword(p, "url") {
443				j := len(s) - len(bytes.TrimLeft(s[i+1:], "\t\n\f\r "))
444				switch {
445				case j != len(s) && s[j] == '"':
446					c.state, j = stateCSSDqURL, j+1
447				case j != len(s) && s[j] == '\'':
448					c.state, j = stateCSSSqURL, j+1
449				default:
450					c.state = stateCSSURL
451				}
452				return c, j
453			}
454		case '/':
455			if i+1 < len(s) {
456				switch s[i+1] {
457				case '/':
458					c.state = stateCSSLineCmt
459					return c, i + 2
460				case '*':
461					c.state = stateCSSBlockCmt
462					return c, i + 2
463				}
464			}
465		case '"':
466			c.state = stateCSSDqStr
467			return c, i + 1
468		case '\'':
469			c.state = stateCSSSqStr
470			return c, i + 1
471		}
472		k = i + 1
473	}
474}
475
476// tCSSStr is the context transition function for the CSS string and URL states.
477func tCSSStr(c context, s []byte) (context, int) {
478	var endAndEsc string
479	switch c.state {
480	case stateCSSDqStr, stateCSSDqURL:
481		endAndEsc = `\"`
482	case stateCSSSqStr, stateCSSSqURL:
483		endAndEsc = `\'`
484	case stateCSSURL:
485		// Unquoted URLs end with a newline or close parenthesis.
486		// The below includes the wc (whitespace character) and nl.
487		endAndEsc = "\\\t\n\f\r )"
488	default:
489		panic(c.state.String())
490	}
491
492	k := 0
493	for {
494		i := k + bytes.IndexAny(s[k:], endAndEsc)
495		if i < k {
496			c, nread := tURL(c, decodeCSS(s[k:]))
497			return c, k + nread
498		}
499		if s[i] == '\\' {
500			i++
501			if i == len(s) {
502				return context{
503					state: stateError,
504					err:   errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in CSS string: %q", s),
505				}, len(s)
506			}
507		} else {
508			c.state = stateCSS
509			return c, i + 1
510		}
511		c, _ = tURL(c, decodeCSS(s[:i+1]))
512		k = i + 1
513	}
514}
515
516// tError is the context transition function for the error state.
517func tError(c context, s []byte) (context, int) {
518	return c, len(s)
519}
520
521// eatAttrName returns the largest j such that s[i:j] is an attribute name.
522// It returns an error if s[i:] does not look like it begins with an
523// attribute name, such as encountering a quote mark without a preceding
524// equals sign.
525func eatAttrName(s []byte, i int) (int, *Error) {
526	for j := i; j < len(s); j++ {
527		switch s[j] {
528		case ' ', '\t', '\n', '\f', '\r', '=', '>':
529			return j, nil
530		case '\'', '"', '<':
531			// These result in a parse warning in HTML5 and are
532			// indicative of serious problems if seen in an attr
533			// name in a template.
534			return -1, errorf(ErrBadHTML, nil, 0, "%q in attribute name: %.32q", s[j:j+1], s)
535		default:
536			// No-op.
537		}
538	}
539	return len(s), nil
540}
541
542var elementNameMap = map[string]element{
543	"script":   elementScript,
544	"style":    elementStyle,
545	"textarea": elementTextarea,
546	"title":    elementTitle,
547}
548
549// asciiAlpha reports whether c is an ASCII letter.
550func asciiAlpha(c byte) bool {
551	return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
552}
553
554// asciiAlphaNum reports whether c is an ASCII letter or digit.
555func asciiAlphaNum(c byte) bool {
556	return asciiAlpha(c) || '0' <= c && c <= '9'
557}
558
559// eatTagName returns the largest j such that s[i:j] is a tag name and the tag type.
560func eatTagName(s []byte, i int) (int, element) {
561	if i == len(s) || !asciiAlpha(s[i]) {
562		return i, elementNone
563	}
564	j := i + 1
565	for j < len(s) {
566		x := s[j]
567		if asciiAlphaNum(x) {
568			j++
569			continue
570		}
571		// Allow "x-y" or "x:y" but not "x-", "-y", or "x--y".
572		if (x == ':' || x == '-') && j+1 < len(s) && asciiAlphaNum(s[j+1]) {
573			j += 2
574			continue
575		}
576		break
577	}
578	return j, elementNameMap[strings.ToLower(string(s[i:j]))]
579}
580
581// eatWhiteSpace returns the largest j such that s[i:j] is white space.
582func eatWhiteSpace(s []byte, i int) int {
583	for j := i; j < len(s); j++ {
584		switch s[j] {
585		case ' ', '\t', '\n', '\f', '\r':
586			// No-op.
587		default:
588			return j
589		}
590	}
591	return len(s)
592}
593