1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package template
6
7import (
8	"bytes"
9	"strings"
10)
11
12// transitionFunc is the array of context transition functions for text nodes.
13// A transition function takes a context and template text input, and returns
14// the updated context and the number of bytes consumed from the front of the
15// input.
16var transitionFunc = [...]func(context, []byte) (context, int){
17	stateText:        tText,
18	stateTag:         tTag,
19	stateAttrName:    tAttrName,
20	stateAfterName:   tAfterName,
21	stateBeforeValue: tBeforeValue,
22	stateHTMLCmt:     tHTMLCmt,
23	stateRCDATA:      tSpecialTagEnd,
24	stateAttr:        tAttr,
25	stateURL:         tURL,
26	stateJS:          tJS,
27	stateJSDqStr:     tJSDelimited,
28	stateJSSqStr:     tJSDelimited,
29	stateJSRegexp:    tJSDelimited,
30	stateJSBlockCmt:  tBlockCmt,
31	stateJSLineCmt:   tLineCmt,
32	stateCSS:         tCSS,
33	stateCSSDqStr:    tCSSStr,
34	stateCSSSqStr:    tCSSStr,
35	stateCSSDqURL:    tCSSStr,
36	stateCSSSqURL:    tCSSStr,
37	stateCSSURL:      tCSSStr,
38	stateCSSBlockCmt: tBlockCmt,
39	stateCSSLineCmt:  tLineCmt,
40	stateError:       tError,
41}
42
43var commentStart = []byte("<!--")
44var commentEnd = []byte("-->")
45
46// tText is the context transition function for the text state.
47func tText(c context, s []byte) (context, int) {
48	k := 0
49	for {
50		i := k + bytes.IndexByte(s[k:], '<')
51		if i < k || i+1 == len(s) {
52			return c, len(s)
53		} else if i+4 <= len(s) && bytes.Equal(commentStart, s[i:i+4]) {
54			return context{state: stateHTMLCmt}, i + 4
55		}
56		i++
57		end := false
58		if s[i] == '/' {
59			if i+1 == len(s) {
60				return c, len(s)
61			}
62			end, i = true, i+1
63		}
64		j, e := eatTagName(s, i)
65		if j != i {
66			if end {
67				e = elementNone
68			}
69			// We've found an HTML tag.
70			return context{state: stateTag, element: e}, j
71		}
72		k = j
73	}
74}
75
76var elementContentType = [...]state{
77	elementNone:     stateText,
78	elementScript:   stateJS,
79	elementStyle:    stateCSS,
80	elementTextarea: stateRCDATA,
81	elementTitle:    stateRCDATA,
82}
83
84// tTag is the context transition function for the tag state.
85func tTag(c context, s []byte) (context, int) {
86	// Find the attribute name.
87	i := eatWhiteSpace(s, 0)
88	if i == len(s) {
89		return c, len(s)
90	}
91	if s[i] == '>' {
92		return context{
93			state:   elementContentType[c.element],
94			element: c.element,
95		}, i + 1
96	}
97	j, err := eatAttrName(s, i)
98	if err != nil {
99		return context{state: stateError, err: err}, len(s)
100	}
101	state, attr := stateTag, attrNone
102	if i == j {
103		return context{
104			state: stateError,
105			err:   errorf(ErrBadHTML, nil, 0, "expected space, attr name, or end of tag, but got %q", s[i:]),
106		}, len(s)
107	}
108	switch attrType(string(s[i:j])) {
109	case contentTypeURL:
110		attr = attrURL
111	case contentTypeCSS:
112		attr = attrStyle
113	case contentTypeJS:
114		attr = attrScript
115	}
116	if j == len(s) {
117		state = stateAttrName
118	} else {
119		state = stateAfterName
120	}
121	return context{state: state, element: c.element, attr: attr}, j
122}
123
124// tAttrName is the context transition function for stateAttrName.
125func tAttrName(c context, s []byte) (context, int) {
126	i, err := eatAttrName(s, 0)
127	if err != nil {
128		return context{state: stateError, err: err}, len(s)
129	} else if i != len(s) {
130		c.state = stateAfterName
131	}
132	return c, i
133}
134
135// tAfterName is the context transition function for stateAfterName.
136func tAfterName(c context, s []byte) (context, int) {
137	// Look for the start of the value.
138	i := eatWhiteSpace(s, 0)
139	if i == len(s) {
140		return c, len(s)
141	} else if s[i] != '=' {
142		// Occurs due to tag ending '>', and valueless attribute.
143		c.state = stateTag
144		return c, i
145	}
146	c.state = stateBeforeValue
147	// Consume the "=".
148	return c, i + 1
149}
150
151var attrStartStates = [...]state{
152	attrNone:   stateAttr,
153	attrScript: stateJS,
154	attrStyle:  stateCSS,
155	attrURL:    stateURL,
156}
157
158// tBeforeValue is the context transition function for stateBeforeValue.
159func tBeforeValue(c context, s []byte) (context, int) {
160	i := eatWhiteSpace(s, 0)
161	if i == len(s) {
162		return c, len(s)
163	}
164	// Find the attribute delimiter.
165	delim := delimSpaceOrTagEnd
166	switch s[i] {
167	case '\'':
168		delim, i = delimSingleQuote, i+1
169	case '"':
170		delim, i = delimDoubleQuote, i+1
171	}
172	c.state, c.delim = attrStartStates[c.attr], delim
173	return c, i
174}
175
176// tHTMLCmt is the context transition function for stateHTMLCmt.
177func tHTMLCmt(c context, s []byte) (context, int) {
178	if i := bytes.Index(s, commentEnd); i != -1 {
179		return context{}, i + 3
180	}
181	return c, len(s)
182}
183
184// specialTagEndMarkers maps element types to the character sequence that
185// case-insensitively signals the end of the special tag body.
186var specialTagEndMarkers = [...][]byte{
187	elementScript:   []byte("script"),
188	elementStyle:    []byte("style"),
189	elementTextarea: []byte("textarea"),
190	elementTitle:    []byte("title"),
191}
192
193var (
194	specialTagEndPrefix = []byte("</")
195	tagEndSeparators    = []byte("> \t\n\f/")
196)
197
198// tSpecialTagEnd is the context transition function for raw text and RCDATA
199// element states.
200func tSpecialTagEnd(c context, s []byte) (context, int) {
201	if c.element != elementNone {
202		if i := indexTagEnd(s, specialTagEndMarkers[c.element]); i != -1 {
203			return context{}, i
204		}
205	}
206	return c, len(s)
207}
208
209// indexTagEnd finds the index of a special tag end in a case insensitive way, or returns -1
210func indexTagEnd(s []byte, tag []byte) int {
211	res := 0
212	plen := len(specialTagEndPrefix)
213	for len(s) > 0 {
214		// Try to find the tag end prefix first
215		i := bytes.Index(s, specialTagEndPrefix)
216		if i == -1 {
217			return i
218		}
219		s = s[i+plen:]
220		// Try to match the actual tag if there is still space for it
221		if len(tag) <= len(s) && bytes.EqualFold(tag, s[:len(tag)]) {
222			s = s[len(tag):]
223			// Check the tag is followed by a proper separator
224			if len(s) > 0 && bytes.IndexByte(tagEndSeparators, s[0]) != -1 {
225				return res + i
226			}
227			res += len(tag)
228		}
229		res += i + plen
230	}
231	return -1
232}
233
234// tAttr is the context transition function for the attribute state.
235func tAttr(c context, s []byte) (context, int) {
236	return c, len(s)
237}
238
239// tURL is the context transition function for the URL state.
240func tURL(c context, s []byte) (context, int) {
241	if bytes.IndexAny(s, "#?") >= 0 {
242		c.urlPart = urlPartQueryOrFrag
243	} else if len(s) != eatWhiteSpace(s, 0) && c.urlPart == urlPartNone {
244		// HTML5 uses "Valid URL potentially surrounded by spaces" for
245		// attrs: http://www.w3.org/TR/html5/index.html#attributes-1
246		c.urlPart = urlPartPreQuery
247	}
248	return c, len(s)
249}
250
251// tJS is the context transition function for the JS state.
252func tJS(c context, s []byte) (context, int) {
253	i := bytes.IndexAny(s, `"'/`)
254	if i == -1 {
255		// Entire input is non string, comment, regexp tokens.
256		c.jsCtx = nextJSCtx(s, c.jsCtx)
257		return c, len(s)
258	}
259	c.jsCtx = nextJSCtx(s[:i], c.jsCtx)
260	switch s[i] {
261	case '"':
262		c.state, c.jsCtx = stateJSDqStr, jsCtxRegexp
263	case '\'':
264		c.state, c.jsCtx = stateJSSqStr, jsCtxRegexp
265	case '/':
266		switch {
267		case i+1 < len(s) && s[i+1] == '/':
268			c.state, i = stateJSLineCmt, i+1
269		case i+1 < len(s) && s[i+1] == '*':
270			c.state, i = stateJSBlockCmt, i+1
271		case c.jsCtx == jsCtxRegexp:
272			c.state = stateJSRegexp
273		case c.jsCtx == jsCtxDivOp:
274			c.jsCtx = jsCtxRegexp
275		default:
276			return context{
277				state: stateError,
278				err:   errorf(ErrSlashAmbig, nil, 0, "'/' could start a division or regexp: %.32q", s[i:]),
279			}, len(s)
280		}
281	default:
282		panic("unreachable")
283	}
284	return c, i + 1
285}
286
287// tJSDelimited is the context transition function for the JS string and regexp
288// states.
289func tJSDelimited(c context, s []byte) (context, int) {
290	specials := `\"`
291	switch c.state {
292	case stateJSSqStr:
293		specials = `\'`
294	case stateJSRegexp:
295		specials = `\/[]`
296	}
297
298	k, inCharset := 0, false
299	for {
300		i := k + bytes.IndexAny(s[k:], specials)
301		if i < k {
302			break
303		}
304		switch s[i] {
305		case '\\':
306			i++
307			if i == len(s) {
308				return context{
309					state: stateError,
310					err:   errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in JS string: %q", s),
311				}, len(s)
312			}
313		case '[':
314			inCharset = true
315		case ']':
316			inCharset = false
317		default:
318			// end delimiter
319			if !inCharset {
320				c.state, c.jsCtx = stateJS, jsCtxDivOp
321				return c, i + 1
322			}
323		}
324		k = i + 1
325	}
326
327	if inCharset {
328		// This can be fixed by making context richer if interpolation
329		// into charsets is desired.
330		return context{
331			state: stateError,
332			err:   errorf(ErrPartialCharset, nil, 0, "unfinished JS regexp charset: %q", s),
333		}, len(s)
334	}
335
336	return c, len(s)
337}
338
339var blockCommentEnd = []byte("*/")
340
341// tBlockCmt is the context transition function for /*comment*/ states.
342func tBlockCmt(c context, s []byte) (context, int) {
343	i := bytes.Index(s, blockCommentEnd)
344	if i == -1 {
345		return c, len(s)
346	}
347	switch c.state {
348	case stateJSBlockCmt:
349		c.state = stateJS
350	case stateCSSBlockCmt:
351		c.state = stateCSS
352	default:
353		panic(c.state.String())
354	}
355	return c, i + 2
356}
357
358// tLineCmt is the context transition function for //comment states.
359func tLineCmt(c context, s []byte) (context, int) {
360	var lineTerminators string
361	var endState state
362	switch c.state {
363	case stateJSLineCmt:
364		lineTerminators, endState = "\n\r\u2028\u2029", stateJS
365	case stateCSSLineCmt:
366		lineTerminators, endState = "\n\f\r", stateCSS
367		// Line comments are not part of any published CSS standard but
368		// are supported by the 4 major browsers.
369		// This defines line comments as
370		//     LINECOMMENT ::= "//" [^\n\f\d]*
371		// since http://www.w3.org/TR/css3-syntax/#SUBTOK-nl defines
372		// newlines:
373		//     nl ::= #xA | #xD #xA | #xD | #xC
374	default:
375		panic(c.state.String())
376	}
377
378	i := bytes.IndexAny(s, lineTerminators)
379	if i == -1 {
380		return c, len(s)
381	}
382	c.state = endState
383	// Per section 7.4 of EcmaScript 5 : http://es5.github.com/#x7.4
384	// "However, the LineTerminator at the end of the line is not
385	// considered to be part of the single-line comment; it is
386	// recognized separately by the lexical grammar and becomes part
387	// of the stream of input elements for the syntactic grammar."
388	return c, i
389}
390
391// tCSS is the context transition function for the CSS state.
392func tCSS(c context, s []byte) (context, int) {
393	// CSS quoted strings are almost never used except for:
394	// (1) URLs as in background: "/foo.png"
395	// (2) Multiword font-names as in font-family: "Times New Roman"
396	// (3) List separators in content values as in inline-lists:
397	//    <style>
398	//    ul.inlineList { list-style: none; padding:0 }
399	//    ul.inlineList > li { display: inline }
400	//    ul.inlineList > li:before { content: ", " }
401	//    ul.inlineList > li:first-child:before { content: "" }
402	//    </style>
403	//    <ul class=inlineList><li>One<li>Two<li>Three</ul>
404	// (4) Attribute value selectors as in a[href="http://example.com/"]
405	//
406	// We conservatively treat all strings as URLs, but make some
407	// allowances to avoid confusion.
408	//
409	// In (1), our conservative assumption is justified.
410	// In (2), valid font names do not contain ':', '?', or '#', so our
411	// conservative assumption is fine since we will never transition past
412	// urlPartPreQuery.
413	// In (3), our protocol heuristic should not be tripped, and there
414	// should not be non-space content after a '?' or '#', so as long as
415	// we only %-encode RFC 3986 reserved characters we are ok.
416	// In (4), we should URL escape for URL attributes, and for others we
417	// have the attribute name available if our conservative assumption
418	// proves problematic for real code.
419
420	k := 0
421	for {
422		i := k + bytes.IndexAny(s[k:], `("'/`)
423		if i < k {
424			return c, len(s)
425		}
426		switch s[i] {
427		case '(':
428			// Look for url to the left.
429			p := bytes.TrimRight(s[:i], "\t\n\f\r ")
430			if endsWithCSSKeyword(p, "url") {
431				j := len(s) - len(bytes.TrimLeft(s[i+1:], "\t\n\f\r "))
432				switch {
433				case j != len(s) && s[j] == '"':
434					c.state, j = stateCSSDqURL, j+1
435				case j != len(s) && s[j] == '\'':
436					c.state, j = stateCSSSqURL, j+1
437				default:
438					c.state = stateCSSURL
439				}
440				return c, j
441			}
442		case '/':
443			if i+1 < len(s) {
444				switch s[i+1] {
445				case '/':
446					c.state = stateCSSLineCmt
447					return c, i + 2
448				case '*':
449					c.state = stateCSSBlockCmt
450					return c, i + 2
451				}
452			}
453		case '"':
454			c.state = stateCSSDqStr
455			return c, i + 1
456		case '\'':
457			c.state = stateCSSSqStr
458			return c, i + 1
459		}
460		k = i + 1
461	}
462}
463
464// tCSSStr is the context transition function for the CSS string and URL states.
465func tCSSStr(c context, s []byte) (context, int) {
466	var endAndEsc string
467	switch c.state {
468	case stateCSSDqStr, stateCSSDqURL:
469		endAndEsc = `\"`
470	case stateCSSSqStr, stateCSSSqURL:
471		endAndEsc = `\'`
472	case stateCSSURL:
473		// Unquoted URLs end with a newline or close parenthesis.
474		// The below includes the wc (whitespace character) and nl.
475		endAndEsc = "\\\t\n\f\r )"
476	default:
477		panic(c.state.String())
478	}
479
480	k := 0
481	for {
482		i := k + bytes.IndexAny(s[k:], endAndEsc)
483		if i < k {
484			c, nread := tURL(c, decodeCSS(s[k:]))
485			return c, k + nread
486		}
487		if s[i] == '\\' {
488			i++
489			if i == len(s) {
490				return context{
491					state: stateError,
492					err:   errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in CSS string: %q", s),
493				}, len(s)
494			}
495		} else {
496			c.state = stateCSS
497			return c, i + 1
498		}
499		c, _ = tURL(c, decodeCSS(s[:i+1]))
500		k = i + 1
501	}
502}
503
504// tError is the context transition function for the error state.
505func tError(c context, s []byte) (context, int) {
506	return c, len(s)
507}
508
509// eatAttrName returns the largest j such that s[i:j] is an attribute name.
510// It returns an error if s[i:] does not look like it begins with an
511// attribute name, such as encountering a quote mark without a preceding
512// equals sign.
513func eatAttrName(s []byte, i int) (int, *Error) {
514	for j := i; j < len(s); j++ {
515		switch s[j] {
516		case ' ', '\t', '\n', '\f', '\r', '=', '>':
517			return j, nil
518		case '\'', '"', '<':
519			// These result in a parse warning in HTML5 and are
520			// indicative of serious problems if seen in an attr
521			// name in a template.
522			return -1, errorf(ErrBadHTML, nil, 0, "%q in attribute name: %.32q", s[j:j+1], s)
523		default:
524			// No-op.
525		}
526	}
527	return len(s), nil
528}
529
530var elementNameMap = map[string]element{
531	"script":   elementScript,
532	"style":    elementStyle,
533	"textarea": elementTextarea,
534	"title":    elementTitle,
535}
536
537// asciiAlpha reports whether c is an ASCII letter.
538func asciiAlpha(c byte) bool {
539	return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
540}
541
542// asciiAlphaNum reports whether c is an ASCII letter or digit.
543func asciiAlphaNum(c byte) bool {
544	return asciiAlpha(c) || '0' <= c && c <= '9'
545}
546
547// eatTagName returns the largest j such that s[i:j] is a tag name and the tag type.
548func eatTagName(s []byte, i int) (int, element) {
549	if i == len(s) || !asciiAlpha(s[i]) {
550		return i, elementNone
551	}
552	j := i + 1
553	for j < len(s) {
554		x := s[j]
555		if asciiAlphaNum(x) {
556			j++
557			continue
558		}
559		// Allow "x-y" or "x:y" but not "x-", "-y", or "x--y".
560		if (x == ':' || x == '-') && j+1 < len(s) && asciiAlphaNum(s[j+1]) {
561			j += 2
562			continue
563		}
564		break
565	}
566	return j, elementNameMap[strings.ToLower(string(s[i:j]))]
567}
568
569// eatWhiteSpace returns the largest j such that s[i:j] is white space.
570func eatWhiteSpace(s []byte, i int) int {
571	for j := i; j < len(s); j++ {
572		switch s[j] {
573		case ' ', '\t', '\n', '\f', '\r':
574			// No-op.
575		default:
576			return j
577		}
578	}
579	return len(s)
580}
581