1// Copyright 2017 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5package webpagereplay
6
7import (
8	"bytes"
9	"compress/flate"
10	"compress/gzip"
11	"crypto/sha256"
12	"encoding/base64"
13	"encoding/json"
14	"errors"
15	"fmt"
16	"io"
17	"io/ioutil"
18	"log"
19	"net/http"
20	"net/url"
21	"regexp"
22	"strconv"
23	"strings"
24)
25
26type readerWithError struct {
27	r   io.Reader
28	err error
29}
30
31func (r *readerWithError) Read(p []byte) (int, error) {
32	n, err := r.r.Read(p)
33	if err == io.EOF && r.err != nil {
34		err = r.err
35	}
36	return n, err
37}
38
39// cloneHeaders clones h.
40func cloneHeaders(h http.Header) http.Header {
41	hh := make(http.Header, len(h))
42	for k, vv := range h {
43		if vv == nil {
44			hh[k] = nil
45		} else {
46			hh[k] = append([]string{}, vv...)
47		}
48	}
49	return hh
50}
51
52// transformResponseBody applies a transformation function to the response
53// body.
54// tf is passed an uncompressed body and should return an uncompressed body.
55// The final response will be compressed if allowed by
56// resp.Header[ContentEncoding].
57func transformResponseBody(resp *http.Response, f func([]byte) []byte) error {
58	failEarly := func(body []byte, err error) error {
59		resp.Body = ioutil.NopCloser(&readerWithError{bytes.NewReader(body), err})
60		return err
61	}
62
63	body, err := ioutil.ReadAll(resp.Body)
64	if err != nil {
65		return failEarly(body, err)
66	}
67	resp.Body.Close()
68
69	var isCompressed bool
70	var ce string
71	if encodings, ok := resp.Header["Content-Encoding"]; ok && len(encodings) > 0 {
72		// TODO(xunjieli): Use the last CE for now. Support chained CEs.
73		ce = strings.ToLower(encodings[len(encodings)-1])
74		isCompressed = (ce != "" && ce != "identity")
75	}
76
77	// Decompress as needed.
78	if isCompressed {
79		body, err = decompressBody(ce, body)
80		if err != nil {
81			return failEarly(body, err)
82		}
83	}
84
85	// Transform and recompress as needed.
86	body = f(body)
87	if isCompressed {
88		body, _, err = CompressBody(ce, body)
89		if err != nil {
90			return failEarly(body, err)
91		}
92	}
93	resp.Body = ioutil.NopCloser(bytes.NewReader(body))
94
95	// ContentLength has changed, so update the outgoing headers accordingly.
96	if resp.ContentLength >= 0 {
97		resp.ContentLength = int64(len(body))
98		resp.Header.Set("Content-Length", strconv.Itoa(len(body)))
99	}
100	return nil
101}
102
103// Decompresses Response Body in place.
104func DecompressResponse(resp *http.Response) error {
105	ce := strings.ToLower(resp.Header.Get("Content-Encoding"))
106	isCompressed := (ce != "" && ce != "identity")
107	if isCompressed {
108		body, err := ioutil.ReadAll(resp.Body)
109		if err != nil {
110			return err
111		}
112		resp.Body.Close()
113		body, err = decompressBody(ce, body)
114		if err != nil {
115			return err
116		}
117		resp.Body = ioutil.NopCloser(bytes.NewReader(body))
118	}
119	return nil
120}
121
122// decompressBody reads a response body and decompresses according to the
123// given Content-Encoding.
124func decompressBody(ce string, compressed []byte) ([]byte, error) {
125	var r io.ReadCloser
126	switch strings.ToLower(ce) {
127	case "gzip":
128		var err error
129		r, err = gzip.NewReader(bytes.NewReader(compressed))
130		if err != nil {
131			return nil, err
132		}
133	case "deflate":
134		r = flate.NewReader(bytes.NewReader(compressed))
135	// TODO(catapult:3742): Implement Brotli support.
136	default:
137		// Unknown compression type or uncompressed.
138		return compressed, errors.New("unknown compression: " + ce)
139	}
140	defer r.Close()
141	return ioutil.ReadAll(r)
142}
143
144// CompressBody reads a response body and compresses according to the given
145// Accept-Encoding.
146// The chosen compressed encoding is returned along with the compressed body.
147func CompressBody(ae string, uncompressed []byte) ([]byte, string, error) {
148	var buf bytes.Buffer
149	var w io.WriteCloser
150	outCE := ""
151	ae = strings.ToLower(ae)
152	switch {
153	case strings.Contains(ae, "gzip"):
154		w = gzip.NewWriter(&buf)
155		outCE = "gzip"
156	case strings.Contains(ae, "deflate"):
157		w, _ = flate.NewWriter(&buf, flate.DefaultCompression) // never fails
158		outCE = "deflate"
159	default:
160		// Unknown compression type or compression not allowed.
161		return uncompressed, "identity", errors.New("unknown compression: " + ae)
162	}
163	if _, err := io.Copy(w, bytes.NewReader(uncompressed)); err != nil {
164		return buf.Bytes(), outCE, err
165	}
166	err := w.Close()
167	return buf.Bytes(), outCE, err
168}
169
170// getCSPScriptSrcDirectiveFromHeaders returns a Content-Security-Policy (CSP)
171// header's script source directive. If a header set does not have a CSP
172// header or if the CSP header does not have a script-src directive,
173// getCSPScriptSrcDirectiveFromHeaders returns an empty string.
174func getCSPScriptSrcDirectiveFromHeaders(header http.Header) string {
175	csp := header.Get("Content-Security-Policy")
176	if csp == "" {
177		return ""
178	}
179
180	directives := strings.Split(csp, ";")
181	default_directive := ""
182	for _, directive := range directives {
183		directive = strings.TrimSpace(directive)
184		if strings.HasPrefix(directive, "script-src") {
185			return directive
186		}
187		if strings.HasPrefix(directive, "default-src") {
188			default_directive = directive
189		}
190	}
191
192	return default_directive
193}
194
195// getScriptSrcNonceTokenFromCSPHeader returns the nonce token from a
196// Content-Security-Policy (CSP) header's script source directive, or an empty
197// string if the CSP header's script source
198// does not contain a nonce.
199// For more background information on CSP and nonce, please refer to
200// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/
201// Content-Security-Policy/script-src
202// https://developers.google.com/web/fundamentals/security/csp/
203func getNonceTokenFromCSPHeaderScriptSrc(cspScriptSrc string) string {
204	cspScriptSrc = strings.Trim(cspScriptSrc, " ")
205	tokens := strings.Split(cspScriptSrc, " ")
206	for _, token := range tokens {
207		token = strings.TrimSpace(token)
208		if strings.HasPrefix(token, "'nonce-") {
209			token = strings.TrimPrefix(token, "'nonce-")
210			token = strings.TrimSuffix(token, "'")
211			return token
212		}
213	}
214
215	return ""
216}
217
218// transformCSPHeader transforms a Content-Security-Policy (CSP) header to
219// permit execution of inline scripts. Without this permission a page with a
220// restrictive CSP will not execute WPR
221// injected scripts.
222// For more background information on CSP, please refer to
223// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/
224// Content-Security-Policy/script-src
225// https://developers.google.com/web/fundamentals/security/csp/
226func transformCSPHeader(header http.Header, injectedScriptSha256 string) {
227	csp := header.Get("Content-Security-Policy")
228	if csp == "" {
229		return
230	}
231	// We prefer the 'script-src', but if it doesn't exist, we want to update a
232	// 'default-src' directive if it exists.
233	directives := strings.Split(csp, ";")
234	updateIndex := -1
235	for index, directive := range directives {
236		directive = strings.TrimSpace(directive)
237		if strings.HasPrefix(directive, "script-src") ||
238		   strings.HasPrefix(directive, "default-src") {
239			updateIndex = index
240			if strings.HasPrefix(directive, "script-src") {
241			  break
242			}
243		}
244	}
245	// No CSP policy to worry about updating.
246	if updateIndex < 0 {
247		return
248	}
249	updateDirective := directives[updateIndex]
250	if getNonceTokenFromCSPHeaderScriptSrc(updateDirective) != "" {
251		// If the CSP header's script-src contains a nonce, then
252		// transformCSPHeader does nothing.
253		// WPR will add the nonce token to any injected script to open the
254		// permission.
255		return
256	}
257	// Break the 'script-src' or 'default-src' directive into more tokens,
258	// and examine each token.
259	tokens := strings.Split(updateDirective, " ")
260	newDirective := ""
261	needsUnsafeInline := true
262
263	for _, token := range tokens {
264		token = strings.TrimSpace(token)
265		// All keyword tokens ['unsafe-inline', 'none', 'nonce-...', 'sha...'']
266		// are single-quote wrapped in the CSP headers.
267		if token == "'unsafe-inline'" {
268			needsUnsafeInline = false
269		}
270		// If the CSP header contains a hash, append the hash of the injected
271		// script.
272		// If a CSP specifies a hash, only inline scripts matching the hash
273		// may execute.
274		if strings.HasPrefix(token, "'sha256-") ||
275			strings.HasPrefix(token, "'sha384-") ||
276			strings.HasPrefix(token, "'sha512-") {
277			newDirective += "'sha256-" + injectedScriptSha256 + "' "
278			needsUnsafeInline = false
279		}
280		// Don't add back 'none' to our set, as if it is the only item it
281		// follows we will be adding 'unsafe-inline' below.
282		if token == "'none'" {
283			continue
284		}
285		newDirective += token + " "
286	}
287
288	if needsUnsafeInline {
289		newDirective += "'unsafe-inline'"
290	}
291
292	directives[updateIndex] = newDirective
293	newCsp := strings.Join(directives, ";")
294	header.Set("Content-Security-Policy", newCsp)
295}
296
297// ResponseTransformer is an interface for transforming HTTP responses.
298type ResponseTransformer interface {
299	// Transform applies transformations to the response. for example, by
300	// updating resp.Header or wrapping resp.Body. The transformer may inspect
301	// the request but should not modify the request.
302	Transform(req *http.Request, resp *http.Response)
303}
304
305// NewScriptInjector constructs a transformer that injects the given script
306// after the first <head>, <html>, or <!doctype html> tag. Statements in
307// script must be ';' terminated. The script is lightly minified before
308// injection.
309func NewScriptInjector(
310	script []byte, replacements map[string]string) ResponseTransformer {
311	// Remove C-style comments.
312	script = jsMultilineCommentRE.ReplaceAllLiteral(script, []byte(""))
313	script = jsSinglelineCommentRE.ReplaceAllLiteral(script, []byte(""))
314	for oldstr, newstr := range replacements {
315		script = bytes.Replace(script, []byte(oldstr), []byte(newstr), -1)
316	}
317	// Remove line breaks.
318	script = bytes.Replace(script, []byte("\r\n"), []byte(""), -1)
319	// Compute the sha256 hash of the script content.
320	// WPR may need to use the sha256 hash in a CSP header to grant the injected
321	// script execute permission.
322	sha256Bytes := sha256.Sum256(script)
323	sha256String := base64.URLEncoding.EncodeToString(sha256Bytes[:])
324	return &scriptInjector{script, sha256String}
325}
326
327// NewScriptInjectorFromFile creates a script injector from a script stored in
328// a file.
329func NewScriptInjectorFromFile(
330	filename string, replacements map[string]string) (
331	ResponseTransformer, error) {
332	script, err := ioutil.ReadFile(filename)
333	if err != nil {
334		return nil, err
335	}
336	return NewScriptInjector(script, replacements), nil
337}
338
339var (
340	jsMultilineCommentRE  = regexp.MustCompile(`(?is)/\*.*?\*/`)
341	jsSinglelineCommentRE = regexp.MustCompile(`(?i)//.*`)
342	doctypeRE             = regexp.MustCompile(
343		`(?is)^.*?(<!--.*-->)?.*?<!doctype html>`)
344	htmlRE = regexp.MustCompile(
345		`(?is)^.*?(<!--.*-->)?.*?<html.*?>`)
346	headRE = regexp.MustCompile(
347		`(?is)^.*?(<!--.*-->)?.*?<head.*?>`)
348)
349
350type scriptInjector struct {
351	script []byte
352	sha256 string
353}
354
355// Given a nonce, getScriptWithNonce returns the injected script text with the
356// nonce.
357// If nonce is an empty string, getScriptWithNonce returns the script block
358// without attaching a nonce attribute.
359// Some responses may specify a nonce inside their Content-Security-Policy,
360// script-src directive.
361// The script injector needs to set the injected script's nonce attribute to
362// open execute permission for the injected script.
363func (si *scriptInjector) getScriptWithNonce(nonce string) []byte {
364	var buffer bytes.Buffer
365	buffer.Write([]byte("<script"))
366	if nonce != "" {
367		buffer.Write([]byte(" nonce=\""+nonce+"\""))
368	}
369	buffer.Write([]byte(">"))
370	buffer.Write(si.script)
371	buffer.Write([]byte("</script>"))
372	return buffer.Bytes()
373}
374
375func (si *scriptInjector) Transform(_ *http.Request, resp *http.Response) {
376	// Skip non-HTML non-200 responses.
377	if !strings.HasPrefix(
378		strings.ToLower(resp.Header.Get("Content-Type")), "text/html") {
379		return
380	}
381	if resp.StatusCode != http.StatusOK {
382		return
383	}
384
385	transformResponseBody(resp, func(body []byte) []byte {
386		// Don't inject if the script has already been injected.
387		if bytes.Contains(body, si.script) {
388			return body
389		}
390
391		// Find an appropriate place to inject the script, then inject.
392		idx := headRE.FindIndex(body)
393		if idx == nil {
394			idx = htmlRE.FindIndex(body)
395		}
396		if idx == nil {
397			idx = doctypeRE.FindIndex(body)
398		}
399		if idx == nil {
400			log.Printf(
401				"ScriptInjector(%s): no start tags found, skip injecting script",
402				resp.Request.URL)
403			return body
404		}
405		n := idx[1]
406
407		// If the response has a content-script-policy script src directive that
408		// specifies a nonce, add the nonce to the injected script.
409		// If a CSP specifies a nonce, only script blocks containing a matching
410		// nonce attribute may execute.
411		// To open permission for WPR-injected scripts while preserving permission
412		// for any page-src scripts containing the nonce, WPR must add the nonce
413		// token to injected scripts. Please see http://crbug.com/904534 for a
414		// detailed case study.
415		nonce := ""
416		if directive := getCSPScriptSrcDirectiveFromHeaders(resp.Header);
417			directive != "" {
418			nonce = getNonceTokenFromCSPHeaderScriptSrc(directive)
419		}
420
421		var buffer bytes.Buffer
422		buffer.Write(body[:n])
423		buffer.Write(si.getScriptWithNonce(nonce))
424		buffer.Write(body[n:])
425
426		// Having injected script, transform the response's
427		// content-security-policy directive to allow the injected script to
428		// execute.
429		transformCSPHeader(resp.Header, si.sha256)
430		return buffer.Bytes()
431	})
432}
433
434// NewRuleBasedTransformer creates a transformer that is controlled by a rules
435// file.
436// Rules are specified as a JSON-encoded array of TransformerRule objects.
437func NewRuleBasedTransformer(filename string) (ResponseTransformer, error) {
438	raw, err := ioutil.ReadFile(filename)
439	if err != nil {
440		return nil, err
441	}
442	var rules []*TransformerRule
443	if err := json.Unmarshal(raw, &rules); err != nil {
444		return nil, fmt.Errorf("json unmarshal failed: %v", err)
445	}
446	for _, r := range rules {
447		if err := r.compile(); err != nil {
448			return nil, err
449		}
450	}
451	return &ruleBasedTransformer{rules}, nil
452}
453
454// TransformerRule is a single JSON-encoded rule. Each rule matches either a
455// specific URL (via URL) or a regexp pattern (via URLPattern).
456type TransformerRule struct {
457	// How to match URLs: exactly one of URL and URLPattern must be specified.
458	URL        string
459	URLPattern string
460
461	// Rules to apply to these URLs.
462	// Inject these extra headers into the response
463	ExtraHeaders http.Header
464	// Inject these HTTP/2 PUSH_PROMISE frames into the response
465	Push []PushPromiseRule
466
467	// Hidden state generated by compile.
468	urlRE *regexp.Regexp
469}
470
471// PushPromiseRule is a rule that adds pushes into the response stream.
472type PushPromiseRule struct {
473	// URL to push.
474	URL string
475
476	// Header for the request being simulated by this push. If empty, a default
477	// set of headers are created by cloning the current request's headers and
478	// setting
479	// "referer" to the URL of the current (pusher) request.
480	Headers http.Header
481}
482
483type ruleBasedTransformer struct {
484	rules []*TransformerRule
485}
486
487func (r *TransformerRule) compile() error {
488	raw, _ := json.Marshal(r)
489	if r.URL == "" && r.URLPattern == "" {
490		return fmt.Errorf("rule missing URL or URLPattern: %q", raw)
491	}
492	if r.URL != "" && r.URLPattern != "" {
493		return fmt.Errorf("rule has both URL and URLPattern: %q", raw)
494	}
495	if r.URLPattern != "" {
496		re, err := regexp.Compile(r.URLPattern)
497		if err != nil {
498			return fmt.Errorf("error compiling URLPattern %s: %v", r.URLPattern, err)
499		}
500		r.urlRE = re
501	}
502	if len(r.ExtraHeaders) == 0 && len(r.Push) == 0 {
503		return fmt.Errorf("rule has no affect: %q", raw)
504	}
505	for _, p := range r.Push {
506		if p.URL == "" {
507			return fmt.Errorf("push has empty URL: %q", raw)
508		}
509		if u, err := url.Parse(p.URL); err != nil || !u.IsAbs() ||
510			(u.Scheme != "http" && u.Scheme != "https") {
511			return fmt.Errorf("push has bad URL %s: %v", p.URL, err)
512		}
513	}
514	return nil
515}
516
517func (r *TransformerRule) matches(req *http.Request) bool {
518	if r.URL != "" {
519		return r.URL == req.URL.String()
520	}
521	return r.urlRE.MatchString(req.URL.String())
522}
523
524func (r *TransformerRule) shortString() string {
525	pushes := ""
526	for _, p := range r.Push {
527		pushes += p.URL + " "
528	}
529	return fmt.Sprintf("ExtraHeaders: %d; Push: [%s]", len(r.ExtraHeaders),
530		pushes)
531}
532
533func (rt *ruleBasedTransformer) Transform(
534	req *http.Request, resp *http.Response) {
535	for _, r := range rt.rules {
536		if !r.matches(req) {
537			continue
538		}
539		log.Printf("Rule(%s): matched rule %v", req.URL, r.shortString())
540		for k, v := range r.ExtraHeaders {
541			resp.Header[k] = append(resp.Header[k], v...)
542		}
543		/*
544			if disabled {
545				for _, p := range r.Push {
546					h := p.Headers
547					if len(h) == 0 {
548						h = cloneHeaders(req.Header)
549						h.Set("Referer", req.URL.String())
550					}
551					rw.Push(p.URL, h)
552				}
553			}
554		*/
555	}
556}
557