1/*
2Package purell offers URL normalization as described on the wikipedia page:
3http://en.wikipedia.org/wiki/URL_normalization
4*/
5package purell
6
7import (
8	"bytes"
9	"fmt"
10	"net/url"
11	"regexp"
12	"sort"
13	"strconv"
14	"strings"
15
16	"github.com/PuerkitoBio/urlesc"
17	"golang.org/x/net/idna"
18	"golang.org/x/text/unicode/norm"
19	"golang.org/x/text/width"
20)
21
22// A set of normalization flags determines how a URL will
23// be normalized.
24type NormalizationFlags uint
25
26const (
27	// Safe normalizations
28	FlagLowercaseScheme           NormalizationFlags = 1 << iota // HTTP://host -> http://host, applied by default in Go1.1
29	FlagLowercaseHost                                            // http://HOST -> http://host
30	FlagUppercaseEscapes                                         // http://host/t%ef -> http://host/t%EF
31	FlagDecodeUnnecessaryEscapes                                 // http://host/t%41 -> http://host/tA
32	FlagEncodeNecessaryEscapes                                   // http://host/!"#$ -> http://host/%21%22#$
33	FlagRemoveDefaultPort                                        // http://host:80 -> http://host
34	FlagRemoveEmptyQuerySeparator                                // http://host/path? -> http://host/path
35
36	// Usually safe normalizations
37	FlagRemoveTrailingSlash // http://host/path/ -> http://host/path
38	FlagAddTrailingSlash    // http://host/path -> http://host/path/ (should choose only one of these add/remove trailing slash flags)
39	FlagRemoveDotSegments   // http://host/path/./a/b/../c -> http://host/path/a/c
40
41	// Unsafe normalizations
42	FlagRemoveDirectoryIndex   // http://host/path/index.html -> http://host/path/
43	FlagRemoveFragment         // http://host/path#fragment -> http://host/path
44	FlagForceHTTP              // https://host -> http://host
45	FlagRemoveDuplicateSlashes // http://host/path//a///b -> http://host/path/a/b
46	FlagRemoveWWW              // http://www.host/ -> http://host/
47	FlagAddWWW                 // http://host/ -> http://www.host/ (should choose only one of these add/remove WWW flags)
48	FlagSortQuery              // http://host/path?c=3&b=2&a=1&b=1 -> http://host/path?a=1&b=1&b=2&c=3
49
50	// Normalizations not in the wikipedia article, required to cover tests cases
51	// submitted by jehiah
52	FlagDecodeDWORDHost           // http://1113982867 -> http://66.102.7.147
53	FlagDecodeOctalHost           // http://0102.0146.07.0223 -> http://66.102.7.147
54	FlagDecodeHexHost             // http://0x42660793 -> http://66.102.7.147
55	FlagRemoveUnnecessaryHostDots // http://.host../path -> http://host/path
56	FlagRemoveEmptyPortSeparator  // http://host:/path -> http://host/path
57
58	// Convenience set of safe normalizations
59	FlagsSafe NormalizationFlags = FlagLowercaseHost | FlagLowercaseScheme | FlagUppercaseEscapes | FlagDecodeUnnecessaryEscapes | FlagEncodeNecessaryEscapes | FlagRemoveDefaultPort | FlagRemoveEmptyQuerySeparator
60
61	// For convenience sets, "greedy" uses the "remove trailing slash" and "remove www. prefix" flags,
62	// while "non-greedy" uses the "add (or keep) the trailing slash" and "add www. prefix".
63
64	// Convenience set of usually safe normalizations (includes FlagsSafe)
65	FlagsUsuallySafeGreedy    NormalizationFlags = FlagsSafe | FlagRemoveTrailingSlash | FlagRemoveDotSegments
66	FlagsUsuallySafeNonGreedy NormalizationFlags = FlagsSafe | FlagAddTrailingSlash | FlagRemoveDotSegments
67
68	// Convenience set of unsafe normalizations (includes FlagsUsuallySafe)
69	FlagsUnsafeGreedy    NormalizationFlags = FlagsUsuallySafeGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagRemoveWWW | FlagSortQuery
70	FlagsUnsafeNonGreedy NormalizationFlags = FlagsUsuallySafeNonGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagAddWWW | FlagSortQuery
71
72	// Convenience set of all available flags
73	FlagsAllGreedy    = FlagsUnsafeGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator
74	FlagsAllNonGreedy = FlagsUnsafeNonGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator
75)
76
77const (
78	defaultHttpPort  = ":80"
79	defaultHttpsPort = ":443"
80)
81
82// Regular expressions used by the normalizations
83var rxPort = regexp.MustCompile(`(:\d+)/?$`)
84var rxDirIndex = regexp.MustCompile(`(^|/)((?:default|index)\.\w{1,4})$`)
85var rxDupSlashes = regexp.MustCompile(`/{2,}`)
86var rxDWORDHost = regexp.MustCompile(`^(\d+)((?:\.+)?(?:\:\d*)?)$`)
87var rxOctalHost = regexp.MustCompile(`^(0\d*)\.(0\d*)\.(0\d*)\.(0\d*)((?:\.+)?(?:\:\d*)?)$`)
88var rxHexHost = regexp.MustCompile(`^0x([0-9A-Fa-f]+)((?:\.+)?(?:\:\d*)?)$`)
89var rxHostDots = regexp.MustCompile(`^(.+?)(:\d+)?$`)
90var rxEmptyPort = regexp.MustCompile(`:+$`)
91
92// Map of flags to implementation function.
93// FlagDecodeUnnecessaryEscapes has no action, since it is done automatically
94// by parsing the string as an URL. Same for FlagUppercaseEscapes and FlagRemoveEmptyQuerySeparator.
95
96// Since maps have undefined traversing order, make a slice of ordered keys
97var flagsOrder = []NormalizationFlags{
98	FlagLowercaseScheme,
99	FlagLowercaseHost,
100	FlagRemoveDefaultPort,
101	FlagRemoveDirectoryIndex,
102	FlagRemoveDotSegments,
103	FlagRemoveFragment,
104	FlagForceHTTP, // Must be after remove default port (because https=443/http=80)
105	FlagRemoveDuplicateSlashes,
106	FlagRemoveWWW,
107	FlagAddWWW,
108	FlagSortQuery,
109	FlagDecodeDWORDHost,
110	FlagDecodeOctalHost,
111	FlagDecodeHexHost,
112	FlagRemoveUnnecessaryHostDots,
113	FlagRemoveEmptyPortSeparator,
114	FlagRemoveTrailingSlash, // These two (add/remove trailing slash) must be last
115	FlagAddTrailingSlash,
116}
117
118// ... and then the map, where order is unimportant
119var flags = map[NormalizationFlags]func(*url.URL){
120	FlagLowercaseScheme:           lowercaseScheme,
121	FlagLowercaseHost:             lowercaseHost,
122	FlagRemoveDefaultPort:         removeDefaultPort,
123	FlagRemoveDirectoryIndex:      removeDirectoryIndex,
124	FlagRemoveDotSegments:         removeDotSegments,
125	FlagRemoveFragment:            removeFragment,
126	FlagForceHTTP:                 forceHTTP,
127	FlagRemoveDuplicateSlashes:    removeDuplicateSlashes,
128	FlagRemoveWWW:                 removeWWW,
129	FlagAddWWW:                    addWWW,
130	FlagSortQuery:                 sortQuery,
131	FlagDecodeDWORDHost:           decodeDWORDHost,
132	FlagDecodeOctalHost:           decodeOctalHost,
133	FlagDecodeHexHost:             decodeHexHost,
134	FlagRemoveUnnecessaryHostDots: removeUnncessaryHostDots,
135	FlagRemoveEmptyPortSeparator:  removeEmptyPortSeparator,
136	FlagRemoveTrailingSlash:       removeTrailingSlash,
137	FlagAddTrailingSlash:          addTrailingSlash,
138}
139
140// MustNormalizeURLString returns the normalized string, and panics if an error occurs.
141// It takes an URL string as input, as well as the normalization flags.
142func MustNormalizeURLString(u string, f NormalizationFlags) string {
143	result, e := NormalizeURLString(u, f)
144	if e != nil {
145		panic(e)
146	}
147	return result
148}
149
150// NormalizeURLString returns the normalized string, or an error if it can't be parsed into an URL object.
151// It takes an URL string as input, as well as the normalization flags.
152func NormalizeURLString(u string, f NormalizationFlags) (string, error) {
153	parsed, err := url.Parse(u)
154	if err != nil {
155		return "", err
156	}
157
158	if f&FlagLowercaseHost == FlagLowercaseHost {
159		parsed.Host = strings.ToLower(parsed.Host)
160	}
161
162	// The idna package doesn't fully conform to RFC 5895
163	// (https://tools.ietf.org/html/rfc5895), so we do it here.
164	// Taken from Go 1.8 cycle source, courtesy of bradfitz.
165	// TODO: Remove when (if?) idna package conforms to RFC 5895.
166	parsed.Host = width.Fold.String(parsed.Host)
167	parsed.Host = norm.NFC.String(parsed.Host)
168	if parsed.Host, err = idna.ToASCII(parsed.Host); err != nil {
169		return "", err
170	}
171
172	return NormalizeURL(parsed, f), nil
173}
174
175// NormalizeURL returns the normalized string.
176// It takes a parsed URL object as input, as well as the normalization flags.
177func NormalizeURL(u *url.URL, f NormalizationFlags) string {
178	for _, k := range flagsOrder {
179		if f&k == k {
180			flags[k](u)
181		}
182	}
183	return urlesc.Escape(u)
184}
185
186func lowercaseScheme(u *url.URL) {
187	if len(u.Scheme) > 0 {
188		u.Scheme = strings.ToLower(u.Scheme)
189	}
190}
191
192func lowercaseHost(u *url.URL) {
193	if len(u.Host) > 0 {
194		u.Host = strings.ToLower(u.Host)
195	}
196}
197
198func removeDefaultPort(u *url.URL) {
199	if len(u.Host) > 0 {
200		scheme := strings.ToLower(u.Scheme)
201		u.Host = rxPort.ReplaceAllStringFunc(u.Host, func(val string) string {
202			if (scheme == "http" && val == defaultHttpPort) || (scheme == "https" && val == defaultHttpsPort) {
203				return ""
204			}
205			return val
206		})
207	}
208}
209
210func removeTrailingSlash(u *url.URL) {
211	if l := len(u.Path); l > 0 {
212		if strings.HasSuffix(u.Path, "/") {
213			u.Path = u.Path[:l-1]
214		}
215	} else if l = len(u.Host); l > 0 {
216		if strings.HasSuffix(u.Host, "/") {
217			u.Host = u.Host[:l-1]
218		}
219	}
220}
221
222func addTrailingSlash(u *url.URL) {
223	if l := len(u.Path); l > 0 {
224		if !strings.HasSuffix(u.Path, "/") {
225			u.Path += "/"
226		}
227	} else if l = len(u.Host); l > 0 {
228		if !strings.HasSuffix(u.Host, "/") {
229			u.Host += "/"
230		}
231	}
232}
233
234func removeDotSegments(u *url.URL) {
235	if len(u.Path) > 0 {
236		var dotFree []string
237		var lastIsDot bool
238
239		sections := strings.Split(u.Path, "/")
240		for _, s := range sections {
241			if s == ".." {
242				if len(dotFree) > 0 {
243					dotFree = dotFree[:len(dotFree)-1]
244				}
245			} else if s != "." {
246				dotFree = append(dotFree, s)
247			}
248			lastIsDot = (s == "." || s == "..")
249		}
250		// Special case if host does not end with / and new path does not begin with /
251		u.Path = strings.Join(dotFree, "/")
252		if u.Host != "" && !strings.HasSuffix(u.Host, "/") && !strings.HasPrefix(u.Path, "/") {
253			u.Path = "/" + u.Path
254		}
255		// Special case if the last segment was a dot, make sure the path ends with a slash
256		if lastIsDot && !strings.HasSuffix(u.Path, "/") {
257			u.Path += "/"
258		}
259	}
260}
261
262func removeDirectoryIndex(u *url.URL) {
263	if len(u.Path) > 0 {
264		u.Path = rxDirIndex.ReplaceAllString(u.Path, "$1")
265	}
266}
267
268func removeFragment(u *url.URL) {
269	u.Fragment = ""
270}
271
272func forceHTTP(u *url.URL) {
273	if strings.ToLower(u.Scheme) == "https" {
274		u.Scheme = "http"
275	}
276}
277
278func removeDuplicateSlashes(u *url.URL) {
279	if len(u.Path) > 0 {
280		u.Path = rxDupSlashes.ReplaceAllString(u.Path, "/")
281	}
282}
283
284func removeWWW(u *url.URL) {
285	if len(u.Host) > 0 && strings.HasPrefix(strings.ToLower(u.Host), "www.") {
286		u.Host = u.Host[4:]
287	}
288}
289
290func addWWW(u *url.URL) {
291	if len(u.Host) > 0 && !strings.HasPrefix(strings.ToLower(u.Host), "www.") {
292		u.Host = "www." + u.Host
293	}
294}
295
296func sortQuery(u *url.URL) {
297	q := u.Query()
298
299	if len(q) > 0 {
300		arKeys := make([]string, len(q))
301		i := 0
302		for k := range q {
303			arKeys[i] = k
304			i++
305		}
306		sort.Strings(arKeys)
307		buf := new(bytes.Buffer)
308		for _, k := range arKeys {
309			sort.Strings(q[k])
310			for _, v := range q[k] {
311				if buf.Len() > 0 {
312					buf.WriteRune('&')
313				}
314				buf.WriteString(fmt.Sprintf("%s=%s", k, urlesc.QueryEscape(v)))
315			}
316		}
317
318		// Rebuild the raw query string
319		u.RawQuery = buf.String()
320	}
321}
322
323func decodeDWORDHost(u *url.URL) {
324	if len(u.Host) > 0 {
325		if matches := rxDWORDHost.FindStringSubmatch(u.Host); len(matches) > 2 {
326			var parts [4]int64
327
328			dword, _ := strconv.ParseInt(matches[1], 10, 0)
329			for i, shift := range []uint{24, 16, 8, 0} {
330				parts[i] = dword >> shift & 0xFF
331			}
332			u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[2])
333		}
334	}
335}
336
337func decodeOctalHost(u *url.URL) {
338	if len(u.Host) > 0 {
339		if matches := rxOctalHost.FindStringSubmatch(u.Host); len(matches) > 5 {
340			var parts [4]int64
341
342			for i := 1; i <= 4; i++ {
343				parts[i-1], _ = strconv.ParseInt(matches[i], 8, 0)
344			}
345			u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[5])
346		}
347	}
348}
349
350func decodeHexHost(u *url.URL) {
351	if len(u.Host) > 0 {
352		if matches := rxHexHost.FindStringSubmatch(u.Host); len(matches) > 2 {
353			// Conversion is safe because of regex validation
354			parsed, _ := strconv.ParseInt(matches[1], 16, 0)
355			// Set host as DWORD (base 10) encoded host
356			u.Host = fmt.Sprintf("%d%s", parsed, matches[2])
357			// The rest is the same as decoding a DWORD host
358			decodeDWORDHost(u)
359		}
360	}
361}
362
363func removeUnncessaryHostDots(u *url.URL) {
364	if len(u.Host) > 0 {
365		if matches := rxHostDots.FindStringSubmatch(u.Host); len(matches) > 1 {
366			// Trim the leading and trailing dots
367			u.Host = strings.Trim(matches[1], ".")
368			if len(matches) > 2 {
369				u.Host += matches[2]
370			}
371		}
372	}
373}
374
375func removeEmptyPortSeparator(u *url.URL) {
376	if len(u.Host) > 0 {
377		u.Host = rxEmptyPort.ReplaceAllString(u.Host, "")
378	}
379}
380