1/* 2Package purell offers URL normalization as described on the wikipedia page: 3http://en.wikipedia.org/wiki/URL_normalization 4*/ 5package purell 6 7import ( 8 "bytes" 9 "fmt" 10 "net/url" 11 "regexp" 12 "sort" 13 "strconv" 14 "strings" 15 16 "github.com/PuerkitoBio/urlesc" 17 "golang.org/x/net/idna" 18 "golang.org/x/text/unicode/norm" 19 "golang.org/x/text/width" 20) 21 22// A set of normalization flags determines how a URL will 23// be normalized. 24type NormalizationFlags uint 25 26const ( 27 // Safe normalizations 28 FlagLowercaseScheme NormalizationFlags = 1 << iota // HTTP://host -> http://host, applied by default in Go1.1 29 FlagLowercaseHost // http://HOST -> http://host 30 FlagUppercaseEscapes // http://host/t%ef -> http://host/t%EF 31 FlagDecodeUnnecessaryEscapes // http://host/t%41 -> http://host/tA 32 FlagEncodeNecessaryEscapes // http://host/!"#$ -> http://host/%21%22#$ 33 FlagRemoveDefaultPort // http://host:80 -> http://host 34 FlagRemoveEmptyQuerySeparator // http://host/path? -> http://host/path 35 36 // Usually safe normalizations 37 FlagRemoveTrailingSlash // http://host/path/ -> http://host/path 38 FlagAddTrailingSlash // http://host/path -> http://host/path/ (should choose only one of these add/remove trailing slash flags) 39 FlagRemoveDotSegments // http://host/path/./a/b/../c -> http://host/path/a/c 40 41 // Unsafe normalizations 42 FlagRemoveDirectoryIndex // http://host/path/index.html -> http://host/path/ 43 FlagRemoveFragment // http://host/path#fragment -> http://host/path 44 FlagForceHTTP // https://host -> http://host 45 FlagRemoveDuplicateSlashes // http://host/path//a///b -> http://host/path/a/b 46 FlagRemoveWWW // http://www.host/ -> http://host/ 47 FlagAddWWW // http://host/ -> http://www.host/ (should choose only one of these add/remove WWW flags) 48 FlagSortQuery // http://host/path?c=3&b=2&a=1&b=1 -> http://host/path?a=1&b=1&b=2&c=3 49 50 // Normalizations not in the wikipedia article, required to cover tests cases 51 // submitted by jehiah 52 FlagDecodeDWORDHost // http://1113982867 -> http://66.102.7.147 53 FlagDecodeOctalHost // http://0102.0146.07.0223 -> http://66.102.7.147 54 FlagDecodeHexHost // http://0x42660793 -> http://66.102.7.147 55 FlagRemoveUnnecessaryHostDots // http://.host../path -> http://host/path 56 FlagRemoveEmptyPortSeparator // http://host:/path -> http://host/path 57 58 // Convenience set of safe normalizations 59 FlagsSafe NormalizationFlags = FlagLowercaseHost | FlagLowercaseScheme | FlagUppercaseEscapes | FlagDecodeUnnecessaryEscapes | FlagEncodeNecessaryEscapes | FlagRemoveDefaultPort | FlagRemoveEmptyQuerySeparator 60 61 // For convenience sets, "greedy" uses the "remove trailing slash" and "remove www. prefix" flags, 62 // while "non-greedy" uses the "add (or keep) the trailing slash" and "add www. prefix". 63 64 // Convenience set of usually safe normalizations (includes FlagsSafe) 65 FlagsUsuallySafeGreedy NormalizationFlags = FlagsSafe | FlagRemoveTrailingSlash | FlagRemoveDotSegments 66 FlagsUsuallySafeNonGreedy NormalizationFlags = FlagsSafe | FlagAddTrailingSlash | FlagRemoveDotSegments 67 68 // Convenience set of unsafe normalizations (includes FlagsUsuallySafe) 69 FlagsUnsafeGreedy NormalizationFlags = FlagsUsuallySafeGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagRemoveWWW | FlagSortQuery 70 FlagsUnsafeNonGreedy NormalizationFlags = FlagsUsuallySafeNonGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagAddWWW | FlagSortQuery 71 72 // Convenience set of all available flags 73 FlagsAllGreedy = FlagsUnsafeGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator 74 FlagsAllNonGreedy = FlagsUnsafeNonGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator 75) 76 77const ( 78 defaultHttpPort = ":80" 79 defaultHttpsPort = ":443" 80) 81 82// Regular expressions used by the normalizations 83var rxPort = regexp.MustCompile(`(:\d+)/?$`) 84var rxDirIndex = regexp.MustCompile(`(^|/)((?:default|index)\.\w{1,4})$`) 85var rxDupSlashes = regexp.MustCompile(`/{2,}`) 86var rxDWORDHost = regexp.MustCompile(`^(\d+)((?:\.+)?(?:\:\d*)?)$`) 87var rxOctalHost = regexp.MustCompile(`^(0\d*)\.(0\d*)\.(0\d*)\.(0\d*)((?:\.+)?(?:\:\d*)?)$`) 88var rxHexHost = regexp.MustCompile(`^0x([0-9A-Fa-f]+)((?:\.+)?(?:\:\d*)?)$`) 89var rxHostDots = regexp.MustCompile(`^(.+?)(:\d+)?$`) 90var rxEmptyPort = regexp.MustCompile(`:+$`) 91 92// Map of flags to implementation function. 93// FlagDecodeUnnecessaryEscapes has no action, since it is done automatically 94// by parsing the string as an URL. Same for FlagUppercaseEscapes and FlagRemoveEmptyQuerySeparator. 95 96// Since maps have undefined traversing order, make a slice of ordered keys 97var flagsOrder = []NormalizationFlags{ 98 FlagLowercaseScheme, 99 FlagLowercaseHost, 100 FlagRemoveDefaultPort, 101 FlagRemoveDirectoryIndex, 102 FlagRemoveDotSegments, 103 FlagRemoveFragment, 104 FlagForceHTTP, // Must be after remove default port (because https=443/http=80) 105 FlagRemoveDuplicateSlashes, 106 FlagRemoveWWW, 107 FlagAddWWW, 108 FlagSortQuery, 109 FlagDecodeDWORDHost, 110 FlagDecodeOctalHost, 111 FlagDecodeHexHost, 112 FlagRemoveUnnecessaryHostDots, 113 FlagRemoveEmptyPortSeparator, 114 FlagRemoveTrailingSlash, // These two (add/remove trailing slash) must be last 115 FlagAddTrailingSlash, 116} 117 118// ... and then the map, where order is unimportant 119var flags = map[NormalizationFlags]func(*url.URL){ 120 FlagLowercaseScheme: lowercaseScheme, 121 FlagLowercaseHost: lowercaseHost, 122 FlagRemoveDefaultPort: removeDefaultPort, 123 FlagRemoveDirectoryIndex: removeDirectoryIndex, 124 FlagRemoveDotSegments: removeDotSegments, 125 FlagRemoveFragment: removeFragment, 126 FlagForceHTTP: forceHTTP, 127 FlagRemoveDuplicateSlashes: removeDuplicateSlashes, 128 FlagRemoveWWW: removeWWW, 129 FlagAddWWW: addWWW, 130 FlagSortQuery: sortQuery, 131 FlagDecodeDWORDHost: decodeDWORDHost, 132 FlagDecodeOctalHost: decodeOctalHost, 133 FlagDecodeHexHost: decodeHexHost, 134 FlagRemoveUnnecessaryHostDots: removeUnncessaryHostDots, 135 FlagRemoveEmptyPortSeparator: removeEmptyPortSeparator, 136 FlagRemoveTrailingSlash: removeTrailingSlash, 137 FlagAddTrailingSlash: addTrailingSlash, 138} 139 140// MustNormalizeURLString returns the normalized string, and panics if an error occurs. 141// It takes an URL string as input, as well as the normalization flags. 142func MustNormalizeURLString(u string, f NormalizationFlags) string { 143 result, e := NormalizeURLString(u, f) 144 if e != nil { 145 panic(e) 146 } 147 return result 148} 149 150// NormalizeURLString returns the normalized string, or an error if it can't be parsed into an URL object. 151// It takes an URL string as input, as well as the normalization flags. 152func NormalizeURLString(u string, f NormalizationFlags) (string, error) { 153 parsed, err := url.Parse(u) 154 if err != nil { 155 return "", err 156 } 157 158 if f&FlagLowercaseHost == FlagLowercaseHost { 159 parsed.Host = strings.ToLower(parsed.Host) 160 } 161 162 // The idna package doesn't fully conform to RFC 5895 163 // (https://tools.ietf.org/html/rfc5895), so we do it here. 164 // Taken from Go 1.8 cycle source, courtesy of bradfitz. 165 // TODO: Remove when (if?) idna package conforms to RFC 5895. 166 parsed.Host = width.Fold.String(parsed.Host) 167 parsed.Host = norm.NFC.String(parsed.Host) 168 if parsed.Host, err = idna.ToASCII(parsed.Host); err != nil { 169 return "", err 170 } 171 172 return NormalizeURL(parsed, f), nil 173} 174 175// NormalizeURL returns the normalized string. 176// It takes a parsed URL object as input, as well as the normalization flags. 177func NormalizeURL(u *url.URL, f NormalizationFlags) string { 178 for _, k := range flagsOrder { 179 if f&k == k { 180 flags[k](u) 181 } 182 } 183 return urlesc.Escape(u) 184} 185 186func lowercaseScheme(u *url.URL) { 187 if len(u.Scheme) > 0 { 188 u.Scheme = strings.ToLower(u.Scheme) 189 } 190} 191 192func lowercaseHost(u *url.URL) { 193 if len(u.Host) > 0 { 194 u.Host = strings.ToLower(u.Host) 195 } 196} 197 198func removeDefaultPort(u *url.URL) { 199 if len(u.Host) > 0 { 200 scheme := strings.ToLower(u.Scheme) 201 u.Host = rxPort.ReplaceAllStringFunc(u.Host, func(val string) string { 202 if (scheme == "http" && val == defaultHttpPort) || (scheme == "https" && val == defaultHttpsPort) { 203 return "" 204 } 205 return val 206 }) 207 } 208} 209 210func removeTrailingSlash(u *url.URL) { 211 if l := len(u.Path); l > 0 { 212 if strings.HasSuffix(u.Path, "/") { 213 u.Path = u.Path[:l-1] 214 } 215 } else if l = len(u.Host); l > 0 { 216 if strings.HasSuffix(u.Host, "/") { 217 u.Host = u.Host[:l-1] 218 } 219 } 220} 221 222func addTrailingSlash(u *url.URL) { 223 if l := len(u.Path); l > 0 { 224 if !strings.HasSuffix(u.Path, "/") { 225 u.Path += "/" 226 } 227 } else if l = len(u.Host); l > 0 { 228 if !strings.HasSuffix(u.Host, "/") { 229 u.Host += "/" 230 } 231 } 232} 233 234func removeDotSegments(u *url.URL) { 235 if len(u.Path) > 0 { 236 var dotFree []string 237 var lastIsDot bool 238 239 sections := strings.Split(u.Path, "/") 240 for _, s := range sections { 241 if s == ".." { 242 if len(dotFree) > 0 { 243 dotFree = dotFree[:len(dotFree)-1] 244 } 245 } else if s != "." { 246 dotFree = append(dotFree, s) 247 } 248 lastIsDot = (s == "." || s == "..") 249 } 250 // Special case if host does not end with / and new path does not begin with / 251 u.Path = strings.Join(dotFree, "/") 252 if u.Host != "" && !strings.HasSuffix(u.Host, "/") && !strings.HasPrefix(u.Path, "/") { 253 u.Path = "/" + u.Path 254 } 255 // Special case if the last segment was a dot, make sure the path ends with a slash 256 if lastIsDot && !strings.HasSuffix(u.Path, "/") { 257 u.Path += "/" 258 } 259 } 260} 261 262func removeDirectoryIndex(u *url.URL) { 263 if len(u.Path) > 0 { 264 u.Path = rxDirIndex.ReplaceAllString(u.Path, "$1") 265 } 266} 267 268func removeFragment(u *url.URL) { 269 u.Fragment = "" 270} 271 272func forceHTTP(u *url.URL) { 273 if strings.ToLower(u.Scheme) == "https" { 274 u.Scheme = "http" 275 } 276} 277 278func removeDuplicateSlashes(u *url.URL) { 279 if len(u.Path) > 0 { 280 u.Path = rxDupSlashes.ReplaceAllString(u.Path, "/") 281 } 282} 283 284func removeWWW(u *url.URL) { 285 if len(u.Host) > 0 && strings.HasPrefix(strings.ToLower(u.Host), "www.") { 286 u.Host = u.Host[4:] 287 } 288} 289 290func addWWW(u *url.URL) { 291 if len(u.Host) > 0 && !strings.HasPrefix(strings.ToLower(u.Host), "www.") { 292 u.Host = "www." + u.Host 293 } 294} 295 296func sortQuery(u *url.URL) { 297 q := u.Query() 298 299 if len(q) > 0 { 300 arKeys := make([]string, len(q)) 301 i := 0 302 for k := range q { 303 arKeys[i] = k 304 i++ 305 } 306 sort.Strings(arKeys) 307 buf := new(bytes.Buffer) 308 for _, k := range arKeys { 309 sort.Strings(q[k]) 310 for _, v := range q[k] { 311 if buf.Len() > 0 { 312 buf.WriteRune('&') 313 } 314 buf.WriteString(fmt.Sprintf("%s=%s", k, urlesc.QueryEscape(v))) 315 } 316 } 317 318 // Rebuild the raw query string 319 u.RawQuery = buf.String() 320 } 321} 322 323func decodeDWORDHost(u *url.URL) { 324 if len(u.Host) > 0 { 325 if matches := rxDWORDHost.FindStringSubmatch(u.Host); len(matches) > 2 { 326 var parts [4]int64 327 328 dword, _ := strconv.ParseInt(matches[1], 10, 0) 329 for i, shift := range []uint{24, 16, 8, 0} { 330 parts[i] = dword >> shift & 0xFF 331 } 332 u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[2]) 333 } 334 } 335} 336 337func decodeOctalHost(u *url.URL) { 338 if len(u.Host) > 0 { 339 if matches := rxOctalHost.FindStringSubmatch(u.Host); len(matches) > 5 { 340 var parts [4]int64 341 342 for i := 1; i <= 4; i++ { 343 parts[i-1], _ = strconv.ParseInt(matches[i], 8, 0) 344 } 345 u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[5]) 346 } 347 } 348} 349 350func decodeHexHost(u *url.URL) { 351 if len(u.Host) > 0 { 352 if matches := rxHexHost.FindStringSubmatch(u.Host); len(matches) > 2 { 353 // Conversion is safe because of regex validation 354 parsed, _ := strconv.ParseInt(matches[1], 16, 0) 355 // Set host as DWORD (base 10) encoded host 356 u.Host = fmt.Sprintf("%d%s", parsed, matches[2]) 357 // The rest is the same as decoding a DWORD host 358 decodeDWORDHost(u) 359 } 360 } 361} 362 363func removeUnncessaryHostDots(u *url.URL) { 364 if len(u.Host) > 0 { 365 if matches := rxHostDots.FindStringSubmatch(u.Host); len(matches) > 1 { 366 // Trim the leading and trailing dots 367 u.Host = strings.Trim(matches[1], ".") 368 if len(matches) > 2 { 369 u.Host += matches[2] 370 } 371 } 372 } 373} 374 375func removeEmptyPortSeparator(u *url.URL) { 376 if len(u.Host) > 0 { 377 u.Host = rxEmptyPort.ReplaceAllString(u.Host, "") 378 } 379} 380