1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Package url parses URLs and implements query escaping. 6// See RFC 3986. 7package url 8 9import ( 10 "bytes" 11 "errors" 12 "sort" 13 "strconv" 14 "strings" 15) 16 17// Error reports an error and the operation and URL that caused it. 18type Error struct { 19 Op string 20 URL string 21 Err error 22} 23 24func (e *Error) Error() string { return e.Op + " " + e.URL + ": " + e.Err.Error() } 25 26func ishex(c byte) bool { 27 switch { 28 case '0' <= c && c <= '9': 29 return true 30 case 'a' <= c && c <= 'f': 31 return true 32 case 'A' <= c && c <= 'F': 33 return true 34 } 35 return false 36} 37 38func unhex(c byte) byte { 39 switch { 40 case '0' <= c && c <= '9': 41 return c - '0' 42 case 'a' <= c && c <= 'f': 43 return c - 'a' + 10 44 case 'A' <= c && c <= 'F': 45 return c - 'A' + 10 46 } 47 return 0 48} 49 50type encoding int 51 52const ( 53 encodePath encoding = 1 + iota 54 encodeUserPassword 55 encodeQueryComponent 56 encodeFragment 57) 58 59type EscapeError string 60 61func (e EscapeError) Error() string { 62 return "invalid URL escape " + strconv.Quote(string(e)) 63} 64 65// Return true if the specified character should be escaped when 66// appearing in a URL string, according to RFC 3986. 67// When 'all' is true the full range of reserved characters are matched. 68func shouldEscape(c byte, mode encoding) bool { 69 // §2.3 Unreserved characters (alphanum) 70 if 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' { 71 return false 72 } 73 74 switch c { 75 case '-', '_', '.', '~': // §2.3 Unreserved characters (mark) 76 return false 77 78 case '$', '&', '+', ',', '/', ':', ';', '=', '?', '@': // §2.2 Reserved characters (reserved) 79 // Different sections of the URL allow a few of 80 // the reserved characters to appear unescaped. 81 switch mode { 82 case encodePath: // §3.3 83 // The RFC allows : @ & = + $ but saves / ; , for assigning 84 // meaning to individual path segments. This package 85 // only manipulates the path as a whole, so we allow those 86 // last two as well. That leaves only ? to escape. 87 return c == '?' 88 89 case encodeUserPassword: // §3.2.2 90 // The RFC allows ; : & = + $ , in userinfo, so we must escape only @ and /. 91 // The parsing of userinfo treats : as special so we must escape that too. 92 return c == '@' || c == '/' || c == ':' 93 94 case encodeQueryComponent: // §3.4 95 // The RFC reserves (so we must escape) everything. 96 return true 97 98 case encodeFragment: // §4.1 99 // The RFC text is silent but the grammar allows 100 // everything, so escape nothing. 101 return false 102 } 103 } 104 105 // Everything else must be escaped. 106 return true 107} 108 109// QueryUnescape does the inverse transformation of QueryEscape, converting 110// %AB into the byte 0xAB and '+' into ' ' (space). It returns an error if 111// any % is not followed by two hexadecimal digits. 112func QueryUnescape(s string) (string, error) { 113 return unescape(s, encodeQueryComponent) 114} 115 116// unescape unescapes a string; the mode specifies 117// which section of the URL string is being unescaped. 118func unescape(s string, mode encoding) (string, error) { 119 // Count %, check that they're well-formed. 120 n := 0 121 hasPlus := false 122 for i := 0; i < len(s); { 123 switch s[i] { 124 case '%': 125 n++ 126 if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) { 127 s = s[i:] 128 if len(s) > 3 { 129 s = s[0:3] 130 } 131 return "", EscapeError(s) 132 } 133 i += 3 134 case '+': 135 hasPlus = mode == encodeQueryComponent 136 i++ 137 default: 138 i++ 139 } 140 } 141 142 if n == 0 && !hasPlus { 143 return s, nil 144 } 145 146 t := make([]byte, len(s)-2*n) 147 j := 0 148 for i := 0; i < len(s); { 149 switch s[i] { 150 case '%': 151 t[j] = unhex(s[i+1])<<4 | unhex(s[i+2]) 152 j++ 153 i += 3 154 case '+': 155 if mode == encodeQueryComponent { 156 t[j] = ' ' 157 } else { 158 t[j] = '+' 159 } 160 j++ 161 i++ 162 default: 163 t[j] = s[i] 164 j++ 165 i++ 166 } 167 } 168 return string(t), nil 169} 170 171// QueryEscape escapes the string so it can be safely placed 172// inside a URL query. 173func QueryEscape(s string) string { 174 return escape(s, encodeQueryComponent) 175} 176 177func escape(s string, mode encoding) string { 178 spaceCount, hexCount := 0, 0 179 for i := 0; i < len(s); i++ { 180 c := s[i] 181 if shouldEscape(c, mode) { 182 if c == ' ' && mode == encodeQueryComponent { 183 spaceCount++ 184 } else { 185 hexCount++ 186 } 187 } 188 } 189 190 if spaceCount == 0 && hexCount == 0 { 191 return s 192 } 193 194 t := make([]byte, len(s)+2*hexCount) 195 j := 0 196 for i := 0; i < len(s); i++ { 197 switch c := s[i]; { 198 case c == ' ' && mode == encodeQueryComponent: 199 t[j] = '+' 200 j++ 201 case shouldEscape(c, mode): 202 t[j] = '%' 203 t[j+1] = "0123456789ABCDEF"[c>>4] 204 t[j+2] = "0123456789ABCDEF"[c&15] 205 j += 3 206 default: 207 t[j] = s[i] 208 j++ 209 } 210 } 211 return string(t) 212} 213 214// A URL represents a parsed URL (technically, a URI reference). 215// The general form represented is: 216// 217// scheme://[userinfo@]host/path[?query][#fragment] 218// 219// URLs that do not start with a slash after the scheme are interpreted as: 220// 221// scheme:opaque[?query][#fragment] 222// 223// Note that the Path field is stored in decoded form: /%47%6f%2f becomes /Go/. 224// A consequence is that it is impossible to tell which slashes in the Path were 225// slashes in the raw URL and which were %2f. This distinction is rarely important, 226// but when it is a client must use other routines to parse the raw URL or construct 227// the parsed URL. For example, an HTTP server can consult req.RequestURI, and 228// an HTTP client can use URL{Host: "example.com", Opaque: "//example.com/Go%2f"} 229// instead of URL{Host: "example.com", Path: "/Go/"}. 230type URL struct { 231 Scheme string 232 Opaque string // encoded opaque data 233 User *Userinfo // username and password information 234 Host string // host or host:port 235 Path string 236 RawQuery string // encoded query values, without '?' 237 Fragment string // fragment for references, without '#' 238} 239 240// User returns a Userinfo containing the provided username 241// and no password set. 242func User(username string) *Userinfo { 243 return &Userinfo{username, "", false} 244} 245 246// UserPassword returns a Userinfo containing the provided username 247// and password. 248// This functionality should only be used with legacy web sites. 249// RFC 2396 warns that interpreting Userinfo this way 250// ``is NOT RECOMMENDED, because the passing of authentication 251// information in clear text (such as URI) has proven to be a 252// security risk in almost every case where it has been used.'' 253func UserPassword(username, password string) *Userinfo { 254 return &Userinfo{username, password, true} 255} 256 257// The Userinfo type is an immutable encapsulation of username and 258// password details for a URL. An existing Userinfo value is guaranteed 259// to have a username set (potentially empty, as allowed by RFC 2396), 260// and optionally a password. 261type Userinfo struct { 262 username string 263 password string 264 passwordSet bool 265} 266 267// Username returns the username. 268func (u *Userinfo) Username() string { 269 return u.username 270} 271 272// Password returns the password in case it is set, and whether it is set. 273func (u *Userinfo) Password() (string, bool) { 274 if u.passwordSet { 275 return u.password, true 276 } 277 return "", false 278} 279 280// String returns the encoded userinfo information in the standard form 281// of "username[:password]". 282func (u *Userinfo) String() string { 283 s := escape(u.username, encodeUserPassword) 284 if u.passwordSet { 285 s += ":" + escape(u.password, encodeUserPassword) 286 } 287 return s 288} 289 290// Maybe rawurl is of the form scheme:path. 291// (Scheme must be [a-zA-Z][a-zA-Z0-9+-.]*) 292// If so, return scheme, path; else return "", rawurl. 293func getscheme(rawurl string) (scheme, path string, err error) { 294 for i := 0; i < len(rawurl); i++ { 295 c := rawurl[i] 296 switch { 297 case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': 298 // do nothing 299 case '0' <= c && c <= '9' || c == '+' || c == '-' || c == '.': 300 if i == 0 { 301 return "", rawurl, nil 302 } 303 case c == ':': 304 if i == 0 { 305 return "", "", errors.New("missing protocol scheme") 306 } 307 return rawurl[0:i], rawurl[i+1:], nil 308 default: 309 // we have encountered an invalid character, 310 // so there is no valid scheme 311 return "", rawurl, nil 312 } 313 } 314 return "", rawurl, nil 315} 316 317// Maybe s is of the form t c u. 318// If so, return t, c u (or t, u if cutc == true). 319// If not, return s, "". 320func split(s string, c string, cutc bool) (string, string) { 321 i := strings.Index(s, c) 322 if i < 0 { 323 return s, "" 324 } 325 if cutc { 326 return s[0:i], s[i+len(c):] 327 } 328 return s[0:i], s[i:] 329} 330 331// Parse parses rawurl into a URL structure. 332// The rawurl may be relative or absolute. 333func Parse(rawurl string) (url *URL, err error) { 334 // Cut off #frag 335 u, frag := split(rawurl, "#", true) 336 if url, err = parse(u, false); err != nil { 337 return nil, err 338 } 339 if frag == "" { 340 return url, nil 341 } 342 if url.Fragment, err = unescape(frag, encodeFragment); err != nil { 343 return nil, &Error{"parse", rawurl, err} 344 } 345 return url, nil 346} 347 348// ParseRequestURI parses rawurl into a URL structure. It assumes that 349// rawurl was received in an HTTP request, so the rawurl is interpreted 350// only as an absolute URI or an absolute path. 351// The string rawurl is assumed not to have a #fragment suffix. 352// (Web browsers strip #fragment before sending the URL to a web server.) 353func ParseRequestURI(rawurl string) (url *URL, err error) { 354 return parse(rawurl, true) 355} 356 357// parse parses a URL from a string in one of two contexts. If 358// viaRequest is true, the URL is assumed to have arrived via an HTTP request, 359// in which case only absolute URLs or path-absolute relative URLs are allowed. 360// If viaRequest is false, all forms of relative URLs are allowed. 361func parse(rawurl string, viaRequest bool) (url *URL, err error) { 362 var rest string 363 364 if rawurl == "" && viaRequest { 365 err = errors.New("empty url") 366 goto Error 367 } 368 url = new(URL) 369 370 if rawurl == "*" { 371 url.Path = "*" 372 return 373 } 374 375 // Split off possible leading "http:", "mailto:", etc. 376 // Cannot contain escaped characters. 377 if url.Scheme, rest, err = getscheme(rawurl); err != nil { 378 goto Error 379 } 380 url.Scheme = strings.ToLower(url.Scheme) 381 382 rest, url.RawQuery = split(rest, "?", true) 383 384 if !strings.HasPrefix(rest, "/") { 385 if url.Scheme != "" { 386 // We consider rootless paths per RFC 3986 as opaque. 387 url.Opaque = rest 388 return url, nil 389 } 390 if viaRequest { 391 err = errors.New("invalid URI for request") 392 goto Error 393 } 394 } 395 396 if (url.Scheme != "" || !viaRequest && !strings.HasPrefix(rest, "///")) && strings.HasPrefix(rest, "//") { 397 var authority string 398 authority, rest = split(rest[2:], "/", false) 399 url.User, url.Host, err = parseAuthority(authority) 400 if err != nil { 401 goto Error 402 } 403 if strings.Contains(url.Host, "%") { 404 err = errors.New("hexadecimal escape in host") 405 goto Error 406 } 407 } 408 if url.Path, err = unescape(rest, encodePath); err != nil { 409 goto Error 410 } 411 return url, nil 412 413Error: 414 return nil, &Error{"parse", rawurl, err} 415} 416 417func parseAuthority(authority string) (user *Userinfo, host string, err error) { 418 i := strings.LastIndex(authority, "@") 419 if i < 0 { 420 host = authority 421 return 422 } 423 userinfo, host := authority[:i], authority[i+1:] 424 if strings.Index(userinfo, ":") < 0 { 425 if userinfo, err = unescape(userinfo, encodeUserPassword); err != nil { 426 return 427 } 428 user = User(userinfo) 429 } else { 430 username, password := split(userinfo, ":", true) 431 if username, err = unescape(username, encodeUserPassword); err != nil { 432 return 433 } 434 if password, err = unescape(password, encodeUserPassword); err != nil { 435 return 436 } 437 user = UserPassword(username, password) 438 } 439 return 440} 441 442// String reassembles the URL into a valid URL string. 443func (u *URL) String() string { 444 var buf bytes.Buffer 445 if u.Scheme != "" { 446 buf.WriteString(u.Scheme) 447 buf.WriteByte(':') 448 } 449 if u.Opaque != "" { 450 buf.WriteString(u.Opaque) 451 } else { 452 if u.Scheme != "" || u.Host != "" || u.User != nil { 453 buf.WriteString("//") 454 if u := u.User; u != nil { 455 buf.WriteString(u.String()) 456 buf.WriteByte('@') 457 } 458 if h := u.Host; h != "" { 459 buf.WriteString(h) 460 } 461 } 462 buf.WriteString(escape(u.Path, encodePath)) 463 } 464 if u.RawQuery != "" { 465 buf.WriteByte('?') 466 buf.WriteString(u.RawQuery) 467 } 468 if u.Fragment != "" { 469 buf.WriteByte('#') 470 buf.WriteString(escape(u.Fragment, encodeFragment)) 471 } 472 return buf.String() 473} 474 475// Values maps a string key to a list of values. 476// It is typically used for query parameters and form values. 477// Unlike in the http.Header map, the keys in a Values map 478// are case-sensitive. 479type Values map[string][]string 480 481// Get gets the first value associated with the given key. 482// If there are no values associated with the key, Get returns 483// the empty string. To access multiple values, use the map 484// directly. 485func (v Values) Get(key string) string { 486 if v == nil { 487 return "" 488 } 489 vs, ok := v[key] 490 if !ok || len(vs) == 0 { 491 return "" 492 } 493 return vs[0] 494} 495 496// Set sets the key to value. It replaces any existing 497// values. 498func (v Values) Set(key, value string) { 499 v[key] = []string{value} 500} 501 502// Add adds the key to value. It appends to any existing 503// values associated with key. 504func (v Values) Add(key, value string) { 505 v[key] = append(v[key], value) 506} 507 508// Del deletes the values associated with key. 509func (v Values) Del(key string) { 510 delete(v, key) 511} 512 513// ParseQuery parses the URL-encoded query string and returns 514// a map listing the values specified for each key. 515// ParseQuery always returns a non-nil map containing all the 516// valid query parameters found; err describes the first decoding error 517// encountered, if any. 518func ParseQuery(query string) (m Values, err error) { 519 m = make(Values) 520 err = parseQuery(m, query) 521 return 522} 523 524func parseQuery(m Values, query string) (err error) { 525 for query != "" { 526 key := query 527 if i := strings.IndexAny(key, "&;"); i >= 0 { 528 key, query = key[:i], key[i+1:] 529 } else { 530 query = "" 531 } 532 if key == "" { 533 continue 534 } 535 value := "" 536 if i := strings.Index(key, "="); i >= 0 { 537 key, value = key[:i], key[i+1:] 538 } 539 key, err1 := QueryUnescape(key) 540 if err1 != nil { 541 if err == nil { 542 err = err1 543 } 544 continue 545 } 546 value, err1 = QueryUnescape(value) 547 if err1 != nil { 548 if err == nil { 549 err = err1 550 } 551 continue 552 } 553 m[key] = append(m[key], value) 554 } 555 return err 556} 557 558// Encode encodes the values into ``URL encoded'' form. 559// e.g. "foo=bar&bar=baz" 560func (v Values) Encode() string { 561 if v == nil { 562 return "" 563 } 564 var buf bytes.Buffer 565 keys := make([]string, 0, len(v)) 566 for k := range v { 567 keys = append(keys, k) 568 } 569 sort.Strings(keys) 570 for _, k := range keys { 571 vs := v[k] 572 prefix := QueryEscape(k) + "=" 573 for _, v := range vs { 574 if buf.Len() > 0 { 575 buf.WriteByte('&') 576 } 577 buf.WriteString(prefix) 578 buf.WriteString(QueryEscape(v)) 579 } 580 } 581 return buf.String() 582} 583 584// resolvePath applies special path segments from refs and applies 585// them to base, per RFC 3986. 586func resolvePath(base, ref string) string { 587 var full string 588 if ref == "" { 589 full = base 590 } else if ref[0] != '/' { 591 i := strings.LastIndex(base, "/") 592 full = base[:i+1] + ref 593 } else { 594 full = ref 595 } 596 if full == "" { 597 return "" 598 } 599 var dst []string 600 src := strings.Split(full, "/") 601 for _, elem := range src { 602 switch elem { 603 case ".": 604 // drop 605 case "..": 606 if len(dst) > 0 { 607 dst = dst[:len(dst)-1] 608 } 609 default: 610 dst = append(dst, elem) 611 } 612 } 613 if last := src[len(src)-1]; last == "." || last == ".." { 614 // Add final slash to the joined path. 615 dst = append(dst, "") 616 } 617 return "/" + strings.TrimLeft(strings.Join(dst, "/"), "/") 618} 619 620// IsAbs returns true if the URL is absolute. 621func (u *URL) IsAbs() bool { 622 return u.Scheme != "" 623} 624 625// Parse parses a URL in the context of the receiver. The provided URL 626// may be relative or absolute. Parse returns nil, err on parse 627// failure, otherwise its return value is the same as ResolveReference. 628func (u *URL) Parse(ref string) (*URL, error) { 629 refurl, err := Parse(ref) 630 if err != nil { 631 return nil, err 632 } 633 return u.ResolveReference(refurl), nil 634} 635 636// ResolveReference resolves a URI reference to an absolute URI from 637// an absolute base URI, per RFC 3986 Section 5.2. The URI reference 638// may be relative or absolute. ResolveReference always returns a new 639// URL instance, even if the returned URL is identical to either the 640// base or reference. If ref is an absolute URL, then ResolveReference 641// ignores base and returns a copy of ref. 642func (u *URL) ResolveReference(ref *URL) *URL { 643 url := *ref 644 if ref.Scheme == "" { 645 url.Scheme = u.Scheme 646 } 647 if ref.Scheme != "" || ref.Host != "" || ref.User != nil { 648 // The "absoluteURI" or "net_path" cases. 649 url.Path = resolvePath(ref.Path, "") 650 return &url 651 } 652 if ref.Opaque != "" { 653 url.User = nil 654 url.Host = "" 655 url.Path = "" 656 return &url 657 } 658 if ref.Path == "" { 659 if ref.RawQuery == "" { 660 url.RawQuery = u.RawQuery 661 if ref.Fragment == "" { 662 url.Fragment = u.Fragment 663 } 664 } 665 } 666 // The "abs_path" or "rel_path" cases. 667 url.Host = u.Host 668 url.User = u.User 669 url.Path = resolvePath(u.Path, ref.Path) 670 return &url 671} 672 673// Query parses RawQuery and returns the corresponding values. 674func (u *URL) Query() Values { 675 v, _ := ParseQuery(u.RawQuery) 676 return v 677} 678 679// RequestURI returns the encoded path?query or opaque?query 680// string that would be used in an HTTP request for u. 681func (u *URL) RequestURI() string { 682 result := u.Opaque 683 if result == "" { 684 result = escape(u.Path, encodePath) 685 if result == "" { 686 result = "/" 687 } 688 } else { 689 if strings.HasPrefix(result, "//") { 690 result = u.Scheme + ":" + result 691 } 692 } 693 if u.RawQuery != "" { 694 result += "?" + u.RawQuery 695 } 696 return result 697} 698