1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:generate go run makeisprint.go -output isprint.go 6 7package strconv 8 9import ( 10 "internal/bytealg" 11 "unicode/utf8" 12) 13 14const lowerhex = "0123456789abcdef" 15 16func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string { 17 return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly)) 18} 19 20func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string { 21 return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly)) 22} 23 24func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte { 25 buf = append(buf, quote) 26 for width := 0; len(s) > 0; s = s[width:] { 27 r := rune(s[0]) 28 width = 1 29 if r >= utf8.RuneSelf { 30 r, width = utf8.DecodeRuneInString(s) 31 } 32 if width == 1 && r == utf8.RuneError { 33 buf = append(buf, `\x`...) 34 buf = append(buf, lowerhex[s[0]>>4]) 35 buf = append(buf, lowerhex[s[0]&0xF]) 36 continue 37 } 38 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) 39 } 40 buf = append(buf, quote) 41 return buf 42} 43 44func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { 45 buf = append(buf, quote) 46 if !utf8.ValidRune(r) { 47 r = utf8.RuneError 48 } 49 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) 50 buf = append(buf, quote) 51 return buf 52} 53 54func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { 55 var runeTmp [utf8.UTFMax]byte 56 if r == rune(quote) || r == '\\' { // always backslashed 57 buf = append(buf, '\\') 58 buf = append(buf, byte(r)) 59 return buf 60 } 61 if ASCIIonly { 62 if r < utf8.RuneSelf && IsPrint(r) { 63 buf = append(buf, byte(r)) 64 return buf 65 } 66 } else if IsPrint(r) || graphicOnly && isInGraphicList(r) { 67 n := utf8.EncodeRune(runeTmp[:], r) 68 buf = append(buf, runeTmp[:n]...) 69 return buf 70 } 71 switch r { 72 case '\a': 73 buf = append(buf, `\a`...) 74 case '\b': 75 buf = append(buf, `\b`...) 76 case '\f': 77 buf = append(buf, `\f`...) 78 case '\n': 79 buf = append(buf, `\n`...) 80 case '\r': 81 buf = append(buf, `\r`...) 82 case '\t': 83 buf = append(buf, `\t`...) 84 case '\v': 85 buf = append(buf, `\v`...) 86 default: 87 switch { 88 case r < ' ': 89 buf = append(buf, `\x`...) 90 buf = append(buf, lowerhex[byte(r)>>4]) 91 buf = append(buf, lowerhex[byte(r)&0xF]) 92 case r > utf8.MaxRune: 93 r = 0xFFFD 94 fallthrough 95 case r < 0x10000: 96 buf = append(buf, `\u`...) 97 for s := 12; s >= 0; s -= 4 { 98 buf = append(buf, lowerhex[r>>uint(s)&0xF]) 99 } 100 default: 101 buf = append(buf, `\U`...) 102 for s := 28; s >= 0; s -= 4 { 103 buf = append(buf, lowerhex[r>>uint(s)&0xF]) 104 } 105 } 106 } 107 return buf 108} 109 110// Quote returns a double-quoted Go string literal representing s. The 111// returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for 112// control characters and non-printable characters as defined by 113// IsPrint. 114func Quote(s string) string { 115 return quoteWith(s, '"', false, false) 116} 117 118// AppendQuote appends a double-quoted Go string literal representing s, 119// as generated by Quote, to dst and returns the extended buffer. 120func AppendQuote(dst []byte, s string) []byte { 121 return appendQuotedWith(dst, s, '"', false, false) 122} 123 124// QuoteToASCII returns a double-quoted Go string literal representing s. 125// The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for 126// non-ASCII characters and non-printable characters as defined by IsPrint. 127func QuoteToASCII(s string) string { 128 return quoteWith(s, '"', true, false) 129} 130 131// AppendQuoteToASCII appends a double-quoted Go string literal representing s, 132// as generated by QuoteToASCII, to dst and returns the extended buffer. 133func AppendQuoteToASCII(dst []byte, s string) []byte { 134 return appendQuotedWith(dst, s, '"', true, false) 135} 136 137// QuoteToGraphic returns a double-quoted Go string literal representing s. 138// The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for 139// non-ASCII characters and non-printable characters as defined by IsGraphic. 140func QuoteToGraphic(s string) string { 141 return quoteWith(s, '"', false, true) 142} 143 144// AppendQuoteToGraphic appends a double-quoted Go string literal representing s, 145// as generated by QuoteToGraphic, to dst and returns the extended buffer. 146func AppendQuoteToGraphic(dst []byte, s string) []byte { 147 return appendQuotedWith(dst, s, '"', false, true) 148} 149 150// QuoteRune returns a single-quoted Go character literal representing the 151// rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) 152// for control characters and non-printable characters as defined by IsPrint. 153func QuoteRune(r rune) string { 154 return quoteRuneWith(r, '\'', false, false) 155} 156 157// AppendQuoteRune appends a single-quoted Go character literal representing the rune, 158// as generated by QuoteRune, to dst and returns the extended buffer. 159func AppendQuoteRune(dst []byte, r rune) []byte { 160 return appendQuotedRuneWith(dst, r, '\'', false, false) 161} 162 163// QuoteRuneToASCII returns a single-quoted Go character literal representing 164// the rune. The returned string uses Go escape sequences (\t, \n, \xFF, 165// \u0100) for non-ASCII characters and non-printable characters as defined 166// by IsPrint. 167func QuoteRuneToASCII(r rune) string { 168 return quoteRuneWith(r, '\'', true, false) 169} 170 171// AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune, 172// as generated by QuoteRuneToASCII, to dst and returns the extended buffer. 173func AppendQuoteRuneToASCII(dst []byte, r rune) []byte { 174 return appendQuotedRuneWith(dst, r, '\'', true, false) 175} 176 177// QuoteRuneToGraphic returns a single-quoted Go character literal representing 178// the rune. The returned string uses Go escape sequences (\t, \n, \xFF, 179// \u0100) for non-ASCII characters and non-printable characters as defined 180// by IsGraphic. 181func QuoteRuneToGraphic(r rune) string { 182 return quoteRuneWith(r, '\'', false, true) 183} 184 185// AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune, 186// as generated by QuoteRuneToGraphic, to dst and returns the extended buffer. 187func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte { 188 return appendQuotedRuneWith(dst, r, '\'', false, true) 189} 190 191// CanBackquote reports whether the string s can be represented 192// unchanged as a single-line backquoted string without control 193// characters other than tab. 194func CanBackquote(s string) bool { 195 for len(s) > 0 { 196 r, wid := utf8.DecodeRuneInString(s) 197 s = s[wid:] 198 if wid > 1 { 199 if r == '\ufeff' { 200 return false // BOMs are invisible and should not be quoted. 201 } 202 continue // All other multibyte runes are correctly encoded and assumed printable. 203 } 204 if r == utf8.RuneError { 205 return false 206 } 207 if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' { 208 return false 209 } 210 } 211 return true 212} 213 214func unhex(b byte) (v rune, ok bool) { 215 c := rune(b) 216 switch { 217 case '0' <= c && c <= '9': 218 return c - '0', true 219 case 'a' <= c && c <= 'f': 220 return c - 'a' + 10, true 221 case 'A' <= c && c <= 'F': 222 return c - 'A' + 10, true 223 } 224 return 225} 226 227// UnquoteChar decodes the first character or byte in the escaped string 228// or character literal represented by the string s. 229// It returns four values: 230// 231// 1) value, the decoded Unicode code point or byte value; 232// 2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation; 233// 3) tail, the remainder of the string after the character; and 234// 4) an error that will be nil if the character is syntactically valid. 235// 236// The second argument, quote, specifies the type of literal being parsed 237// and therefore which escaped quote character is permitted. 238// If set to a single quote, it permits the sequence \' and disallows unescaped '. 239// If set to a double quote, it permits \" and disallows unescaped ". 240// If set to zero, it does not permit either escape and allows both quote characters to appear unescaped. 241func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) { 242 // easy cases 243 if len(s) == 0 { 244 err = ErrSyntax 245 return 246 } 247 switch c := s[0]; { 248 case c == quote && (quote == '\'' || quote == '"'): 249 err = ErrSyntax 250 return 251 case c >= utf8.RuneSelf: 252 r, size := utf8.DecodeRuneInString(s) 253 return r, true, s[size:], nil 254 case c != '\\': 255 return rune(s[0]), false, s[1:], nil 256 } 257 258 // hard case: c is backslash 259 if len(s) <= 1 { 260 err = ErrSyntax 261 return 262 } 263 c := s[1] 264 s = s[2:] 265 266 switch c { 267 case 'a': 268 value = '\a' 269 case 'b': 270 value = '\b' 271 case 'f': 272 value = '\f' 273 case 'n': 274 value = '\n' 275 case 'r': 276 value = '\r' 277 case 't': 278 value = '\t' 279 case 'v': 280 value = '\v' 281 case 'x', 'u', 'U': 282 n := 0 283 switch c { 284 case 'x': 285 n = 2 286 case 'u': 287 n = 4 288 case 'U': 289 n = 8 290 } 291 var v rune 292 if len(s) < n { 293 err = ErrSyntax 294 return 295 } 296 for j := 0; j < n; j++ { 297 x, ok := unhex(s[j]) 298 if !ok { 299 err = ErrSyntax 300 return 301 } 302 v = v<<4 | x 303 } 304 s = s[n:] 305 if c == 'x' { 306 // single-byte string, possibly not UTF-8 307 value = v 308 break 309 } 310 if v > utf8.MaxRune { 311 err = ErrSyntax 312 return 313 } 314 value = v 315 multibyte = true 316 case '0', '1', '2', '3', '4', '5', '6', '7': 317 v := rune(c) - '0' 318 if len(s) < 2 { 319 err = ErrSyntax 320 return 321 } 322 for j := 0; j < 2; j++ { // one digit already; two more 323 x := rune(s[j]) - '0' 324 if x < 0 || x > 7 { 325 err = ErrSyntax 326 return 327 } 328 v = (v << 3) | x 329 } 330 s = s[2:] 331 if v > 255 { 332 err = ErrSyntax 333 return 334 } 335 value = v 336 case '\\': 337 value = '\\' 338 case '\'', '"': 339 if c != quote { 340 err = ErrSyntax 341 return 342 } 343 value = rune(c) 344 default: 345 err = ErrSyntax 346 return 347 } 348 tail = s 349 return 350} 351 352// Unquote interprets s as a single-quoted, double-quoted, 353// or backquoted Go string literal, returning the string value 354// that s quotes. (If s is single-quoted, it would be a Go 355// character literal; Unquote returns the corresponding 356// one-character string.) 357func Unquote(s string) (string, error) { 358 n := len(s) 359 if n < 2 { 360 return "", ErrSyntax 361 } 362 quote := s[0] 363 if quote != s[n-1] { 364 return "", ErrSyntax 365 } 366 s = s[1 : n-1] 367 368 if quote == '`' { 369 if contains(s, '`') { 370 return "", ErrSyntax 371 } 372 if contains(s, '\r') { 373 // -1 because we know there is at least one \r to remove. 374 buf := make([]byte, 0, len(s)-1) 375 for i := 0; i < len(s); i++ { 376 if s[i] != '\r' { 377 buf = append(buf, s[i]) 378 } 379 } 380 return string(buf), nil 381 } 382 return s, nil 383 } 384 if quote != '"' && quote != '\'' { 385 return "", ErrSyntax 386 } 387 if contains(s, '\n') { 388 return "", ErrSyntax 389 } 390 391 // Is it trivial? Avoid allocation. 392 if !contains(s, '\\') && !contains(s, quote) { 393 switch quote { 394 case '"': 395 if utf8.ValidString(s) { 396 return s, nil 397 } 398 case '\'': 399 r, size := utf8.DecodeRuneInString(s) 400 if size == len(s) && (r != utf8.RuneError || size != 1) { 401 return s, nil 402 } 403 } 404 } 405 406 var runeTmp [utf8.UTFMax]byte 407 buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations. 408 for len(s) > 0 { 409 c, multibyte, ss, err := UnquoteChar(s, quote) 410 if err != nil { 411 return "", err 412 } 413 s = ss 414 if c < utf8.RuneSelf || !multibyte { 415 buf = append(buf, byte(c)) 416 } else { 417 n := utf8.EncodeRune(runeTmp[:], c) 418 buf = append(buf, runeTmp[:n]...) 419 } 420 if quote == '\'' && len(s) != 0 { 421 // single-quoted must be single character 422 return "", ErrSyntax 423 } 424 } 425 return string(buf), nil 426} 427 428// contains reports whether the string contains the byte c. 429func contains(s string, c byte) bool { 430 return bytealg.IndexByteString(s, c) != -1 431} 432 433// bsearch16 returns the smallest i such that a[i] >= x. 434// If there is no such i, bsearch16 returns len(a). 435func bsearch16(a []uint16, x uint16) int { 436 i, j := 0, len(a) 437 for i < j { 438 h := i + (j-i)/2 439 if a[h] < x { 440 i = h + 1 441 } else { 442 j = h 443 } 444 } 445 return i 446} 447 448// bsearch32 returns the smallest i such that a[i] >= x. 449// If there is no such i, bsearch32 returns len(a). 450func bsearch32(a []uint32, x uint32) int { 451 i, j := 0, len(a) 452 for i < j { 453 h := i + (j-i)/2 454 if a[h] < x { 455 i = h + 1 456 } else { 457 j = h 458 } 459 } 460 return i 461} 462 463// TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests 464// to give the same answer. It allows this package not to depend on unicode, 465// and therefore not pull in all the Unicode tables. If the linker were better 466// at tossing unused tables, we could get rid of this implementation. 467// That would be nice. 468 469// IsPrint reports whether the rune is defined as printable by Go, with 470// the same definition as unicode.IsPrint: letters, numbers, punctuation, 471// symbols and ASCII space. 472func IsPrint(r rune) bool { 473 // Fast check for Latin-1 474 if r <= 0xFF { 475 if 0x20 <= r && r <= 0x7E { 476 // All the ASCII is printable from space through DEL-1. 477 return true 478 } 479 if 0xA1 <= r && r <= 0xFF { 480 // Similarly for ¡ through ÿ... 481 return r != 0xAD // ...except for the bizarre soft hyphen. 482 } 483 return false 484 } 485 486 // Same algorithm, either on uint16 or uint32 value. 487 // First, find first i such that isPrint[i] >= x. 488 // This is the index of either the start or end of a pair that might span x. 489 // The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]). 490 // If we find x in a range, make sure x is not in isNotPrint list. 491 492 if 0 <= r && r < 1<<16 { 493 rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16 494 i := bsearch16(isPrint, rr) 495 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { 496 return false 497 } 498 j := bsearch16(isNotPrint, rr) 499 return j >= len(isNotPrint) || isNotPrint[j] != rr 500 } 501 502 rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32 503 i := bsearch32(isPrint, rr) 504 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { 505 return false 506 } 507 if r >= 0x20000 { 508 return true 509 } 510 r -= 0x10000 511 j := bsearch16(isNotPrint, uint16(r)) 512 return j >= len(isNotPrint) || isNotPrint[j] != uint16(r) 513} 514 515// IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such 516// characters include letters, marks, numbers, punctuation, symbols, and 517// spaces, from categories L, M, N, P, S, and Zs. 518func IsGraphic(r rune) bool { 519 if IsPrint(r) { 520 return true 521 } 522 return isInGraphicList(r) 523} 524 525// isInGraphicList reports whether the rune is in the isGraphic list. This separation 526// from IsGraphic allows quoteWith to avoid two calls to IsPrint. 527// Should be called only if IsPrint fails. 528func isInGraphicList(r rune) bool { 529 // We know r must fit in 16 bits - see makeisprint.go. 530 if r > 0xFFFF { 531 return false 532 } 533 rr := uint16(r) 534 i := bsearch16(isGraphic, rr) 535 return i < len(isGraphic) && rr == isGraphic[i] 536} 537