1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:generate go run makeisprint.go -output isprint.go 6 7package strconv 8 9import "unicode/utf8" 10 11const lowerhex = "0123456789abcdef" 12 13func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string { 14 return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly)) 15} 16 17func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string { 18 return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly)) 19} 20 21func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte { 22 buf = append(buf, quote) 23 for width := 0; len(s) > 0; s = s[width:] { 24 r := rune(s[0]) 25 width = 1 26 if r >= utf8.RuneSelf { 27 r, width = utf8.DecodeRuneInString(s) 28 } 29 if width == 1 && r == utf8.RuneError { 30 buf = append(buf, `\x`...) 31 buf = append(buf, lowerhex[s[0]>>4]) 32 buf = append(buf, lowerhex[s[0]&0xF]) 33 continue 34 } 35 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) 36 } 37 buf = append(buf, quote) 38 return buf 39} 40 41func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { 42 buf = append(buf, quote) 43 if !utf8.ValidRune(r) { 44 r = utf8.RuneError 45 } 46 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) 47 buf = append(buf, quote) 48 return buf 49} 50 51func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { 52 var runeTmp [utf8.UTFMax]byte 53 if r == rune(quote) || r == '\\' { // always backslashed 54 buf = append(buf, '\\') 55 buf = append(buf, byte(r)) 56 return buf 57 } 58 if ASCIIonly { 59 if r < utf8.RuneSelf && IsPrint(r) { 60 buf = append(buf, byte(r)) 61 return buf 62 } 63 } else if IsPrint(r) || graphicOnly && isInGraphicList(r) { 64 n := utf8.EncodeRune(runeTmp[:], r) 65 buf = append(buf, runeTmp[:n]...) 66 return buf 67 } 68 switch r { 69 case '\a': 70 buf = append(buf, `\a`...) 71 case '\b': 72 buf = append(buf, `\b`...) 73 case '\f': 74 buf = append(buf, `\f`...) 75 case '\n': 76 buf = append(buf, `\n`...) 77 case '\r': 78 buf = append(buf, `\r`...) 79 case '\t': 80 buf = append(buf, `\t`...) 81 case '\v': 82 buf = append(buf, `\v`...) 83 default: 84 switch { 85 case r < ' ': 86 buf = append(buf, `\x`...) 87 buf = append(buf, lowerhex[byte(r)>>4]) 88 buf = append(buf, lowerhex[byte(r)&0xF]) 89 case r > utf8.MaxRune: 90 r = 0xFFFD 91 fallthrough 92 case r < 0x10000: 93 buf = append(buf, `\u`...) 94 for s := 12; s >= 0; s -= 4 { 95 buf = append(buf, lowerhex[r>>uint(s)&0xF]) 96 } 97 default: 98 buf = append(buf, `\U`...) 99 for s := 28; s >= 0; s -= 4 { 100 buf = append(buf, lowerhex[r>>uint(s)&0xF]) 101 } 102 } 103 } 104 return buf 105} 106 107// Quote returns a double-quoted Go string literal representing s. The 108// returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for 109// control characters and non-printable characters as defined by 110// IsPrint. 111func Quote(s string) string { 112 return quoteWith(s, '"', false, false) 113} 114 115// AppendQuote appends a double-quoted Go string literal representing s, 116// as generated by Quote, to dst and returns the extended buffer. 117func AppendQuote(dst []byte, s string) []byte { 118 return appendQuotedWith(dst, s, '"', false, false) 119} 120 121// QuoteToASCII returns a double-quoted Go string literal representing s. 122// The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for 123// non-ASCII characters and non-printable characters as defined by IsPrint. 124func QuoteToASCII(s string) string { 125 return quoteWith(s, '"', true, false) 126} 127 128// AppendQuoteToASCII appends a double-quoted Go string literal representing s, 129// as generated by QuoteToASCII, to dst and returns the extended buffer. 130func AppendQuoteToASCII(dst []byte, s string) []byte { 131 return appendQuotedWith(dst, s, '"', true, false) 132} 133 134// QuoteToGraphic returns a double-quoted Go string literal representing s. 135// The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for 136// non-ASCII characters and non-printable characters as defined by IsGraphic. 137func QuoteToGraphic(s string) string { 138 return quoteWith(s, '"', false, true) 139} 140 141// AppendQuoteToGraphic appends a double-quoted Go string literal representing s, 142// as generated by QuoteToGraphic, to dst and returns the extended buffer. 143func AppendQuoteToGraphic(dst []byte, s string) []byte { 144 return appendQuotedWith(dst, s, '"', false, true) 145} 146 147// QuoteRune returns a single-quoted Go character literal representing the 148// rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) 149// for control characters and non-printable characters as defined by IsPrint. 150func QuoteRune(r rune) string { 151 return quoteRuneWith(r, '\'', false, false) 152} 153 154// AppendQuoteRune appends a single-quoted Go character literal representing the rune, 155// as generated by QuoteRune, to dst and returns the extended buffer. 156func AppendQuoteRune(dst []byte, r rune) []byte { 157 return appendQuotedRuneWith(dst, r, '\'', false, false) 158} 159 160// QuoteRuneToASCII returns a single-quoted Go character literal representing 161// the rune. The returned string uses Go escape sequences (\t, \n, \xFF, 162// \u0100) for non-ASCII characters and non-printable characters as defined 163// by IsPrint. 164func QuoteRuneToASCII(r rune) string { 165 return quoteRuneWith(r, '\'', true, false) 166} 167 168// AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune, 169// as generated by QuoteRuneToASCII, to dst and returns the extended buffer. 170func AppendQuoteRuneToASCII(dst []byte, r rune) []byte { 171 return appendQuotedRuneWith(dst, r, '\'', true, false) 172} 173 174// QuoteRuneToGraphic returns a single-quoted Go character literal representing 175// the rune. The returned string uses Go escape sequences (\t, \n, \xFF, 176// \u0100) for non-ASCII characters and non-printable characters as defined 177// by IsGraphic. 178func QuoteRuneToGraphic(r rune) string { 179 return quoteRuneWith(r, '\'', false, true) 180} 181 182// AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune, 183// as generated by QuoteRuneToGraphic, to dst and returns the extended buffer. 184func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte { 185 return appendQuotedRuneWith(dst, r, '\'', false, true) 186} 187 188// CanBackquote reports whether the string s can be represented 189// unchanged as a single-line backquoted string without control 190// characters other than tab. 191func CanBackquote(s string) bool { 192 for len(s) > 0 { 193 r, wid := utf8.DecodeRuneInString(s) 194 s = s[wid:] 195 if wid > 1 { 196 if r == '\ufeff' { 197 return false // BOMs are invisible and should not be quoted. 198 } 199 continue // All other multibyte runes are correctly encoded and assumed printable. 200 } 201 if r == utf8.RuneError { 202 return false 203 } 204 if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' { 205 return false 206 } 207 } 208 return true 209} 210 211func unhex(b byte) (v rune, ok bool) { 212 c := rune(b) 213 switch { 214 case '0' <= c && c <= '9': 215 return c - '0', true 216 case 'a' <= c && c <= 'f': 217 return c - 'a' + 10, true 218 case 'A' <= c && c <= 'F': 219 return c - 'A' + 10, true 220 } 221 return 222} 223 224// UnquoteChar decodes the first character or byte in the escaped string 225// or character literal represented by the string s. 226// It returns four values: 227// 228// 1) value, the decoded Unicode code point or byte value; 229// 2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation; 230// 3) tail, the remainder of the string after the character; and 231// 4) an error that will be nil if the character is syntactically valid. 232// 233// The second argument, quote, specifies the type of literal being parsed 234// and therefore which escaped quote character is permitted. 235// If set to a single quote, it permits the sequence \' and disallows unescaped '. 236// If set to a double quote, it permits \" and disallows unescaped ". 237// If set to zero, it does not permit either escape and allows both quote characters to appear unescaped. 238func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) { 239 // easy cases 240 switch c := s[0]; { 241 case c == quote && (quote == '\'' || quote == '"'): 242 err = ErrSyntax 243 return 244 case c >= utf8.RuneSelf: 245 r, size := utf8.DecodeRuneInString(s) 246 return r, true, s[size:], nil 247 case c != '\\': 248 return rune(s[0]), false, s[1:], nil 249 } 250 251 // hard case: c is backslash 252 if len(s) <= 1 { 253 err = ErrSyntax 254 return 255 } 256 c := s[1] 257 s = s[2:] 258 259 switch c { 260 case 'a': 261 value = '\a' 262 case 'b': 263 value = '\b' 264 case 'f': 265 value = '\f' 266 case 'n': 267 value = '\n' 268 case 'r': 269 value = '\r' 270 case 't': 271 value = '\t' 272 case 'v': 273 value = '\v' 274 case 'x', 'u', 'U': 275 n := 0 276 switch c { 277 case 'x': 278 n = 2 279 case 'u': 280 n = 4 281 case 'U': 282 n = 8 283 } 284 var v rune 285 if len(s) < n { 286 err = ErrSyntax 287 return 288 } 289 for j := 0; j < n; j++ { 290 x, ok := unhex(s[j]) 291 if !ok { 292 err = ErrSyntax 293 return 294 } 295 v = v<<4 | x 296 } 297 s = s[n:] 298 if c == 'x' { 299 // single-byte string, possibly not UTF-8 300 value = v 301 break 302 } 303 if v > utf8.MaxRune { 304 err = ErrSyntax 305 return 306 } 307 value = v 308 multibyte = true 309 case '0', '1', '2', '3', '4', '5', '6', '7': 310 v := rune(c) - '0' 311 if len(s) < 2 { 312 err = ErrSyntax 313 return 314 } 315 for j := 0; j < 2; j++ { // one digit already; two more 316 x := rune(s[j]) - '0' 317 if x < 0 || x > 7 { 318 err = ErrSyntax 319 return 320 } 321 v = (v << 3) | x 322 } 323 s = s[2:] 324 if v > 255 { 325 err = ErrSyntax 326 return 327 } 328 value = v 329 case '\\': 330 value = '\\' 331 case '\'', '"': 332 if c != quote { 333 err = ErrSyntax 334 return 335 } 336 value = rune(c) 337 default: 338 err = ErrSyntax 339 return 340 } 341 tail = s 342 return 343} 344 345// Unquote interprets s as a single-quoted, double-quoted, 346// or backquoted Go string literal, returning the string value 347// that s quotes. (If s is single-quoted, it would be a Go 348// character literal; Unquote returns the corresponding 349// one-character string.) 350func Unquote(s string) (string, error) { 351 n := len(s) 352 if n < 2 { 353 return "", ErrSyntax 354 } 355 quote := s[0] 356 if quote != s[n-1] { 357 return "", ErrSyntax 358 } 359 s = s[1 : n-1] 360 361 if quote == '`' { 362 if contains(s, '`') { 363 return "", ErrSyntax 364 } 365 if contains(s, '\r') { 366 // -1 because we know there is at least one \r to remove. 367 buf := make([]byte, 0, len(s)-1) 368 for i := 0; i < len(s); i++ { 369 if s[i] != '\r' { 370 buf = append(buf, s[i]) 371 } 372 } 373 return string(buf), nil 374 } 375 return s, nil 376 } 377 if quote != '"' && quote != '\'' { 378 return "", ErrSyntax 379 } 380 if contains(s, '\n') { 381 return "", ErrSyntax 382 } 383 384 // Is it trivial? Avoid allocation. 385 if !contains(s, '\\') && !contains(s, quote) { 386 switch quote { 387 case '"': 388 return s, nil 389 case '\'': 390 r, size := utf8.DecodeRuneInString(s) 391 if size == len(s) && (r != utf8.RuneError || size != 1) { 392 return s, nil 393 } 394 } 395 } 396 397 var runeTmp [utf8.UTFMax]byte 398 buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations. 399 for len(s) > 0 { 400 c, multibyte, ss, err := UnquoteChar(s, quote) 401 if err != nil { 402 return "", err 403 } 404 s = ss 405 if c < utf8.RuneSelf || !multibyte { 406 buf = append(buf, byte(c)) 407 } else { 408 n := utf8.EncodeRune(runeTmp[:], c) 409 buf = append(buf, runeTmp[:n]...) 410 } 411 if quote == '\'' && len(s) != 0 { 412 // single-quoted must be single character 413 return "", ErrSyntax 414 } 415 } 416 return string(buf), nil 417} 418 419// contains reports whether the string contains the byte c. 420func contains(s string, c byte) bool { 421 for i := 0; i < len(s); i++ { 422 if s[i] == c { 423 return true 424 } 425 } 426 return false 427} 428 429// bsearch16 returns the smallest i such that a[i] >= x. 430// If there is no such i, bsearch16 returns len(a). 431func bsearch16(a []uint16, x uint16) int { 432 i, j := 0, len(a) 433 for i < j { 434 h := i + (j-i)/2 435 if a[h] < x { 436 i = h + 1 437 } else { 438 j = h 439 } 440 } 441 return i 442} 443 444// bsearch32 returns the smallest i such that a[i] >= x. 445// If there is no such i, bsearch32 returns len(a). 446func bsearch32(a []uint32, x uint32) int { 447 i, j := 0, len(a) 448 for i < j { 449 h := i + (j-i)/2 450 if a[h] < x { 451 i = h + 1 452 } else { 453 j = h 454 } 455 } 456 return i 457} 458 459// TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests 460// to give the same answer. It allows this package not to depend on unicode, 461// and therefore not pull in all the Unicode tables. If the linker were better 462// at tossing unused tables, we could get rid of this implementation. 463// That would be nice. 464 465// IsPrint reports whether the rune is defined as printable by Go, with 466// the same definition as unicode.IsPrint: letters, numbers, punctuation, 467// symbols and ASCII space. 468func IsPrint(r rune) bool { 469 // Fast check for Latin-1 470 if r <= 0xFF { 471 if 0x20 <= r && r <= 0x7E { 472 // All the ASCII is printable from space through DEL-1. 473 return true 474 } 475 if 0xA1 <= r && r <= 0xFF { 476 // Similarly for ¡ through ÿ... 477 return r != 0xAD // ...except for the bizarre soft hyphen. 478 } 479 return false 480 } 481 482 // Same algorithm, either on uint16 or uint32 value. 483 // First, find first i such that isPrint[i] >= x. 484 // This is the index of either the start or end of a pair that might span x. 485 // The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]). 486 // If we find x in a range, make sure x is not in isNotPrint list. 487 488 if 0 <= r && r < 1<<16 { 489 rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16 490 i := bsearch16(isPrint, rr) 491 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { 492 return false 493 } 494 j := bsearch16(isNotPrint, rr) 495 return j >= len(isNotPrint) || isNotPrint[j] != rr 496 } 497 498 rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32 499 i := bsearch32(isPrint, rr) 500 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { 501 return false 502 } 503 if r >= 0x20000 { 504 return true 505 } 506 r -= 0x10000 507 j := bsearch16(isNotPrint, uint16(r)) 508 return j >= len(isNotPrint) || isNotPrint[j] != uint16(r) 509} 510 511// IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such 512// characters include letters, marks, numbers, punctuation, symbols, and 513// spaces, from categories L, M, N, P, S, and Zs. 514func IsGraphic(r rune) bool { 515 if IsPrint(r) { 516 return true 517 } 518 return isInGraphicList(r) 519} 520 521// isInGraphicList reports whether the rune is in the isGraphic list. This separation 522// from IsGraphic allows quoteWith to avoid two calls to IsPrint. 523// Should be called only if IsPrint fails. 524func isInGraphicList(r rune) bool { 525 // We know r must fit in 16 bits - see makeisprint.go. 526 if r > 0xFFFF { 527 return false 528 } 529 rr := uint16(r) 530 i := bsearch16(isGraphic, rr) 531 return i < len(isGraphic) && rr == isGraphic[i] 532} 533