1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:generate go run makeisprint.go -output isprint.go
6
7package strconv
8
9import "unicode/utf8"
10
11const lowerhex = "0123456789abcdef"
12
13func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
14	return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
15}
16
17func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string {
18	return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
19}
20
21func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte {
22	buf = append(buf, quote)
23	for width := 0; len(s) > 0; s = s[width:] {
24		r := rune(s[0])
25		width = 1
26		if r >= utf8.RuneSelf {
27			r, width = utf8.DecodeRuneInString(s)
28		}
29		if width == 1 && r == utf8.RuneError {
30			buf = append(buf, `\x`...)
31			buf = append(buf, lowerhex[s[0]>>4])
32			buf = append(buf, lowerhex[s[0]&0xF])
33			continue
34		}
35		buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
36	}
37	buf = append(buf, quote)
38	return buf
39}
40
41func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
42	buf = append(buf, quote)
43	if !utf8.ValidRune(r) {
44		r = utf8.RuneError
45	}
46	buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
47	buf = append(buf, quote)
48	return buf
49}
50
51func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
52	var runeTmp [utf8.UTFMax]byte
53	if r == rune(quote) || r == '\\' { // always backslashed
54		buf = append(buf, '\\')
55		buf = append(buf, byte(r))
56		return buf
57	}
58	if ASCIIonly {
59		if r < utf8.RuneSelf && IsPrint(r) {
60			buf = append(buf, byte(r))
61			return buf
62		}
63	} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
64		n := utf8.EncodeRune(runeTmp[:], r)
65		buf = append(buf, runeTmp[:n]...)
66		return buf
67	}
68	switch r {
69	case '\a':
70		buf = append(buf, `\a`...)
71	case '\b':
72		buf = append(buf, `\b`...)
73	case '\f':
74		buf = append(buf, `\f`...)
75	case '\n':
76		buf = append(buf, `\n`...)
77	case '\r':
78		buf = append(buf, `\r`...)
79	case '\t':
80		buf = append(buf, `\t`...)
81	case '\v':
82		buf = append(buf, `\v`...)
83	default:
84		switch {
85		case r < ' ':
86			buf = append(buf, `\x`...)
87			buf = append(buf, lowerhex[byte(r)>>4])
88			buf = append(buf, lowerhex[byte(r)&0xF])
89		case r > utf8.MaxRune:
90			r = 0xFFFD
91			fallthrough
92		case r < 0x10000:
93			buf = append(buf, `\u`...)
94			for s := 12; s >= 0; s -= 4 {
95				buf = append(buf, lowerhex[r>>uint(s)&0xF])
96			}
97		default:
98			buf = append(buf, `\U`...)
99			for s := 28; s >= 0; s -= 4 {
100				buf = append(buf, lowerhex[r>>uint(s)&0xF])
101			}
102		}
103	}
104	return buf
105}
106
107// Quote returns a double-quoted Go string literal representing s. The
108// returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
109// control characters and non-printable characters as defined by
110// IsPrint.
111func Quote(s string) string {
112	return quoteWith(s, '"', false, false)
113}
114
115// AppendQuote appends a double-quoted Go string literal representing s,
116// as generated by Quote, to dst and returns the extended buffer.
117func AppendQuote(dst []byte, s string) []byte {
118	return appendQuotedWith(dst, s, '"', false, false)
119}
120
121// QuoteToASCII returns a double-quoted Go string literal representing s.
122// The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
123// non-ASCII characters and non-printable characters as defined by IsPrint.
124func QuoteToASCII(s string) string {
125	return quoteWith(s, '"', true, false)
126}
127
128// AppendQuoteToASCII appends a double-quoted Go string literal representing s,
129// as generated by QuoteToASCII, to dst and returns the extended buffer.
130func AppendQuoteToASCII(dst []byte, s string) []byte {
131	return appendQuotedWith(dst, s, '"', true, false)
132}
133
134// QuoteToGraphic returns a double-quoted Go string literal representing s.
135// The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
136// non-ASCII characters and non-printable characters as defined by IsGraphic.
137func QuoteToGraphic(s string) string {
138	return quoteWith(s, '"', false, true)
139}
140
141// AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
142// as generated by QuoteToGraphic, to dst and returns the extended buffer.
143func AppendQuoteToGraphic(dst []byte, s string) []byte {
144	return appendQuotedWith(dst, s, '"', false, true)
145}
146
147// QuoteRune returns a single-quoted Go character literal representing the
148// rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
149// for control characters and non-printable characters as defined by IsPrint.
150func QuoteRune(r rune) string {
151	return quoteRuneWith(r, '\'', false, false)
152}
153
154// AppendQuoteRune appends a single-quoted Go character literal representing the rune,
155// as generated by QuoteRune, to dst and returns the extended buffer.
156func AppendQuoteRune(dst []byte, r rune) []byte {
157	return appendQuotedRuneWith(dst, r, '\'', false, false)
158}
159
160// QuoteRuneToASCII returns a single-quoted Go character literal representing
161// the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
162// \u0100) for non-ASCII characters and non-printable characters as defined
163// by IsPrint.
164func QuoteRuneToASCII(r rune) string {
165	return quoteRuneWith(r, '\'', true, false)
166}
167
168// AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
169// as generated by QuoteRuneToASCII, to dst and returns the extended buffer.
170func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
171	return appendQuotedRuneWith(dst, r, '\'', true, false)
172}
173
174// QuoteRuneToGraphic returns a single-quoted Go character literal representing
175// the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
176// \u0100) for non-ASCII characters and non-printable characters as defined
177// by IsGraphic.
178func QuoteRuneToGraphic(r rune) string {
179	return quoteRuneWith(r, '\'', false, true)
180}
181
182// AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
183// as generated by QuoteRuneToGraphic, to dst and returns the extended buffer.
184func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
185	return appendQuotedRuneWith(dst, r, '\'', false, true)
186}
187
188// CanBackquote reports whether the string s can be represented
189// unchanged as a single-line backquoted string without control
190// characters other than tab.
191func CanBackquote(s string) bool {
192	for len(s) > 0 {
193		r, wid := utf8.DecodeRuneInString(s)
194		s = s[wid:]
195		if wid > 1 {
196			if r == '\ufeff' {
197				return false // BOMs are invisible and should not be quoted.
198			}
199			continue // All other multibyte runes are correctly encoded and assumed printable.
200		}
201		if r == utf8.RuneError {
202			return false
203		}
204		if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
205			return false
206		}
207	}
208	return true
209}
210
211func unhex(b byte) (v rune, ok bool) {
212	c := rune(b)
213	switch {
214	case '0' <= c && c <= '9':
215		return c - '0', true
216	case 'a' <= c && c <= 'f':
217		return c - 'a' + 10, true
218	case 'A' <= c && c <= 'F':
219		return c - 'A' + 10, true
220	}
221	return
222}
223
224// UnquoteChar decodes the first character or byte in the escaped string
225// or character literal represented by the string s.
226// It returns four values:
227//
228//	1) value, the decoded Unicode code point or byte value;
229//	2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
230//	3) tail, the remainder of the string after the character; and
231//	4) an error that will be nil if the character is syntactically valid.
232//
233// The second argument, quote, specifies the type of literal being parsed
234// and therefore which escaped quote character is permitted.
235// If set to a single quote, it permits the sequence \' and disallows unescaped '.
236// If set to a double quote, it permits \" and disallows unescaped ".
237// If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
238func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
239	// easy cases
240	switch c := s[0]; {
241	case c == quote && (quote == '\'' || quote == '"'):
242		err = ErrSyntax
243		return
244	case c >= utf8.RuneSelf:
245		r, size := utf8.DecodeRuneInString(s)
246		return r, true, s[size:], nil
247	case c != '\\':
248		return rune(s[0]), false, s[1:], nil
249	}
250
251	// hard case: c is backslash
252	if len(s) <= 1 {
253		err = ErrSyntax
254		return
255	}
256	c := s[1]
257	s = s[2:]
258
259	switch c {
260	case 'a':
261		value = '\a'
262	case 'b':
263		value = '\b'
264	case 'f':
265		value = '\f'
266	case 'n':
267		value = '\n'
268	case 'r':
269		value = '\r'
270	case 't':
271		value = '\t'
272	case 'v':
273		value = '\v'
274	case 'x', 'u', 'U':
275		n := 0
276		switch c {
277		case 'x':
278			n = 2
279		case 'u':
280			n = 4
281		case 'U':
282			n = 8
283		}
284		var v rune
285		if len(s) < n {
286			err = ErrSyntax
287			return
288		}
289		for j := 0; j < n; j++ {
290			x, ok := unhex(s[j])
291			if !ok {
292				err = ErrSyntax
293				return
294			}
295			v = v<<4 | x
296		}
297		s = s[n:]
298		if c == 'x' {
299			// single-byte string, possibly not UTF-8
300			value = v
301			break
302		}
303		if v > utf8.MaxRune {
304			err = ErrSyntax
305			return
306		}
307		value = v
308		multibyte = true
309	case '0', '1', '2', '3', '4', '5', '6', '7':
310		v := rune(c) - '0'
311		if len(s) < 2 {
312			err = ErrSyntax
313			return
314		}
315		for j := 0; j < 2; j++ { // one digit already; two more
316			x := rune(s[j]) - '0'
317			if x < 0 || x > 7 {
318				err = ErrSyntax
319				return
320			}
321			v = (v << 3) | x
322		}
323		s = s[2:]
324		if v > 255 {
325			err = ErrSyntax
326			return
327		}
328		value = v
329	case '\\':
330		value = '\\'
331	case '\'', '"':
332		if c != quote {
333			err = ErrSyntax
334			return
335		}
336		value = rune(c)
337	default:
338		err = ErrSyntax
339		return
340	}
341	tail = s
342	return
343}
344
345// Unquote interprets s as a single-quoted, double-quoted,
346// or backquoted Go string literal, returning the string value
347// that s quotes.  (If s is single-quoted, it would be a Go
348// character literal; Unquote returns the corresponding
349// one-character string.)
350func Unquote(s string) (string, error) {
351	n := len(s)
352	if n < 2 {
353		return "", ErrSyntax
354	}
355	quote := s[0]
356	if quote != s[n-1] {
357		return "", ErrSyntax
358	}
359	s = s[1 : n-1]
360
361	if quote == '`' {
362		if contains(s, '`') {
363			return "", ErrSyntax
364		}
365		if contains(s, '\r') {
366			// -1 because we know there is at least one \r to remove.
367			buf := make([]byte, 0, len(s)-1)
368			for i := 0; i < len(s); i++ {
369				if s[i] != '\r' {
370					buf = append(buf, s[i])
371				}
372			}
373			return string(buf), nil
374		}
375		return s, nil
376	}
377	if quote != '"' && quote != '\'' {
378		return "", ErrSyntax
379	}
380	if contains(s, '\n') {
381		return "", ErrSyntax
382	}
383
384	// Is it trivial? Avoid allocation.
385	if !contains(s, '\\') && !contains(s, quote) {
386		switch quote {
387		case '"':
388			return s, nil
389		case '\'':
390			r, size := utf8.DecodeRuneInString(s)
391			if size == len(s) && (r != utf8.RuneError || size != 1) {
392				return s, nil
393			}
394		}
395	}
396
397	var runeTmp [utf8.UTFMax]byte
398	buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
399	for len(s) > 0 {
400		c, multibyte, ss, err := UnquoteChar(s, quote)
401		if err != nil {
402			return "", err
403		}
404		s = ss
405		if c < utf8.RuneSelf || !multibyte {
406			buf = append(buf, byte(c))
407		} else {
408			n := utf8.EncodeRune(runeTmp[:], c)
409			buf = append(buf, runeTmp[:n]...)
410		}
411		if quote == '\'' && len(s) != 0 {
412			// single-quoted must be single character
413			return "", ErrSyntax
414		}
415	}
416	return string(buf), nil
417}
418
419// contains reports whether the string contains the byte c.
420func contains(s string, c byte) bool {
421	for i := 0; i < len(s); i++ {
422		if s[i] == c {
423			return true
424		}
425	}
426	return false
427}
428
429// bsearch16 returns the smallest i such that a[i] >= x.
430// If there is no such i, bsearch16 returns len(a).
431func bsearch16(a []uint16, x uint16) int {
432	i, j := 0, len(a)
433	for i < j {
434		h := i + (j-i)/2
435		if a[h] < x {
436			i = h + 1
437		} else {
438			j = h
439		}
440	}
441	return i
442}
443
444// bsearch32 returns the smallest i such that a[i] >= x.
445// If there is no such i, bsearch32 returns len(a).
446func bsearch32(a []uint32, x uint32) int {
447	i, j := 0, len(a)
448	for i < j {
449		h := i + (j-i)/2
450		if a[h] < x {
451			i = h + 1
452		} else {
453			j = h
454		}
455	}
456	return i
457}
458
459// TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
460// to give the same answer. It allows this package not to depend on unicode,
461// and therefore not pull in all the Unicode tables. If the linker were better
462// at tossing unused tables, we could get rid of this implementation.
463// That would be nice.
464
465// IsPrint reports whether the rune is defined as printable by Go, with
466// the same definition as unicode.IsPrint: letters, numbers, punctuation,
467// symbols and ASCII space.
468func IsPrint(r rune) bool {
469	// Fast check for Latin-1
470	if r <= 0xFF {
471		if 0x20 <= r && r <= 0x7E {
472			// All the ASCII is printable from space through DEL-1.
473			return true
474		}
475		if 0xA1 <= r && r <= 0xFF {
476			// Similarly for ¡ through ÿ...
477			return r != 0xAD // ...except for the bizarre soft hyphen.
478		}
479		return false
480	}
481
482	// Same algorithm, either on uint16 or uint32 value.
483	// First, find first i such that isPrint[i] >= x.
484	// This is the index of either the start or end of a pair that might span x.
485	// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
486	// If we find x in a range, make sure x is not in isNotPrint list.
487
488	if 0 <= r && r < 1<<16 {
489		rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
490		i := bsearch16(isPrint, rr)
491		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
492			return false
493		}
494		j := bsearch16(isNotPrint, rr)
495		return j >= len(isNotPrint) || isNotPrint[j] != rr
496	}
497
498	rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
499	i := bsearch32(isPrint, rr)
500	if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
501		return false
502	}
503	if r >= 0x20000 {
504		return true
505	}
506	r -= 0x10000
507	j := bsearch16(isNotPrint, uint16(r))
508	return j >= len(isNotPrint) || isNotPrint[j] != uint16(r)
509}
510
511// IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
512// characters include letters, marks, numbers, punctuation, symbols, and
513// spaces, from categories L, M, N, P, S, and Zs.
514func IsGraphic(r rune) bool {
515	if IsPrint(r) {
516		return true
517	}
518	return isInGraphicList(r)
519}
520
521// isInGraphicList reports whether the rune is in the isGraphic list. This separation
522// from IsGraphic allows quoteWith to avoid two calls to IsPrint.
523// Should be called only if IsPrint fails.
524func isInGraphicList(r rune) bool {
525	// We know r must fit in 16 bits - see makeisprint.go.
526	if r > 0xFFFF {
527		return false
528	}
529	rr := uint16(r)
530	i := bsearch16(isGraphic, rr)
531	return i < len(isGraphic) && rr == isGraphic[i]
532}
533