1package scsu
2
3import (
4	"errors"
5	"fmt"
6	"io"
7	"unicode/utf16"
8	"unicode/utf8"
9)
10
11// A RuneSource represents a sequence of runes with look-behind support.
12//
13// RuneAt returns a rune at a given position.
14// The position starts at zero and is not guaranteed to be sequential, therefore
15// the only valid arguments are 0 or one of the previously returned as nextPos.
16// Supplying anything else results in an unspecified behaviour.
17// Returns io.EOF when there are no more runes left.
18// If a rune was read err must be nil (i.e. (rune, EOF) combination is not possible)
19type RuneSource interface {
20	RuneAt(pos int) (r rune, nextPos int, err error)
21}
22
23// StrictStringRuneSource does not tolerate invalid UTF-8 sequences.
24type StrictStringRuneSource string
25
26// StringRuneSource represents an UTF-8 string. Invalid sequences are replaced with
27// utf8.RuneError.
28type StringRuneSource string
29
30// SingleRuneSource that contains a single rune.
31type SingleRuneSource rune
32
33// RuneSlice is a RuneSource backed by []rune.
34type RuneSlice []rune
35
36type encoder struct {
37	scsu
38	wr      io.Writer // nil when encoding into a slice
39	out     []byte    // a buffer so that we can go back and replace SCU to SQU. In streaming mode does not need more than 5 bytes
40	written int
41
42	src     RuneSource
43	curRune rune
44	curErr  error
45	nextPos int
46	scuPos  int
47
48	nextWindow int
49}
50
51// Encoder can be used to encode a string into []byte.
52// Zero value is ready to use.
53type Encoder struct {
54	encoder
55}
56
57type Writer struct {
58	encoder
59}
60
61var (
62	ErrInvalidUTF8 = errors.New("invalid UTF-8")
63)
64
65func (s StrictStringRuneSource) RuneAt(pos int) (rune, int, error) {
66	if pos < len(s) {
67		r, size := utf8.DecodeRuneInString(string(s)[pos:])
68		if r == utf8.RuneError && size == 1 {
69			return 0, 0, ErrInvalidUTF8
70		}
71		return r, pos + size, nil
72	}
73	return 0, 0, io.EOF
74}
75
76func (s StringRuneSource) RuneAt(pos int) (rune, int, error) {
77	if pos < len(s) {
78		r, size := utf8.DecodeRuneInString(string(s)[pos:])
79		return r, pos + size, nil
80	}
81	return 0, 0, io.EOF
82}
83
84func (s RuneSlice) RuneAt(pos int) (rune, int, error) {
85	if pos < len(s) {
86		return s[pos], pos + 1, nil
87	}
88	return 0, 0, io.EOF
89}
90
91func (r SingleRuneSource) RuneAt(pos int) (rune, int, error) {
92	if pos == 0 {
93		return rune(r), 1, nil
94	}
95	return 0, 0, io.EOF
96}
97
98func NewWriter(wr io.Writer) *Writer {
99	e := new(Writer)
100	e.wr = wr
101	e.init()
102	return e
103}
104
105func (e *encoder) init() {
106	e.scsu.init()
107	e.nextWindow = 3
108	e.scuPos = -1
109}
110
111func (e *encoder) nextRune() {
112	e.curRune, e.nextPos, e.curErr = e.src.RuneAt(e.nextPos)
113}
114
115/** locate a window for a character given a table of offsets
116  @param ch - character
117  @param offsetTable - table of window offsets
118  @return true if the character fits a window from the table of windows */
119func (e *encoder) locateWindow(ch rune, offsetTable []int32) bool {
120	// always try the current window first
121	// if the character fits the current window
122	// just use the current window
123	if win := e.window; win != -1 {
124		if offset := offsetTable[win]; ch >= offset && ch < offset+0x80 {
125			return true
126		}
127	}
128
129	// try all windows in order
130	for win, offset := range offsetTable {
131		if ch >= offset && ch < offset+0x80 {
132			e.window = win
133			return true
134		}
135	}
136	// none found
137	return false
138}
139
140/** returns true if the character is ASCII, but not a control other than CR, LF and TAB */
141func isAsciiCrLfOrTab(ch rune) bool {
142	return (ch >= 0x20 && ch <= 0x7F) || // ASCII
143		ch == 0x09 || ch == 0x0A || ch == 0x0D // CR/LF or TAB
144}
145
146/** output a run of characters in single byte mode
147    In single byte mode pass through characters in the ASCII range, but
148    quote characters overlapping with compression command codes. Runs
149    of characters fitting the current window are output as runs of bytes
150    in the range 0x80-0xFF.
151**/
152func (e *encoder) outputSingleByteRun() error {
153	win := e.window
154	for e.curErr == nil {
155		ch := e.curRune
156		// ASCII Letter, NUL, CR, LF and TAB are always passed through
157		if isAsciiCrLfOrTab(ch) || ch == 0 {
158			// pass through directly
159			e.out = append(e.out, byte(ch&0x7F))
160		} else if ch < 0x20 {
161			// All other control codes must be quoted
162			e.out = append(e.out, SQ0, byte(ch))
163		} else if dOffset := e.dynamicOffset[win]; ch >= dOffset && ch < dOffset+0x80 {
164			// Letters that fit the current dynamic window
165			ch -= dOffset
166			e.out = append(e.out, byte(ch|0x80))
167		} else {
168			// need to use some other compression mode for this
169			// character so we terminate this loop
170			break
171		}
172		err := e.flush()
173		if err != nil {
174			return err
175		}
176		e.nextRune()
177	}
178
179	return nil
180}
181
182/** quote a single character in single byte mode
183  Quoting a character (aka 'non-locking shift') gives efficient access
184  to characters that occur in isolation--usually punctuation characters.
185  When quoting a character from a dynamic window use 0x80 - 0xFF, when
186  quoting a character from a static window use 0x00-0x7f.
187  **/
188func (e *encoder) quoteSingleByte(ch rune) error {
189	// Output command byte followed by...
190	e.out = append(e.out, byte(SQ0+e.window))
191	if offset := e.dynamicOffset[e.window]; ch >= offset && ch < offset+0x80 {
192		// ... letter that fits the current dynamic window
193		ch -= offset
194		e.out = append(e.out, byte(ch|0x80))
195	} else if offset := staticOffset[e.window]; ch >= offset && ch < offset+0x80 {
196		// ... letter that fits the current static window
197		ch -= offset
198		e.out = append(e.out, byte(ch))
199	} else {
200		return fmt.Errorf("ch = %d not valid in quoteSingleByte. Internal Compressor Error", ch)
201	}
202
203	err := e.flush()
204	if err != nil {
205		return err
206	}
207	return nil
208}
209
210/** output a run of characters in Unicode mode
211  A run of Unicode mode consists of characters which are all in the
212  range of non-compressible characters or isolated occurrence
213  of any other characters. Characters in the range 0xE00-0xF2FF must
214  be quoted to avoid overlap with the Unicode mode compression command codes.
215  **/
216func (e *encoder) outputUnicodeRun() (err error) {
217	for e.curErr == nil {
218		r, n := e.curRune, e.nextPos
219		var r1 rune
220		var n1 int
221		if isCompressible(r) {
222			r1, n1, err = e.src.RuneAt(n)
223			if err != nil && err != io.EOF {
224				return
225			}
226			if err == nil && isCompressible(r1) {
227				// at least 2 characters are compressible
228				// break the run
229				break
230			}
231			if err != nil && e.scuPos != -1 && (r == 0 || isAsciiCrLfOrTab(r)) {
232				// The current character is the last one, it is a pass-though
233				// character (i.e. can be encoded with one byte without
234				// changing a window) and we have only produced one unicode
235				// character so far.
236				// The result will be an SQU followed by a unicode character,
237				// followed by a single byte.
238				// If we didn't break here it would be one byte longer
239				// (SCU followed by 2 unicode characters).
240				err = nil
241				break
242			}
243		}
244
245		// If we get here, the current character is only character
246		// left in the input or it is followed by a non-compressible
247		// character. In neither case do we gain by breaking the
248		// run, so we proceed to output the character.
249		if r < 0x10000 {
250			if r >= 0xE000 && r <= 0xF2FF {
251				// Characters in this range need to be escaped
252				e.out = append(e.out, UQU)
253			}
254			e.out = append(e.out, byte(r>>8), byte(r))
255		} else {
256			r1, r2 := utf16.EncodeRune(r)
257			e.out = append(e.out, byte(r1>>8), byte(r1), byte(r2>>8), byte(r2))
258		}
259		if n1 != 0 {
260			e.curRune, e.nextPos = r1, n1
261		} else {
262			e.nextRune()
263		}
264
265		if len(e.out)-e.scuPos > 3 {
266			err = e.flush()
267			e.scuPos = -1
268			if err != nil {
269				return
270			}
271		}
272	}
273
274	return
275}
276
277// redefine a window so it surrounds a given character value
278func (e *encoder) positionWindow(ch rune, fUnicodeMode bool) bool {
279	iWin := e.nextWindow % 8 // simple LRU
280	var iPosition uint16
281
282	// iPosition 0 is a reserved value
283	if ch < 0x80 {
284		panic("ch < 0x80")
285	}
286
287	// Check the fixed offsets
288	for i := 0; i < len(fixedOffset); i++ {
289		if offset := fixedOffset[i]; ch >= offset && ch < offset+0x80 {
290			iPosition = uint16(i)
291			break
292		}
293	}
294
295	extended := false
296	if iPosition != 0 {
297		e.dynamicOffset[iWin] = fixedOffset[iPosition]
298		iPosition += fixedThreshold
299	} else if ch < 0x3400 {
300		// calculate a window position command and set the offset
301		iPosition = uint16(ch >> 7)
302		e.dynamicOffset[iWin] = ch & 0xFF80
303	} else if ch < 0xE000 {
304		// attempt to place a window where none can go
305		return false
306	} else if ch <= 0xFFFF {
307		// calculate a window position command, accounting
308		// for the gap in position values, and set the offset
309		iPosition = uint16((ch - gapOffset) >> 7)
310		e.dynamicOffset[iWin] = ch & 0xFF80
311	} else {
312		// if we get here, the character is in the extended range.
313
314		iPosition = uint16((ch - 0x10000) >> 7)
315
316		iPosition |= uint16(iWin << 13)
317		e.dynamicOffset[iWin] = ch & 0x1FFF80
318		extended = true
319	}
320
321	if !extended {
322		// Outputting window definition command for the general cases
323		var b byte
324		if fUnicodeMode {
325			b = UD0
326		} else {
327			b = SD0
328		}
329		e.out = append(e.out, b+byte(iWin), byte(iPosition&0xFF))
330	} else {
331		// Output an extended window definition command
332		var b byte
333		if fUnicodeMode {
334			b = UDX
335		} else {
336			b = SDX
337		}
338		e.out = append(e.out, b, byte(iPosition>>8), byte(iPosition))
339	}
340	e.window = iWin
341	e.nextWindow++
342	return true
343}
344
345// Note, e.curRune must be compressible
346func (e *encoder) chooseWindow() error {
347	curCh, nextPos := e.curRune, e.nextPos
348	var err error
349
350	// Find a nearest compressible non-ASCII character which will be used
351	// to select a window.
352	// If we encounter a non-quotable unicode sequence (i.e. a single
353	// extended incompressible character or two BMP incompressible characters
354	// we stop and use the next character as it doesn't matter which window to
355	// select: all the characters will be ASCII and then we'll have to switch to
356	// unicode mode.
357	windowDecider := curCh
358	prevIncompressible := false
359	for c, p := curCh, nextPos; ; {
360		if c >= 0x80 {
361			if !isCompressible(c) {
362				if c >= 0x10000 || prevIncompressible {
363					break
364				}
365				prevIncompressible = true
366			} else {
367				windowDecider = c
368				break
369			}
370		} else {
371			prevIncompressible = false
372		}
373		c, p, err = e.src.RuneAt(p)
374		if err != nil {
375			if err == io.EOF {
376				err = nil
377				break
378			}
379			return err
380		}
381	}
382	prevWindow := e.window
383
384	// try to locate a dynamic window
385	if windowDecider < 0x80 || e.locateWindow(windowDecider, e.dynamicOffset[:]) {
386		// lookahead to use SQn instead of SCn for single
387		// character interruptions of runs in current window
388		if !e.unicodeMode {
389			ch2, n2, err := e.src.RuneAt(nextPos)
390			if err != nil && err != io.EOF {
391				return err
392			}
393			if err == nil && ch2 >= e.dynamicOffset[prevWindow] &&
394				ch2 < e.dynamicOffset[prevWindow]+0x80 {
395				err = e.quoteSingleByte(curCh)
396				if err != nil {
397					return err
398				}
399				e.curRune, e.nextPos = ch2, n2
400				e.window = prevWindow
401				return nil
402			}
403		}
404		var b byte
405		if e.unicodeMode {
406			b = UC0
407		} else {
408			b = SC0
409		}
410		e.out = append(e.out, b+byte(e.window))
411		e.unicodeMode = false
412		return nil
413	} else
414	// try to locate a static window
415	if !e.unicodeMode && e.locateWindow(windowDecider, staticOffset[:]) {
416		// static windows are not accessible from Unicode mode
417		err = e.quoteSingleByte(curCh)
418		if err != nil {
419			return err
420		}
421		e.nextRune()
422		e.window = prevWindow // restore current Window settings
423		return nil
424	} else // try to define a window around windowDecider
425	if e.positionWindow(windowDecider, e.unicodeMode) {
426		e.unicodeMode = false
427		return nil
428	}
429	return errors.New("could not select window. Internal Compressor Error")
430}
431
432func (e *encoder) encode(src RuneSource) error {
433	var err error
434	e.src, e.written, e.nextPos = src, 0, 0
435	e.nextRune()
436
437	for {
438		if e.unicodeMode {
439			err = e.outputUnicodeRun()
440			if err != nil {
441				break
442			}
443			if e.scuPos != -1 && len(e.out)-e.scuPos == 3 {
444				// for single character Unicode runs use quote
445				// go back and fix up the SCU to an SQU instead
446				e.out[e.scuPos] = SQU
447				e.scuPos = -1
448				err = e.flush()
449				if err != nil {
450					break
451				}
452				e.unicodeMode = false
453				continue
454			} else {
455				e.scuPos = -1
456				err = e.flush()
457				if err != nil {
458					break
459				}
460			}
461		} else {
462			err = e.outputSingleByteRun()
463			if err != nil {
464				break
465			}
466		}
467		if e.curErr != nil {
468			err = e.curErr
469			break
470		}
471		// note, if we were in unicode mode the character must be compressible
472		if isCompressible(e.curRune) {
473			err = e.chooseWindow()
474			if err != nil {
475				break
476			}
477		} else {
478			// switching to unicode
479			e.scuPos = len(e.out)
480			e.out = append(e.out, SCU)
481			e.unicodeMode = true
482		}
483	}
484
485	if errors.Is(err, io.EOF) {
486		err = e.flush()
487	}
488
489	e.src = nil // do not hold the reference
490
491	return err
492}
493
494// WriteString encodes the given string and writes the binary representation
495// into the writer. Invalid UTF-8 sequences are replaced with utf8.RuneError.
496// Returns the number of bytes written and an error (if any).
497func (w *Writer) WriteString(in string) (int, error) {
498	return w.WriteRunes(StringRuneSource(in))
499}
500
501// WriteRune encodes the given rune and writes the binary representation
502// into the writer.
503// Returns the number of bytes written and an error (if any).
504func (w *Writer) WriteRune(r rune) (int, error) {
505	return w.WriteRunes(SingleRuneSource(r))
506}
507
508func (w *Writer) WriteRunes(src RuneSource) (int, error) {
509	err := w.encode(src)
510	return w.written, err
511}
512
513func (e *encoder) flush() error {
514	if e.wr != nil && len(e.out) > 0 {
515		n, err := e.wr.Write(e.out)
516		e.written += n
517		e.out = e.out[:0]
518		return err
519	}
520
521	return nil
522}
523
524// Reset discards the encoder's state and makes it equivalent to the result of NewEncoder
525// called with w allowing to re-use the instance.
526func (w *Writer) Reset(out io.Writer) {
527	w.wr = out
528	w.out = w.out[:0]
529	w.reset()
530	w.init()
531}
532
533// Encode the given RuneSource and append to dst. If dst does not have enough capacity
534// it will be re-allocated. It can be nil.
535// Not goroutine-safe. The instance can be re-used after.
536func (e *Encoder) Encode(src RuneSource, dst []byte) ([]byte, error) {
537	e.reset()
538	e.init()
539	e.out = dst
540	err := e.encode(src)
541	out := e.out
542	e.out = nil
543	return out, err
544}
545
546// Encode src and append to dst. If dst does not have enough capacity
547// it will be re-allocated. It can be nil.
548func Encode(src string, dst []byte) ([]byte, error) {
549	var e Encoder
550	return e.Encode(StringRuneSource(src), dst)
551}
552
553// EncodeStrict is the same as Encode, however it stops and returns ErrInvalidUTF8
554// if an invalid UTF-8 sequence is encountered rather than replacing it with
555// utf8.RuneError.
556func EncodeStrict(src string, dst []byte) ([]byte, error) {
557	var e Encoder
558	return e.Encode(StrictStringRuneSource(src), dst)
559}
560