1package scsu 2 3import ( 4 "errors" 5 "fmt" 6 "io" 7 "unicode/utf16" 8 "unicode/utf8" 9) 10 11// A RuneSource represents a sequence of runes with look-behind support. 12// 13// RuneAt returns a rune at a given position. 14// The position starts at zero and is not guaranteed to be sequential, therefore 15// the only valid arguments are 0 or one of the previously returned as nextPos. 16// Supplying anything else results in an unspecified behaviour. 17// Returns io.EOF when there are no more runes left. 18// If a rune was read err must be nil (i.e. (rune, EOF) combination is not possible) 19type RuneSource interface { 20 RuneAt(pos int) (r rune, nextPos int, err error) 21} 22 23// StrictStringRuneSource does not tolerate invalid UTF-8 sequences. 24type StrictStringRuneSource string 25 26// StringRuneSource represents an UTF-8 string. Invalid sequences are replaced with 27// utf8.RuneError. 28type StringRuneSource string 29 30// SingleRuneSource that contains a single rune. 31type SingleRuneSource rune 32 33// RuneSlice is a RuneSource backed by []rune. 34type RuneSlice []rune 35 36type encoder struct { 37 scsu 38 wr io.Writer // nil when encoding into a slice 39 out []byte // a buffer so that we can go back and replace SCU to SQU. In streaming mode does not need more than 5 bytes 40 written int 41 42 src RuneSource 43 curRune rune 44 curErr error 45 nextPos int 46 scuPos int 47 48 nextWindow int 49} 50 51// Encoder can be used to encode a string into []byte. 52// Zero value is ready to use. 53type Encoder struct { 54 encoder 55} 56 57type Writer struct { 58 encoder 59} 60 61var ( 62 ErrInvalidUTF8 = errors.New("invalid UTF-8") 63) 64 65func (s StrictStringRuneSource) RuneAt(pos int) (rune, int, error) { 66 if pos < len(s) { 67 r, size := utf8.DecodeRuneInString(string(s)[pos:]) 68 if r == utf8.RuneError && size == 1 { 69 return 0, 0, ErrInvalidUTF8 70 } 71 return r, pos + size, nil 72 } 73 return 0, 0, io.EOF 74} 75 76func (s StringRuneSource) RuneAt(pos int) (rune, int, error) { 77 if pos < len(s) { 78 r, size := utf8.DecodeRuneInString(string(s)[pos:]) 79 return r, pos + size, nil 80 } 81 return 0, 0, io.EOF 82} 83 84func (s RuneSlice) RuneAt(pos int) (rune, int, error) { 85 if pos < len(s) { 86 return s[pos], pos + 1, nil 87 } 88 return 0, 0, io.EOF 89} 90 91func (r SingleRuneSource) RuneAt(pos int) (rune, int, error) { 92 if pos == 0 { 93 return rune(r), 1, nil 94 } 95 return 0, 0, io.EOF 96} 97 98func NewWriter(wr io.Writer) *Writer { 99 e := new(Writer) 100 e.wr = wr 101 e.init() 102 return e 103} 104 105func (e *encoder) init() { 106 e.scsu.init() 107 e.nextWindow = 3 108 e.scuPos = -1 109} 110 111func (e *encoder) nextRune() { 112 e.curRune, e.nextPos, e.curErr = e.src.RuneAt(e.nextPos) 113} 114 115/** locate a window for a character given a table of offsets 116 @param ch - character 117 @param offsetTable - table of window offsets 118 @return true if the character fits a window from the table of windows */ 119func (e *encoder) locateWindow(ch rune, offsetTable []int32) bool { 120 // always try the current window first 121 // if the character fits the current window 122 // just use the current window 123 if win := e.window; win != -1 { 124 if offset := offsetTable[win]; ch >= offset && ch < offset+0x80 { 125 return true 126 } 127 } 128 129 // try all windows in order 130 for win, offset := range offsetTable { 131 if ch >= offset && ch < offset+0x80 { 132 e.window = win 133 return true 134 } 135 } 136 // none found 137 return false 138} 139 140/** returns true if the character is ASCII, but not a control other than CR, LF and TAB */ 141func isAsciiCrLfOrTab(ch rune) bool { 142 return (ch >= 0x20 && ch <= 0x7F) || // ASCII 143 ch == 0x09 || ch == 0x0A || ch == 0x0D // CR/LF or TAB 144} 145 146/** output a run of characters in single byte mode 147 In single byte mode pass through characters in the ASCII range, but 148 quote characters overlapping with compression command codes. Runs 149 of characters fitting the current window are output as runs of bytes 150 in the range 0x80-0xFF. 151**/ 152func (e *encoder) outputSingleByteRun() error { 153 win := e.window 154 for e.curErr == nil { 155 ch := e.curRune 156 // ASCII Letter, NUL, CR, LF and TAB are always passed through 157 if isAsciiCrLfOrTab(ch) || ch == 0 { 158 // pass through directly 159 e.out = append(e.out, byte(ch&0x7F)) 160 } else if ch < 0x20 { 161 // All other control codes must be quoted 162 e.out = append(e.out, SQ0, byte(ch)) 163 } else if dOffset := e.dynamicOffset[win]; ch >= dOffset && ch < dOffset+0x80 { 164 // Letters that fit the current dynamic window 165 ch -= dOffset 166 e.out = append(e.out, byte(ch|0x80)) 167 } else { 168 // need to use some other compression mode for this 169 // character so we terminate this loop 170 break 171 } 172 err := e.flush() 173 if err != nil { 174 return err 175 } 176 e.nextRune() 177 } 178 179 return nil 180} 181 182/** quote a single character in single byte mode 183 Quoting a character (aka 'non-locking shift') gives efficient access 184 to characters that occur in isolation--usually punctuation characters. 185 When quoting a character from a dynamic window use 0x80 - 0xFF, when 186 quoting a character from a static window use 0x00-0x7f. 187 **/ 188func (e *encoder) quoteSingleByte(ch rune) error { 189 // Output command byte followed by... 190 e.out = append(e.out, byte(SQ0+e.window)) 191 if offset := e.dynamicOffset[e.window]; ch >= offset && ch < offset+0x80 { 192 // ... letter that fits the current dynamic window 193 ch -= offset 194 e.out = append(e.out, byte(ch|0x80)) 195 } else if offset := staticOffset[e.window]; ch >= offset && ch < offset+0x80 { 196 // ... letter that fits the current static window 197 ch -= offset 198 e.out = append(e.out, byte(ch)) 199 } else { 200 return fmt.Errorf("ch = %d not valid in quoteSingleByte. Internal Compressor Error", ch) 201 } 202 203 err := e.flush() 204 if err != nil { 205 return err 206 } 207 return nil 208} 209 210/** output a run of characters in Unicode mode 211 A run of Unicode mode consists of characters which are all in the 212 range of non-compressible characters or isolated occurrence 213 of any other characters. Characters in the range 0xE00-0xF2FF must 214 be quoted to avoid overlap with the Unicode mode compression command codes. 215 **/ 216func (e *encoder) outputUnicodeRun() (err error) { 217 for e.curErr == nil { 218 r, n := e.curRune, e.nextPos 219 var r1 rune 220 var n1 int 221 if isCompressible(r) { 222 r1, n1, err = e.src.RuneAt(n) 223 if err != nil && err != io.EOF { 224 return 225 } 226 if err == nil && isCompressible(r1) { 227 // at least 2 characters are compressible 228 // break the run 229 break 230 } 231 if err != nil && e.scuPos != -1 && (r == 0 || isAsciiCrLfOrTab(r)) { 232 // The current character is the last one, it is a pass-though 233 // character (i.e. can be encoded with one byte without 234 // changing a window) and we have only produced one unicode 235 // character so far. 236 // The result will be an SQU followed by a unicode character, 237 // followed by a single byte. 238 // If we didn't break here it would be one byte longer 239 // (SCU followed by 2 unicode characters). 240 err = nil 241 break 242 } 243 } 244 245 // If we get here, the current character is only character 246 // left in the input or it is followed by a non-compressible 247 // character. In neither case do we gain by breaking the 248 // run, so we proceed to output the character. 249 if r < 0x10000 { 250 if r >= 0xE000 && r <= 0xF2FF { 251 // Characters in this range need to be escaped 252 e.out = append(e.out, UQU) 253 } 254 e.out = append(e.out, byte(r>>8), byte(r)) 255 } else { 256 r1, r2 := utf16.EncodeRune(r) 257 e.out = append(e.out, byte(r1>>8), byte(r1), byte(r2>>8), byte(r2)) 258 } 259 if n1 != 0 { 260 e.curRune, e.nextPos = r1, n1 261 } else { 262 e.nextRune() 263 } 264 265 if len(e.out)-e.scuPos > 3 { 266 err = e.flush() 267 e.scuPos = -1 268 if err != nil { 269 return 270 } 271 } 272 } 273 274 return 275} 276 277// redefine a window so it surrounds a given character value 278func (e *encoder) positionWindow(ch rune, fUnicodeMode bool) bool { 279 iWin := e.nextWindow % 8 // simple LRU 280 var iPosition uint16 281 282 // iPosition 0 is a reserved value 283 if ch < 0x80 { 284 panic("ch < 0x80") 285 } 286 287 // Check the fixed offsets 288 for i := 0; i < len(fixedOffset); i++ { 289 if offset := fixedOffset[i]; ch >= offset && ch < offset+0x80 { 290 iPosition = uint16(i) 291 break 292 } 293 } 294 295 extended := false 296 if iPosition != 0 { 297 e.dynamicOffset[iWin] = fixedOffset[iPosition] 298 iPosition += fixedThreshold 299 } else if ch < 0x3400 { 300 // calculate a window position command and set the offset 301 iPosition = uint16(ch >> 7) 302 e.dynamicOffset[iWin] = ch & 0xFF80 303 } else if ch < 0xE000 { 304 // attempt to place a window where none can go 305 return false 306 } else if ch <= 0xFFFF { 307 // calculate a window position command, accounting 308 // for the gap in position values, and set the offset 309 iPosition = uint16((ch - gapOffset) >> 7) 310 e.dynamicOffset[iWin] = ch & 0xFF80 311 } else { 312 // if we get here, the character is in the extended range. 313 314 iPosition = uint16((ch - 0x10000) >> 7) 315 316 iPosition |= uint16(iWin << 13) 317 e.dynamicOffset[iWin] = ch & 0x1FFF80 318 extended = true 319 } 320 321 if !extended { 322 // Outputting window definition command for the general cases 323 var b byte 324 if fUnicodeMode { 325 b = UD0 326 } else { 327 b = SD0 328 } 329 e.out = append(e.out, b+byte(iWin), byte(iPosition&0xFF)) 330 } else { 331 // Output an extended window definition command 332 var b byte 333 if fUnicodeMode { 334 b = UDX 335 } else { 336 b = SDX 337 } 338 e.out = append(e.out, b, byte(iPosition>>8), byte(iPosition)) 339 } 340 e.window = iWin 341 e.nextWindow++ 342 return true 343} 344 345// Note, e.curRune must be compressible 346func (e *encoder) chooseWindow() error { 347 curCh, nextPos := e.curRune, e.nextPos 348 var err error 349 350 // Find a nearest compressible non-ASCII character which will be used 351 // to select a window. 352 // If we encounter a non-quotable unicode sequence (i.e. a single 353 // extended incompressible character or two BMP incompressible characters 354 // we stop and use the next character as it doesn't matter which window to 355 // select: all the characters will be ASCII and then we'll have to switch to 356 // unicode mode. 357 windowDecider := curCh 358 prevIncompressible := false 359 for c, p := curCh, nextPos; ; { 360 if c >= 0x80 { 361 if !isCompressible(c) { 362 if c >= 0x10000 || prevIncompressible { 363 break 364 } 365 prevIncompressible = true 366 } else { 367 windowDecider = c 368 break 369 } 370 } else { 371 prevIncompressible = false 372 } 373 c, p, err = e.src.RuneAt(p) 374 if err != nil { 375 if err == io.EOF { 376 err = nil 377 break 378 } 379 return err 380 } 381 } 382 prevWindow := e.window 383 384 // try to locate a dynamic window 385 if windowDecider < 0x80 || e.locateWindow(windowDecider, e.dynamicOffset[:]) { 386 // lookahead to use SQn instead of SCn for single 387 // character interruptions of runs in current window 388 if !e.unicodeMode { 389 ch2, n2, err := e.src.RuneAt(nextPos) 390 if err != nil && err != io.EOF { 391 return err 392 } 393 if err == nil && ch2 >= e.dynamicOffset[prevWindow] && 394 ch2 < e.dynamicOffset[prevWindow]+0x80 { 395 err = e.quoteSingleByte(curCh) 396 if err != nil { 397 return err 398 } 399 e.curRune, e.nextPos = ch2, n2 400 e.window = prevWindow 401 return nil 402 } 403 } 404 var b byte 405 if e.unicodeMode { 406 b = UC0 407 } else { 408 b = SC0 409 } 410 e.out = append(e.out, b+byte(e.window)) 411 e.unicodeMode = false 412 return nil 413 } else 414 // try to locate a static window 415 if !e.unicodeMode && e.locateWindow(windowDecider, staticOffset[:]) { 416 // static windows are not accessible from Unicode mode 417 err = e.quoteSingleByte(curCh) 418 if err != nil { 419 return err 420 } 421 e.nextRune() 422 e.window = prevWindow // restore current Window settings 423 return nil 424 } else // try to define a window around windowDecider 425 if e.positionWindow(windowDecider, e.unicodeMode) { 426 e.unicodeMode = false 427 return nil 428 } 429 return errors.New("could not select window. Internal Compressor Error") 430} 431 432func (e *encoder) encode(src RuneSource) error { 433 var err error 434 e.src, e.written, e.nextPos = src, 0, 0 435 e.nextRune() 436 437 for { 438 if e.unicodeMode { 439 err = e.outputUnicodeRun() 440 if err != nil { 441 break 442 } 443 if e.scuPos != -1 && len(e.out)-e.scuPos == 3 { 444 // for single character Unicode runs use quote 445 // go back and fix up the SCU to an SQU instead 446 e.out[e.scuPos] = SQU 447 e.scuPos = -1 448 err = e.flush() 449 if err != nil { 450 break 451 } 452 e.unicodeMode = false 453 continue 454 } else { 455 e.scuPos = -1 456 err = e.flush() 457 if err != nil { 458 break 459 } 460 } 461 } else { 462 err = e.outputSingleByteRun() 463 if err != nil { 464 break 465 } 466 } 467 if e.curErr != nil { 468 err = e.curErr 469 break 470 } 471 // note, if we were in unicode mode the character must be compressible 472 if isCompressible(e.curRune) { 473 err = e.chooseWindow() 474 if err != nil { 475 break 476 } 477 } else { 478 // switching to unicode 479 e.scuPos = len(e.out) 480 e.out = append(e.out, SCU) 481 e.unicodeMode = true 482 } 483 } 484 485 if errors.Is(err, io.EOF) { 486 err = e.flush() 487 } 488 489 e.src = nil // do not hold the reference 490 491 return err 492} 493 494// WriteString encodes the given string and writes the binary representation 495// into the writer. Invalid UTF-8 sequences are replaced with utf8.RuneError. 496// Returns the number of bytes written and an error (if any). 497func (w *Writer) WriteString(in string) (int, error) { 498 return w.WriteRunes(StringRuneSource(in)) 499} 500 501// WriteRune encodes the given rune and writes the binary representation 502// into the writer. 503// Returns the number of bytes written and an error (if any). 504func (w *Writer) WriteRune(r rune) (int, error) { 505 return w.WriteRunes(SingleRuneSource(r)) 506} 507 508func (w *Writer) WriteRunes(src RuneSource) (int, error) { 509 err := w.encode(src) 510 return w.written, err 511} 512 513func (e *encoder) flush() error { 514 if e.wr != nil && len(e.out) > 0 { 515 n, err := e.wr.Write(e.out) 516 e.written += n 517 e.out = e.out[:0] 518 return err 519 } 520 521 return nil 522} 523 524// Reset discards the encoder's state and makes it equivalent to the result of NewEncoder 525// called with w allowing to re-use the instance. 526func (w *Writer) Reset(out io.Writer) { 527 w.wr = out 528 w.out = w.out[:0] 529 w.reset() 530 w.init() 531} 532 533// Encode the given RuneSource and append to dst. If dst does not have enough capacity 534// it will be re-allocated. It can be nil. 535// Not goroutine-safe. The instance can be re-used after. 536func (e *Encoder) Encode(src RuneSource, dst []byte) ([]byte, error) { 537 e.reset() 538 e.init() 539 e.out = dst 540 err := e.encode(src) 541 out := e.out 542 e.out = nil 543 return out, err 544} 545 546// Encode src and append to dst. If dst does not have enough capacity 547// it will be re-allocated. It can be nil. 548func Encode(src string, dst []byte) ([]byte, error) { 549 var e Encoder 550 return e.Encode(StringRuneSource(src), dst) 551} 552 553// EncodeStrict is the same as Encode, however it stops and returns ErrInvalidUTF8 554// if an invalid UTF-8 sequence is encountered rather than replacing it with 555// utf8.RuneError. 556func EncodeStrict(src string, dst []byte) ([]byte, error) { 557 var e Encoder 558 return e.Encode(StrictStringRuneSource(src), dst) 559} 560