1// Copyright 2015, Joe Tsai. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE.md file. 4 5package bzip2 6 7import "github.com/dsnet/compress/bzip2/internal/sais" 8 9// The Burrows-Wheeler Transform implementation used here is based on the 10// Suffix Array by Induced Sorting (SA-IS) methodology by Nong, Zhang, and Chan. 11// This implementation uses the sais algorithm originally written by Yuta Mori. 12// 13// The SA-IS algorithm runs in O(n) and outputs a Suffix Array. There is a 14// mathematical relationship between Suffix Arrays and the Burrows-Wheeler 15// Transform, such that a SA can be converted to a BWT in O(n) time. 16// 17// References: 18// http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf 19// https://github.com/cscott/compressjs/blob/master/lib/BWT.js 20// https://www.quora.com/How-can-I-optimize-burrows-wheeler-transform-and-inverse-transform-to-work-in-O-n-time-O-n-space 21type burrowsWheelerTransform struct { 22 buf []byte 23 sa []int 24 perm []uint32 25} 26 27func (bwt *burrowsWheelerTransform) Encode(buf []byte) (ptr int) { 28 if len(buf) == 0 { 29 return -1 30 } 31 32 // TODO(dsnet): Find a way to avoid the duplicate input string method. 33 // We only need to do this because suffix arrays (by definition) only 34 // operate non-wrapped suffixes of a string. On the other hand, 35 // the BWT specifically used in bzip2 operate on a strings that wrap-around 36 // when being sorted. 37 38 // Step 1: Concatenate the input string to itself so that we can use the 39 // suffix array algorithm for bzip2's variant of BWT. 40 n := len(buf) 41 bwt.buf = append(append(bwt.buf[:0], buf...), buf...) 42 if cap(bwt.sa) < 2*n { 43 bwt.sa = make([]int, 2*n) 44 } 45 t := bwt.buf[:2*n] 46 sa := bwt.sa[:2*n] 47 48 // Step 2: Compute the suffix array (SA). The input string, t, will not be 49 // modified, while the results will be written to the output, sa. 50 sais.ComputeSA(t, sa) 51 52 // Step 3: Convert the SA to a BWT. Since ComputeSA does not mutate the 53 // input, we have two copies of the input; in buf and buf2. Thus, we write 54 // the transformation to buf, while using buf2. 55 var j int 56 buf2 := t[n:] 57 for _, i := range sa { 58 if i < n { 59 if i == 0 { 60 ptr = j 61 i = n 62 } 63 buf[j] = buf2[i-1] 64 j++ 65 } 66 } 67 return ptr 68} 69 70func (bwt *burrowsWheelerTransform) Decode(buf []byte, ptr int) { 71 if len(buf) == 0 { 72 return 73 } 74 75 // Step 1: Compute cumm, where cumm[ch] reports the total number of 76 // characters that precede the character ch in the alphabet. 77 var cumm [256]int 78 for _, v := range buf { 79 cumm[v]++ 80 } 81 var sum int 82 for i, v := range cumm { 83 cumm[i] = sum 84 sum += v 85 } 86 87 // Step 2: Compute perm, where perm[ptr] contains a pointer to the next 88 // byte in buf and the next pointer in perm itself. 89 if cap(bwt.perm) < len(buf) { 90 bwt.perm = make([]uint32, len(buf)) 91 } 92 perm := bwt.perm[:len(buf)] 93 for i, b := range buf { 94 perm[cumm[b]] = uint32(i) 95 cumm[b]++ 96 } 97 98 // Step 3: Follow each pointer in perm to the next byte, starting with the 99 // origin pointer. 100 if cap(bwt.buf) < len(buf) { 101 bwt.buf = make([]byte, len(buf)) 102 } 103 buf2 := bwt.buf[:len(buf)] 104 i := perm[ptr] 105 for j := range buf2 { 106 buf2[j] = buf[i] 107 i = perm[i] 108 } 109 copy(buf, buf2) 110} 111