1 // Copyright 2014 Wouter van Oortmerssen. All rights reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Wouter's Entropy Coder: apparently I re-invented adaptive Shannon-Fano.
16 //
17 // similar compression performance to huffman, and absolutely tiny code, one function for both
18 // compression and decompression. Adaptive, so should work well even for tiny buffers.
19 // One of the vectors passed in is the input, the other the output (and exactly one of the two
20 // should be empty initially).
21 // not the fastest possible implementation (only 40MB/sec for either compression or decompression
22 // on a modern pc), but should be sufficient for many uses
23 //
24 // uses std::string and std::swap as its only external dependencies for simplicity, but could made
25 // to not rely on them relatively easily.
26 
WEntropyCoder(const unsigned char * in,size_t inlen,size_t origlen,string & out)27 template<bool compress> void WEntropyCoder(const unsigned char *in,
28                                            size_t inlen,    // Size of input.
29                                            size_t origlen,  // Uncompressed size.
30                                            string &out) {
31     const int NSYM = 256;     // This depends on the fact we're reading from unsigned chars.
32     int symbol[NSYM];         // The symbol in this slot. Adaptively sorted by frequency.
33     size_t freq[NSYM];        // Its frequency.
34     int sym_idx[NSYM];        // Lookup symbol -> index into the above two arrays.
35     for (int i = 0; i < NSYM; i++) { freq[i] = 1; symbol[i] = sym_idx[i] = i; }
36     size_t compr_idx = 0;
37     unsigned char bits = 0;
38     int nbits = 0;
39     for (size_t i = 0; i < origlen; i++) {
40         int start = 0, range = NSYM;
41         size_t total_freq = i + NSYM;
42         while (range > 1) {
43             size_t acc_freq = 0;
44             int j = start;
45             do acc_freq += freq[j++]; while (acc_freq + freq[j] / 2 < total_freq / 2);
46             unsigned char bit = 0;
47             if (compress) {
48                 if (sym_idx[in[i]] < j) bit = 1;
49                 bits |= bit << nbits;
50                 if (++nbits == 8) { out.push_back(bits); bits = 0; nbits = 0; }
51             } else {
52                 if (!nbits) { assert(compr_idx < inlen); bits = in[compr_idx++]; nbits = 8; }
53                 bit = bits & 1;
54                 bits >>= 1;
55                 nbits--;
56             }
57             if (bit) {
58                 total_freq = acc_freq; assert(j - start < range); range = j - start;
59             } else {
60                 total_freq -= acc_freq; range -= j - start; start = j;
61             }
62         }
63         if (!compress) out.push_back((unsigned char)symbol[start]);
64         assert(range == 1 && (!compress || in[i] == symbol[start]));
65         freq[start]++;
66         while (start && freq[start - 1] < freq[start]) {
67             swap(sym_idx[symbol[start - 1]], sym_idx[symbol[start]]);
68             swap(freq[start - 1], freq[start]);
69             swap(symbol[start - 1], symbol[start]);
70             start--;
71         }
72     }
73     if (compress) { if (nbits) out.push_back(bits); } else { assert(compr_idx == inlen); }
74     (void)inlen;
75 }
76 
77