1 #include "draco/compression/entropy/shannon_entropy.h"
2
3 #include <cmath>
4 #include <vector>
5
6 #include "draco/compression/entropy/rans_symbol_coding.h"
7
8 namespace draco {
9
ComputeShannonEntropy(const uint32_t * symbols,int num_symbols,int max_value,int * out_num_unique_symbols)10 int64_t ComputeShannonEntropy(const uint32_t *symbols, int num_symbols,
11 int max_value, int *out_num_unique_symbols) {
12 // First find frequency of all unique symbols in the input array.
13 int num_unique_symbols = 0;
14 std::vector<int> symbol_frequencies(max_value + 1, 0);
15 for (int i = 0; i < num_symbols; ++i) {
16 ++symbol_frequencies[symbols[i]];
17 }
18 double total_bits = 0;
19 double num_symbols_d = num_symbols;
20 for (int i = 0; i < max_value + 1; ++i) {
21 if (symbol_frequencies[i] > 0) {
22 ++num_unique_symbols;
23 // Compute Shannon entropy for the symbol.
24 // We don't want to use std::log2 here for Android build.
25 total_bits +=
26 symbol_frequencies[i] *
27 log2(static_cast<double>(symbol_frequencies[i]) / num_symbols_d);
28 }
29 }
30 if (out_num_unique_symbols) {
31 *out_num_unique_symbols = num_unique_symbols;
32 }
33 // Entropy is always negative.
34 return static_cast<int64_t>(-total_bits);
35 }
36
ComputeBinaryShannonEntropy(uint32_t num_values,uint32_t num_true_values)37 double ComputeBinaryShannonEntropy(uint32_t num_values,
38 uint32_t num_true_values) {
39 if (num_values == 0) {
40 return 0;
41 }
42
43 // We can exit early if the data set has 0 entropy.
44 if (num_true_values == 0 || num_values == num_true_values) {
45 return 0;
46 }
47 const double true_freq =
48 static_cast<double>(num_true_values) / static_cast<double>(num_values);
49 const double false_freq = 1.0 - true_freq;
50 return -(true_freq * std::log2(true_freq) +
51 false_freq * std::log2(false_freq));
52 }
53
ShannonEntropyTracker()54 ShannonEntropyTracker::ShannonEntropyTracker() {}
55
Peek(const uint32_t * symbols,int num_symbols)56 ShannonEntropyTracker::EntropyData ShannonEntropyTracker::Peek(
57 const uint32_t *symbols, int num_symbols) {
58 return UpdateSymbols(symbols, num_symbols, false);
59 }
60
Push(const uint32_t * symbols,int num_symbols)61 ShannonEntropyTracker::EntropyData ShannonEntropyTracker::Push(
62 const uint32_t *symbols, int num_symbols) {
63 return UpdateSymbols(symbols, num_symbols, true);
64 }
65
UpdateSymbols(const uint32_t * symbols,int num_symbols,bool push_changes)66 ShannonEntropyTracker::EntropyData ShannonEntropyTracker::UpdateSymbols(
67 const uint32_t *symbols, int num_symbols, bool push_changes) {
68 EntropyData ret_data = entropy_data_;
69 ret_data.num_values += num_symbols;
70 for (int i = 0; i < num_symbols; ++i) {
71 const uint32_t symbol = symbols[i];
72 if (frequencies_.size() <= symbol) {
73 frequencies_.resize(symbol + 1, 0);
74 }
75
76 // Update the entropy of the stream. Note that entropy of |N| values
77 // represented by |S| unique symbols is defined as:
78 //
79 // entropy = -sum_over_S(symbol_frequency / N * log2(symbol_frequency / N))
80 //
81 // To avoid the need to recompute the entire sum when new values are added,
82 // we can instead update a so called entropy norm that is defined as:
83 //
84 // entropy_norm = sum_over_S(symbol_frequency * log2(symbol_frequency))
85 //
86 // In this case, all we need to do is update entries on the symbols where
87 // the frequency actually changed.
88 //
89 // Note that entropy_norm and entropy can be easily transformed to the
90 // actual entropy as:
91 //
92 // entropy = log2(N) - entropy_norm / N
93 //
94 double old_symbol_entropy_norm = 0;
95 int &frequency = frequencies_[symbol];
96 if (frequency > 1) {
97 old_symbol_entropy_norm = frequency * std::log2(frequency);
98 } else if (frequency == 0) {
99 ret_data.num_unique_symbols++;
100 if (symbol > static_cast<uint32_t>(ret_data.max_symbol)) {
101 ret_data.max_symbol = symbol;
102 }
103 }
104 frequency++;
105 const double new_symbol_entropy_norm = frequency * std::log2(frequency);
106
107 // Update the final entropy.
108 ret_data.entropy_norm += new_symbol_entropy_norm - old_symbol_entropy_norm;
109 }
110 if (push_changes) {
111 // Update entropy data of the stream.
112 entropy_data_ = ret_data;
113 } else {
114 // We are only peeking so do not update the stream.
115 // Revert changes in the frequency table.
116 for (int i = 0; i < num_symbols; ++i) {
117 const uint32_t symbol = symbols[i];
118 frequencies_[symbol]--;
119 }
120 }
121 return ret_data;
122 }
123
GetNumberOfDataBits(const EntropyData & entropy_data)124 int64_t ShannonEntropyTracker::GetNumberOfDataBits(
125 const EntropyData &entropy_data) {
126 if (entropy_data.num_values < 2) {
127 return 0;
128 }
129 // We need to compute the number of bits required to represent the stream
130 // using the entropy norm. Note that:
131 //
132 // entropy = log2(num_values) - entropy_norm / num_values
133 //
134 // and number of bits required for the entropy is: num_values * entropy
135 //
136 return static_cast<int64_t>(
137 ceil(entropy_data.num_values * std::log2(entropy_data.num_values) -
138 entropy_data.entropy_norm));
139 }
140
GetNumberOfRAnsTableBits(const EntropyData & entropy_data)141 int64_t ShannonEntropyTracker::GetNumberOfRAnsTableBits(
142 const EntropyData &entropy_data) {
143 return ApproximateRAnsFrequencyTableBits(entropy_data.max_symbol + 1,
144 entropy_data.num_unique_symbols);
145 }
146
147 } // namespace draco
148