1 #include "draco/compression/entropy/shannon_entropy.h"
2 
3 #include <cmath>
4 #include <vector>
5 
6 #include "draco/compression/entropy/rans_symbol_coding.h"
7 
8 namespace draco {
9 
ComputeShannonEntropy(const uint32_t * symbols,int num_symbols,int max_value,int * out_num_unique_symbols)10 int64_t ComputeShannonEntropy(const uint32_t *symbols, int num_symbols,
11                               int max_value, int *out_num_unique_symbols) {
12   // First find frequency of all unique symbols in the input array.
13   int num_unique_symbols = 0;
14   std::vector<int> symbol_frequencies(max_value + 1, 0);
15   for (int i = 0; i < num_symbols; ++i) {
16     ++symbol_frequencies[symbols[i]];
17   }
18   double total_bits = 0;
19   double num_symbols_d = num_symbols;
20   for (int i = 0; i < max_value + 1; ++i) {
21     if (symbol_frequencies[i] > 0) {
22       ++num_unique_symbols;
23       // Compute Shannon entropy for the symbol.
24       // We don't want to use std::log2 here for Android build.
25       total_bits +=
26           symbol_frequencies[i] *
27           log2(static_cast<double>(symbol_frequencies[i]) / num_symbols_d);
28     }
29   }
30   if (out_num_unique_symbols) {
31     *out_num_unique_symbols = num_unique_symbols;
32   }
33   // Entropy is always negative.
34   return static_cast<int64_t>(-total_bits);
35 }
36 
ComputeBinaryShannonEntropy(uint32_t num_values,uint32_t num_true_values)37 double ComputeBinaryShannonEntropy(uint32_t num_values,
38                                    uint32_t num_true_values) {
39   if (num_values == 0) {
40     return 0;
41   }
42 
43   // We can exit early if the data set has 0 entropy.
44   if (num_true_values == 0 || num_values == num_true_values) {
45     return 0;
46   }
47   const double true_freq =
48       static_cast<double>(num_true_values) / static_cast<double>(num_values);
49   const double false_freq = 1.0 - true_freq;
50   return -(true_freq * std::log2(true_freq) +
51            false_freq * std::log2(false_freq));
52 }
53 
ShannonEntropyTracker()54 ShannonEntropyTracker::ShannonEntropyTracker() {}
55 
Peek(const uint32_t * symbols,int num_symbols)56 ShannonEntropyTracker::EntropyData ShannonEntropyTracker::Peek(
57     const uint32_t *symbols, int num_symbols) {
58   return UpdateSymbols(symbols, num_symbols, false);
59 }
60 
Push(const uint32_t * symbols,int num_symbols)61 ShannonEntropyTracker::EntropyData ShannonEntropyTracker::Push(
62     const uint32_t *symbols, int num_symbols) {
63   return UpdateSymbols(symbols, num_symbols, true);
64 }
65 
UpdateSymbols(const uint32_t * symbols,int num_symbols,bool push_changes)66 ShannonEntropyTracker::EntropyData ShannonEntropyTracker::UpdateSymbols(
67     const uint32_t *symbols, int num_symbols, bool push_changes) {
68   EntropyData ret_data = entropy_data_;
69   ret_data.num_values += num_symbols;
70   for (int i = 0; i < num_symbols; ++i) {
71     const uint32_t symbol = symbols[i];
72     if (frequencies_.size() <= symbol) {
73       frequencies_.resize(symbol + 1, 0);
74     }
75 
76     // Update the entropy of the stream. Note that entropy of |N| values
77     // represented by |S| unique symbols is defined as:
78     //
79     //  entropy = -sum_over_S(symbol_frequency / N * log2(symbol_frequency / N))
80     //
81     // To avoid the need to recompute the entire sum when new values are added,
82     // we can instead update a so called entropy norm that is defined as:
83     //
84     //  entropy_norm = sum_over_S(symbol_frequency * log2(symbol_frequency))
85     //
86     // In this case, all we need to do is update entries on the symbols where
87     // the frequency actually changed.
88     //
89     // Note that entropy_norm and entropy can be easily transformed to the
90     // actual entropy as:
91     //
92     //  entropy = log2(N) - entropy_norm / N
93     //
94     double old_symbol_entropy_norm = 0;
95     int &frequency = frequencies_[symbol];
96     if (frequency > 1) {
97       old_symbol_entropy_norm = frequency * std::log2(frequency);
98     } else if (frequency == 0) {
99       ret_data.num_unique_symbols++;
100       if (symbol > static_cast<uint32_t>(ret_data.max_symbol)) {
101         ret_data.max_symbol = symbol;
102       }
103     }
104     frequency++;
105     const double new_symbol_entropy_norm = frequency * std::log2(frequency);
106 
107     // Update the final entropy.
108     ret_data.entropy_norm += new_symbol_entropy_norm - old_symbol_entropy_norm;
109   }
110   if (push_changes) {
111     // Update entropy data of the stream.
112     entropy_data_ = ret_data;
113   } else {
114     // We are only peeking so do not update the stream.
115     // Revert changes in the frequency table.
116     for (int i = 0; i < num_symbols; ++i) {
117       const uint32_t symbol = symbols[i];
118       frequencies_[symbol]--;
119     }
120   }
121   return ret_data;
122 }
123 
GetNumberOfDataBits(const EntropyData & entropy_data)124 int64_t ShannonEntropyTracker::GetNumberOfDataBits(
125     const EntropyData &entropy_data) {
126   if (entropy_data.num_values < 2) {
127     return 0;
128   }
129   // We need to compute the number of bits required to represent the stream
130   // using the entropy norm. Note that:
131   //
132   //   entropy = log2(num_values) - entropy_norm / num_values
133   //
134   // and number of bits required for the entropy is: num_values * entropy
135   //
136   return static_cast<int64_t>(
137       ceil(entropy_data.num_values * std::log2(entropy_data.num_values) -
138            entropy_data.entropy_norm));
139 }
140 
GetNumberOfRAnsTableBits(const EntropyData & entropy_data)141 int64_t ShannonEntropyTracker::GetNumberOfRAnsTableBits(
142     const EntropyData &entropy_data) {
143   return ApproximateRAnsFrequencyTableBits(entropy_data.max_symbol + 1,
144                                            entropy_data.num_unique_symbols);
145 }
146 
147 }  // namespace draco
148