1 //===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Lempel–Ziv–Welch encoding/decoding
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef SANITIZER_LZW_H
14 #define SANITIZER_LZW_H
15 
16 #include "sanitizer_dense_map.h"
17 
18 namespace __sanitizer {
19 
20 using LzwCodeType = u32;
21 
22 template <class T, class ItIn, class ItOut>
23 ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
24   using Substring =
25       detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
26 
27   // Sentinel value for substrings of len 1.
28   static constexpr LzwCodeType kNoPrefix =
29       Min(DenseMapInfo<Substring>::getEmptyKey().first,
30           DenseMapInfo<Substring>::getTombstoneKey().first) -
31       1;
32   DenseMap<Substring, LzwCodeType> prefix_to_code;
33   {
34     // Add all substring of len 1 as initial dictionary.
35     InternalMmapVector<T> dict_len1;
36     for (auto it = begin; it != end; ++it)
37       if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
38         dict_len1.push_back(*it);
39 
40     // Slightly helps with later delta encoding.
41     Sort(dict_len1.data(), dict_len1.size());
42 
43     // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
44     // just generate them.
45     *out = dict_len1.size();
46     ++out;
47 
48     for (uptr i = 0; i != dict_len1.size(); ++i) {
49       // Remap after the Sort.
50       prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
51       *out = dict_len1[i];
52       ++out;
53     }
54     CHECK_EQ(prefix_to_code.size(), dict_len1.size());
55   }
56 
57   if (begin == end)
58     return out;
59 
60   // Main LZW encoding loop.
61   LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
62   ++begin;
63   for (auto it = begin; it != end; ++it) {
64     // Extend match with the new item.
65     auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
66     if (ins.second) {
67       // This is a new substring, but emit the code for the current match
68       // (before extend). This allows LZW decoder to recover the dictionary.
69       *out = match;
70       ++out;
71       // Reset the match to a single item, which must be already in the map.
72       match = prefix_to_code.find({kNoPrefix, *it})->second;
73     } else {
74       // Already known, use as the current match.
75       match = ins.first->second;
76     }
77   }
78 
79   *out = match;
80   ++out;
81 
82   return out;
83 }
84 
85 template <class T, class ItIn, class ItOut>
86 ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
87   if (begin == end)
88     return out;
89 
90   // Load dictionary of len 1 substrings. Theses correspont to lowest codes.
91   InternalMmapVector<T> dict_len1(*begin);
92   ++begin;
93 
94   if (begin == end)
95     return out;
96 
97   for (auto& v : dict_len1) {
98     v = *begin;
99     ++begin;
100   }
101 
102   // Substrings of len 2 and up. Indexes are shifted because [0,
103   // dict_len1.size()) stored in dict_len1. Substings get here after being
104   // emitted to the output, so we can use output position.
105   InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
106       code_to_substr;
107 
108   // Copies already emitted substrings into the output again.
109   auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
110     if (code < dict_len1.size()) {
111       *out = dict_len1[code];
112       ++out;
113       return out;
114     }
115     const auto& s = code_to_substr[code - dict_len1.size()];
116 
117     for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
118     return out;
119   };
120 
121   // Returns lens of the substring with the given code.
122   auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
123     if (code < dict_len1.size())
124       return 1;
125     const auto& s = code_to_substr[code - dict_len1.size()];
126     return s.second - s.first;
127   };
128 
129   // Main LZW decoding loop.
130   LzwCodeType prev_code = *begin;
131   ++begin;
132   out = copy(prev_code, out);
133   for (auto it = begin; it != end; ++it) {
134     LzwCodeType code = *it;
135     auto start = out;
136     if (code == dict_len1.size() + code_to_substr.size()) {
137       // Special LZW case. The code is not in the dictionary yet. This is
138       // possible only when the new substring is the same as previous one plus
139       // the first item of the previous substring. We can emit that in two
140       // steps.
141       out = copy(prev_code, out);
142       *out = *start;
143       ++out;
144     } else {
145       out = copy(code, out);
146     }
147 
148     // Every time encoded emits the code, it also creates substing of len + 1
149     // including the first item of the just emmited substring. Do the same here.
150     uptr len = code_to_len(prev_code);
151     code_to_substr.push_back({start - len, start + 1});
152 
153     prev_code = code;
154   }
155   return out;
156 }
157 
158 }  // namespace __sanitizer
159 #endif
160