1 #ifndef _LIBZIM_COMPRESSION_
2 #define _LIBZIM_COMPRESSION_
3
4 #include <vector>
5 #include "string.h"
6
7 #include "file_reader.h"
8 #include <zim/error.h>
9
10 #include "config.h"
11
12 #include <lzma.h>
13 #include <zstd.h>
14
15 #include "zim_types.h"
16
17 //#define DEB(X) std::cerr << __func__ << " " << X << std::endl ;
18 #define DEB(X)
19
20 enum class CompStep {
21 STEP,
22 FINISH
23 };
24
25 enum class CompStatus {
26 OK,
27 STREAM_END,
28 BUF_ERROR,
29 };
30
31 enum class RunnerStatus {
32 OK,
33 NEED_MORE,
34 ERROR
35 };
36
37 struct LZMA_INFO {
38 typedef lzma_stream stream_t;
39 static const std::string name;
40 static void init_stream_decoder(stream_t* stream, char* raw_data);
41 static void init_stream_encoder(stream_t* stream, char* raw_data);
42 static CompStatus stream_run_encode(stream_t* stream, CompStep step);
43 static CompStatus stream_run_decode(stream_t* stream, CompStep step);
44 static CompStatus stream_run(stream_t* stream, CompStep step);
45 static void stream_end_encode(stream_t* stream);
46 static void stream_end_decode(stream_t* stream);
47 };
48
49
50 struct ZSTD_INFO {
51 struct stream_t
52 {
53 const unsigned char* next_in;
54 size_t avail_in;
55 unsigned char* next_out;
56 size_t avail_out;
57 size_t total_out;
58
59 ::ZSTD_CStream* encoder_stream;
60 ::ZSTD_DStream* decoder_stream;
61
62 stream_t();
63 ~stream_t();
64 private:
65 stream_t(const stream_t& t) = delete;
66 void operator=(const stream_t& t) = delete;
67 };
68
69 static const std::string name;
70 static void init_stream_decoder(stream_t* stream, char* raw_data);
71 static void init_stream_encoder(stream_t* stream, char* raw_data);
72 static CompStatus stream_run_encode(stream_t* stream, CompStep step);
73 static CompStatus stream_run_decode(stream_t* stream, CompStep step);
74 static void stream_end_encode(stream_t* stream);
75 static void stream_end_decode(stream_t* stream);
76 };
77
78
79 namespace zim {
80
81 template<typename INFO>
82 class Uncompressor
83 {
84 public:
85 Uncompressor(size_t initial_size=1024*1024) :
ret_data(new char[initial_size])86 ret_data(new char[initial_size]),
87 data_size(initial_size)
88 {}
89 ~Uncompressor() = default;
90
init(char * data)91 void init(char* data) {
92 INFO::init_stream_decoder(&stream, data);
93 stream.next_out = (uint8_t*)ret_data.get();
94 stream.avail_out = data_size;
95 }
96
97 RunnerStatus feed(char* data, size_t size, CompStep step = CompStep::STEP) {
98 stream.next_in = (unsigned char*)data;
99 stream.avail_in = size;
100 while (true) {
101 auto errcode = INFO::stream_run_decode(&stream, step);
102 DEB((int)errcode)
103 switch (errcode) {
104 case CompStatus::BUF_ERROR:
105 if (stream.avail_in == 0 && stream.avail_out != 0) {
106 // End of input stream.
107 // compressor hasn't recognize the end of the input stream but there is
108 // no more input.
109 return RunnerStatus::NEED_MORE;
110 } else {
111 // Not enought output size.
112 // Allocate more memory and continue the loop.
113 DEB("need memory " << data_size << " " << stream.avail_out << " " << stream.total_out)
114 data_size *= 2;
115 std::unique_ptr<char[]> new_ret_data(new char[data_size]);
116 memcpy(new_ret_data.get(), ret_data.get(), stream.total_out);
117 stream.next_out = (unsigned char*)(new_ret_data.get() + stream.total_out);
118 stream.avail_out = data_size - stream.total_out;
119 DEB(data_size << " " << stream.avail_out << " " << stream.avail_in)
120 ret_data = std::move(new_ret_data);
121 }
122 break;
123 case CompStatus::OK:
124 // On first call where lzma cannot progress (no output size).
125 // Lzma return OK. If we return NEED_MORE, then we will try to compress
126 // with new input data, but we should not as current one is not processed.
127 // We must do a second step to have te BUF_ERROR and handle thing correctly.
128 // If we have no more input, then we must ask for more.
129 if (stream.avail_in == 0) {
130 return RunnerStatus::NEED_MORE;
131 }
132 break;
133 case CompStatus::STREAM_END:
134 // End of compressed stream. Everything is ok.
135 return RunnerStatus::OK;
136 default:
137 // unreachable
138 return RunnerStatus::ERROR;
139 }
140 };
141 // unreachable
142 return RunnerStatus::NEED_MORE;
143 }
144
get_data(zim::zsize_t * size)145 std::unique_ptr<char[]> get_data(zim::zsize_t* size) {
146 feed(nullptr, 0, CompStep::FINISH);
147 size->v = stream.total_out;
148 INFO::stream_end_decode(&stream);
149 return std::move(ret_data);
150 }
151
152 private:
153 std::unique_ptr<char[]> ret_data;
154 size_type data_size;
155 typename INFO::stream_t stream;
156 };
157
158 #define CHUNCK_SIZE ((zim::size_type)(1024))
159 /**
160 * Uncompress data of the reader at startOffset.
161 *
162 * @param reader The reader where the data is.
163 * @param startOffset The offset where the data is in the reader.
164 * @param dest_size[out] The size of the uncompressed data.
165 * @return A pointer to the uncompressed data. This must be deleted (delete[])
166 */
167 template<typename INFO>
uncompress(const zim::Reader * reader,zim::offset_t startOffset,zim::zsize_t * dest_size)168 std::unique_ptr<char[]> uncompress(const zim::Reader* reader, zim::offset_t startOffset, zim::zsize_t* dest_size) {
169 // Use a compressor to compress the data.
170 // As we don't know the result size, neither the compressed size,
171 // we have to do chunk by chunk until decompressor is happy.
172 // Let's assume it will be something like the minChunkSize used at creation
173 Uncompressor<INFO> runner(1024*1024);
174 // The input is a buffer of CHUNCK_SIZE char max. It may be less if the last chunk
175 // is at the end of the reader and the reader size is not a multiple of CHUNCK_SIZE.
176 std::vector<char> raw_data(CHUNCK_SIZE);
177
178 DEB("Init")
179 runner.init(raw_data.data());
180
181 zim::size_type availableSize = reader->size().v - startOffset.v;
182 auto ret = RunnerStatus::NEED_MORE;
183 while(ret != RunnerStatus::OK) {
184 if (ret == RunnerStatus::NEED_MORE and availableSize) {
185 zim::size_type inputSize = std::min(availableSize, CHUNCK_SIZE);
186 reader->read(raw_data.data(), startOffset, zim::zsize_t(inputSize));
187 startOffset.v += inputSize;
188 availableSize -= inputSize;
189 DEB("Step " << startOffset.v)
190 ret = runner.feed(raw_data.data(), inputSize);
191 DEB("Ret " << (int)ret)
192 }
193 if (ret == RunnerStatus::ERROR) {
194 throw zim::ZimFileFormatError(std::string("Invalid ") + INFO::name
195 + std::string(" stream for cluster."));
196 }
197 }
198
199 DEB("Finish")
200 return runner.get_data(dest_size);
201 }
202
203 template<typename INFO>
204 class Compressor
205 {
206 public:
207 Compressor(size_t initial_size=1024*1024) :
ret_data(new char[initial_size])208 ret_data(new char[initial_size]),
209 ret_size(initial_size)
210 {}
211
212 ~Compressor() = default;
213
init(char * data)214 void init(char* data) {
215 INFO::init_stream_encoder(&stream, data);
216 stream.next_out = (uint8_t*)ret_data.get();
217 stream.avail_out = ret_size;
218 }
219
220 RunnerStatus feed(const char* data, size_t size, CompStep step=CompStep::STEP) {
221 stream.next_in = (unsigned char*)data;
222 stream.avail_in = size;
223 while (true) {
224 auto errcode = INFO::stream_run_encode(&stream, step);
225 switch (errcode) {
226 case CompStatus::OK:
227 if (stream.avail_out == 0) {
228 // lzma return a OK return status the first time it runs out of output memory.
229 // The BUF_ERROR is returned only the second time we call a lzma_code.
230 continue;
231 } else {
232 return RunnerStatus::NEED_MORE;
233 }
234 case CompStatus::STREAM_END:
235 return RunnerStatus::NEED_MORE;
236 case CompStatus::BUF_ERROR: {
237 //Not enought output size
238 ret_size *= 2;
239 std::unique_ptr<char[]> new_ret_data(new char[ret_size]);
240 memcpy(new_ret_data.get(), ret_data.get(), stream.total_out);
241 stream.next_out = (unsigned char*)(new_ret_data.get() + stream.total_out);
242 stream.avail_out = ret_size - stream.total_out;
243 ret_data = std::move(new_ret_data);
244 continue;
245 }
246 break;
247 default:
248 // unreachable
249 return RunnerStatus::ERROR;
250 };
251 };
252 // urreachable
253 return RunnerStatus::NEED_MORE;
254 }
255
get_data(zim::zsize_t * size)256 std::unique_ptr<char[]> get_data(zim::zsize_t* size) {
257 feed(nullptr, 0, CompStep::FINISH);
258 INFO::stream_end_encode(&stream);
259 size->v = stream.total_out;
260 return std::move(ret_data);
261 }
262
263 private:
264 std::unique_ptr<char[]> ret_data;
265 size_t ret_size;
266 typename INFO::stream_t stream;
267 };
268
269 } // namespace zim
270
271 #endif // _LIBZIM_COMPRESSION_
272