1 #ifndef _LIBZIM_COMPRESSION_
2 #define _LIBZIM_COMPRESSION_
3 
4 #include <vector>
5 #include "string.h"
6 
7 #include "file_reader.h"
8 #include <zim/error.h>
9 
10 #include "config.h"
11 
12 #include <lzma.h>
13 #include <zstd.h>
14 
15 #include "zim_types.h"
16 
17 //#define DEB(X) std::cerr << __func__ << " " << X << std::endl ;
18 #define DEB(X)
19 
20 enum class CompStep {
21   STEP,
22   FINISH
23 };
24 
25 enum class CompStatus {
26   OK,
27   STREAM_END,
28   BUF_ERROR,
29 };
30 
31 enum class RunnerStatus {
32   OK,
33   NEED_MORE,
34   ERROR
35 };
36 
37 struct LZMA_INFO {
38   typedef lzma_stream stream_t;
39   static const std::string name;
40   static void init_stream_decoder(stream_t* stream, char* raw_data);
41   static void init_stream_encoder(stream_t* stream, char* raw_data);
42   static CompStatus stream_run_encode(stream_t* stream, CompStep step);
43   static CompStatus stream_run_decode(stream_t* stream, CompStep step);
44   static CompStatus stream_run(stream_t* stream, CompStep step);
45   static void stream_end_encode(stream_t* stream);
46   static void stream_end_decode(stream_t* stream);
47 };
48 
49 
50 struct ZSTD_INFO {
51   struct stream_t
52   {
53     const unsigned char* next_in;
54     size_t avail_in;
55     unsigned char* next_out;
56     size_t avail_out;
57     size_t total_out;
58 
59     ::ZSTD_CStream* encoder_stream;
60     ::ZSTD_DStream* decoder_stream;
61 
62     stream_t();
63     ~stream_t();
64   private:
65     stream_t(const stream_t& t) = delete;
66     void operator=(const stream_t& t) = delete;
67   };
68 
69   static const std::string name;
70   static void init_stream_decoder(stream_t* stream, char* raw_data);
71   static void init_stream_encoder(stream_t* stream, char* raw_data);
72   static CompStatus stream_run_encode(stream_t* stream, CompStep step);
73   static CompStatus stream_run_decode(stream_t* stream, CompStep step);
74   static void stream_end_encode(stream_t* stream);
75   static void stream_end_decode(stream_t* stream);
76 };
77 
78 
79 namespace zim {
80 
81 template<typename INFO>
82 class Uncompressor
83 {
84   public:
85     Uncompressor(size_t initial_size=1024*1024) :
ret_data(new char[initial_size])86       ret_data(new char[initial_size]),
87       data_size(initial_size)
88     {}
89     ~Uncompressor() = default;
90 
init(char * data)91     void init(char* data) {
92       INFO::init_stream_decoder(&stream, data);
93       stream.next_out = (uint8_t*)ret_data.get();
94       stream.avail_out = data_size;
95     }
96 
97     RunnerStatus feed(char* data, size_t size, CompStep step = CompStep::STEP) {
98       stream.next_in = (unsigned char*)data;
99       stream.avail_in = size;
100       while (true) {
101         auto errcode = INFO::stream_run_decode(&stream, step);
102         DEB((int)errcode)
103         switch (errcode) {
104           case CompStatus::BUF_ERROR:
105             if (stream.avail_in == 0 && stream.avail_out != 0)  {
106               // End of input stream.
107               // compressor hasn't recognize the end of the input stream but there is
108               // no more input.
109               return RunnerStatus::NEED_MORE;
110             } else {
111               // Not enought output size.
112               // Allocate more memory and continue the loop.
113               DEB("need memory " << data_size << " " << stream.avail_out << " " << stream.total_out)
114               data_size *= 2;
115               std::unique_ptr<char[]> new_ret_data(new char[data_size]);
116               memcpy(new_ret_data.get(), ret_data.get(), stream.total_out);
117               stream.next_out = (unsigned char*)(new_ret_data.get() + stream.total_out);
118               stream.avail_out = data_size - stream.total_out;
119               DEB(data_size << " " << stream.avail_out << " " << stream.avail_in)
120               ret_data = std::move(new_ret_data);
121             }
122             break;
123           case CompStatus::OK:
124             // On first call where lzma cannot progress (no output size).
125             // Lzma return OK. If we return NEED_MORE, then we will try to compress
126             // with new input data, but we should not as current one is not processed.
127             // We must do a second step to have te BUF_ERROR and handle thing correctly.
128             // If we have no more input, then we must ask for more.
129             if (stream.avail_in == 0) {
130               return RunnerStatus::NEED_MORE;
131             }
132             break;
133           case CompStatus::STREAM_END:
134             // End of compressed stream. Everything is ok.
135             return RunnerStatus::OK;
136           default:
137             // unreachable
138             return RunnerStatus::ERROR;
139         }
140       };
141       // unreachable
142       return RunnerStatus::NEED_MORE;
143     }
144 
get_data(zim::zsize_t * size)145     std::unique_ptr<char[]> get_data(zim::zsize_t* size) {
146       feed(nullptr, 0, CompStep::FINISH);
147       size->v = stream.total_out;
148       INFO::stream_end_decode(&stream);
149       return std::move(ret_data);
150     }
151 
152   private:
153     std::unique_ptr<char[]> ret_data;
154     size_type data_size;
155     typename INFO::stream_t stream;
156 };
157 
158 #define CHUNCK_SIZE ((zim::size_type)(1024))
159 /**
160  * Uncompress data of the reader at startOffset.
161  *
162  * @param reader         The reader where the data is.
163  * @param startOffset    The offset where the data is in the reader.
164  * @param dest_size[out] The size of the uncompressed data.
165  * @return A pointer to the uncompressed data. This must be deleted (delete[])
166 */
167 template<typename INFO>
uncompress(const zim::Reader * reader,zim::offset_t startOffset,zim::zsize_t * dest_size)168 std::unique_ptr<char[]> uncompress(const zim::Reader* reader, zim::offset_t startOffset, zim::zsize_t* dest_size) {
169   // Use a compressor to compress the data.
170   // As we don't know the result size, neither the compressed size,
171   // we have to do chunk by chunk until decompressor is happy.
172   // Let's assume it will be something like the minChunkSize used at creation
173   Uncompressor<INFO> runner(1024*1024);
174   // The input is a buffer of CHUNCK_SIZE char max. It may be less if the last chunk
175   // is at the end of the reader and the reader size is not a multiple of CHUNCK_SIZE.
176   std::vector<char> raw_data(CHUNCK_SIZE);
177 
178   DEB("Init")
179   runner.init(raw_data.data());
180 
181   zim::size_type availableSize = reader->size().v - startOffset.v;
182   auto ret = RunnerStatus::NEED_MORE;
183   while(ret != RunnerStatus::OK) {
184     if (ret == RunnerStatus::NEED_MORE and availableSize) {
185       zim::size_type inputSize = std::min(availableSize, CHUNCK_SIZE);
186       reader->read(raw_data.data(), startOffset, zim::zsize_t(inputSize));
187       startOffset.v += inputSize;
188       availableSize -= inputSize;
189       DEB("Step " << startOffset.v)
190       ret = runner.feed(raw_data.data(), inputSize);
191       DEB("Ret " << (int)ret)
192     }
193     if (ret == RunnerStatus::ERROR) {
194       throw zim::ZimFileFormatError(std::string("Invalid ") + INFO::name
195                                + std::string(" stream for cluster."));
196     }
197   }
198 
199   DEB("Finish")
200   return runner.get_data(dest_size);
201 }
202 
203 template<typename INFO>
204 class Compressor
205 {
206   public:
207     Compressor(size_t initial_size=1024*1024) :
ret_data(new char[initial_size])208       ret_data(new char[initial_size]),
209       ret_size(initial_size)
210     {}
211 
212     ~Compressor() = default;
213 
init(char * data)214     void init(char* data) {
215       INFO::init_stream_encoder(&stream, data);
216       stream.next_out = (uint8_t*)ret_data.get();
217       stream.avail_out = ret_size;
218     }
219 
220     RunnerStatus feed(const char* data, size_t size, CompStep step=CompStep::STEP) {
221       stream.next_in = (unsigned char*)data;
222       stream.avail_in = size;
223       while (true) {
224         auto errcode = INFO::stream_run_encode(&stream, step);
225         switch (errcode) {
226           case CompStatus::OK:
227             if (stream.avail_out == 0) {
228               // lzma return a OK return status the first time it runs out of output memory.
229               // The BUF_ERROR is returned only the second time we call a lzma_code.
230               continue;
231             } else {
232               return RunnerStatus::NEED_MORE;
233             }
234           case CompStatus::STREAM_END:
235             return RunnerStatus::NEED_MORE;
236           case CompStatus::BUF_ERROR: {
237             //Not enought output size
238             ret_size *= 2;
239             std::unique_ptr<char[]> new_ret_data(new char[ret_size]);
240             memcpy(new_ret_data.get(), ret_data.get(), stream.total_out);
241             stream.next_out = (unsigned char*)(new_ret_data.get() + stream.total_out);
242             stream.avail_out = ret_size - stream.total_out;
243             ret_data = std::move(new_ret_data);
244             continue;
245           }
246           break;
247           default:
248             // unreachable
249             return RunnerStatus::ERROR;
250         };
251       };
252       // urreachable
253       return RunnerStatus::NEED_MORE;
254     }
255 
get_data(zim::zsize_t * size)256     std::unique_ptr<char[]> get_data(zim::zsize_t* size) {
257       feed(nullptr, 0, CompStep::FINISH);
258       INFO::stream_end_encode(&stream);
259       size->v = stream.total_out;
260       return std::move(ret_data);
261     }
262 
263   private:
264     std::unique_ptr<char[]> ret_data;
265     size_t ret_size;
266     typename INFO::stream_t stream;
267 };
268 
269 } // namespace zim
270 
271 #endif // _LIBZIM_COMPRESSION_
272