1 /*
2  * Copyright 2018, 2020 Jonathan Dieter <jdieter@gmail.com>
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  1. Redistributions of source code must retain the above copyright notice,
8  *     this list of conditions and the following disclaimer.
9  *
10  *  2. Redistributions in binary form must reproduce the above copyright notice,
11  *     this list of conditions and the following disclaimer in the documentation
12  *     and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
18  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24  * POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #define _GNU_SOURCE
28 
29 #include <assert.h>
30 #include <stdlib.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdint.h>
34 #include <stdbool.h>
35 #include <sys/types.h>
36 #include <sys/stat.h>
37 #include <sys/wait.h>
38 #include <fcntl.h>
39 #include <libgen.h>
40 #include <dirent.h>
41 #include <unistd.h>
42 #include <argp.h>
43 #include <zck.h>
44 
45 #if defined(stdout)
46 #undef stdout
47 #endif
48 
49 #include "util_common.h"
50 
51 static char doc[] = "zck_gen_zdict - Generate a zdict for a zchunk file";
52 
53 static char args_doc[] = "<file>";
54 
55 static struct argp_option options[] = {
56     {"verbose", 'v', 0,        0,
57      "Increase verbosity (can be specified more than once for debugging)"},
58     /*{"stdout",  'c', 0,        0, "Direct output to stdout"},*/
59     {"dir",     'd', "DIRECTORY", 0,
60      "Write individual chunks to DIRECTORY (defaults to temporary directory)"},
61     {"version", 'V', 0,        0, "Show program version"},
62     { 0 }
63 };
64 
65 struct arguments {
66   char *args[1];
67   char *dir;
68   zck_log_type log_level;
69   bool stdout;
70   bool exit;
71 };
72 
parse_opt(int key,char * arg,struct argp_state * state)73 static error_t parse_opt (int key, char *arg, struct argp_state *state) {
74     struct arguments *arguments = state->input;
75 
76     if(arguments->exit)
77         return 0;
78 
79     switch (key) {
80         case 'v':
81             arguments->log_level--;
82             if(arguments->log_level < ZCK_LOG_DDEBUG)
83                 arguments->log_level = ZCK_LOG_DDEBUG;
84             break;
85         /*case 'c':
86             arguments->stdout = true;
87             break;*/
88         case 'd':
89             arguments->dir = arg;
90             break;
91         case 'V':
92             version();
93             arguments->exit = true;
94             break;
95         case ARGP_KEY_ARG:
96             if (state->arg_num >= 1) {
97                 argp_usage (state);
98                 return EINVAL;
99             }
100             arguments->args[state->arg_num] = arg;
101 
102             break;
103 
104         case ARGP_KEY_END:
105             if (state->arg_num < 1) {
106                 argp_usage (state);
107                 return EINVAL;
108             }
109             break;
110 
111         default:
112             return ARGP_ERR_UNKNOWN;
113     }
114     return 0;
115 }
116 
117 static struct argp argp = {options, parse_opt, args_doc, doc};
118 
get_tmp_dir(char * old_dir)119 char *get_tmp_dir(char *old_dir) {
120     char *dir = NULL;
121     if(old_dir == NULL) {
122         char template[] = "zcktempXXXXXX";
123         char *tmpdir = getenv("TMPDIR");
124 
125         if(tmpdir == NULL) {
126             tmpdir = "/tmp/";
127         } else if(strlen(tmpdir) > 1024) {
128             printf("TMPDIR environmental variable is > 1024 bytes\n");
129             return NULL;
130         }
131 
132         char *base_dir = calloc(strlen(template) + strlen(tmpdir) + 2, 1);
133         assert(base_dir);
134         int i=0;
135         for(i=0; i<strlen(tmpdir); i++)
136             base_dir[i] = tmpdir[i];
137         int offset = i;
138         base_dir[offset] = '/';
139         offset++;
140         for(i=0; i<strlen(template); i++)
141             base_dir[offset + i] = template[i];
142         offset += i;
143         base_dir[offset] = '\0';
144         dir = mkdtemp(base_dir);
145         if(dir == NULL) {
146             perror("ERROR: ");
147             return NULL;
148         }
149     } else {
150         dir = calloc(strlen(old_dir) + 1, 1);
151         assert(dir);
152         int i=0;
153         for(i=0; i<strlen(old_dir); i++)
154             dir[i] = old_dir[i];
155         dir[i] = '\0';
156     }
157     return dir;
158 }
159 
main(int argc,char * argv[])160 int main (int argc, char *argv[]) {
161     struct arguments arguments = {0};
162 
163     /* Defaults */
164     arguments.log_level = ZCK_LOG_ERROR;
165 
166     int retval = argp_parse (&argp, argc, argv, 0, 0, &arguments);
167     if(retval || arguments.exit)
168         exit(retval);
169 
170     zck_set_log_level(arguments.log_level);
171 
172     int src_fd = open(arguments.args[0], O_RDONLY);
173     if(src_fd < 0) {
174         dprintf(STDERR_FILENO, "Unable to open %s\n", arguments.args[0]);
175         perror("");
176         exit(1);
177     }
178     char *base_name = basename(arguments.args[0]);
179     // len .zck -> .zdict = +2 + \0 = +3
180     char *out_name = calloc(strlen(base_name) + 3, 1);
181     assert(out_name);
182     snprintf(out_name, strlen(base_name) - 3, "%s", base_name); //Strip off .zck
183 
184     char *dir = get_tmp_dir(arguments.dir);
185     if(dir == NULL) {
186         free(out_name);
187         exit(1);
188     }
189     bool good_exit = false;
190 
191     char *data = NULL;
192     zckCtx *zck = zck_create();
193     if(!zck_init_read(zck, src_fd)) {
194         dprintf(STDERR_FILENO, "%s", zck_get_error(zck));
195         goto error2;
196     }
197 
198     int ret = zck_validate_data_checksum(zck);
199     if(ret < 1) {
200         if(ret == -1)
201             dprintf(STDERR_FILENO, "Data checksum failed verification\n");
202         goto error2;
203     }
204 
205     for(zckChunk *idx=zck_get_first_chunk(zck); idx!=NULL;
206         idx=zck_get_next_chunk(idx)) {
207         // Skip dictionary
208         if(idx == zck_get_first_chunk(zck))
209             continue;
210         ssize_t chunk_size = zck_get_chunk_size(idx);
211         if(chunk_size < 0) {
212             dprintf(STDERR_FILENO, "%s", zck_get_error(zck));
213             goto error2;
214         }
215         data = calloc(chunk_size, 1);
216         assert(data);
217         ssize_t read_size = zck_get_chunk_data(idx, data, chunk_size);
218         if(read_size != chunk_size) {
219             if(read_size < 0)
220                 dprintf(STDERR_FILENO, "%s", zck_get_error(zck));
221             else
222                 dprintf(STDERR_FILENO,
223                         "Chunk %li size doesn't match expected size: %li != %li\n",
224                         zck_get_chunk_number(idx), read_size, chunk_size);
225             goto error2;
226         }
227 
228         char *dict_block = calloc(strlen(dir) + strlen(out_name) + 12, 1);
229         assert(dict_block);
230         snprintf(dict_block, strlen(dir) + strlen(out_name) + 12, "%s/%s.%li",
231                  dir, out_name, zck_get_chunk_number(idx));
232         int dst_fd = open(dict_block, O_TRUNC | O_WRONLY | O_CREAT, 0666);
233         if(dst_fd < 0) {
234             dprintf(STDERR_FILENO, "Unable to open %s", dict_block);
235             perror("");
236             free(dict_block);
237             goto error2;
238         }
239         if(write(dst_fd, data, chunk_size) != chunk_size) {
240             dprintf(STDERR_FILENO, "Error writing to %s\n", dict_block);
241             free(dict_block);
242             goto error2;
243         }
244         free(data);
245         close(dst_fd);
246         free(dict_block);
247     }
248     snprintf(out_name + strlen(base_name) - 4, 7, ".zdict");
249 
250     if(!zck_close(zck)) {
251         dprintf(STDERR_FILENO, "%s", zck_get_error(zck));
252         goto error2;
253     }
254 
255     /* Create dictionary */
256     int pid = fork();
257     if(pid == 0) {
258         execl("/usr/bin/zstd", "zstd", "--train", dir, "-r", "-o", out_name, NULL);
259         dprintf(STDERR_FILENO, "Unable to find /usr/bin/zstd\n");
260         exit(1);
261     }
262     int wstatus = 0;
263     int w = waitpid(pid, &wstatus, 0);
264     if (w == -1) {
265         dprintf(STDERR_FILENO, "Error waiting for zstd\n");
266         perror("");
267         goto error2;
268     }
269     if(WEXITSTATUS(wstatus) != 0) {
270         dprintf(STDERR_FILENO, "Error generating dict\n");
271         goto error2;
272     }
273 
274     /* Clean up temporary directory */
275     if(!arguments.dir) {
276         struct dirent *dp;
277         DIR *dfd;
278 
279         if ((dfd = opendir(dir)) == NULL) {
280             dprintf(STDERR_FILENO, "Unable to read %s\n", dir);
281             goto error2;
282         }
283 
284         bool err = false;
285         while((dp = readdir(dfd)) != NULL) {
286             if(dp->d_name[0] == '.')
287                 continue;
288             char *full_path = calloc(strlen(dir) + strlen(dp->d_name) + 2, 1);
289             snprintf(full_path, strlen(dir) + strlen(dp->d_name) + 2, "%s/%s",
290                      dir, dp->d_name);
291             if(unlink(full_path) != 0) {
292                 dprintf(STDERR_FILENO, "Unable to remove %s\n", full_path);
293                 perror("");
294                 err = true;
295             } else {
296                 if(arguments.log_level <= ZCK_LOG_INFO)
297                     dprintf(STDERR_FILENO, "Removed %s\n", full_path);
298             }
299             free(full_path);
300         }
301         closedir(dfd);
302         if(!err) {
303             if(rmdir(dir) != 0) {
304                 dprintf(STDERR_FILENO, "Unable to remove %s\n", dir);
305                 perror("");
306             }
307         } else {
308             dprintf(STDERR_FILENO, "Errors encountered, not removing %s\n",
309                     dir);
310         }
311     }
312     good_exit = true;
313 error2:
314     free(dir);
315     zck_free(&zck);
316     if(!good_exit)
317         unlink(out_name);
318     free(out_name);
319     close(src_fd);
320     if(!good_exit)
321         exit(1);
322     exit(0);
323 }
324