1 /*
2  * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under both the BSD-style license (found in the
6  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7  * in the COPYING file in the root directory of this source tree).
8  * You may select, at your option, one of the above-listed licenses.
9  */
10 
11 #include "data.h"
12 
13 #include <assert.h>
14 #include <errno.h>
15 #include <stdio.h>
16 #include <string.h>
17 
18 #include <sys/stat.h>
19 
20 #include <curl/curl.h>
21 
22 #include "mem.h"
23 #include "util.h"
24 #define XXH_STATIC_LINKING_ONLY
25 #include "xxhash.h"
26 
27 /**
28  * Data objects
29  */
30 
31 #define REGRESSION_RELEASE(x) \
32     "https://github.com/facebook/zstd/releases/download/regression-data/" x
33 
34 data_t silesia = {
35     .name = "silesia",
36     .type = data_type_dir,
37     .data =
38         {
39             .url = REGRESSION_RELEASE("silesia.tar.zst"),
40             .xxhash64 = 0x48a199f92f93e977LL,
41         },
42 };
43 
44 data_t silesia_tar = {
45     .name = "silesia.tar",
46     .type = data_type_file,
47     .data =
48         {
49             .url = REGRESSION_RELEASE("silesia.tar.zst"),
50             .xxhash64 = 0x48a199f92f93e977LL,
51         },
52 };
53 
54 data_t github = {
55     .name = "github",
56     .type = data_type_dir,
57     .data =
58         {
59             .url = REGRESSION_RELEASE("github.tar.zst"),
60             .xxhash64 = 0xa9b1b44b020df292LL,
61         },
62     .dict =
63         {
64             .url = REGRESSION_RELEASE("github.dict.zst"),
65             .xxhash64 = 0x1eddc6f737d3cb53LL,
66 
67         },
68 };
69 
70 static data_t* g_data[] = {
71     &silesia,
72     &silesia_tar,
73     &github,
74     NULL,
75 };
76 
77 data_t const* const* data = (data_t const* const*)g_data;
78 
79 /**
80  * data helpers.
81  */
82 
data_has_dict(data_t const * data)83 int data_has_dict(data_t const* data) {
84     return data->dict.url != NULL;
85 }
86 
87 /**
88  * data buffer helper functions (documented in header).
89  */
90 
data_buffer_create(size_t const capacity)91 data_buffer_t data_buffer_create(size_t const capacity) {
92     data_buffer_t buffer = {};
93 
94     buffer.data = (uint8_t*)malloc(capacity);
95     if (buffer.data == NULL)
96         return buffer;
97     buffer.capacity = capacity;
98     return buffer;
99 }
100 
data_buffer_read(char const * filename)101 data_buffer_t data_buffer_read(char const* filename) {
102     data_buffer_t buffer = {};
103 
104     uint64_t const size = UTIL_getFileSize(filename);
105     if (size == UTIL_FILESIZE_UNKNOWN) {
106         fprintf(stderr, "unknown size for %s\n", filename);
107         return buffer;
108     }
109 
110     buffer.data = (uint8_t*)malloc(size);
111     if (buffer.data == NULL) {
112         fprintf(stderr, "malloc failed\n");
113         return buffer;
114     }
115     buffer.capacity = size;
116 
117     FILE* file = fopen(filename, "rb");
118     if (file == NULL) {
119         fprintf(stderr, "file null\n");
120         goto err;
121     }
122     buffer.size = fread(buffer.data, 1, buffer.capacity, file);
123     fclose(file);
124     if (buffer.size != buffer.capacity) {
125         fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity);
126         goto err;
127     }
128 
129     return buffer;
130 err:
131     free(buffer.data);
132     memset(&buffer, 0, sizeof(buffer));
133     return buffer;
134 }
135 
data_buffer_get_data(data_t const * data)136 data_buffer_t data_buffer_get_data(data_t const* data) {
137     data_buffer_t const kEmptyBuffer = {};
138 
139     if (data->type != data_type_file)
140         return kEmptyBuffer;
141 
142     return data_buffer_read(data->data.path);
143 }
144 
data_buffer_get_dict(data_t const * data)145 data_buffer_t data_buffer_get_dict(data_t const* data) {
146     data_buffer_t const kEmptyBuffer = {};
147 
148     if (!data_has_dict(data))
149         return kEmptyBuffer;
150 
151     return data_buffer_read(data->dict.path);
152 }
153 
data_buffer_compare(data_buffer_t buffer1,data_buffer_t buffer2)154 int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) {
155     size_t const size =
156         buffer1.size < buffer2.size ? buffer1.size : buffer2.size;
157     int const cmp = memcmp(buffer1.data, buffer2.data, size);
158     if (cmp != 0)
159         return cmp;
160     if (buffer1.size < buffer2.size)
161         return -1;
162     if (buffer1.size == buffer2.size)
163         return 0;
164     assert(buffer1.size > buffer2.size);
165     return 1;
166 }
167 
data_buffer_free(data_buffer_t buffer)168 void data_buffer_free(data_buffer_t buffer) {
169     free(buffer.data);
170 }
171 
172 /**
173  * data filenames helpers.
174  */
175 
data_filenames_get(data_t const * data)176 data_filenames_t data_filenames_get(data_t const* data) {
177     data_filenames_t filenames = {.buffer = NULL, .size = 0};
178     char const* path = data->data.path;
179 
180     filenames.filenames = UTIL_createFileList(
181         &path,
182         1,
183         &filenames.buffer,
184         &filenames.size,
185         /* followLinks */ 0);
186     return filenames;
187 }
188 
data_filenames_free(data_filenames_t filenames)189 void data_filenames_free(data_filenames_t filenames) {
190     UTIL_freeFileList(filenames.filenames, filenames.buffer);
191 }
192 
193 /**
194  * data buffers helpers.
195  */
196 
data_buffers_get(data_t const * data)197 data_buffers_t data_buffers_get(data_t const* data) {
198     data_buffers_t buffers = {.size = 0};
199     data_filenames_t filenames = data_filenames_get(data);
200     if (filenames.size == 0)
201         return buffers;
202 
203     data_buffer_t* buffersPtr =
204         (data_buffer_t*)malloc(filenames.size * sizeof(data_buffer_t));
205     if (buffersPtr == NULL)
206         return buffers;
207     buffers.buffers = (data_buffer_t const*)buffersPtr;
208     buffers.size = filenames.size;
209 
210     for (size_t i = 0; i < filenames.size; ++i) {
211         buffersPtr[i] = data_buffer_read(filenames.filenames[i]);
212         if (buffersPtr[i].data == NULL) {
213             data_buffers_t const kEmptyBuffer = {};
214             data_buffers_free(buffers);
215             return kEmptyBuffer;
216         }
217     }
218 
219     return buffers;
220 }
221 
222 /**
223  * Frees the data buffers.
224  */
data_buffers_free(data_buffers_t buffers)225 void data_buffers_free(data_buffers_t buffers) {
226     free((data_buffer_t*)buffers.buffers);
227 }
228 
229 /**
230  * Initialization and download functions.
231  */
232 
233 static char* g_data_dir = NULL;
234 
235 /* mkdir -p */
ensure_directory_exists(char const * indir)236 static int ensure_directory_exists(char const* indir) {
237     char* const dir = strdup(indir);
238     char* end = dir;
239     int ret = 0;
240     if (dir == NULL) {
241         ret = EINVAL;
242         goto out;
243     }
244     do {
245         /* Find the next directory level. */
246         for (++end; *end != '\0' && *end != '/'; ++end)
247             ;
248         /* End the string there, make the directory, and restore the string. */
249         char const save = *end;
250         *end = '\0';
251         int const isdir = UTIL_isDirectory(dir);
252         ret = mkdir(dir, S_IRWXU);
253         *end = save;
254         /* Its okay if the directory already exists. */
255         if (ret == 0 || (errno == EEXIST && isdir))
256             continue;
257         ret = errno;
258         fprintf(stderr, "mkdir() failed\n");
259         goto out;
260     } while (*end != '\0');
261 
262     ret = 0;
263 out:
264     free(dir);
265     return ret;
266 }
267 
268 /** Concatenate 3 strings into a new buffer. */
cat3(char const * str1,char const * str2,char const * str3)269 static char* cat3(char const* str1, char const* str2, char const* str3) {
270     size_t const size1 = strlen(str1);
271     size_t const size2 = strlen(str2);
272     size_t const size3 = str3 == NULL ? 0 : strlen(str3);
273     size_t const size = size1 + size2 + size3 + 1;
274     char* const dst = (char*)malloc(size);
275     if (dst == NULL)
276         return NULL;
277     strcpy(dst, str1);
278     strcpy(dst + size1, str2);
279     if (str3 != NULL)
280         strcpy(dst + size1 + size2, str3);
281     assert(strlen(dst) == size1 + size2 + size3);
282     return dst;
283 }
284 
cat2(char const * str1,char const * str2)285 static char* cat2(char const* str1, char const* str2) {
286     return cat3(str1, str2, NULL);
287 }
288 
289 /**
290  * State needed by the curl callback.
291  * It takes data from curl, hashes it, and writes it to the file.
292  */
293 typedef struct {
294     FILE* file;
295     XXH64_state_t xxhash64;
296     int error;
297 } curl_data_t;
298 
299 /** Create the curl state. */
curl_data_create(data_resource_t const * resource,data_type_t type)300 static curl_data_t curl_data_create(
301     data_resource_t const* resource,
302     data_type_t type) {
303     curl_data_t cdata = {};
304 
305     XXH64_reset(&cdata.xxhash64, 0);
306 
307     assert(UTIL_isDirectory(g_data_dir));
308 
309     if (type == data_type_file) {
310         /* Decompress the resource and store to the path. */
311         char* cmd = cat3("zstd -dqfo '", resource->path, "'");
312         if (cmd == NULL) {
313             cdata.error = ENOMEM;
314             return cdata;
315         }
316         cdata.file = popen(cmd, "w");
317         free(cmd);
318     } else {
319         /* Decompress and extract the resource to the cache directory. */
320         char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'");
321         if (cmd == NULL) {
322             cdata.error = ENOMEM;
323             return cdata;
324         }
325         cdata.file = popen(cmd, "w");
326         free(cmd);
327     }
328     if (cdata.file == NULL) {
329         cdata.error = errno;
330     }
331 
332     return cdata;
333 }
334 
335 /** Free the curl state. */
curl_data_free(curl_data_t cdata)336 static int curl_data_free(curl_data_t cdata) {
337     return pclose(cdata.file);
338 }
339 
340 /** curl callback. Updates the hash, and writes to the file. */
curl_write(void * data,size_t size,size_t count,void * ptr)341 static size_t curl_write(void* data, size_t size, size_t count, void* ptr) {
342     curl_data_t* cdata = (curl_data_t*)ptr;
343     size_t const written = fwrite(data, size, count, cdata->file);
344     XXH64_update(&cdata->xxhash64, data, written * size);
345     return written;
346 }
347 
curl_download_resource(CURL * curl,data_resource_t const * resource,data_type_t type)348 static int curl_download_resource(
349     CURL* curl,
350     data_resource_t const* resource,
351     data_type_t type) {
352     curl_data_t cdata;
353     /* Download the data. */
354     if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0)
355         return EINVAL;
356     if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0)
357         return EINVAL;
358     cdata = curl_data_create(resource, type);
359     if (cdata.error != 0)
360         return cdata.error;
361     int const curl_err = curl_easy_perform(curl);
362     int const close_err = curl_data_free(cdata);
363     if (curl_err) {
364         fprintf(
365             stderr,
366             "downloading '%s' for '%s' failed\n",
367             resource->url,
368             resource->path);
369         return EIO;
370     }
371     if (close_err) {
372         fprintf(stderr, "writing data to '%s' failed\n", resource->path);
373         return EIO;
374     }
375     /* check that the file exists. */
376     if (type == data_type_file && !UTIL_isRegularFile(resource->path)) {
377         fprintf(stderr, "output file '%s' does not exist\n", resource->path);
378         return EIO;
379     }
380     if (type == data_type_dir && !UTIL_isDirectory(resource->path)) {
381         fprintf(
382             stderr, "output directory '%s' does not exist\n", resource->path);
383         return EIO;
384     }
385     /* Check that the hash matches. */
386     if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) {
387         fprintf(
388             stderr,
389             "checksum does not match: 0x%llxLL != 0x%llxLL\n",
390             (unsigned long long)XXH64_digest(&cdata.xxhash64),
391             (unsigned long long)resource->xxhash64);
392         return EINVAL;
393     }
394 
395     return 0;
396 }
397 
398 /** Download a single data object. */
curl_download_datum(CURL * curl,data_t const * data)399 static int curl_download_datum(CURL* curl, data_t const* data) {
400     int ret;
401     ret = curl_download_resource(curl, &data->data, data->type);
402     if (ret != 0)
403         return ret;
404     if (data_has_dict(data)) {
405         ret = curl_download_resource(curl, &data->dict, data_type_file);
406         if (ret != 0)
407             return ret;
408     }
409     return ret;
410 }
411 
412 /** Download all the data. */
curl_download_data(data_t const * const * data)413 static int curl_download_data(data_t const* const* data) {
414     if (curl_global_init(CURL_GLOBAL_ALL) != 0)
415         return EFAULT;
416 
417     curl_data_t cdata = {};
418     CURL* curl = curl_easy_init();
419     int err = EFAULT;
420 
421     if (curl == NULL)
422         return EFAULT;
423 
424     if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0)
425         goto out;
426     if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0)
427         goto out;
428     if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0)
429         goto out;
430 
431     assert(data != NULL);
432     for (; *data != NULL; ++data) {
433         if (curl_download_datum(curl, *data) != 0)
434             goto out;
435     }
436 
437     err = 0;
438 out:
439     curl_easy_cleanup(curl);
440     curl_global_cleanup();
441     return err;
442 }
443 
444 /** Fill the path member variable of the data objects. */
data_create_paths(data_t * const * data,char const * dir)445 static int data_create_paths(data_t* const* data, char const* dir) {
446     size_t const dirlen = strlen(dir);
447     assert(data != NULL);
448     for (; *data != NULL; ++data) {
449         data_t* const datum = *data;
450         datum->data.path = cat3(dir, "/", datum->name);
451         if (datum->data.path == NULL)
452             return ENOMEM;
453         if (data_has_dict(datum)) {
454             datum->dict.path = cat2(datum->data.path, ".dict");
455             if (datum->dict.path == NULL)
456                 return ENOMEM;
457         }
458     }
459     return 0;
460 }
461 
462 /** Free the path member variable of the data objects. */
data_free_paths(data_t * const * data)463 static void data_free_paths(data_t* const* data) {
464     assert(data != NULL);
465     for (; *data != NULL; ++data) {
466         data_t* datum = *data;
467         free((void*)datum->data.path);
468         free((void*)datum->dict.path);
469         datum->data.path = NULL;
470         datum->dict.path = NULL;
471     }
472 }
473 
474 static char const kStampName[] = "STAMP";
475 
xxh_update_le(XXH64_state_t * state,uint64_t data)476 static void xxh_update_le(XXH64_state_t* state, uint64_t data) {
477     if (!MEM_isLittleEndian())
478         data = MEM_swap64(data);
479     XXH64_update(state, &data, sizeof(data));
480 }
481 
482 /** Hash the data to create the stamp. */
stamp_hash(data_t const * const * data)483 static uint64_t stamp_hash(data_t const* const* data) {
484     XXH64_state_t state;
485 
486     XXH64_reset(&state, 0);
487     assert(data != NULL);
488     for (; *data != NULL; ++data) {
489         data_t const* datum = *data;
490         /* We don't care about the URL that we fetch from. */
491         /* The path is derived from the name. */
492         XXH64_update(&state, datum->name, strlen(datum->name));
493         xxh_update_le(&state, datum->data.xxhash64);
494         xxh_update_le(&state, datum->dict.xxhash64);
495         xxh_update_le(&state, datum->type);
496     }
497     return XXH64_digest(&state);
498 }
499 
500 /** Check if the stamp matches the stamp in the cache directory. */
stamp_check(char const * dir,data_t const * const * data)501 static int stamp_check(char const* dir, data_t const* const* data) {
502     char* stamp = cat3(dir, "/", kStampName);
503     uint64_t const expected = stamp_hash(data);
504     XXH64_canonical_t actual;
505     FILE* stampfile = NULL;
506     int matches = 0;
507 
508     if (stamp == NULL)
509         goto out;
510     if (!UTIL_isRegularFile(stamp)) {
511         fprintf(stderr, "stamp does not exist: recreating the data cache\n");
512         goto out;
513     }
514 
515     stampfile = fopen(stamp, "rb");
516     if (stampfile == NULL) {
517         fprintf(stderr, "could not open stamp: recreating the data cache\n");
518         goto out;
519     }
520 
521     size_t b;
522     if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) {
523         fprintf(stderr, "invalid stamp: recreating the data cache\n");
524         goto out;
525     }
526 
527     matches = (expected == XXH64_hashFromCanonical(&actual));
528     if (matches)
529         fprintf(stderr, "stamp matches: reusing the cached data\n");
530     else
531         fprintf(stderr, "stamp does not match: recreating the data cache\n");
532 
533 out:
534     free(stamp);
535     if (stampfile != NULL)
536         fclose(stampfile);
537     return matches;
538 }
539 
540 /** On success write a new stamp, on failure delete the old stamp. */
541 static int
stamp_write(char const * dir,data_t const * const * data,int const data_err)542 stamp_write(char const* dir, data_t const* const* data, int const data_err) {
543     char* stamp = cat3(dir, "/", kStampName);
544     FILE* stampfile = NULL;
545     int err = EIO;
546 
547     if (stamp == NULL)
548         return ENOMEM;
549 
550     if (data_err != 0) {
551         err = data_err;
552         goto out;
553     }
554     XXH64_canonical_t hash;
555 
556     XXH64_canonicalFromHash(&hash, stamp_hash(data));
557 
558     stampfile = fopen(stamp, "wb");
559     if (stampfile == NULL)
560         goto out;
561     if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1)
562         goto out;
563     err = 0;
564     fprintf(stderr, "stamped new data cache\n");
565 out:
566     if (err != 0)
567         /* Ignore errors. */
568         unlink(stamp);
569     free(stamp);
570     if (stampfile != NULL)
571         fclose(stampfile);
572     return err;
573 }
574 
data_init(char const * dir)575 int data_init(char const* dir) {
576     int err;
577 
578     if (dir == NULL)
579         return EINVAL;
580 
581     /* This must be first to simplify logic. */
582     err = ensure_directory_exists(dir);
583     if (err != 0)
584         return err;
585 
586     /* Save the cache directory. */
587     g_data_dir = strdup(dir);
588     if (g_data_dir == NULL)
589         return ENOMEM;
590 
591     err = data_create_paths(g_data, dir);
592     if (err != 0)
593         return err;
594 
595     /* If the stamp matches then we are good to go.
596      * This must be called before any modifications to the data cache.
597      * After this point, we MUST call stamp_write() to update the STAMP,
598      * since we've updated the data cache.
599      */
600     if (stamp_check(dir, data))
601         return 0;
602 
603     err = curl_download_data(data);
604     if (err != 0)
605         goto out;
606 
607 out:
608     /* This must be last, since it must know if data_init() succeeded. */
609     stamp_write(dir, data, err);
610     return err;
611 }
612 
data_finish(void)613 void data_finish(void) {
614     data_free_paths(g_data);
615     free(g_data_dir);
616     g_data_dir = NULL;
617 }
618