1 /*
2  * Copyright (c) Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under both the BSD-style license (found in the
6  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7  * in the COPYING file in the root directory of this source tree).
8  * You may select, at your option, one of the above-listed licenses.
9  */
10 
11 #include "data.h"
12 
13 #include <assert.h>
14 #include <errno.h>
15 #include <stdio.h>
16 #include <string.h>
17 #include <stdlib.h>   /* free() */
18 
19 #include <sys/stat.h>
20 
21 #include <curl/curl.h>
22 
23 #include "mem.h"
24 #include "util.h"
25 #define XXH_STATIC_LINKING_ONLY
26 #include "xxhash.h"
27 
28 /**
29  * Data objects
30  */
31 
32 #define REGRESSION_RELEASE(x) \
33     "https://github.com/facebook/zstd/releases/download/regression-data/" x
34 
35 data_t silesia = {
36     .name = "silesia",
37     .type = data_type_dir,
38     .data =
39         {
40             .url = REGRESSION_RELEASE("silesia.tar.zst"),
41             .xxhash64 = 0x48a199f92f93e977LL,
42         },
43 };
44 
45 data_t silesia_tar = {
46     .name = "silesia.tar",
47     .type = data_type_file,
48     .data =
49         {
50             .url = REGRESSION_RELEASE("silesia.tar.zst"),
51             .xxhash64 = 0x48a199f92f93e977LL,
52         },
53 };
54 
55 data_t github = {
56     .name = "github",
57     .type = data_type_dir,
58     .data =
59         {
60             .url = REGRESSION_RELEASE("github.tar.zst"),
61             .xxhash64 = 0xa9b1b44b020df292LL,
62         },
63     .dict =
64         {
65             .url = REGRESSION_RELEASE("github.dict.zst"),
66             .xxhash64 = 0x1eddc6f737d3cb53LL,
67 
68         },
69 };
70 
71 data_t github_tar = {
72     .name = "github.tar",
73     .type = data_type_file,
74     .data =
75         {
76             .url = REGRESSION_RELEASE("github.tar.zst"),
77             .xxhash64 = 0xa9b1b44b020df292LL,
78         },
79     .dict =
80         {
81             .url = REGRESSION_RELEASE("github.dict.zst"),
82             .xxhash64 = 0x1eddc6f737d3cb53LL,
83 
84         },
85 };
86 
87 static data_t* g_data[] = {
88     &silesia,
89     &silesia_tar,
90     &github,
91     &github_tar,
92     NULL,
93 };
94 
95 data_t const* const* data = (data_t const* const*)g_data;
96 
97 /**
98  * data helpers.
99  */
100 
data_has_dict(data_t const * data)101 int data_has_dict(data_t const* data) {
102     return data->dict.url != NULL;
103 }
104 
105 /**
106  * data buffer helper functions (documented in header).
107  */
108 
data_buffer_create(size_t const capacity)109 data_buffer_t data_buffer_create(size_t const capacity) {
110     data_buffer_t buffer = {};
111 
112     buffer.data = (uint8_t*)malloc(capacity);
113     if (buffer.data == NULL)
114         return buffer;
115     buffer.capacity = capacity;
116     return buffer;
117 }
118 
data_buffer_read(char const * filename)119 data_buffer_t data_buffer_read(char const* filename) {
120     data_buffer_t buffer = {};
121 
122     uint64_t const size = UTIL_getFileSize(filename);
123     if (size == UTIL_FILESIZE_UNKNOWN) {
124         fprintf(stderr, "unknown size for %s\n", filename);
125         return buffer;
126     }
127 
128     buffer.data = (uint8_t*)malloc(size);
129     if (buffer.data == NULL) {
130         fprintf(stderr, "malloc failed\n");
131         return buffer;
132     }
133     buffer.capacity = size;
134 
135     FILE* file = fopen(filename, "rb");
136     if (file == NULL) {
137         fprintf(stderr, "file null\n");
138         goto err;
139     }
140     buffer.size = fread(buffer.data, 1, buffer.capacity, file);
141     fclose(file);
142     if (buffer.size != buffer.capacity) {
143         fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity);
144         goto err;
145     }
146 
147     return buffer;
148 err:
149     free(buffer.data);
150     memset(&buffer, 0, sizeof(buffer));
151     return buffer;
152 }
153 
data_buffer_get_data(data_t const * data)154 data_buffer_t data_buffer_get_data(data_t const* data) {
155     data_buffer_t const kEmptyBuffer = {};
156 
157     if (data->type != data_type_file)
158         return kEmptyBuffer;
159 
160     return data_buffer_read(data->data.path);
161 }
162 
data_buffer_get_dict(data_t const * data)163 data_buffer_t data_buffer_get_dict(data_t const* data) {
164     data_buffer_t const kEmptyBuffer = {};
165 
166     if (!data_has_dict(data))
167         return kEmptyBuffer;
168 
169     return data_buffer_read(data->dict.path);
170 }
171 
data_buffer_compare(data_buffer_t buffer1,data_buffer_t buffer2)172 int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) {
173     size_t const size =
174         buffer1.size < buffer2.size ? buffer1.size : buffer2.size;
175     int const cmp = memcmp(buffer1.data, buffer2.data, size);
176     if (cmp != 0)
177         return cmp;
178     if (buffer1.size < buffer2.size)
179         return -1;
180     if (buffer1.size == buffer2.size)
181         return 0;
182     assert(buffer1.size > buffer2.size);
183     return 1;
184 }
185 
data_buffer_free(data_buffer_t buffer)186 void data_buffer_free(data_buffer_t buffer) {
187     free(buffer.data);
188 }
189 
190 /**
191  * data filenames helpers.
192  */
193 
data_filenames_get(data_t const * data)194 FileNamesTable* data_filenames_get(data_t const* data)
195 {
196     char const* const path = data->data.path;
197     return UTIL_createExpandedFNT(&path, 1, 0 /* followLinks */ );
198 }
199 
200 /**
201  * data buffers helpers.
202  */
203 
data_buffers_get(data_t const * data)204 data_buffers_t data_buffers_get(data_t const* data) {
205     data_buffers_t buffers = {.size = 0};
206     FileNamesTable* const filenames = data_filenames_get(data);
207     if (filenames == NULL) return buffers;
208     if (filenames->tableSize == 0) {
209         UTIL_freeFileNamesTable(filenames);
210         return buffers;
211     }
212 
213     data_buffer_t* buffersPtr =
214         (data_buffer_t*)malloc(filenames->tableSize * sizeof(*buffersPtr));
215     if (buffersPtr == NULL) {
216         UTIL_freeFileNamesTable(filenames);
217         return buffers;
218     }
219     buffers.buffers = (data_buffer_t const*)buffersPtr;
220     buffers.size = filenames->tableSize;
221 
222     for (size_t i = 0; i < filenames->tableSize; ++i) {
223         buffersPtr[i] = data_buffer_read(filenames->fileNames[i]);
224         if (buffersPtr[i].data == NULL) {
225             data_buffers_t const kEmptyBuffer = {};
226             data_buffers_free(buffers);
227             UTIL_freeFileNamesTable(filenames);
228             return kEmptyBuffer;
229         }
230     }
231 
232     UTIL_freeFileNamesTable(filenames);
233     return buffers;
234 }
235 
236 /**
237  * Frees the data buffers.
238  */
data_buffers_free(data_buffers_t buffers)239 void data_buffers_free(data_buffers_t buffers) {
240     free((data_buffer_t*)buffers.buffers);
241 }
242 
243 /**
244  * Initialization and download functions.
245  */
246 
247 static char* g_data_dir = NULL;
248 
249 /* mkdir -p */
ensure_directory_exists(char const * indir)250 static int ensure_directory_exists(char const* indir) {
251     char* const dir = strdup(indir);
252     char* end = dir;
253     int ret = 0;
254     if (dir == NULL) {
255         ret = EINVAL;
256         goto out;
257     }
258     do {
259         /* Find the next directory level. */
260         for (++end; *end != '\0' && *end != '/'; ++end)
261             ;
262         /* End the string there, make the directory, and restore the string. */
263         char const save = *end;
264         *end = '\0';
265         int const isdir = UTIL_isDirectory(dir);
266         ret = mkdir(dir, S_IRWXU);
267         *end = save;
268         /* Its okay if the directory already exists. */
269         if (ret == 0 || (errno == EEXIST && isdir))
270             continue;
271         ret = errno;
272         fprintf(stderr, "mkdir() failed\n");
273         goto out;
274     } while (*end != '\0');
275 
276     ret = 0;
277 out:
278     free(dir);
279     return ret;
280 }
281 
282 /** Concatenate 3 strings into a new buffer. */
cat3(char const * str1,char const * str2,char const * str3)283 static char* cat3(char const* str1, char const* str2, char const* str3) {
284     size_t const size1 = strlen(str1);
285     size_t const size2 = strlen(str2);
286     size_t const size3 = str3 == NULL ? 0 : strlen(str3);
287     size_t const size = size1 + size2 + size3 + 1;
288     char* const dst = (char*)malloc(size);
289     if (dst == NULL)
290         return NULL;
291     strcpy(dst, str1);
292     strcpy(dst + size1, str2);
293     if (str3 != NULL)
294         strcpy(dst + size1 + size2, str3);
295     assert(strlen(dst) == size1 + size2 + size3);
296     return dst;
297 }
298 
cat2(char const * str1,char const * str2)299 static char* cat2(char const* str1, char const* str2) {
300     return cat3(str1, str2, NULL);
301 }
302 
303 /**
304  * State needed by the curl callback.
305  * It takes data from curl, hashes it, and writes it to the file.
306  */
307 typedef struct {
308     FILE* file;
309     XXH64_state_t xxhash64;
310     int error;
311 } curl_data_t;
312 
313 /** Create the curl state. */
curl_data_create(data_resource_t const * resource,data_type_t type)314 static curl_data_t curl_data_create(
315     data_resource_t const* resource,
316     data_type_t type) {
317     curl_data_t cdata = {};
318 
319     XXH64_reset(&cdata.xxhash64, 0);
320 
321     assert(UTIL_isDirectory(g_data_dir));
322 
323     if (type == data_type_file) {
324         /* Decompress the resource and store to the path. */
325         char* cmd = cat3("zstd -dqfo '", resource->path, "'");
326         if (cmd == NULL) {
327             cdata.error = ENOMEM;
328             return cdata;
329         }
330         cdata.file = popen(cmd, "w");
331         free(cmd);
332     } else {
333         /* Decompress and extract the resource to the cache directory. */
334         char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'");
335         if (cmd == NULL) {
336             cdata.error = ENOMEM;
337             return cdata;
338         }
339         cdata.file = popen(cmd, "w");
340         free(cmd);
341     }
342     if (cdata.file == NULL) {
343         cdata.error = errno;
344     }
345 
346     return cdata;
347 }
348 
349 /** Free the curl state. */
curl_data_free(curl_data_t cdata)350 static int curl_data_free(curl_data_t cdata) {
351     return pclose(cdata.file);
352 }
353 
354 /** curl callback. Updates the hash, and writes to the file. */
curl_write(void * data,size_t size,size_t count,void * ptr)355 static size_t curl_write(void* data, size_t size, size_t count, void* ptr) {
356     curl_data_t* cdata = (curl_data_t*)ptr;
357     size_t const written = fwrite(data, size, count, cdata->file);
358     XXH64_update(&cdata->xxhash64, data, written * size);
359     return written;
360 }
361 
curl_download_resource(CURL * curl,data_resource_t const * resource,data_type_t type)362 static int curl_download_resource(
363     CURL* curl,
364     data_resource_t const* resource,
365     data_type_t type) {
366     curl_data_t cdata;
367     /* Download the data. */
368     if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0)
369         return EINVAL;
370     if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0)
371         return EINVAL;
372     cdata = curl_data_create(resource, type);
373     if (cdata.error != 0)
374         return cdata.error;
375     int const curl_err = curl_easy_perform(curl);
376     int const close_err = curl_data_free(cdata);
377     if (curl_err) {
378         fprintf(
379             stderr,
380             "downloading '%s' for '%s' failed\n",
381             resource->url,
382             resource->path);
383         return EIO;
384     }
385     if (close_err) {
386         fprintf(stderr, "writing data to '%s' failed\n", resource->path);
387         return EIO;
388     }
389     /* check that the file exists. */
390     if (type == data_type_file && !UTIL_isRegularFile(resource->path)) {
391         fprintf(stderr, "output file '%s' does not exist\n", resource->path);
392         return EIO;
393     }
394     if (type == data_type_dir && !UTIL_isDirectory(resource->path)) {
395         fprintf(
396             stderr, "output directory '%s' does not exist\n", resource->path);
397         return EIO;
398     }
399     /* Check that the hash matches. */
400     if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) {
401         fprintf(
402             stderr,
403             "checksum does not match: 0x%llxLL != 0x%llxLL\n",
404             (unsigned long long)XXH64_digest(&cdata.xxhash64),
405             (unsigned long long)resource->xxhash64);
406         return EINVAL;
407     }
408 
409     return 0;
410 }
411 
412 /** Download a single data object. */
curl_download_datum(CURL * curl,data_t const * data)413 static int curl_download_datum(CURL* curl, data_t const* data) {
414     int ret;
415     ret = curl_download_resource(curl, &data->data, data->type);
416     if (ret != 0)
417         return ret;
418     if (data_has_dict(data)) {
419         ret = curl_download_resource(curl, &data->dict, data_type_file);
420         if (ret != 0)
421             return ret;
422     }
423     return ret;
424 }
425 
426 /** Download all the data. */
curl_download_data(data_t const * const * data)427 static int curl_download_data(data_t const* const* data) {
428     if (curl_global_init(CURL_GLOBAL_ALL) != 0)
429         return EFAULT;
430 
431     curl_data_t cdata = {};
432     CURL* curl = curl_easy_init();
433     int err = EFAULT;
434 
435     if (curl == NULL)
436         return EFAULT;
437 
438     if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0)
439         goto out;
440     if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0)
441         goto out;
442     if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0)
443         goto out;
444 
445     assert(data != NULL);
446     for (; *data != NULL; ++data) {
447         if (curl_download_datum(curl, *data) != 0)
448             goto out;
449     }
450 
451     err = 0;
452 out:
453     curl_easy_cleanup(curl);
454     curl_global_cleanup();
455     return err;
456 }
457 
458 /** Fill the path member variable of the data objects. */
data_create_paths(data_t * const * data,char const * dir)459 static int data_create_paths(data_t* const* data, char const* dir) {
460     size_t const dirlen = strlen(dir);
461     assert(data != NULL);
462     for (; *data != NULL; ++data) {
463         data_t* const datum = *data;
464         datum->data.path = cat3(dir, "/", datum->name);
465         if (datum->data.path == NULL)
466             return ENOMEM;
467         if (data_has_dict(datum)) {
468             datum->dict.path = cat2(datum->data.path, ".dict");
469             if (datum->dict.path == NULL)
470                 return ENOMEM;
471         }
472     }
473     return 0;
474 }
475 
476 /** Free the path member variable of the data objects. */
data_free_paths(data_t * const * data)477 static void data_free_paths(data_t* const* data) {
478     assert(data != NULL);
479     for (; *data != NULL; ++data) {
480         data_t* datum = *data;
481         free((void*)datum->data.path);
482         free((void*)datum->dict.path);
483         datum->data.path = NULL;
484         datum->dict.path = NULL;
485     }
486 }
487 
488 static char const kStampName[] = "STAMP";
489 
xxh_update_le(XXH64_state_t * state,uint64_t data)490 static void xxh_update_le(XXH64_state_t* state, uint64_t data) {
491     if (!MEM_isLittleEndian())
492         data = MEM_swap64(data);
493     XXH64_update(state, &data, sizeof(data));
494 }
495 
496 /** Hash the data to create the stamp. */
stamp_hash(data_t const * const * data)497 static uint64_t stamp_hash(data_t const* const* data) {
498     XXH64_state_t state;
499 
500     XXH64_reset(&state, 0);
501     assert(data != NULL);
502     for (; *data != NULL; ++data) {
503         data_t const* datum = *data;
504         /* We don't care about the URL that we fetch from. */
505         /* The path is derived from the name. */
506         XXH64_update(&state, datum->name, strlen(datum->name));
507         xxh_update_le(&state, datum->data.xxhash64);
508         xxh_update_le(&state, datum->dict.xxhash64);
509         xxh_update_le(&state, datum->type);
510     }
511     return XXH64_digest(&state);
512 }
513 
514 /** Check if the stamp matches the stamp in the cache directory. */
stamp_check(char const * dir,data_t const * const * data)515 static int stamp_check(char const* dir, data_t const* const* data) {
516     char* stamp = cat3(dir, "/", kStampName);
517     uint64_t const expected = stamp_hash(data);
518     XXH64_canonical_t actual;
519     FILE* stampfile = NULL;
520     int matches = 0;
521 
522     if (stamp == NULL)
523         goto out;
524     if (!UTIL_isRegularFile(stamp)) {
525         fprintf(stderr, "stamp does not exist: recreating the data cache\n");
526         goto out;
527     }
528 
529     stampfile = fopen(stamp, "rb");
530     if (stampfile == NULL) {
531         fprintf(stderr, "could not open stamp: recreating the data cache\n");
532         goto out;
533     }
534 
535     size_t b;
536     if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) {
537         fprintf(stderr, "invalid stamp: recreating the data cache\n");
538         goto out;
539     }
540 
541     matches = (expected == XXH64_hashFromCanonical(&actual));
542     if (matches)
543         fprintf(stderr, "stamp matches: reusing the cached data\n");
544     else
545         fprintf(stderr, "stamp does not match: recreating the data cache\n");
546 
547 out:
548     free(stamp);
549     if (stampfile != NULL)
550         fclose(stampfile);
551     return matches;
552 }
553 
554 /** On success write a new stamp, on failure delete the old stamp. */
555 static int
stamp_write(char const * dir,data_t const * const * data,int const data_err)556 stamp_write(char const* dir, data_t const* const* data, int const data_err) {
557     char* stamp = cat3(dir, "/", kStampName);
558     FILE* stampfile = NULL;
559     int err = EIO;
560 
561     if (stamp == NULL)
562         return ENOMEM;
563 
564     if (data_err != 0) {
565         err = data_err;
566         goto out;
567     }
568     XXH64_canonical_t hash;
569 
570     XXH64_canonicalFromHash(&hash, stamp_hash(data));
571 
572     stampfile = fopen(stamp, "wb");
573     if (stampfile == NULL)
574         goto out;
575     if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1)
576         goto out;
577     err = 0;
578     fprintf(stderr, "stamped new data cache\n");
579 out:
580     if (err != 0)
581         /* Ignore errors. */
582         unlink(stamp);
583     free(stamp);
584     if (stampfile != NULL)
585         fclose(stampfile);
586     return err;
587 }
588 
data_init(char const * dir)589 int data_init(char const* dir) {
590     int err;
591 
592     if (dir == NULL)
593         return EINVAL;
594 
595     /* This must be first to simplify logic. */
596     err = ensure_directory_exists(dir);
597     if (err != 0)
598         return err;
599 
600     /* Save the cache directory. */
601     g_data_dir = strdup(dir);
602     if (g_data_dir == NULL)
603         return ENOMEM;
604 
605     err = data_create_paths(g_data, dir);
606     if (err != 0)
607         return err;
608 
609     /* If the stamp matches then we are good to go.
610      * This must be called before any modifications to the data cache.
611      * After this point, we MUST call stamp_write() to update the STAMP,
612      * since we've updated the data cache.
613      */
614     if (stamp_check(dir, data))
615         return 0;
616 
617     err = curl_download_data(data);
618     if (err != 0)
619         goto out;
620 
621 out:
622     /* This must be last, since it must know if data_init() succeeded. */
623     stamp_write(dir, data, err);
624     return err;
625 }
626 
data_finish(void)627 void data_finish(void) {
628     data_free_paths(g_data);
629     free(g_data_dir);
630     g_data_dir = NULL;
631 }
632