1 /*
2  * Copyright (c) 2016-2020, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under both the BSD-style license (found in the
6  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7  * in the COPYING file in the root directory of this source tree).
8  * You may select, at your option, one of the above-listed licenses.
9  */
10 
11 #include "data.h"
12 
13 #include <assert.h>
14 #include <errno.h>
15 #include <stdio.h>
16 #include <string.h>
17 
18 #include <sys/stat.h>
19 
20 #include <curl/curl.h>
21 
22 #include "mem.h"
23 #include "util.h"
24 #define XXH_STATIC_LINKING_ONLY
25 #include "xxhash.h"
26 
27 /**
28  * Data objects
29  */
30 
31 #define REGRESSION_RELEASE(x) \
32     "https://github.com/facebook/zstd/releases/download/regression-data/" x
33 
34 data_t silesia = {
35     .name = "silesia",
36     .type = data_type_dir,
37     .data =
38         {
39             .url = REGRESSION_RELEASE("silesia.tar.zst"),
40             .xxhash64 = 0x48a199f92f93e977LL,
41         },
42 };
43 
44 data_t silesia_tar = {
45     .name = "silesia.tar",
46     .type = data_type_file,
47     .data =
48         {
49             .url = REGRESSION_RELEASE("silesia.tar.zst"),
50             .xxhash64 = 0x48a199f92f93e977LL,
51         },
52 };
53 
54 data_t github = {
55     .name = "github",
56     .type = data_type_dir,
57     .data =
58         {
59             .url = REGRESSION_RELEASE("github.tar.zst"),
60             .xxhash64 = 0xa9b1b44b020df292LL,
61         },
62     .dict =
63         {
64             .url = REGRESSION_RELEASE("github.dict.zst"),
65             .xxhash64 = 0x1eddc6f737d3cb53LL,
66 
67         },
68 };
69 
70 static data_t* g_data[] = {
71     &silesia,
72     &silesia_tar,
73     &github,
74     NULL,
75 };
76 
77 data_t const* const* data = (data_t const* const*)g_data;
78 
79 /**
80  * data helpers.
81  */
82 
data_has_dict(data_t const * data)83 int data_has_dict(data_t const* data) {
84     return data->dict.url != NULL;
85 }
86 
87 /**
88  * data buffer helper functions (documented in header).
89  */
90 
data_buffer_create(size_t const capacity)91 data_buffer_t data_buffer_create(size_t const capacity) {
92     data_buffer_t buffer = {};
93 
94     buffer.data = (uint8_t*)malloc(capacity);
95     if (buffer.data == NULL)
96         return buffer;
97     buffer.capacity = capacity;
98     return buffer;
99 }
100 
data_buffer_read(char const * filename)101 data_buffer_t data_buffer_read(char const* filename) {
102     data_buffer_t buffer = {};
103 
104     uint64_t const size = UTIL_getFileSize(filename);
105     if (size == UTIL_FILESIZE_UNKNOWN) {
106         fprintf(stderr, "unknown size for %s\n", filename);
107         return buffer;
108     }
109 
110     buffer.data = (uint8_t*)malloc(size);
111     if (buffer.data == NULL) {
112         fprintf(stderr, "malloc failed\n");
113         return buffer;
114     }
115     buffer.capacity = size;
116 
117     FILE* file = fopen(filename, "rb");
118     if (file == NULL) {
119         fprintf(stderr, "file null\n");
120         goto err;
121     }
122     buffer.size = fread(buffer.data, 1, buffer.capacity, file);
123     fclose(file);
124     if (buffer.size != buffer.capacity) {
125         fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity);
126         goto err;
127     }
128 
129     return buffer;
130 err:
131     free(buffer.data);
132     memset(&buffer, 0, sizeof(buffer));
133     return buffer;
134 }
135 
data_buffer_get_data(data_t const * data)136 data_buffer_t data_buffer_get_data(data_t const* data) {
137     data_buffer_t const kEmptyBuffer = {};
138 
139     if (data->type != data_type_file)
140         return kEmptyBuffer;
141 
142     return data_buffer_read(data->data.path);
143 }
144 
data_buffer_get_dict(data_t const * data)145 data_buffer_t data_buffer_get_dict(data_t const* data) {
146     data_buffer_t const kEmptyBuffer = {};
147 
148     if (!data_has_dict(data))
149         return kEmptyBuffer;
150 
151     return data_buffer_read(data->dict.path);
152 }
153 
data_buffer_compare(data_buffer_t buffer1,data_buffer_t buffer2)154 int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) {
155     size_t const size =
156         buffer1.size < buffer2.size ? buffer1.size : buffer2.size;
157     int const cmp = memcmp(buffer1.data, buffer2.data, size);
158     if (cmp != 0)
159         return cmp;
160     if (buffer1.size < buffer2.size)
161         return -1;
162     if (buffer1.size == buffer2.size)
163         return 0;
164     assert(buffer1.size > buffer2.size);
165     return 1;
166 }
167 
data_buffer_free(data_buffer_t buffer)168 void data_buffer_free(data_buffer_t buffer) {
169     free(buffer.data);
170 }
171 
172 /**
173  * data filenames helpers.
174  */
175 
data_filenames_get(data_t const * data)176 FileNamesTable* data_filenames_get(data_t const* data)
177 {
178     char const* const path = data->data.path;
179     return UTIL_createExpandedFNT(&path, 1, 0 /* followLinks */ );
180 }
181 
182 /**
183  * data buffers helpers.
184  */
185 
data_buffers_get(data_t const * data)186 data_buffers_t data_buffers_get(data_t const* data) {
187     data_buffers_t buffers = {.size = 0};
188     FileNamesTable* const filenames = data_filenames_get(data);
189     if (filenames == NULL) return buffers;
190     if (filenames->tableSize == 0) {
191         UTIL_freeFileNamesTable(filenames);
192         return buffers;
193     }
194 
195     data_buffer_t* buffersPtr =
196         (data_buffer_t*)malloc(filenames->tableSize * sizeof(*buffersPtr));
197     if (buffersPtr == NULL) {
198         UTIL_freeFileNamesTable(filenames);
199         return buffers;
200     }
201     buffers.buffers = (data_buffer_t const*)buffersPtr;
202     buffers.size = filenames->tableSize;
203 
204     for (size_t i = 0; i < filenames->tableSize; ++i) {
205         buffersPtr[i] = data_buffer_read(filenames->fileNames[i]);
206         if (buffersPtr[i].data == NULL) {
207             data_buffers_t const kEmptyBuffer = {};
208             data_buffers_free(buffers);
209             UTIL_freeFileNamesTable(filenames);
210             return kEmptyBuffer;
211         }
212     }
213 
214     UTIL_freeFileNamesTable(filenames);
215     return buffers;
216 }
217 
218 /**
219  * Frees the data buffers.
220  */
data_buffers_free(data_buffers_t buffers)221 void data_buffers_free(data_buffers_t buffers) {
222     free((data_buffer_t*)buffers.buffers);
223 }
224 
225 /**
226  * Initialization and download functions.
227  */
228 
229 static char* g_data_dir = NULL;
230 
231 /* mkdir -p */
ensure_directory_exists(char const * indir)232 static int ensure_directory_exists(char const* indir) {
233     char* const dir = strdup(indir);
234     char* end = dir;
235     int ret = 0;
236     if (dir == NULL) {
237         ret = EINVAL;
238         goto out;
239     }
240     do {
241         /* Find the next directory level. */
242         for (++end; *end != '\0' && *end != '/'; ++end)
243             ;
244         /* End the string there, make the directory, and restore the string. */
245         char const save = *end;
246         *end = '\0';
247         int const isdir = UTIL_isDirectory(dir);
248         ret = mkdir(dir, S_IRWXU);
249         *end = save;
250         /* Its okay if the directory already exists. */
251         if (ret == 0 || (errno == EEXIST && isdir))
252             continue;
253         ret = errno;
254         fprintf(stderr, "mkdir() failed\n");
255         goto out;
256     } while (*end != '\0');
257 
258     ret = 0;
259 out:
260     free(dir);
261     return ret;
262 }
263 
264 /** Concatenate 3 strings into a new buffer. */
cat3(char const * str1,char const * str2,char const * str3)265 static char* cat3(char const* str1, char const* str2, char const* str3) {
266     size_t const size1 = strlen(str1);
267     size_t const size2 = strlen(str2);
268     size_t const size3 = str3 == NULL ? 0 : strlen(str3);
269     size_t const size = size1 + size2 + size3 + 1;
270     char* const dst = (char*)malloc(size);
271     if (dst == NULL)
272         return NULL;
273     strcpy(dst, str1);
274     strcpy(dst + size1, str2);
275     if (str3 != NULL)
276         strcpy(dst + size1 + size2, str3);
277     assert(strlen(dst) == size1 + size2 + size3);
278     return dst;
279 }
280 
cat2(char const * str1,char const * str2)281 static char* cat2(char const* str1, char const* str2) {
282     return cat3(str1, str2, NULL);
283 }
284 
285 /**
286  * State needed by the curl callback.
287  * It takes data from curl, hashes it, and writes it to the file.
288  */
289 typedef struct {
290     FILE* file;
291     XXH64_state_t xxhash64;
292     int error;
293 } curl_data_t;
294 
295 /** Create the curl state. */
curl_data_create(data_resource_t const * resource,data_type_t type)296 static curl_data_t curl_data_create(
297     data_resource_t const* resource,
298     data_type_t type) {
299     curl_data_t cdata = {};
300 
301     XXH64_reset(&cdata.xxhash64, 0);
302 
303     assert(UTIL_isDirectory(g_data_dir));
304 
305     if (type == data_type_file) {
306         /* Decompress the resource and store to the path. */
307         char* cmd = cat3("zstd -dqfo '", resource->path, "'");
308         if (cmd == NULL) {
309             cdata.error = ENOMEM;
310             return cdata;
311         }
312         cdata.file = popen(cmd, "w");
313         free(cmd);
314     } else {
315         /* Decompress and extract the resource to the cache directory. */
316         char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'");
317         if (cmd == NULL) {
318             cdata.error = ENOMEM;
319             return cdata;
320         }
321         cdata.file = popen(cmd, "w");
322         free(cmd);
323     }
324     if (cdata.file == NULL) {
325         cdata.error = errno;
326     }
327 
328     return cdata;
329 }
330 
331 /** Free the curl state. */
curl_data_free(curl_data_t cdata)332 static int curl_data_free(curl_data_t cdata) {
333     return pclose(cdata.file);
334 }
335 
336 /** curl callback. Updates the hash, and writes to the file. */
curl_write(void * data,size_t size,size_t count,void * ptr)337 static size_t curl_write(void* data, size_t size, size_t count, void* ptr) {
338     curl_data_t* cdata = (curl_data_t*)ptr;
339     size_t const written = fwrite(data, size, count, cdata->file);
340     XXH64_update(&cdata->xxhash64, data, written * size);
341     return written;
342 }
343 
curl_download_resource(CURL * curl,data_resource_t const * resource,data_type_t type)344 static int curl_download_resource(
345     CURL* curl,
346     data_resource_t const* resource,
347     data_type_t type) {
348     curl_data_t cdata;
349     /* Download the data. */
350     if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0)
351         return EINVAL;
352     if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0)
353         return EINVAL;
354     cdata = curl_data_create(resource, type);
355     if (cdata.error != 0)
356         return cdata.error;
357     int const curl_err = curl_easy_perform(curl);
358     int const close_err = curl_data_free(cdata);
359     if (curl_err) {
360         fprintf(
361             stderr,
362             "downloading '%s' for '%s' failed\n",
363             resource->url,
364             resource->path);
365         return EIO;
366     }
367     if (close_err) {
368         fprintf(stderr, "writing data to '%s' failed\n", resource->path);
369         return EIO;
370     }
371     /* check that the file exists. */
372     if (type == data_type_file && !UTIL_isRegularFile(resource->path)) {
373         fprintf(stderr, "output file '%s' does not exist\n", resource->path);
374         return EIO;
375     }
376     if (type == data_type_dir && !UTIL_isDirectory(resource->path)) {
377         fprintf(
378             stderr, "output directory '%s' does not exist\n", resource->path);
379         return EIO;
380     }
381     /* Check that the hash matches. */
382     if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) {
383         fprintf(
384             stderr,
385             "checksum does not match: 0x%llxLL != 0x%llxLL\n",
386             (unsigned long long)XXH64_digest(&cdata.xxhash64),
387             (unsigned long long)resource->xxhash64);
388         return EINVAL;
389     }
390 
391     return 0;
392 }
393 
394 /** Download a single data object. */
curl_download_datum(CURL * curl,data_t const * data)395 static int curl_download_datum(CURL* curl, data_t const* data) {
396     int ret;
397     ret = curl_download_resource(curl, &data->data, data->type);
398     if (ret != 0)
399         return ret;
400     if (data_has_dict(data)) {
401         ret = curl_download_resource(curl, &data->dict, data_type_file);
402         if (ret != 0)
403             return ret;
404     }
405     return ret;
406 }
407 
408 /** Download all the data. */
curl_download_data(data_t const * const * data)409 static int curl_download_data(data_t const* const* data) {
410     if (curl_global_init(CURL_GLOBAL_ALL) != 0)
411         return EFAULT;
412 
413     curl_data_t cdata = {};
414     CURL* curl = curl_easy_init();
415     int err = EFAULT;
416 
417     if (curl == NULL)
418         return EFAULT;
419 
420     if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0)
421         goto out;
422     if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0)
423         goto out;
424     if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0)
425         goto out;
426 
427     assert(data != NULL);
428     for (; *data != NULL; ++data) {
429         if (curl_download_datum(curl, *data) != 0)
430             goto out;
431     }
432 
433     err = 0;
434 out:
435     curl_easy_cleanup(curl);
436     curl_global_cleanup();
437     return err;
438 }
439 
440 /** Fill the path member variable of the data objects. */
data_create_paths(data_t * const * data,char const * dir)441 static int data_create_paths(data_t* const* data, char const* dir) {
442     size_t const dirlen = strlen(dir);
443     assert(data != NULL);
444     for (; *data != NULL; ++data) {
445         data_t* const datum = *data;
446         datum->data.path = cat3(dir, "/", datum->name);
447         if (datum->data.path == NULL)
448             return ENOMEM;
449         if (data_has_dict(datum)) {
450             datum->dict.path = cat2(datum->data.path, ".dict");
451             if (datum->dict.path == NULL)
452                 return ENOMEM;
453         }
454     }
455     return 0;
456 }
457 
458 /** Free the path member variable of the data objects. */
data_free_paths(data_t * const * data)459 static void data_free_paths(data_t* const* data) {
460     assert(data != NULL);
461     for (; *data != NULL; ++data) {
462         data_t* datum = *data;
463         free((void*)datum->data.path);
464         free((void*)datum->dict.path);
465         datum->data.path = NULL;
466         datum->dict.path = NULL;
467     }
468 }
469 
470 static char const kStampName[] = "STAMP";
471 
xxh_update_le(XXH64_state_t * state,uint64_t data)472 static void xxh_update_le(XXH64_state_t* state, uint64_t data) {
473     if (!MEM_isLittleEndian())
474         data = MEM_swap64(data);
475     XXH64_update(state, &data, sizeof(data));
476 }
477 
478 /** Hash the data to create the stamp. */
stamp_hash(data_t const * const * data)479 static uint64_t stamp_hash(data_t const* const* data) {
480     XXH64_state_t state;
481 
482     XXH64_reset(&state, 0);
483     assert(data != NULL);
484     for (; *data != NULL; ++data) {
485         data_t const* datum = *data;
486         /* We don't care about the URL that we fetch from. */
487         /* The path is derived from the name. */
488         XXH64_update(&state, datum->name, strlen(datum->name));
489         xxh_update_le(&state, datum->data.xxhash64);
490         xxh_update_le(&state, datum->dict.xxhash64);
491         xxh_update_le(&state, datum->type);
492     }
493     return XXH64_digest(&state);
494 }
495 
496 /** Check if the stamp matches the stamp in the cache directory. */
stamp_check(char const * dir,data_t const * const * data)497 static int stamp_check(char const* dir, data_t const* const* data) {
498     char* stamp = cat3(dir, "/", kStampName);
499     uint64_t const expected = stamp_hash(data);
500     XXH64_canonical_t actual;
501     FILE* stampfile = NULL;
502     int matches = 0;
503 
504     if (stamp == NULL)
505         goto out;
506     if (!UTIL_isRegularFile(stamp)) {
507         fprintf(stderr, "stamp does not exist: recreating the data cache\n");
508         goto out;
509     }
510 
511     stampfile = fopen(stamp, "rb");
512     if (stampfile == NULL) {
513         fprintf(stderr, "could not open stamp: recreating the data cache\n");
514         goto out;
515     }
516 
517     size_t b;
518     if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) {
519         fprintf(stderr, "invalid stamp: recreating the data cache\n");
520         goto out;
521     }
522 
523     matches = (expected == XXH64_hashFromCanonical(&actual));
524     if (matches)
525         fprintf(stderr, "stamp matches: reusing the cached data\n");
526     else
527         fprintf(stderr, "stamp does not match: recreating the data cache\n");
528 
529 out:
530     free(stamp);
531     if (stampfile != NULL)
532         fclose(stampfile);
533     return matches;
534 }
535 
536 /** On success write a new stamp, on failure delete the old stamp. */
537 static int
stamp_write(char const * dir,data_t const * const * data,int const data_err)538 stamp_write(char const* dir, data_t const* const* data, int const data_err) {
539     char* stamp = cat3(dir, "/", kStampName);
540     FILE* stampfile = NULL;
541     int err = EIO;
542 
543     if (stamp == NULL)
544         return ENOMEM;
545 
546     if (data_err != 0) {
547         err = data_err;
548         goto out;
549     }
550     XXH64_canonical_t hash;
551 
552     XXH64_canonicalFromHash(&hash, stamp_hash(data));
553 
554     stampfile = fopen(stamp, "wb");
555     if (stampfile == NULL)
556         goto out;
557     if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1)
558         goto out;
559     err = 0;
560     fprintf(stderr, "stamped new data cache\n");
561 out:
562     if (err != 0)
563         /* Ignore errors. */
564         unlink(stamp);
565     free(stamp);
566     if (stampfile != NULL)
567         fclose(stampfile);
568     return err;
569 }
570 
data_init(char const * dir)571 int data_init(char const* dir) {
572     int err;
573 
574     if (dir == NULL)
575         return EINVAL;
576 
577     /* This must be first to simplify logic. */
578     err = ensure_directory_exists(dir);
579     if (err != 0)
580         return err;
581 
582     /* Save the cache directory. */
583     g_data_dir = strdup(dir);
584     if (g_data_dir == NULL)
585         return ENOMEM;
586 
587     err = data_create_paths(g_data, dir);
588     if (err != 0)
589         return err;
590 
591     /* If the stamp matches then we are good to go.
592      * This must be called before any modifications to the data cache.
593      * After this point, we MUST call stamp_write() to update the STAMP,
594      * since we've updated the data cache.
595      */
596     if (stamp_check(dir, data))
597         return 0;
598 
599     err = curl_download_data(data);
600     if (err != 0)
601         goto out;
602 
603 out:
604     /* This must be last, since it must know if data_init() succeeded. */
605     stamp_write(dir, data, err);
606     return err;
607 }
608 
data_finish(void)609 void data_finish(void) {
610     data_free_paths(g_data);
611     free(g_data_dir);
612     g_data_dir = NULL;
613 }
614