1 /*
2 * Copyright (c) Facebook, Inc.
3 * All rights reserved.
4 *
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
9 */
10
11 #include "data.h"
12
13 #include <assert.h>
14 #include <errno.h>
15 #include <stdio.h>
16 #include <string.h>
17 #include <stdlib.h> /* free() */
18
19 #include <sys/stat.h>
20
21 #include <curl/curl.h>
22
23 #include "mem.h"
24 #include "util.h"
25 #define XXH_STATIC_LINKING_ONLY
26 #include "xxhash.h"
27
28 /**
29 * Data objects
30 */
31
32 #define REGRESSION_RELEASE(x) \
33 "https://github.com/facebook/zstd/releases/download/regression-data/" x
34
35 data_t silesia = {
36 .name = "silesia",
37 .type = data_type_dir,
38 .data =
39 {
40 .url = REGRESSION_RELEASE("silesia.tar.zst"),
41 .xxhash64 = 0x48a199f92f93e977LL,
42 },
43 };
44
45 data_t silesia_tar = {
46 .name = "silesia.tar",
47 .type = data_type_file,
48 .data =
49 {
50 .url = REGRESSION_RELEASE("silesia.tar.zst"),
51 .xxhash64 = 0x48a199f92f93e977LL,
52 },
53 };
54
55 data_t github = {
56 .name = "github",
57 .type = data_type_dir,
58 .data =
59 {
60 .url = REGRESSION_RELEASE("github.tar.zst"),
61 .xxhash64 = 0xa9b1b44b020df292LL,
62 },
63 .dict =
64 {
65 .url = REGRESSION_RELEASE("github.dict.zst"),
66 .xxhash64 = 0x1eddc6f737d3cb53LL,
67
68 },
69 };
70
71 data_t github_tar = {
72 .name = "github.tar",
73 .type = data_type_file,
74 .data =
75 {
76 .url = REGRESSION_RELEASE("github.tar.zst"),
77 .xxhash64 = 0xa9b1b44b020df292LL,
78 },
79 .dict =
80 {
81 .url = REGRESSION_RELEASE("github.dict.zst"),
82 .xxhash64 = 0x1eddc6f737d3cb53LL,
83
84 },
85 };
86
87 static data_t* g_data[] = {
88 &silesia,
89 &silesia_tar,
90 &github,
91 &github_tar,
92 NULL,
93 };
94
95 data_t const* const* data = (data_t const* const*)g_data;
96
97 /**
98 * data helpers.
99 */
100
data_has_dict(data_t const * data)101 int data_has_dict(data_t const* data) {
102 return data->dict.url != NULL;
103 }
104
105 /**
106 * data buffer helper functions (documented in header).
107 */
108
data_buffer_create(size_t const capacity)109 data_buffer_t data_buffer_create(size_t const capacity) {
110 data_buffer_t buffer = {};
111
112 buffer.data = (uint8_t*)malloc(capacity);
113 if (buffer.data == NULL)
114 return buffer;
115 buffer.capacity = capacity;
116 return buffer;
117 }
118
data_buffer_read(char const * filename)119 data_buffer_t data_buffer_read(char const* filename) {
120 data_buffer_t buffer = {};
121
122 uint64_t const size = UTIL_getFileSize(filename);
123 if (size == UTIL_FILESIZE_UNKNOWN) {
124 fprintf(stderr, "unknown size for %s\n", filename);
125 return buffer;
126 }
127
128 buffer.data = (uint8_t*)malloc(size);
129 if (buffer.data == NULL) {
130 fprintf(stderr, "malloc failed\n");
131 return buffer;
132 }
133 buffer.capacity = size;
134
135 FILE* file = fopen(filename, "rb");
136 if (file == NULL) {
137 fprintf(stderr, "file null\n");
138 goto err;
139 }
140 buffer.size = fread(buffer.data, 1, buffer.capacity, file);
141 fclose(file);
142 if (buffer.size != buffer.capacity) {
143 fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity);
144 goto err;
145 }
146
147 return buffer;
148 err:
149 free(buffer.data);
150 memset(&buffer, 0, sizeof(buffer));
151 return buffer;
152 }
153
data_buffer_get_data(data_t const * data)154 data_buffer_t data_buffer_get_data(data_t const* data) {
155 data_buffer_t const kEmptyBuffer = {};
156
157 if (data->type != data_type_file)
158 return kEmptyBuffer;
159
160 return data_buffer_read(data->data.path);
161 }
162
data_buffer_get_dict(data_t const * data)163 data_buffer_t data_buffer_get_dict(data_t const* data) {
164 data_buffer_t const kEmptyBuffer = {};
165
166 if (!data_has_dict(data))
167 return kEmptyBuffer;
168
169 return data_buffer_read(data->dict.path);
170 }
171
data_buffer_compare(data_buffer_t buffer1,data_buffer_t buffer2)172 int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) {
173 size_t const size =
174 buffer1.size < buffer2.size ? buffer1.size : buffer2.size;
175 int const cmp = memcmp(buffer1.data, buffer2.data, size);
176 if (cmp != 0)
177 return cmp;
178 if (buffer1.size < buffer2.size)
179 return -1;
180 if (buffer1.size == buffer2.size)
181 return 0;
182 assert(buffer1.size > buffer2.size);
183 return 1;
184 }
185
data_buffer_free(data_buffer_t buffer)186 void data_buffer_free(data_buffer_t buffer) {
187 free(buffer.data);
188 }
189
190 /**
191 * data filenames helpers.
192 */
193
data_filenames_get(data_t const * data)194 FileNamesTable* data_filenames_get(data_t const* data)
195 {
196 char const* const path = data->data.path;
197 return UTIL_createExpandedFNT(&path, 1, 0 /* followLinks */ );
198 }
199
200 /**
201 * data buffers helpers.
202 */
203
data_buffers_get(data_t const * data)204 data_buffers_t data_buffers_get(data_t const* data) {
205 data_buffers_t buffers = {.size = 0};
206 FileNamesTable* const filenames = data_filenames_get(data);
207 if (filenames == NULL) return buffers;
208 if (filenames->tableSize == 0) {
209 UTIL_freeFileNamesTable(filenames);
210 return buffers;
211 }
212
213 data_buffer_t* buffersPtr =
214 (data_buffer_t*)malloc(filenames->tableSize * sizeof(*buffersPtr));
215 if (buffersPtr == NULL) {
216 UTIL_freeFileNamesTable(filenames);
217 return buffers;
218 }
219 buffers.buffers = (data_buffer_t const*)buffersPtr;
220 buffers.size = filenames->tableSize;
221
222 for (size_t i = 0; i < filenames->tableSize; ++i) {
223 buffersPtr[i] = data_buffer_read(filenames->fileNames[i]);
224 if (buffersPtr[i].data == NULL) {
225 data_buffers_t const kEmptyBuffer = {};
226 data_buffers_free(buffers);
227 UTIL_freeFileNamesTable(filenames);
228 return kEmptyBuffer;
229 }
230 }
231
232 UTIL_freeFileNamesTable(filenames);
233 return buffers;
234 }
235
236 /**
237 * Frees the data buffers.
238 */
data_buffers_free(data_buffers_t buffers)239 void data_buffers_free(data_buffers_t buffers) {
240 free((data_buffer_t*)buffers.buffers);
241 }
242
243 /**
244 * Initialization and download functions.
245 */
246
247 static char* g_data_dir = NULL;
248
249 /* mkdir -p */
ensure_directory_exists(char const * indir)250 static int ensure_directory_exists(char const* indir) {
251 char* const dir = strdup(indir);
252 char* end = dir;
253 int ret = 0;
254 if (dir == NULL) {
255 ret = EINVAL;
256 goto out;
257 }
258 do {
259 /* Find the next directory level. */
260 for (++end; *end != '\0' && *end != '/'; ++end)
261 ;
262 /* End the string there, make the directory, and restore the string. */
263 char const save = *end;
264 *end = '\0';
265 int const isdir = UTIL_isDirectory(dir);
266 ret = mkdir(dir, S_IRWXU);
267 *end = save;
268 /* Its okay if the directory already exists. */
269 if (ret == 0 || (errno == EEXIST && isdir))
270 continue;
271 ret = errno;
272 fprintf(stderr, "mkdir() failed\n");
273 goto out;
274 } while (*end != '\0');
275
276 ret = 0;
277 out:
278 free(dir);
279 return ret;
280 }
281
282 /** Concatenate 3 strings into a new buffer. */
cat3(char const * str1,char const * str2,char const * str3)283 static char* cat3(char const* str1, char const* str2, char const* str3) {
284 size_t const size1 = strlen(str1);
285 size_t const size2 = strlen(str2);
286 size_t const size3 = str3 == NULL ? 0 : strlen(str3);
287 size_t const size = size1 + size2 + size3 + 1;
288 char* const dst = (char*)malloc(size);
289 if (dst == NULL)
290 return NULL;
291 strcpy(dst, str1);
292 strcpy(dst + size1, str2);
293 if (str3 != NULL)
294 strcpy(dst + size1 + size2, str3);
295 assert(strlen(dst) == size1 + size2 + size3);
296 return dst;
297 }
298
cat2(char const * str1,char const * str2)299 static char* cat2(char const* str1, char const* str2) {
300 return cat3(str1, str2, NULL);
301 }
302
303 /**
304 * State needed by the curl callback.
305 * It takes data from curl, hashes it, and writes it to the file.
306 */
307 typedef struct {
308 FILE* file;
309 XXH64_state_t xxhash64;
310 int error;
311 } curl_data_t;
312
313 /** Create the curl state. */
curl_data_create(data_resource_t const * resource,data_type_t type)314 static curl_data_t curl_data_create(
315 data_resource_t const* resource,
316 data_type_t type) {
317 curl_data_t cdata = {};
318
319 XXH64_reset(&cdata.xxhash64, 0);
320
321 assert(UTIL_isDirectory(g_data_dir));
322
323 if (type == data_type_file) {
324 /* Decompress the resource and store to the path. */
325 char* cmd = cat3("zstd -dqfo '", resource->path, "'");
326 if (cmd == NULL) {
327 cdata.error = ENOMEM;
328 return cdata;
329 }
330 cdata.file = popen(cmd, "w");
331 free(cmd);
332 } else {
333 /* Decompress and extract the resource to the cache directory. */
334 char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'");
335 if (cmd == NULL) {
336 cdata.error = ENOMEM;
337 return cdata;
338 }
339 cdata.file = popen(cmd, "w");
340 free(cmd);
341 }
342 if (cdata.file == NULL) {
343 cdata.error = errno;
344 }
345
346 return cdata;
347 }
348
349 /** Free the curl state. */
curl_data_free(curl_data_t cdata)350 static int curl_data_free(curl_data_t cdata) {
351 return pclose(cdata.file);
352 }
353
354 /** curl callback. Updates the hash, and writes to the file. */
curl_write(void * data,size_t size,size_t count,void * ptr)355 static size_t curl_write(void* data, size_t size, size_t count, void* ptr) {
356 curl_data_t* cdata = (curl_data_t*)ptr;
357 size_t const written = fwrite(data, size, count, cdata->file);
358 XXH64_update(&cdata->xxhash64, data, written * size);
359 return written;
360 }
361
curl_download_resource(CURL * curl,data_resource_t const * resource,data_type_t type)362 static int curl_download_resource(
363 CURL* curl,
364 data_resource_t const* resource,
365 data_type_t type) {
366 curl_data_t cdata;
367 /* Download the data. */
368 if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0)
369 return EINVAL;
370 if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0)
371 return EINVAL;
372 cdata = curl_data_create(resource, type);
373 if (cdata.error != 0)
374 return cdata.error;
375 int const curl_err = curl_easy_perform(curl);
376 int const close_err = curl_data_free(cdata);
377 if (curl_err) {
378 fprintf(
379 stderr,
380 "downloading '%s' for '%s' failed\n",
381 resource->url,
382 resource->path);
383 return EIO;
384 }
385 if (close_err) {
386 fprintf(stderr, "writing data to '%s' failed\n", resource->path);
387 return EIO;
388 }
389 /* check that the file exists. */
390 if (type == data_type_file && !UTIL_isRegularFile(resource->path)) {
391 fprintf(stderr, "output file '%s' does not exist\n", resource->path);
392 return EIO;
393 }
394 if (type == data_type_dir && !UTIL_isDirectory(resource->path)) {
395 fprintf(
396 stderr, "output directory '%s' does not exist\n", resource->path);
397 return EIO;
398 }
399 /* Check that the hash matches. */
400 if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) {
401 fprintf(
402 stderr,
403 "checksum does not match: 0x%llxLL != 0x%llxLL\n",
404 (unsigned long long)XXH64_digest(&cdata.xxhash64),
405 (unsigned long long)resource->xxhash64);
406 return EINVAL;
407 }
408
409 return 0;
410 }
411
412 /** Download a single data object. */
curl_download_datum(CURL * curl,data_t const * data)413 static int curl_download_datum(CURL* curl, data_t const* data) {
414 int ret;
415 ret = curl_download_resource(curl, &data->data, data->type);
416 if (ret != 0)
417 return ret;
418 if (data_has_dict(data)) {
419 ret = curl_download_resource(curl, &data->dict, data_type_file);
420 if (ret != 0)
421 return ret;
422 }
423 return ret;
424 }
425
426 /** Download all the data. */
curl_download_data(data_t const * const * data)427 static int curl_download_data(data_t const* const* data) {
428 if (curl_global_init(CURL_GLOBAL_ALL) != 0)
429 return EFAULT;
430
431 curl_data_t cdata = {};
432 CURL* curl = curl_easy_init();
433 int err = EFAULT;
434
435 if (curl == NULL)
436 return EFAULT;
437
438 if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0)
439 goto out;
440 if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0)
441 goto out;
442 if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0)
443 goto out;
444
445 assert(data != NULL);
446 for (; *data != NULL; ++data) {
447 if (curl_download_datum(curl, *data) != 0)
448 goto out;
449 }
450
451 err = 0;
452 out:
453 curl_easy_cleanup(curl);
454 curl_global_cleanup();
455 return err;
456 }
457
458 /** Fill the path member variable of the data objects. */
data_create_paths(data_t * const * data,char const * dir)459 static int data_create_paths(data_t* const* data, char const* dir) {
460 size_t const dirlen = strlen(dir);
461 assert(data != NULL);
462 for (; *data != NULL; ++data) {
463 data_t* const datum = *data;
464 datum->data.path = cat3(dir, "/", datum->name);
465 if (datum->data.path == NULL)
466 return ENOMEM;
467 if (data_has_dict(datum)) {
468 datum->dict.path = cat2(datum->data.path, ".dict");
469 if (datum->dict.path == NULL)
470 return ENOMEM;
471 }
472 }
473 return 0;
474 }
475
476 /** Free the path member variable of the data objects. */
data_free_paths(data_t * const * data)477 static void data_free_paths(data_t* const* data) {
478 assert(data != NULL);
479 for (; *data != NULL; ++data) {
480 data_t* datum = *data;
481 free((void*)datum->data.path);
482 free((void*)datum->dict.path);
483 datum->data.path = NULL;
484 datum->dict.path = NULL;
485 }
486 }
487
488 static char const kStampName[] = "STAMP";
489
xxh_update_le(XXH64_state_t * state,uint64_t data)490 static void xxh_update_le(XXH64_state_t* state, uint64_t data) {
491 if (!MEM_isLittleEndian())
492 data = MEM_swap64(data);
493 XXH64_update(state, &data, sizeof(data));
494 }
495
496 /** Hash the data to create the stamp. */
stamp_hash(data_t const * const * data)497 static uint64_t stamp_hash(data_t const* const* data) {
498 XXH64_state_t state;
499
500 XXH64_reset(&state, 0);
501 assert(data != NULL);
502 for (; *data != NULL; ++data) {
503 data_t const* datum = *data;
504 /* We don't care about the URL that we fetch from. */
505 /* The path is derived from the name. */
506 XXH64_update(&state, datum->name, strlen(datum->name));
507 xxh_update_le(&state, datum->data.xxhash64);
508 xxh_update_le(&state, datum->dict.xxhash64);
509 xxh_update_le(&state, datum->type);
510 }
511 return XXH64_digest(&state);
512 }
513
514 /** Check if the stamp matches the stamp in the cache directory. */
stamp_check(char const * dir,data_t const * const * data)515 static int stamp_check(char const* dir, data_t const* const* data) {
516 char* stamp = cat3(dir, "/", kStampName);
517 uint64_t const expected = stamp_hash(data);
518 XXH64_canonical_t actual;
519 FILE* stampfile = NULL;
520 int matches = 0;
521
522 if (stamp == NULL)
523 goto out;
524 if (!UTIL_isRegularFile(stamp)) {
525 fprintf(stderr, "stamp does not exist: recreating the data cache\n");
526 goto out;
527 }
528
529 stampfile = fopen(stamp, "rb");
530 if (stampfile == NULL) {
531 fprintf(stderr, "could not open stamp: recreating the data cache\n");
532 goto out;
533 }
534
535 size_t b;
536 if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) {
537 fprintf(stderr, "invalid stamp: recreating the data cache\n");
538 goto out;
539 }
540
541 matches = (expected == XXH64_hashFromCanonical(&actual));
542 if (matches)
543 fprintf(stderr, "stamp matches: reusing the cached data\n");
544 else
545 fprintf(stderr, "stamp does not match: recreating the data cache\n");
546
547 out:
548 free(stamp);
549 if (stampfile != NULL)
550 fclose(stampfile);
551 return matches;
552 }
553
554 /** On success write a new stamp, on failure delete the old stamp. */
555 static int
stamp_write(char const * dir,data_t const * const * data,int const data_err)556 stamp_write(char const* dir, data_t const* const* data, int const data_err) {
557 char* stamp = cat3(dir, "/", kStampName);
558 FILE* stampfile = NULL;
559 int err = EIO;
560
561 if (stamp == NULL)
562 return ENOMEM;
563
564 if (data_err != 0) {
565 err = data_err;
566 goto out;
567 }
568 XXH64_canonical_t hash;
569
570 XXH64_canonicalFromHash(&hash, stamp_hash(data));
571
572 stampfile = fopen(stamp, "wb");
573 if (stampfile == NULL)
574 goto out;
575 if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1)
576 goto out;
577 err = 0;
578 fprintf(stderr, "stamped new data cache\n");
579 out:
580 if (err != 0)
581 /* Ignore errors. */
582 unlink(stamp);
583 free(stamp);
584 if (stampfile != NULL)
585 fclose(stampfile);
586 return err;
587 }
588
data_init(char const * dir)589 int data_init(char const* dir) {
590 int err;
591
592 if (dir == NULL)
593 return EINVAL;
594
595 /* This must be first to simplify logic. */
596 err = ensure_directory_exists(dir);
597 if (err != 0)
598 return err;
599
600 /* Save the cache directory. */
601 g_data_dir = strdup(dir);
602 if (g_data_dir == NULL)
603 return ENOMEM;
604
605 err = data_create_paths(g_data, dir);
606 if (err != 0)
607 return err;
608
609 /* If the stamp matches then we are good to go.
610 * This must be called before any modifications to the data cache.
611 * After this point, we MUST call stamp_write() to update the STAMP,
612 * since we've updated the data cache.
613 */
614 if (stamp_check(dir, data))
615 return 0;
616
617 err = curl_download_data(data);
618 if (err != 0)
619 goto out;
620
621 out:
622 /* This must be last, since it must know if data_init() succeeded. */
623 stamp_write(dir, data, err);
624 return err;
625 }
626
data_finish(void)627 void data_finish(void) {
628 data_free_paths(g_data);
629 free(g_data_dir);
630 g_data_dir = NULL;
631 }
632