1 /*
2 * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
3 * All rights reserved.
4 *
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
9 */
10
11 #include "data.h"
12
13 #include <assert.h>
14 #include <errno.h>
15 #include <stdio.h>
16 #include <string.h>
17
18 #include <sys/stat.h>
19
20 #include <curl/curl.h>
21
22 #include "mem.h"
23 #include "util.h"
24 #define XXH_STATIC_LINKING_ONLY
25 #include "xxhash.h"
26
27 /**
28 * Data objects
29 */
30
31 #define REGRESSION_RELEASE(x) \
32 "https://github.com/facebook/zstd/releases/download/regression-data/" x
33
34 data_t silesia = {
35 .name = "silesia",
36 .type = data_type_dir,
37 .data =
38 {
39 .url = REGRESSION_RELEASE("silesia.tar.zst"),
40 .xxhash64 = 0x48a199f92f93e977LL,
41 },
42 };
43
44 data_t silesia_tar = {
45 .name = "silesia.tar",
46 .type = data_type_file,
47 .data =
48 {
49 .url = REGRESSION_RELEASE("silesia.tar.zst"),
50 .xxhash64 = 0x48a199f92f93e977LL,
51 },
52 };
53
54 data_t github = {
55 .name = "github",
56 .type = data_type_dir,
57 .data =
58 {
59 .url = REGRESSION_RELEASE("github.tar.zst"),
60 .xxhash64 = 0xa9b1b44b020df292LL,
61 },
62 .dict =
63 {
64 .url = REGRESSION_RELEASE("github.dict.zst"),
65 .xxhash64 = 0x1eddc6f737d3cb53LL,
66
67 },
68 };
69
70 static data_t* g_data[] = {
71 &silesia,
72 &silesia_tar,
73 &github,
74 NULL,
75 };
76
77 data_t const* const* data = (data_t const* const*)g_data;
78
79 /**
80 * data helpers.
81 */
82
data_has_dict(data_t const * data)83 int data_has_dict(data_t const* data) {
84 return data->dict.url != NULL;
85 }
86
87 /**
88 * data buffer helper functions (documented in header).
89 */
90
data_buffer_create(size_t const capacity)91 data_buffer_t data_buffer_create(size_t const capacity) {
92 data_buffer_t buffer = {};
93
94 buffer.data = (uint8_t*)malloc(capacity);
95 if (buffer.data == NULL)
96 return buffer;
97 buffer.capacity = capacity;
98 return buffer;
99 }
100
data_buffer_read(char const * filename)101 data_buffer_t data_buffer_read(char const* filename) {
102 data_buffer_t buffer = {};
103
104 uint64_t const size = UTIL_getFileSize(filename);
105 if (size == UTIL_FILESIZE_UNKNOWN) {
106 fprintf(stderr, "unknown size for %s\n", filename);
107 return buffer;
108 }
109
110 buffer.data = (uint8_t*)malloc(size);
111 if (buffer.data == NULL) {
112 fprintf(stderr, "malloc failed\n");
113 return buffer;
114 }
115 buffer.capacity = size;
116
117 FILE* file = fopen(filename, "rb");
118 if (file == NULL) {
119 fprintf(stderr, "file null\n");
120 goto err;
121 }
122 buffer.size = fread(buffer.data, 1, buffer.capacity, file);
123 fclose(file);
124 if (buffer.size != buffer.capacity) {
125 fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity);
126 goto err;
127 }
128
129 return buffer;
130 err:
131 free(buffer.data);
132 memset(&buffer, 0, sizeof(buffer));
133 return buffer;
134 }
135
data_buffer_get_data(data_t const * data)136 data_buffer_t data_buffer_get_data(data_t const* data) {
137 data_buffer_t const kEmptyBuffer = {};
138
139 if (data->type != data_type_file)
140 return kEmptyBuffer;
141
142 return data_buffer_read(data->data.path);
143 }
144
data_buffer_get_dict(data_t const * data)145 data_buffer_t data_buffer_get_dict(data_t const* data) {
146 data_buffer_t const kEmptyBuffer = {};
147
148 if (!data_has_dict(data))
149 return kEmptyBuffer;
150
151 return data_buffer_read(data->dict.path);
152 }
153
data_buffer_compare(data_buffer_t buffer1,data_buffer_t buffer2)154 int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) {
155 size_t const size =
156 buffer1.size < buffer2.size ? buffer1.size : buffer2.size;
157 int const cmp = memcmp(buffer1.data, buffer2.data, size);
158 if (cmp != 0)
159 return cmp;
160 if (buffer1.size < buffer2.size)
161 return -1;
162 if (buffer1.size == buffer2.size)
163 return 0;
164 assert(buffer1.size > buffer2.size);
165 return 1;
166 }
167
data_buffer_free(data_buffer_t buffer)168 void data_buffer_free(data_buffer_t buffer) {
169 free(buffer.data);
170 }
171
172 /**
173 * data filenames helpers.
174 */
175
data_filenames_get(data_t const * data)176 data_filenames_t data_filenames_get(data_t const* data) {
177 data_filenames_t filenames = {.buffer = NULL, .size = 0};
178 char const* path = data->data.path;
179
180 filenames.filenames = UTIL_createFileList(
181 &path,
182 1,
183 &filenames.buffer,
184 &filenames.size,
185 /* followLinks */ 0);
186 return filenames;
187 }
188
data_filenames_free(data_filenames_t filenames)189 void data_filenames_free(data_filenames_t filenames) {
190 UTIL_freeFileList(filenames.filenames, filenames.buffer);
191 }
192
193 /**
194 * data buffers helpers.
195 */
196
data_buffers_get(data_t const * data)197 data_buffers_t data_buffers_get(data_t const* data) {
198 data_buffers_t buffers = {.size = 0};
199 data_filenames_t filenames = data_filenames_get(data);
200 if (filenames.size == 0)
201 return buffers;
202
203 data_buffer_t* buffersPtr =
204 (data_buffer_t*)malloc(filenames.size * sizeof(data_buffer_t));
205 if (buffersPtr == NULL)
206 return buffers;
207 buffers.buffers = (data_buffer_t const*)buffersPtr;
208 buffers.size = filenames.size;
209
210 for (size_t i = 0; i < filenames.size; ++i) {
211 buffersPtr[i] = data_buffer_read(filenames.filenames[i]);
212 if (buffersPtr[i].data == NULL) {
213 data_buffers_t const kEmptyBuffer = {};
214 data_buffers_free(buffers);
215 return kEmptyBuffer;
216 }
217 }
218
219 return buffers;
220 }
221
222 /**
223 * Frees the data buffers.
224 */
data_buffers_free(data_buffers_t buffers)225 void data_buffers_free(data_buffers_t buffers) {
226 free((data_buffer_t*)buffers.buffers);
227 }
228
229 /**
230 * Initialization and download functions.
231 */
232
233 static char* g_data_dir = NULL;
234
235 /* mkdir -p */
ensure_directory_exists(char const * indir)236 static int ensure_directory_exists(char const* indir) {
237 char* const dir = strdup(indir);
238 char* end = dir;
239 int ret = 0;
240 if (dir == NULL) {
241 ret = EINVAL;
242 goto out;
243 }
244 do {
245 /* Find the next directory level. */
246 for (++end; *end != '\0' && *end != '/'; ++end)
247 ;
248 /* End the string there, make the directory, and restore the string. */
249 char const save = *end;
250 *end = '\0';
251 int const isdir = UTIL_isDirectory(dir);
252 ret = mkdir(dir, S_IRWXU);
253 *end = save;
254 /* Its okay if the directory already exists. */
255 if (ret == 0 || (errno == EEXIST && isdir))
256 continue;
257 ret = errno;
258 fprintf(stderr, "mkdir() failed\n");
259 goto out;
260 } while (*end != '\0');
261
262 ret = 0;
263 out:
264 free(dir);
265 return ret;
266 }
267
268 /** Concatenate 3 strings into a new buffer. */
cat3(char const * str1,char const * str2,char const * str3)269 static char* cat3(char const* str1, char const* str2, char const* str3) {
270 size_t const size1 = strlen(str1);
271 size_t const size2 = strlen(str2);
272 size_t const size3 = str3 == NULL ? 0 : strlen(str3);
273 size_t const size = size1 + size2 + size3 + 1;
274 char* const dst = (char*)malloc(size);
275 if (dst == NULL)
276 return NULL;
277 strcpy(dst, str1);
278 strcpy(dst + size1, str2);
279 if (str3 != NULL)
280 strcpy(dst + size1 + size2, str3);
281 assert(strlen(dst) == size1 + size2 + size3);
282 return dst;
283 }
284
cat2(char const * str1,char const * str2)285 static char* cat2(char const* str1, char const* str2) {
286 return cat3(str1, str2, NULL);
287 }
288
289 /**
290 * State needed by the curl callback.
291 * It takes data from curl, hashes it, and writes it to the file.
292 */
293 typedef struct {
294 FILE* file;
295 XXH64_state_t xxhash64;
296 int error;
297 } curl_data_t;
298
299 /** Create the curl state. */
curl_data_create(data_resource_t const * resource,data_type_t type)300 static curl_data_t curl_data_create(
301 data_resource_t const* resource,
302 data_type_t type) {
303 curl_data_t cdata = {};
304
305 XXH64_reset(&cdata.xxhash64, 0);
306
307 assert(UTIL_isDirectory(g_data_dir));
308
309 if (type == data_type_file) {
310 /* Decompress the resource and store to the path. */
311 char* cmd = cat3("zstd -dqfo '", resource->path, "'");
312 if (cmd == NULL) {
313 cdata.error = ENOMEM;
314 return cdata;
315 }
316 cdata.file = popen(cmd, "w");
317 free(cmd);
318 } else {
319 /* Decompress and extract the resource to the cache directory. */
320 char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'");
321 if (cmd == NULL) {
322 cdata.error = ENOMEM;
323 return cdata;
324 }
325 cdata.file = popen(cmd, "w");
326 free(cmd);
327 }
328 if (cdata.file == NULL) {
329 cdata.error = errno;
330 }
331
332 return cdata;
333 }
334
335 /** Free the curl state. */
curl_data_free(curl_data_t cdata)336 static int curl_data_free(curl_data_t cdata) {
337 return pclose(cdata.file);
338 }
339
340 /** curl callback. Updates the hash, and writes to the file. */
curl_write(void * data,size_t size,size_t count,void * ptr)341 static size_t curl_write(void* data, size_t size, size_t count, void* ptr) {
342 curl_data_t* cdata = (curl_data_t*)ptr;
343 size_t const written = fwrite(data, size, count, cdata->file);
344 XXH64_update(&cdata->xxhash64, data, written * size);
345 return written;
346 }
347
curl_download_resource(CURL * curl,data_resource_t const * resource,data_type_t type)348 static int curl_download_resource(
349 CURL* curl,
350 data_resource_t const* resource,
351 data_type_t type) {
352 curl_data_t cdata;
353 /* Download the data. */
354 if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0)
355 return EINVAL;
356 if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0)
357 return EINVAL;
358 cdata = curl_data_create(resource, type);
359 if (cdata.error != 0)
360 return cdata.error;
361 int const curl_err = curl_easy_perform(curl);
362 int const close_err = curl_data_free(cdata);
363 if (curl_err) {
364 fprintf(
365 stderr,
366 "downloading '%s' for '%s' failed\n",
367 resource->url,
368 resource->path);
369 return EIO;
370 }
371 if (close_err) {
372 fprintf(stderr, "writing data to '%s' failed\n", resource->path);
373 return EIO;
374 }
375 /* check that the file exists. */
376 if (type == data_type_file && !UTIL_isRegularFile(resource->path)) {
377 fprintf(stderr, "output file '%s' does not exist\n", resource->path);
378 return EIO;
379 }
380 if (type == data_type_dir && !UTIL_isDirectory(resource->path)) {
381 fprintf(
382 stderr, "output directory '%s' does not exist\n", resource->path);
383 return EIO;
384 }
385 /* Check that the hash matches. */
386 if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) {
387 fprintf(
388 stderr,
389 "checksum does not match: 0x%llxLL != 0x%llxLL\n",
390 (unsigned long long)XXH64_digest(&cdata.xxhash64),
391 (unsigned long long)resource->xxhash64);
392 return EINVAL;
393 }
394
395 return 0;
396 }
397
398 /** Download a single data object. */
curl_download_datum(CURL * curl,data_t const * data)399 static int curl_download_datum(CURL* curl, data_t const* data) {
400 int ret;
401 ret = curl_download_resource(curl, &data->data, data->type);
402 if (ret != 0)
403 return ret;
404 if (data_has_dict(data)) {
405 ret = curl_download_resource(curl, &data->dict, data_type_file);
406 if (ret != 0)
407 return ret;
408 }
409 return ret;
410 }
411
412 /** Download all the data. */
curl_download_data(data_t const * const * data)413 static int curl_download_data(data_t const* const* data) {
414 if (curl_global_init(CURL_GLOBAL_ALL) != 0)
415 return EFAULT;
416
417 curl_data_t cdata = {};
418 CURL* curl = curl_easy_init();
419 int err = EFAULT;
420
421 if (curl == NULL)
422 return EFAULT;
423
424 if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0)
425 goto out;
426 if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0)
427 goto out;
428 if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0)
429 goto out;
430
431 assert(data != NULL);
432 for (; *data != NULL; ++data) {
433 if (curl_download_datum(curl, *data) != 0)
434 goto out;
435 }
436
437 err = 0;
438 out:
439 curl_easy_cleanup(curl);
440 curl_global_cleanup();
441 return err;
442 }
443
444 /** Fill the path member variable of the data objects. */
data_create_paths(data_t * const * data,char const * dir)445 static int data_create_paths(data_t* const* data, char const* dir) {
446 size_t const dirlen = strlen(dir);
447 assert(data != NULL);
448 for (; *data != NULL; ++data) {
449 data_t* const datum = *data;
450 datum->data.path = cat3(dir, "/", datum->name);
451 if (datum->data.path == NULL)
452 return ENOMEM;
453 if (data_has_dict(datum)) {
454 datum->dict.path = cat2(datum->data.path, ".dict");
455 if (datum->dict.path == NULL)
456 return ENOMEM;
457 }
458 }
459 return 0;
460 }
461
462 /** Free the path member variable of the data objects. */
data_free_paths(data_t * const * data)463 static void data_free_paths(data_t* const* data) {
464 assert(data != NULL);
465 for (; *data != NULL; ++data) {
466 data_t* datum = *data;
467 free((void*)datum->data.path);
468 free((void*)datum->dict.path);
469 datum->data.path = NULL;
470 datum->dict.path = NULL;
471 }
472 }
473
474 static char const kStampName[] = "STAMP";
475
xxh_update_le(XXH64_state_t * state,uint64_t data)476 static void xxh_update_le(XXH64_state_t* state, uint64_t data) {
477 if (!MEM_isLittleEndian())
478 data = MEM_swap64(data);
479 XXH64_update(state, &data, sizeof(data));
480 }
481
482 /** Hash the data to create the stamp. */
stamp_hash(data_t const * const * data)483 static uint64_t stamp_hash(data_t const* const* data) {
484 XXH64_state_t state;
485
486 XXH64_reset(&state, 0);
487 assert(data != NULL);
488 for (; *data != NULL; ++data) {
489 data_t const* datum = *data;
490 /* We don't care about the URL that we fetch from. */
491 /* The path is derived from the name. */
492 XXH64_update(&state, datum->name, strlen(datum->name));
493 xxh_update_le(&state, datum->data.xxhash64);
494 xxh_update_le(&state, datum->dict.xxhash64);
495 xxh_update_le(&state, datum->type);
496 }
497 return XXH64_digest(&state);
498 }
499
500 /** Check if the stamp matches the stamp in the cache directory. */
stamp_check(char const * dir,data_t const * const * data)501 static int stamp_check(char const* dir, data_t const* const* data) {
502 char* stamp = cat3(dir, "/", kStampName);
503 uint64_t const expected = stamp_hash(data);
504 XXH64_canonical_t actual;
505 FILE* stampfile = NULL;
506 int matches = 0;
507
508 if (stamp == NULL)
509 goto out;
510 if (!UTIL_isRegularFile(stamp)) {
511 fprintf(stderr, "stamp does not exist: recreating the data cache\n");
512 goto out;
513 }
514
515 stampfile = fopen(stamp, "rb");
516 if (stampfile == NULL) {
517 fprintf(stderr, "could not open stamp: recreating the data cache\n");
518 goto out;
519 }
520
521 size_t b;
522 if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) {
523 fprintf(stderr, "invalid stamp: recreating the data cache\n");
524 goto out;
525 }
526
527 matches = (expected == XXH64_hashFromCanonical(&actual));
528 if (matches)
529 fprintf(stderr, "stamp matches: reusing the cached data\n");
530 else
531 fprintf(stderr, "stamp does not match: recreating the data cache\n");
532
533 out:
534 free(stamp);
535 if (stampfile != NULL)
536 fclose(stampfile);
537 return matches;
538 }
539
540 /** On success write a new stamp, on failure delete the old stamp. */
541 static int
stamp_write(char const * dir,data_t const * const * data,int const data_err)542 stamp_write(char const* dir, data_t const* const* data, int const data_err) {
543 char* stamp = cat3(dir, "/", kStampName);
544 FILE* stampfile = NULL;
545 int err = EIO;
546
547 if (stamp == NULL)
548 return ENOMEM;
549
550 if (data_err != 0) {
551 err = data_err;
552 goto out;
553 }
554 XXH64_canonical_t hash;
555
556 XXH64_canonicalFromHash(&hash, stamp_hash(data));
557
558 stampfile = fopen(stamp, "wb");
559 if (stampfile == NULL)
560 goto out;
561 if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1)
562 goto out;
563 err = 0;
564 fprintf(stderr, "stamped new data cache\n");
565 out:
566 if (err != 0)
567 /* Ignore errors. */
568 unlink(stamp);
569 free(stamp);
570 if (stampfile != NULL)
571 fclose(stampfile);
572 return err;
573 }
574
data_init(char const * dir)575 int data_init(char const* dir) {
576 int err;
577
578 if (dir == NULL)
579 return EINVAL;
580
581 /* This must be first to simplify logic. */
582 err = ensure_directory_exists(dir);
583 if (err != 0)
584 return err;
585
586 /* Save the cache directory. */
587 g_data_dir = strdup(dir);
588 if (g_data_dir == NULL)
589 return ENOMEM;
590
591 err = data_create_paths(g_data, dir);
592 if (err != 0)
593 return err;
594
595 /* If the stamp matches then we are good to go.
596 * This must be called before any modifications to the data cache.
597 * After this point, we MUST call stamp_write() to update the STAMP,
598 * since we've updated the data cache.
599 */
600 if (stamp_check(dir, data))
601 return 0;
602
603 err = curl_download_data(data);
604 if (err != 0)
605 goto out;
606
607 out:
608 /* This must be last, since it must know if data_init() succeeded. */
609 stamp_write(dir, data, err);
610 return err;
611 }
612
data_finish(void)613 void data_finish(void) {
614 data_free_paths(g_data);
615 free(g_data_dir);
616 g_data_dir = NULL;
617 }
618