1 /* Utility functions for writing WARC files.
2 Copyright (C) 2011-2012, 2015, 2018-2021 Free Software Foundation,
3 Inc.
4
5 This file is part of GNU Wget.
6
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or (at
10 your option) any later version.
11
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with Wget. If not, see <http://www.gnu.org/licenses/>.
19
20 Additional permission under GNU GPL version 3 section 7
21
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work. */
30
31 #include "wget.h"
32 #include "hash.h"
33 #include "utils.h"
34 #include "version.h"
35 #include "dirname.h"
36 #include "url.h"
37
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <time.h>
42 #include <tmpdir.h>
43 #include <sha1.h>
44 #include <base32.h>
45 #include <unistd.h>
46 #ifdef HAVE_LIBZ
47 #include <zlib.h>
48 #endif
49
50 #ifdef HAVE_LIBUUID
51 #include <uuid/uuid.h>
52 #elif HAVE_UUID_CREATE
53 #include <uuid.h>
54 #endif
55
56 #include "warc.h"
57 #include "exits.h"
58
59 #ifdef WINDOWS
60 /* we need this on Windows to have O_TEMPORARY defined */
61 # include <fcntl.h>
62 # include <rpc.h>
63 #endif
64
65 #ifndef O_TEMPORARY
66 #define O_TEMPORARY 0
67 #endif
68
69 #include "warc.h"
70 #include "exits.h"
71
72
73 /* The log file (a temporary file that contains a copy
74 of the wget log). */
75 static FILE *warc_log_fp;
76
77 /* The manifest file (a temporary file that contains the
78 warcinfo uuid of every file in this crawl). */
79 static FILE *warc_manifest_fp;
80
81 /* The current WARC file (or NULL, if WARC is disabled). */
82 static FILE *warc_current_file;
83
84 #ifdef HAVE_LIBZ
85 /* The gzip stream for the current WARC file
86 (or NULL, if WARC or gzip is disabled). */
87 static gzFile warc_current_gzfile;
88
89 /* The offset of the current gzip record in the WARC file. */
90 static off_t warc_current_gzfile_offset;
91
92 /* The uncompressed size (so far) of the current record. */
93 static off_t warc_current_gzfile_uncompressed_size;
94 # endif
95
96 /* This is true until a warc_write_* method fails. */
97 static bool warc_write_ok;
98
99 /* The current CDX file (or NULL, if CDX is disabled). */
100 static FILE *warc_current_cdx_file;
101
102 /* The record id of the warcinfo record of the current WARC file. */
103 static char warc_current_warcinfo_uuid_str[48];
104
105 /* The file name of the current WARC file. */
106 static char *warc_current_filename;
107
108 /* The serial number of the current WARC file. This number is
109 incremented each time a new file is opened and is used in the
110 WARC file's filename. */
111 static int warc_current_file_number;
112
113 /* The table of CDX records, if deduplication is enabled. */
114 static struct hash_table * warc_cdx_dedup_table;
115
116 static bool warc_start_new_file (bool meta);
117
118
119 struct warc_cdx_record
120 {
121 char *url;
122 char *uuid;
123 char digest[SHA1_DIGEST_SIZE];
124 };
125
126 static unsigned long
warc_hash_sha1_digest(const void * key)127 warc_hash_sha1_digest (const void *key)
128 {
129 /* We just use some of the first bytes of the digest. */
130 unsigned long v = 0;
131 memcpy (&v, key, sizeof (unsigned long));
132 return v;
133 }
134
135 static int
warc_cmp_sha1_digest(const void * digest1,const void * digest2)136 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
137 {
138 return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
139 }
140
141
142
143 /* Writes SIZE bytes from BUFFER to the current WARC file,
144 through gzwrite if compression is enabled.
145 Returns the number of uncompressed bytes written. */
146 static size_t
warc_write_buffer(const char * buffer,size_t size)147 warc_write_buffer (const char *buffer, size_t size)
148 {
149 #ifdef HAVE_LIBZ
150 if (warc_current_gzfile)
151 {
152 warc_current_gzfile_uncompressed_size += size;
153 return gzwrite (warc_current_gzfile, buffer, size);
154 }
155 else
156 #endif
157 return fwrite (buffer, 1, size, warc_current_file);
158 }
159
160 /* Writes STR to the current WARC file.
161 Returns false and set warc_write_ok to false if there
162 is an error. */
163 static bool
warc_write_string(const char * str)164 warc_write_string (const char *str)
165 {
166 size_t n;
167
168 if (!warc_write_ok)
169 return false;
170
171 n = strlen (str);
172 if (n != warc_write_buffer (str, n))
173 warc_write_ok = false;
174
175 return warc_write_ok;
176 }
177
178
179 #define EXTRA_GZIP_HEADER_SIZE 14
180 #define GZIP_STATIC_HEADER_SIZE 10
181 #define FLG_FEXTRA 0x04
182 #define OFF_FLG 3
183
184 /* Starts a new WARC record. Writes the version header.
185 If opt.warc_maxsize is set and the current file is becoming
186 too large, this will open a new WARC file.
187
188 If compression is enabled, this will start a new
189 gzip stream in the current WARC file.
190
191 Returns false and set warc_write_ok to false if there
192 is an error. */
193 static bool
warc_write_start_record(void)194 warc_write_start_record (void)
195 {
196 if (!warc_write_ok)
197 return false;
198
199 fflush (warc_current_file);
200 if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize)
201 warc_start_new_file (false);
202
203 #ifdef HAVE_LIBZ
204 /* Start a GZIP stream, if required. */
205 if (opt.warc_compression_enabled)
206 {
207 int dup_fd;
208 /* Record the starting offset of the new record. */
209 warc_current_gzfile_offset = ftello (warc_current_file);
210
211 /* Reserve space for the extra GZIP header field.
212 In warc_write_end_record we will fill this space
213 with information about the uncompressed and
214 compressed size of the record. */
215 if (fseek (warc_current_file, EXTRA_GZIP_HEADER_SIZE, SEEK_CUR) < 0)
216 {
217 logprintf (LOG_NOTQUIET, _("Error setting WARC file position.\n"));
218 warc_write_ok = false;
219 return false;
220 }
221
222 if (fflush (warc_current_file) != 0)
223 {
224 logprintf (LOG_NOTQUIET, _("Error flushing WARC file to disk.\n"));
225 warc_write_ok = false;
226 return false;
227 }
228
229 /* Start a new GZIP stream. */
230 dup_fd = dup (fileno (warc_current_file));
231 if (dup_fd < 0)
232 {
233 logprintf (LOG_NOTQUIET,
234 _("Error duplicating WARC file file descriptor.\n"));
235 warc_write_ok = false;
236 return false;
237 }
238
239 warc_current_gzfile = gzdopen (dup_fd, "wb9");
240 warc_current_gzfile_uncompressed_size = 0;
241
242 if (warc_current_gzfile == NULL)
243 {
244 logprintf (LOG_NOTQUIET,
245 _("Error opening GZIP stream to WARC file.\n"));
246 close (dup_fd);
247 warc_write_ok = false;
248 return false;
249 }
250 }
251 #endif
252
253 warc_write_string ("WARC/1.0\r\n");
254 return warc_write_ok;
255 }
256
257 /* Writes a WARC header to the current WARC record.
258 This method may be run after warc_write_start_record and
259 before warc_write_block_from_file. */
260 static bool
warc_write_header(const char * name,const char * value)261 warc_write_header (const char *name, const char *value)
262 {
263 if (value)
264 {
265 warc_write_string (name);
266 warc_write_string (": ");
267 warc_write_string (value);
268 warc_write_string ("\r\n");
269 }
270 return warc_write_ok;
271 }
272
273 /* Writes a WARC header with a URI as value to the current WARC record.
274 This method may be run after warc_write_start_record and
275 before warc_write_block_from_file. */
276 static bool
warc_write_header_uri(const char * name,const char * value)277 warc_write_header_uri (const char *name, const char *value)
278 {
279 if (value)
280 {
281 warc_write_string (name);
282 warc_write_string (": <");
283 warc_write_string (value);
284 warc_write_string (">\r\n");
285 }
286 return warc_write_ok;
287 }
288
289 /* Copies the contents of DATA_IN to the WARC record.
290 Adds a Content-Length header to the WARC record.
291 Run this method after warc_write_header,
292 then run warc_write_end_record. */
293 static bool
warc_write_block_from_file(FILE * data_in)294 warc_write_block_from_file (FILE *data_in)
295 {
296 /* Add the Content-Length header. */
297 char content_length[MAX_INT_TO_STRING_LEN(off_t)];
298 char buffer[BUFSIZ];
299 size_t s;
300
301 fseeko (data_in, 0L, SEEK_END);
302 number_to_string (content_length, ftello (data_in));
303 warc_write_header ("Content-Length", content_length);
304
305 /* End of the WARC header section. */
306 warc_write_string ("\r\n");
307
308 if (fseeko (data_in, 0L, SEEK_SET) != 0)
309 warc_write_ok = false;
310
311 /* Copy the data in the file to the WARC record. */
312 while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
313 {
314 if (warc_write_buffer (buffer, s) < s)
315 warc_write_ok = false;
316 }
317
318 return warc_write_ok;
319 }
320
321 /* Run this method to close the current WARC record.
322
323 If compression is enabled, this method closes the
324 current GZIP stream and fills the extra GZIP header
325 with the uncompressed and compressed length of the
326 record. */
327 static bool
warc_write_end_record(void)328 warc_write_end_record (void)
329 {
330 if (warc_write_buffer ("\r\n\r\n", 4) != 4)
331 {
332 warc_write_ok = false;
333 return false;
334 }
335
336 #ifdef HAVE_LIBZ
337 /* We start a new gzip stream for each record. */
338 if (warc_write_ok && warc_current_gzfile)
339 {
340 char extra_header[EXTRA_GZIP_HEADER_SIZE];
341 char static_header[GZIP_STATIC_HEADER_SIZE];
342 off_t current_offset, uncompressed_size, compressed_size;
343 size_t result;
344
345 if (gzclose (warc_current_gzfile) != Z_OK)
346 {
347 warc_write_ok = false;
348 return false;
349 }
350
351 fflush (warc_current_file);
352 fseeko (warc_current_file, 0, SEEK_END);
353
354 /* The WARC standard suggests that we add 'skip length' data in the
355 extra header field of the GZIP stream.
356
357 In warc_write_start_record we reserved space for this extra header.
358 This extra space starts at warc_current_gzfile_offset and fills
359 EXTRA_GZIP_HEADER_SIZE bytes. The static GZIP header starts at
360 warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
361
362 We need to do three things:
363 1. Move the static GZIP header to warc_current_gzfile_offset;
364 2. Set the FEXTRA flag in the GZIP header;
365 3. Write the extra GZIP header after the static header, that is,
366 starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
367 */
368
369 /* Calculate the uncompressed and compressed sizes. */
370 current_offset = ftello (warc_current_file);
371 uncompressed_size = current_offset - warc_current_gzfile_offset;
372 compressed_size = warc_current_gzfile_uncompressed_size;
373
374 /* Go back to the static GZIP header. */
375 result = fseeko (warc_current_file, warc_current_gzfile_offset
376 + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
377 if (result != 0)
378 {
379 warc_write_ok = false;
380 return false;
381 }
382
383 /* Read the header. */
384 result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE,
385 warc_current_file);
386 if (result != GZIP_STATIC_HEADER_SIZE)
387 {
388 warc_write_ok = false;
389 return false;
390 }
391
392 /* Set the FEXTRA flag in the flags byte of the header. */
393 static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
394
395 /* Write the header back to the file, but starting at
396 warc_current_gzfile_offset. */
397 fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
398 fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
399
400 /* Prepare the extra GZIP header. */
401 /* XLEN, the length of the extra header fields. */
402 extra_header[0] = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
403 extra_header[1] = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
404 /* The extra header field identifier for the WARC skip length. */
405 extra_header[2] = 's';
406 extra_header[3] = 'l';
407 /* The size of the field value (8 bytes). */
408 extra_header[4] = (8 & 255);
409 extra_header[5] = ((8 >> 8) & 255);
410 /* The size of the uncompressed record. */
411 extra_header[6] = (uncompressed_size & 255);
412 extra_header[7] = (uncompressed_size >> 8) & 255;
413 extra_header[8] = (uncompressed_size >> 16) & 255;
414 extra_header[9] = (uncompressed_size >> 24) & 255;
415 /* The size of the compressed record. */
416 extra_header[10] = (compressed_size & 255);
417 extra_header[11] = (compressed_size >> 8) & 255;
418 extra_header[12] = (compressed_size >> 16) & 255;
419 extra_header[13] = (compressed_size >> 24) & 255;
420
421 /* Write the extra header after the static header. */
422 fseeko (warc_current_file, warc_current_gzfile_offset
423 + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
424 fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
425
426 /* Done, move back to the end of the file. */
427 fflush (warc_current_file);
428 fseeko (warc_current_file, 0, SEEK_END);
429 }
430 #endif /* HAVE_LIBZ */
431
432 return warc_write_ok;
433 }
434
435
436 /* Writes the WARC-Date header for the given timestamp to
437 the current WARC record.
438 If timestamp is NULL, the current time will be used. */
439 static bool
warc_write_date_header(const char * timestamp)440 warc_write_date_header (const char *timestamp)
441 {
442 char current_timestamp[21];
443
444 return warc_write_header ("WARC-Date", timestamp ? timestamp :
445 warc_timestamp (current_timestamp, sizeof(current_timestamp)));
446 }
447
448 /* Writes the WARC-IP-Address header for the given IP to
449 the current WARC record. If IP is NULL, no header will
450 be written. */
451 static bool
warc_write_ip_header(const ip_address * ip)452 warc_write_ip_header (const ip_address *ip)
453 {
454 if (ip != NULL)
455 return warc_write_header ("WARC-IP-Address", print_address (ip));
456 else
457 return warc_write_ok;
458 }
459
460
461 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
462 from gnulib/sha1.c. This version calculates two digests in one go.
463
464 Compute SHA1 message digests for bytes read from STREAM. The
465 digest of the complete file will be written into the 16 bytes
466 beginning at RES_BLOCK.
467
468 If payload_offset >= 0, a second digest will be calculated of the
469 portion of the file starting at payload_offset and continuing to
470 the end of the file. The digest number will be written into the
471 16 bytes beginning ad RES_PAYLOAD. */
472 static int
warc_sha1_stream_with_payload(FILE * stream,void * res_block,void * res_payload,off_t payload_offset)473 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
474 off_t payload_offset)
475 {
476 #define BLOCKSIZE 32768
477
478 struct sha1_ctx ctx_block;
479 struct sha1_ctx ctx_payload;
480 off_t pos;
481 off_t sum;
482
483 char *buffer = xmalloc (BLOCKSIZE + 72);
484
485 /* Initialize the computation context. */
486 sha1_init_ctx (&ctx_block);
487 if (payload_offset >= 0)
488 sha1_init_ctx (&ctx_payload);
489
490 pos = 0;
491
492 /* Iterate over full file contents. */
493 while (1)
494 {
495 /* We read the file in blocks of BLOCKSIZE bytes. One call of the
496 computation function processes the whole buffer so that with the
497 next round of the loop another block can be read. */
498 off_t n;
499 sum = 0;
500
501 /* Read block. Take care for partial reads. */
502 while (1)
503 {
504 n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
505
506 sum += n;
507 pos += n;
508
509 if (sum == BLOCKSIZE)
510 break;
511
512 if (n == 0)
513 {
514 /* Check for the error flag IF N == 0, so that we don't
515 exit the loop after a partial read due to e.g., EAGAIN
516 or EWOULDBLOCK. */
517 if (ferror (stream))
518 {
519 xfree (buffer);
520 return 1;
521 }
522 goto process_partial_block;
523 }
524
525 /* We've read at least one byte, so ignore errors. But always
526 check for EOF, since feof may be true even though N > 0.
527 Otherwise, we could end up calling fread after EOF. */
528 if (feof (stream))
529 goto process_partial_block;
530 }
531
532 /* Process buffer with BLOCKSIZE bytes. Note that
533 BLOCKSIZE % 64 == 0
534 */
535 sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
536 if (payload_offset >= 0 && payload_offset < pos)
537 {
538 /* At least part of the buffer contains data from payload. */
539 off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
540 if (start_of_payload <= 0)
541 /* All bytes in the buffer belong to the payload. */
542 start_of_payload = 0;
543
544 /* Process the payload part of the buffer.
545 Note: we can't use sha1_process_block here even if we
546 process the complete buffer. Because the payload doesn't
547 have to start with a full block, there may still be some
548 bytes left from the previous buffer. Therefore, we need
549 to continue with sha1_process_bytes. */
550 sha1_process_bytes (buffer + start_of_payload,
551 BLOCKSIZE - start_of_payload, &ctx_payload);
552 }
553 }
554
555 process_partial_block:;
556
557 /* Process any remaining bytes. */
558 if (sum > 0)
559 {
560 sha1_process_bytes (buffer, sum, &ctx_block);
561 if (payload_offset >= 0 && payload_offset < pos)
562 {
563 /* At least part of the buffer contains data from payload. */
564 off_t start_of_payload = payload_offset - (pos - sum);
565 if (start_of_payload <= 0)
566 /* All bytes in the buffer belong to the payload. */
567 start_of_payload = 0;
568
569 /* Process the payload part of the buffer. */
570 sha1_process_bytes (buffer + start_of_payload,
571 sum - start_of_payload, &ctx_payload);
572 }
573 }
574
575 /* Construct result in desired memory. */
576 sha1_finish_ctx (&ctx_block, res_block);
577 if (payload_offset >= 0)
578 sha1_finish_ctx (&ctx_payload, res_payload);
579 xfree (buffer);
580 return 0;
581
582 #undef BLOCKSIZE
583 }
584
585 /* Converts the SHA1 digest to a base32-encoded string.
586 "sha1:DIGEST\0" (Allocates a new string for the response.) */
587 static char *
warc_base32_sha1_digest(const char * sha1_digest,char * sha1_base32,size_t sha1_base32_size)588 warc_base32_sha1_digest (const char *sha1_digest, char *sha1_base32, size_t sha1_base32_size)
589 {
590 if (sha1_base32_size >= BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5 + 1)
591 {
592 memcpy (sha1_base32, "sha1:", 5);
593 base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5,
594 sha1_base32_size - 5);
595 }
596 else
597 *sha1_base32 = 0;
598
599 return sha1_base32;
600 }
601
602
603 /* Sets the digest headers of the record.
604 This method will calculate the block digest and, if payload_offset >= 0,
605 will also calculate the payload digest of the payload starting at the
606 provided offset. */
607 static void
warc_write_digest_headers(FILE * file,long payload_offset)608 warc_write_digest_headers (FILE *file, long payload_offset)
609 {
610 if (opt.warc_digests_enabled)
611 {
612 /* Calculate the block and payload digests. */
613 char sha1_res_block[SHA1_DIGEST_SIZE];
614 char sha1_res_payload[SHA1_DIGEST_SIZE];
615
616 rewind (file);
617 if (warc_sha1_stream_with_payload (file, sha1_res_block,
618 sha1_res_payload, payload_offset) == 0)
619 {
620 char digest[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5];
621
622 warc_write_header ("WARC-Block-Digest",
623 warc_base32_sha1_digest (sha1_res_block, digest, sizeof(digest)));
624
625 if (payload_offset >= 0)
626 warc_write_header ("WARC-Payload-Digest",
627 warc_base32_sha1_digest (sha1_res_payload, digest, sizeof(digest)));
628 }
629 }
630 }
631
632
633 /* Fills timestamp with the current time and date.
634 The UTC time is formatted following ISO 8601, as required
635 for use in the WARC-Date header.
636 The timestamp will be 21 characters long. */
637 char *
warc_timestamp(char * timestamp,size_t timestamp_size)638 warc_timestamp (char *timestamp, size_t timestamp_size)
639 {
640 time_t rawtime = time (NULL);
641 struct tm * timeinfo = gmtime (&rawtime);
642
643 if (strftime (timestamp, timestamp_size, "%Y-%m-%dT%H:%M:%SZ", timeinfo) == 0 && timestamp_size > 0)
644 *timestamp = 0;
645
646 return timestamp;
647 }
648
649 /* Fills urn_str with a UUID in the format required
650 for the WARC-Record-Id header.
651 The string will be 47 characters long. */
652 #if HAVE_LIBUUID
653 void
warc_uuid_str(char * urn_str,size_t urn_size)654 warc_uuid_str (char *urn_str, size_t urn_size)
655 {
656 char uuid_str[37];
657 uuid_t record_id;
658
659 uuid_generate (record_id);
660 uuid_unparse (record_id, uuid_str);
661
662 snprintf (urn_str, urn_size, "<urn:uuid:%s>", uuid_str);
663 }
664 #elif HAVE_UUID_CREATE
665 void
warc_uuid_str(char * urn_str,size_t urn_size)666 warc_uuid_str (char *urn_str, size_t urn_size)
667 {
668 char *uuid_str;
669 uuid_t record_id;
670
671 uuid_create (&record_id, NULL);
672 uuid_to_string (&record_id, &uuid_str, NULL);
673
674 snprintf (urn_str, urn_size, "<urn:uuid:%s>", uuid_str);
675 xfree (uuid_str);
676 }
677 #else
678 # ifdef WINDOWS
679
680 typedef RPC_STATUS (RPC_ENTRY * UuidCreate_proc) (UUID *);
681 typedef RPC_STATUS (RPC_ENTRY * UuidToString_proc) (UUID *, unsigned char **);
682 typedef RPC_STATUS (RPC_ENTRY * RpcStringFree_proc) (unsigned char **);
683
684 static int
windows_uuid_str(char * urn_str,size_t urn_size)685 windows_uuid_str (char *urn_str, size_t urn_size)
686 {
687 static UuidCreate_proc pfn_UuidCreate = NULL;
688 static UuidToString_proc pfn_UuidToString = NULL;
689 static RpcStringFree_proc pfn_RpcStringFree = NULL;
690 static int rpc_uuid_avail = -1;
691
692 /* Rpcrt4.dll is not available on older versions of Windows, so we
693 need to test its availability at run time. */
694 if (rpc_uuid_avail == -1)
695 {
696 HMODULE hm_rpcrt4 = LoadLibrary ("Rpcrt4.dll");
697
698 if (hm_rpcrt4)
699 {
700 pfn_UuidCreate =
701 (UuidCreate_proc) GetProcAddress (hm_rpcrt4, "UuidCreate");
702 pfn_UuidToString =
703 (UuidToString_proc) GetProcAddress (hm_rpcrt4, "UuidToStringA");
704 pfn_RpcStringFree =
705 (RpcStringFree_proc) GetProcAddress (hm_rpcrt4, "RpcStringFreeA");
706 if (pfn_UuidCreate && pfn_UuidToString && pfn_RpcStringFree)
707 rpc_uuid_avail = 1;
708 else
709 rpc_uuid_avail = 0;
710 }
711 else
712 rpc_uuid_avail = 0;
713 }
714
715 if (rpc_uuid_avail)
716 {
717 BYTE *uuid_str;
718 UUID uuid;
719
720 if (pfn_UuidCreate (&uuid) == RPC_S_OK)
721 {
722 if (pfn_UuidToString (&uuid, &uuid_str) == RPC_S_OK)
723 {
724 snprintf (urn_str, urn_size, "<urn:uuid:%s>", uuid_str);
725 pfn_RpcStringFree (&uuid_str);
726 return 1;
727 }
728 }
729 }
730 return 0;
731 }
732 #endif
733 /* Fills urn_str with a UUID based on random numbers in the format
734 required for the WARC-Record-Id header.
735 (See RFC 4122, UUID version 4.)
736
737 Note: this is a fallback method, it is much better to use the
738 methods provided by libuuid.
739
740 The string will be 47 characters long. */
741 void
warc_uuid_str(char * urn_str,size_t urn_size)742 warc_uuid_str (char *urn_str, size_t urn_size)
743 {
744 /* RFC 4122, a version 4 UUID with only random numbers */
745
746 unsigned char uuid_data[16];
747 int i;
748
749 #ifdef WINDOWS
750 /* If the native method fails (expected on older Windows versions),
751 use the fallback below. */
752 if (windows_uuid_str (urn_str, urn_size))
753 return;
754 #endif
755
756 for (i=0; i<16; i++)
757 uuid_data[i] = random_number (255);
758
759 /* Set the four most significant bits (bits 12 through 15) of the
760 * time_hi_and_version field to the 4-bit version number */
761 uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
762
763 /* Set the two most significant bits (bits 6 and 7) of the
764 * clock_seq_hi_and_reserved to zero and one, respectively. */
765 uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
766
767 snprintf (urn_str, urn_size,
768 "<urn:uuid:%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>",
769 uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
770 uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
771 uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
772 uuid_data[15]);
773 }
774 #endif
775
776 /* Write a warcinfo record to the current file.
777 Updates warc_current_warcinfo_uuid_str. */
778 static bool
warc_write_warcinfo_record(const char * filename)779 warc_write_warcinfo_record (const char *filename)
780 {
781 FILE *warc_tmp;
782 char timestamp[22];
783 char *filename_basename;
784
785 /* Write warc-info record as the first record of the file. */
786 /* We add the record id of this info record to the other records in the
787 file. */
788 warc_uuid_str (warc_current_warcinfo_uuid_str, sizeof (warc_current_warcinfo_uuid_str));
789
790 warc_timestamp (timestamp, sizeof(timestamp));
791
792 filename_basename = base_name (filename);
793
794 warc_write_start_record ();
795 warc_write_header ("WARC-Type", "warcinfo");
796 warc_write_header ("Content-Type", "application/warc-fields");
797 warc_write_header ("WARC-Date", timestamp);
798 warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
799 warc_write_header ("WARC-Filename", filename_basename);
800
801 xfree (filename_basename);
802
803 /* Create content. */
804 warc_tmp = warc_tempfile ();
805 if (warc_tmp == NULL)
806 {
807 return false;
808 }
809
810 fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
811 fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
812 fprintf (warc_tmp,
813 "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
814 fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
815 fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
816 /* Add the user headers, if any. */
817 if (opt.warc_user_headers)
818 {
819 int i;
820 for (i = 0; opt.warc_user_headers[i]; i++)
821 fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
822 }
823 fprintf(warc_tmp, "\r\n");
824
825 warc_write_digest_headers (warc_tmp, -1);
826 warc_write_block_from_file (warc_tmp);
827 warc_write_end_record ();
828
829 if (! warc_write_ok)
830 logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
831
832 fclose (warc_tmp);
833 return warc_write_ok;
834 }
835
836 /* Opens a new WARC file.
837 If META is true, generates a filename ending with 'meta.warc.gz'.
838
839 This method will:
840 1. close the current WARC file (if there is one);
841 2. increment warc_current_file_number;
842 3. open a new WARC file;
843 4. write the initial warcinfo record.
844
845 Returns true on success, false otherwise.
846 */
847 static bool
warc_start_new_file(bool meta)848 warc_start_new_file (bool meta)
849 {
850 #ifdef __VMS
851 # define WARC_GZ "warc-gz"
852 #else /* def __VMS */
853 # define WARC_GZ "warc.gz"
854 #endif /* def __VMS [else] */
855
856 #ifdef HAVE_LIBZ
857 const char *extension = (opt.warc_compression_enabled ? WARC_GZ : "warc");
858 #else
859 const char *extension = "warc";
860 #endif
861
862 int base_filename_length;
863 char *new_filename;
864
865 if (opt.warc_filename == NULL)
866 return false;
867
868 if (warc_current_file != NULL)
869 fclose (warc_current_file);
870
871 *warc_current_warcinfo_uuid_str = 0;
872 xfree (warc_current_filename);
873
874 warc_current_file_number++;
875
876 base_filename_length = strlen (opt.warc_filename);
877 /* filename format: base + "-" + 5 digit serial number + ".warc.gz" */
878 new_filename = xmalloc (base_filename_length + 1 + 5 + 8 + 1);
879
880 warc_current_filename = new_filename;
881
882 /* If max size is enabled, we add a serial number to the file names. */
883 if (meta)
884 sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
885 else if (opt.warc_maxsize > 0)
886 {
887 sprintf (new_filename, "%s-%05d.%s", opt.warc_filename,
888 warc_current_file_number, extension);
889 }
890 else
891 sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
892
893 logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
894
895 /* Open the WARC file. */
896 warc_current_file = fopen (new_filename, "wb+");
897 if (warc_current_file == NULL)
898 {
899 logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"),
900 quote (new_filename));
901 return false;
902 }
903
904 if (! warc_write_warcinfo_record (new_filename))
905 return false;
906
907 /* Add warcinfo uuid to manifest. */
908 if (warc_manifest_fp)
909 fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
910
911 return true;
912 }
913
914 /* Opens the CDX file for output. */
915 static bool
warc_start_cdx_file(void)916 warc_start_cdx_file (void)
917 {
918 char *cdx_filename = aprintf("%s.cdx", opt.warc_filename);
919 warc_current_cdx_file = fopen (cdx_filename, "a+");
920 free(cdx_filename);
921
922 if (warc_current_cdx_file == NULL)
923 return false;
924
925 /* Print the CDX header.
926 *
927 * a - original url
928 * b - date
929 * m - mime type
930 * s - response code
931 * k - new style checksum
932 * r - redirect
933 * M - meta tags
934 * V - compressed arc file offset
935 * g - file name
936 * u - record-id
937 */
938 fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
939 fflush (warc_current_cdx_file);
940
941 return true;
942 }
943
944 #define CDX_FIELDSEP " \t\r\n"
945
946 /* Parse the CDX header and find the field numbers of the original url,
947 checksum and record ID fields. */
948 static bool
warc_parse_cdx_header(char * lineptr,int * field_num_original_url,int * field_num_checksum,int * field_num_record_id)949 warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
950 int *field_num_checksum, int *field_num_record_id)
951 {
952 char *token;
953 char *save_ptr;
954
955 *field_num_original_url = -1;
956 *field_num_checksum = -1;
957 *field_num_record_id = -1;
958
959 token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
960
961 if (token != NULL && strcmp (token, "CDX") == 0)
962 {
963 int field_num = 0;
964 while (token != NULL)
965 {
966 token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
967 if (token != NULL)
968 {
969 switch (token[0])
970 {
971 case 'a':
972 *field_num_original_url = field_num;
973 break;
974 case 'k':
975 *field_num_checksum = field_num;
976 break;
977 case 'u':
978 *field_num_record_id = field_num;
979 break;
980 }
981 }
982 field_num++;
983 }
984 }
985
986 return *field_num_original_url != -1
987 && *field_num_checksum != -1
988 && *field_num_record_id != -1;
989 }
990
991 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
992 static void
warc_process_cdx_line(char * lineptr,int field_num_original_url,int field_num_checksum,int field_num_record_id)993 warc_process_cdx_line (char *lineptr, int field_num_original_url,
994 int field_num_checksum, int field_num_record_id)
995 {
996 char *original_url = NULL;
997 char *checksum = NULL;
998 char *record_id = NULL;
999 char *token;
1000 char *save_ptr;
1001 int field_num = 0;
1002
1003 /* Read this line to get the fields we need. */
1004 token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
1005 while (token != NULL)
1006 {
1007 char **val;
1008 if (field_num == field_num_original_url)
1009 val = &original_url;
1010 else if (field_num == field_num_checksum)
1011 val = &checksum;
1012 else if (field_num == field_num_record_id)
1013 val = &record_id;
1014 else
1015 val = NULL;
1016
1017 if (val != NULL)
1018 *val = strdup (token);
1019
1020 token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
1021 field_num++;
1022 }
1023
1024 if (original_url != NULL && checksum != NULL && record_id != NULL)
1025 {
1026 /* For some extra efficiency, we decode the base32 encoded
1027 checksum value. This should produce exactly SHA1_DIGEST_SIZE
1028 bytes. */
1029 size_t checksum_l;
1030 char * checksum_v;
1031 base32_decode_alloc (checksum, strlen (checksum), &checksum_v,
1032 &checksum_l);
1033 xfree (checksum);
1034
1035 if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
1036 {
1037 /* This is a valid line with a valid checksum. */
1038 struct warc_cdx_record *rec;
1039 rec = xmalloc (sizeof (struct warc_cdx_record));
1040 rec->url = original_url;
1041 rec->uuid = record_id;
1042 memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
1043 hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
1044 xfree (checksum_v);
1045 }
1046 else
1047 {
1048 xfree (original_url);
1049 xfree (checksum_v);
1050 xfree (record_id);
1051 }
1052 }
1053 else
1054 {
1055 xfree(checksum);
1056 xfree(original_url);
1057 xfree(record_id);
1058 }
1059 }
1060
1061 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
1062 the warc_cdx_dedup_table. */
1063 static bool
warc_load_cdx_dedup_file(void)1064 warc_load_cdx_dedup_file (void)
1065 {
1066 FILE *f;
1067 char *lineptr = NULL;
1068 size_t n = 0;
1069 ssize_t line_length;
1070 int field_num_original_url = -1;
1071 int field_num_checksum = -1;
1072 int field_num_record_id = -1;
1073
1074 f = fopen (opt.warc_cdx_dedup_filename, "r");
1075 if (f == NULL)
1076 return false;
1077
1078 /* The first line should contain the CDX header.
1079 Format: " CDX x x x x x"
1080 where x are field type indicators. For our purposes, we only
1081 need 'a' (the original url), 'k' (the SHA1 checksum) and
1082 'u' (the WARC record id). */
1083 line_length = getline (&lineptr, &n, f);
1084 if (line_length != -1)
1085 warc_parse_cdx_header (lineptr, &field_num_original_url,
1086 &field_num_checksum, &field_num_record_id);
1087
1088 /* If the file contains all three fields, read the complete file. */
1089 if (field_num_original_url == -1
1090 || field_num_checksum == -1
1091 || field_num_record_id == -1)
1092 {
1093 if (field_num_original_url == -1)
1094 logprintf (LOG_NOTQUIET,
1095 _("CDX file does not list original urls. (Missing column 'a'.)\n"));
1096 if (field_num_checksum == -1)
1097 logprintf (LOG_NOTQUIET,
1098 _("CDX file does not list checksums. (Missing column 'k'.)\n"));
1099 if (field_num_record_id == -1)
1100 logprintf (LOG_NOTQUIET,
1101 _("CDX file does not list record ids. (Missing column 'u'.)\n"));
1102 }
1103 else
1104 {
1105 int nrecords;
1106
1107 /* Initialize the table. */
1108 warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest,
1109 warc_cmp_sha1_digest);
1110
1111 do
1112 {
1113 line_length = getline (&lineptr, &n, f);
1114 if (line_length != -1)
1115 {
1116 warc_process_cdx_line (lineptr, field_num_original_url,
1117 field_num_checksum, field_num_record_id);
1118 }
1119
1120 }
1121 while (line_length != -1);
1122
1123 /* Print results. */
1124 nrecords = hash_table_count (warc_cdx_dedup_table);
1125 logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
1126 "Loaded %d records from CDX.\n\n",
1127 nrecords),
1128 nrecords);
1129 }
1130
1131 xfree (lineptr);
1132 fclose (f);
1133
1134 return true;
1135 }
1136 #undef CDX_FIELDSEP
1137
1138 /* Returns the existing duplicate CDX record for the given url and payload
1139 digest. Returns NULL if the url is not found or if the payload digest
1140 does not match, or if CDX deduplication is disabled. */
1141 static struct warc_cdx_record *
warc_find_duplicate_cdx_record(const char * url,char * sha1_digest_payload)1142 warc_find_duplicate_cdx_record (const char *url, char *sha1_digest_payload)
1143 {
1144 struct warc_cdx_record *rec_existing;
1145
1146 if (warc_cdx_dedup_table == NULL)
1147 return NULL;
1148
1149 rec_existing = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload);
1150
1151 if (rec_existing && strcmp (rec_existing->url, url) == 0)
1152 return rec_existing;
1153 else
1154 return NULL;
1155 }
1156
1157 /* Initializes the WARC writer (if opt.warc_filename is set).
1158 This should be called before any WARC record is written. */
1159 void
warc_init(void)1160 warc_init (void)
1161 {
1162 warc_write_ok = true;
1163
1164 if (opt.warc_filename != NULL)
1165 {
1166 if (opt.warc_cdx_dedup_filename != NULL)
1167 {
1168 if (! warc_load_cdx_dedup_file ())
1169 {
1170 logprintf (LOG_NOTQUIET,
1171 _("Could not read CDX file %s for deduplication.\n"),
1172 quote (opt.warc_cdx_dedup_filename));
1173 exit (WGET_EXIT_GENERIC_ERROR);
1174 }
1175 }
1176
1177 warc_manifest_fp = warc_tempfile ();
1178 if (warc_manifest_fp == NULL)
1179 {
1180 logprintf (LOG_NOTQUIET,
1181 _("Could not open temporary WARC manifest file.\n"));
1182 exit (WGET_EXIT_GENERIC_ERROR);
1183 }
1184
1185 if (opt.warc_keep_log)
1186 {
1187 warc_log_fp = warc_tempfile ();
1188 if (warc_log_fp == NULL)
1189 {
1190 logprintf (LOG_NOTQUIET,
1191 _("Could not open temporary WARC log file.\n"));
1192 exit (WGET_EXIT_GENERIC_ERROR);
1193 }
1194 log_set_warc_log_fp (warc_log_fp);
1195 }
1196
1197 warc_current_file_number = -1;
1198 if (! warc_start_new_file (false))
1199 {
1200 logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
1201 exit (WGET_EXIT_GENERIC_ERROR);
1202 }
1203
1204 if (opt.warc_cdx_enabled)
1205 {
1206 if (! warc_start_cdx_file ())
1207 {
1208 logprintf (LOG_NOTQUIET,
1209 _("Could not open CDX file for output.\n"));
1210 exit (WGET_EXIT_GENERIC_ERROR);
1211 }
1212 }
1213 }
1214 }
1215
1216 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1217 static void
warc_write_metadata(void)1218 warc_write_metadata (void)
1219 {
1220 char manifest_uuid[48];
1221 FILE *warc_tmp_fp;
1222
1223 /* If there are multiple WARC files, the metadata should be written to a separate file. */
1224 if (opt.warc_maxsize > 0)
1225 warc_start_new_file (true);
1226
1227 warc_uuid_str (manifest_uuid, sizeof (manifest_uuid));
1228
1229 fflush (warc_manifest_fp);
1230 warc_write_metadata_record (manifest_uuid,
1231 "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1232 NULL, NULL, NULL, "text/plain",
1233 warc_manifest_fp, -1);
1234 /* warc_write_resource_record has closed warc_manifest_fp. */
1235
1236 warc_tmp_fp = warc_tempfile ();
1237 if (warc_tmp_fp == NULL)
1238 {
1239 logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1240 exit (WGET_EXIT_GENERIC_ERROR);
1241 }
1242 fflush (warc_tmp_fp);
1243 fprintf (warc_tmp_fp, "%s\n", program_argstring);
1244
1245 warc_write_resource_record (NULL,
1246 "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1247 NULL, manifest_uuid, NULL, "text/plain",
1248 warc_tmp_fp, -1);
1249 /* warc_write_resource_record has closed warc_tmp_fp. */
1250
1251 if (warc_log_fp != NULL)
1252 {
1253 warc_write_resource_record (NULL,
1254 "metadata://gnu.org/software/wget/warc/wget.log",
1255 NULL, manifest_uuid, NULL, "text/plain",
1256 warc_log_fp, -1);
1257 /* warc_write_resource_record has closed warc_log_fp. */
1258
1259 warc_log_fp = NULL;
1260 log_set_warc_log_fp (NULL);
1261 }
1262 }
1263
1264 /* Finishes the WARC writing.
1265 This should be called at the end of the program. */
1266 void
warc_close(void)1267 warc_close (void)
1268 {
1269 if (warc_current_file != NULL)
1270 {
1271 warc_write_metadata ();
1272 *warc_current_warcinfo_uuid_str = 0;
1273 fclose (warc_current_file);
1274 warc_current_file = NULL;
1275 }
1276
1277 if (warc_current_cdx_file != NULL)
1278 {
1279 fclose (warc_current_cdx_file);
1280 warc_current_cdx_file = NULL;
1281 }
1282
1283 if (warc_log_fp != NULL)
1284 {
1285 fclose (warc_log_fp);
1286 log_set_warc_log_fp (NULL);
1287 }
1288 }
1289
1290 /* Creates a temporary file for writing WARC output.
1291 The temporary file will be created in opt.warc_tempdir.
1292 Returns the pointer to the temporary file, or NULL. */
1293 FILE *
warc_tempfile(void)1294 warc_tempfile (void)
1295 {
1296 char filename[100];
1297 int fd;
1298
1299 if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1300 return NULL;
1301
1302 #ifdef __VMS
1303 /* 2013-07-12 SMS.
1304 * mkostemp()+unlink()+fdopen() scheme causes trouble on VMS, so use
1305 * mktemp() to uniquify the (VMS-style) name, and then use a normal
1306 * fopen() with a "create temp file marked for delete" option.
1307 */
1308 {
1309 char *tfn;
1310
1311 tfn = mktemp (filename); /* Get unique name from template. */
1312 if (tfn == NULL)
1313 return NULL;
1314 return fopen (tfn, "w+", "fop=tmd"); /* Create auto-delete temp file. */
1315 }
1316 #else /* def __VMS */
1317 fd = mkostemp (filename, O_TEMPORARY);
1318 if (fd < 0)
1319 return NULL;
1320
1321 #if !O_TEMPORARY
1322 if (unlink (filename) < 0)
1323 {
1324 close(fd);
1325 return NULL;
1326 }
1327 #endif
1328
1329 return fdopen (fd, "wb+");
1330 #endif /* def __VMS [else] */
1331 }
1332
1333
1334 /* Writes a request record to the WARC file.
1335 url is the target uri of the request,
1336 timestamp_str is the timestamp of the request (generated with warc_timestamp),
1337 record_uuid is the uuid of the request (generated with warc_uuid_str),
1338 body is a pointer to a file containing the request headers and body.
1339 ip is the ip address of the server (or NULL),
1340 Calling this function will close body.
1341 Returns true on success, false on error. */
1342 bool
warc_write_request_record(const char * url,const char * timestamp_str,const char * record_uuid,const ip_address * ip,FILE * body,off_t payload_offset)1343 warc_write_request_record (const char *url, const char *timestamp_str,
1344 const char *record_uuid, const ip_address *ip,
1345 FILE *body, off_t payload_offset)
1346 {
1347 warc_write_start_record ();
1348 warc_write_header ("WARC-Type", "request");
1349 warc_write_header_uri ("WARC-Target-URI", url);
1350 warc_write_header ("Content-Type", "application/http;msgtype=request");
1351 warc_write_date_header (timestamp_str);
1352 warc_write_header ("WARC-Record-ID", record_uuid);
1353 warc_write_ip_header (ip);
1354 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1355 warc_write_digest_headers (body, payload_offset);
1356 warc_write_block_from_file (body);
1357 warc_write_end_record ();
1358
1359 fclose (body);
1360
1361 return warc_write_ok;
1362 }
1363
1364 /* Writes a response record to the CDX file.
1365 url is the target uri of the request/response,
1366 timestamp_str is the timestamp of the request that generated this response,
1367 (generated with warc_timestamp),
1368 mime_type is the mime type of the response body (will be printed to CDX),
1369 response_code is the HTTP response code (will be printed to CDX),
1370 payload_digest is the sha1 digest of the payload,
1371 redirect_location is the contents of the Location: header, or NULL (will be printed to CDX),
1372 offset is the position of the WARC record in the WARC file,
1373 warc_filename is the filename of the WARC,
1374 response_uuid is the uuid of the response.
1375 Returns true on success, false on error. */
1376 static bool
warc_write_cdx_record(const char * url,const char * timestamp_str,const char * mime_type,int response_code,const char * payload_digest,const char * redirect_location,off_t offset,const char * warc_filename _GL_UNUSED,const char * response_uuid)1377 warc_write_cdx_record (const char *url, const char *timestamp_str,
1378 const char *mime_type, int response_code,
1379 const char *payload_digest, const char *redirect_location,
1380 off_t offset, const char *warc_filename _GL_UNUSED,
1381 const char *response_uuid)
1382 {
1383 /* Transform the timestamp. */
1384 char timestamp_str_cdx[15];
1385 char offset_string[MAX_INT_TO_STRING_LEN(off_t)];
1386 const char *checksum;
1387 char *tmp_location = NULL;
1388
1389 memcpy (timestamp_str_cdx , timestamp_str , 4); /* "YYYY" "-" */
1390 memcpy (timestamp_str_cdx + 4, timestamp_str + 5, 2); /* "mm" "-" */
1391 memcpy (timestamp_str_cdx + 6, timestamp_str + 8, 2); /* "dd" "T" */
1392 memcpy (timestamp_str_cdx + 8, timestamp_str + 11, 2); /* "HH" ":" */
1393 memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM" ":" */
1394 memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS" "Z" */
1395 timestamp_str_cdx[14] = '\0';
1396
1397 /* Rewrite the checksum. */
1398 if (payload_digest != NULL)
1399 checksum = payload_digest + 5; /* Skip the "sha1:" */
1400 else
1401 checksum = "-";
1402
1403 if (mime_type == NULL || strlen(mime_type) == 0)
1404 mime_type = "-";
1405 if (redirect_location == NULL || strlen(redirect_location) == 0)
1406 tmp_location = strdup ("-");
1407 else
1408 tmp_location = url_escape(redirect_location);
1409
1410 number_to_string (offset_string, offset);
1411
1412 /* Print the CDX line. */
1413 fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %s %s %s\n", url,
1414 timestamp_str_cdx, url, mime_type, response_code, checksum,
1415 tmp_location, offset_string, warc_current_filename,
1416 response_uuid);
1417 fflush (warc_current_cdx_file);
1418 free (tmp_location);
1419
1420 return true;
1421 }
1422
1423 /* Writes a revisit record to the WARC file.
1424 url is the target uri of the request/response,
1425 timestamp_str is the timestamp of the request that generated this response
1426 (generated with warc_timestamp),
1427 concurrent_to_uuid is the uuid of the request for that generated this response
1428 (generated with warc_uuid_str),
1429 refers_to_uuid is the uuid of the original response
1430 (generated with warc_uuid_str),
1431 payload_digest is the sha1 digest of the payload,
1432 ip is the ip address of the server (or NULL),
1433 body is a pointer to a file containing the response headers (without payload).
1434 Calling this function will close body.
1435 Returns true on success, false on error. */
1436 static bool
warc_write_revisit_record(const char * url,const char * timestamp_str,const char * concurrent_to_uuid,const char * payload_digest,const char * refers_to,const ip_address * ip,FILE * body)1437 warc_write_revisit_record (const char *url, const char *timestamp_str,
1438 const char *concurrent_to_uuid, const char *payload_digest,
1439 const char *refers_to, const ip_address *ip, FILE *body)
1440 {
1441 char revisit_uuid [48];
1442 char block_digest[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5];
1443 char sha1_res_block[SHA1_DIGEST_SIZE];
1444
1445 warc_uuid_str (revisit_uuid, sizeof (revisit_uuid));
1446
1447 sha1_stream (body, sha1_res_block);
1448 warc_base32_sha1_digest (sha1_res_block, block_digest, sizeof(block_digest));
1449
1450 warc_write_start_record ();
1451 warc_write_header ("WARC-Type", "revisit");
1452 warc_write_header ("WARC-Record-ID", revisit_uuid);
1453 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1454 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1455 warc_write_header ("WARC-Refers-To", refers_to);
1456 warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1457 warc_write_header ("WARC-Truncated", "length");
1458 warc_write_header_uri ("WARC-Target-URI", url);
1459 warc_write_date_header (timestamp_str);
1460 warc_write_ip_header (ip);
1461 warc_write_header ("Content-Type", "application/http;msgtype=response");
1462 warc_write_header ("WARC-Block-Digest", block_digest);
1463 warc_write_header ("WARC-Payload-Digest", payload_digest);
1464 warc_write_block_from_file (body);
1465 warc_write_end_record ();
1466
1467 fclose (body);
1468
1469 return warc_write_ok;
1470 }
1471
1472 /* Writes a response record to the WARC file.
1473 url is the target uri of the request/response,
1474 timestamp_str is the timestamp of the request that generated this response
1475 (generated with warc_timestamp),
1476 concurrent_to_uuid is the uuid of the request for that generated this response
1477 (generated with warc_uuid_str),
1478 ip is the ip address of the server (or NULL),
1479 body is a pointer to a file containing the response headers and body.
1480 mime_type is the mime type of the response body (will be printed to CDX),
1481 response_code is the HTTP response code (will be printed to CDX),
1482 redirect_location is the contents of the Location: header, or NULL (will be printed to CDX),
1483 Calling this function will close body.
1484 Returns true on success, false on error. */
1485 bool
warc_write_response_record(const char * url,const char * timestamp_str,const char * concurrent_to_uuid,const ip_address * ip,FILE * body,off_t payload_offset,const char * mime_type,int response_code,const char * redirect_location)1486 warc_write_response_record (const char *url, const char *timestamp_str,
1487 const char *concurrent_to_uuid, const ip_address *ip,
1488 FILE *body, off_t payload_offset, const char *mime_type,
1489 int response_code, const char *redirect_location)
1490 {
1491 char block_digest[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5];
1492 char payload_digest[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5];
1493 char sha1_res_block[SHA1_DIGEST_SIZE];
1494 char sha1_res_payload[SHA1_DIGEST_SIZE];
1495 char response_uuid [48];
1496 off_t offset;
1497
1498 if (opt.warc_digests_enabled)
1499 {
1500 /* Calculate the block and payload digests. */
1501 rewind (body);
1502 if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload,
1503 payload_offset) == 0)
1504 {
1505 /* Decide (based on url + payload digest) if we have seen this
1506 data before. */
1507 struct warc_cdx_record *rec_existing;
1508 rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1509 if (rec_existing != NULL)
1510 {
1511 bool result;
1512
1513 /* Found an existing record. */
1514 logprintf (LOG_VERBOSE,
1515 _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1516
1517 /* Remove the payload from the file. */
1518 if (payload_offset > 0)
1519 {
1520 if (ftruncate (fileno (body), payload_offset) == -1)
1521 return false;
1522 }
1523
1524 /* Send the original payload digest. */
1525 warc_base32_sha1_digest (sha1_res_payload, payload_digest, sizeof(payload_digest));
1526 result = warc_write_revisit_record (url, timestamp_str,
1527 concurrent_to_uuid, payload_digest, rec_existing->uuid,
1528 ip, body);
1529
1530 return result;
1531 }
1532
1533 warc_base32_sha1_digest (sha1_res_block, block_digest, sizeof(block_digest));
1534 warc_base32_sha1_digest (sha1_res_payload, payload_digest, sizeof(payload_digest));
1535 }
1536 }
1537
1538 /* Not a revisit, just store the record. */
1539
1540 warc_uuid_str (response_uuid, sizeof (response_uuid));
1541
1542 fseeko (warc_current_file, 0L, SEEK_END);
1543 offset = ftello (warc_current_file);
1544
1545 warc_write_start_record ();
1546 warc_write_header ("WARC-Type", "response");
1547 warc_write_header ("WARC-Record-ID", response_uuid);
1548 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1549 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1550 warc_write_header_uri ("WARC-Target-URI", url);
1551 warc_write_date_header (timestamp_str);
1552 warc_write_ip_header (ip);
1553 warc_write_header ("WARC-Block-Digest", block_digest);
1554 warc_write_header ("WARC-Payload-Digest", payload_digest);
1555 warc_write_header ("Content-Type", "application/http;msgtype=response");
1556 warc_write_block_from_file (body);
1557 warc_write_end_record ();
1558
1559 fclose (body);
1560
1561 if (warc_write_ok && opt.warc_cdx_enabled)
1562 {
1563 /* Add this record to the CDX. */
1564 warc_write_cdx_record (url, timestamp_str, mime_type, response_code,
1565 payload_digest, redirect_location, offset, warc_current_filename,
1566 response_uuid);
1567 }
1568
1569 return warc_write_ok;
1570 }
1571
1572 /* Writes a resource or metadata record to the WARC file.
1573 warc_type is either "resource" or "metadata",
1574 resource_uuid is the uuid of the resource (or NULL),
1575 url is the target uri of the resource,
1576 timestamp_str is the timestamp (generated with warc_timestamp),
1577 concurrent_to_uuid is the uuid of the record that generated this,
1578 resource (generated with warc_uuid_str) or NULL,
1579 ip is the ip address of the server (or NULL),
1580 content_type is the mime type of the body (or NULL),
1581 body is a pointer to a file containing the resource data.
1582 Calling this function will close body.
1583 Returns true on success, false on error. */
1584 static bool
warc_write_record(const char * record_type,const char * resource_uuid,const char * url,const char * timestamp_str,const char * concurrent_to_uuid,const ip_address * ip,const char * content_type,FILE * body,off_t payload_offset)1585 warc_write_record (const char *record_type, const char *resource_uuid,
1586 const char *url, const char *timestamp_str,
1587 const char *concurrent_to_uuid,
1588 const ip_address *ip, const char *content_type, FILE *body,
1589 off_t payload_offset)
1590 {
1591 char uuid_buf[48];
1592
1593 if (resource_uuid == NULL)
1594 {
1595 warc_uuid_str (uuid_buf, sizeof (uuid_buf));
1596 resource_uuid = uuid_buf;
1597 }
1598
1599 if (content_type == NULL)
1600 content_type = "application/octet-stream";
1601
1602 warc_write_start_record ();
1603 warc_write_header ("WARC-Type", record_type);
1604 warc_write_header ("WARC-Record-ID", resource_uuid);
1605 warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1606 warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1607 warc_write_header_uri ("WARC-Target-URI", url);
1608 warc_write_date_header (timestamp_str);
1609 warc_write_ip_header (ip);
1610 warc_write_digest_headers (body, payload_offset);
1611 warc_write_header ("Content-Type", content_type);
1612 warc_write_block_from_file (body);
1613 warc_write_end_record ();
1614
1615 fclose (body);
1616
1617 return warc_write_ok;
1618 }
1619
1620 /* Writes a resource record to the WARC file.
1621 resource_uuid is the uuid of the resource (or NULL),
1622 url is the target uri of the resource,
1623 timestamp_str is the timestamp (generated with warc_timestamp),
1624 concurrent_to_uuid is the uuid of the record that generated this,
1625 resource (generated with warc_uuid_str) or NULL,
1626 ip is the ip address of the server (or NULL),
1627 content_type is the mime type of the body (or NULL),
1628 body is a pointer to a file containing the resource data.
1629 Calling this function will close body.
1630 Returns true on success, false on error. */
1631 bool
warc_write_resource_record(const char * resource_uuid,const char * url,const char * timestamp_str,const char * concurrent_to_uuid,const ip_address * ip,const char * content_type,FILE * body,off_t payload_offset)1632 warc_write_resource_record (const char *resource_uuid, const char *url,
1633 const char *timestamp_str, const char *concurrent_to_uuid,
1634 const ip_address *ip, const char *content_type, FILE *body,
1635 off_t payload_offset)
1636 {
1637 return warc_write_record ("resource",
1638 resource_uuid, url, timestamp_str, concurrent_to_uuid,
1639 ip, content_type, body, payload_offset);
1640 }
1641
1642 /* Writes a metadata record to the WARC file.
1643 record_uuid is the uuid of the record (or NULL),
1644 url is the target uri of the record,
1645 timestamp_str is the timestamp (generated with warc_timestamp),
1646 concurrent_to_uuid is the uuid of the record that generated this,
1647 record (generated with warc_uuid_str) or NULL,
1648 ip is the ip address of the server (or NULL),
1649 content_type is the mime type of the body (or NULL),
1650 body is a pointer to a file containing the record data.
1651 Calling this function will close body.
1652 Returns true on success, false on error. */
1653 bool
warc_write_metadata_record(const char * record_uuid,const char * url,const char * timestamp_str,const char * concurrent_to_uuid,ip_address * ip,const char * content_type,FILE * body,off_t payload_offset)1654 warc_write_metadata_record (const char *record_uuid, const char *url,
1655 const char *timestamp_str, const char *concurrent_to_uuid,
1656 ip_address *ip, const char *content_type, FILE *body,
1657 off_t payload_offset)
1658 {
1659 return warc_write_record ("metadata",
1660 record_uuid, url, timestamp_str, concurrent_to_uuid,
1661 ip, content_type, body, payload_offset);
1662 }
1663