1 /* Utility functions for writing WARC files.
2    Copyright (C) 2011-2012, 2015, 2018-2021 Free Software Foundation,
3    Inc.
4 
5 This file is part of GNU Wget.
6 
7 GNU Wget is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or (at
10 your option) any later version.
11 
12 GNU Wget is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with Wget.  If not, see <http://www.gnu.org/licenses/>.
19 
20 Additional permission under GNU GPL version 3 section 7
21 
22 If you modify this program, or any covered work, by linking or
23 combining it with the OpenSSL project's OpenSSL library (or a
24 modified version of that library), containing parts covered by the
25 terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26 grants you additional permission to convey the resulting work.
27 Corresponding Source for a non-source form of such a combination
28 shall include the source code for the parts of OpenSSL used as well
29 as that of the covered work.  */
30 
31 #include "wget.h"
32 #include "hash.h"
33 #include "utils.h"
34 #include "version.h"
35 #include "dirname.h"
36 #include "url.h"
37 
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <time.h>
42 #include <tmpdir.h>
43 #include <sha1.h>
44 #include <base32.h>
45 #include <unistd.h>
46 #ifdef HAVE_LIBZ
47 #include <zlib.h>
48 #endif
49 
50 #ifdef HAVE_LIBUUID
51 #include <uuid/uuid.h>
52 #elif HAVE_UUID_CREATE
53 #include <uuid.h>
54 #endif
55 
56 #include "warc.h"
57 #include "exits.h"
58 
59 #ifdef WINDOWS
60 /* we need this on Windows to have O_TEMPORARY defined */
61 # include <fcntl.h>
62 # include <rpc.h>
63 #endif
64 
65 #ifndef O_TEMPORARY
66 #define O_TEMPORARY 0
67 #endif
68 
69 #include "warc.h"
70 #include "exits.h"
71 
72 
73 /* The log file (a temporary file that contains a copy
74    of the wget log). */
75 static FILE *warc_log_fp;
76 
77 /* The manifest file (a temporary file that contains the
78    warcinfo uuid of every file in this crawl). */
79 static FILE *warc_manifest_fp;
80 
81 /* The current WARC file (or NULL, if WARC is disabled). */
82 static FILE *warc_current_file;
83 
84 #ifdef HAVE_LIBZ
85 /* The gzip stream for the current WARC file
86    (or NULL, if WARC or gzip is disabled). */
87 static gzFile warc_current_gzfile;
88 
89 /* The offset of the current gzip record in the WARC file. */
90 static off_t warc_current_gzfile_offset;
91 
92 /* The uncompressed size (so far) of the current record. */
93 static off_t warc_current_gzfile_uncompressed_size;
94 # endif
95 
96 /* This is true until a warc_write_* method fails. */
97 static bool warc_write_ok;
98 
99 /* The current CDX file (or NULL, if CDX is disabled). */
100 static FILE *warc_current_cdx_file;
101 
102 /* The record id of the warcinfo record of the current WARC file.  */
103 static char warc_current_warcinfo_uuid_str[48];
104 
105 /* The file name of the current WARC file. */
106 static char *warc_current_filename;
107 
108 /* The serial number of the current WARC file.  This number is
109    incremented each time a new file is opened and is used in the
110    WARC file's filename. */
111 static int warc_current_file_number;
112 
113 /* The table of CDX records, if deduplication is enabled. */
114 static struct hash_table * warc_cdx_dedup_table;
115 
116 static bool warc_start_new_file (bool meta);
117 
118 
119 struct warc_cdx_record
120 {
121   char *url;
122   char *uuid;
123   char digest[SHA1_DIGEST_SIZE];
124 };
125 
126 static unsigned long
warc_hash_sha1_digest(const void * key)127 warc_hash_sha1_digest (const void *key)
128 {
129   /* We just use some of the first bytes of the digest. */
130   unsigned long v = 0;
131   memcpy (&v, key, sizeof (unsigned long));
132   return v;
133 }
134 
135 static int
warc_cmp_sha1_digest(const void * digest1,const void * digest2)136 warc_cmp_sha1_digest (const void *digest1, const void *digest2)
137 {
138   return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE);
139 }
140 
141 
142 
143 /* Writes SIZE bytes from BUFFER to the current WARC file,
144    through gzwrite if compression is enabled.
145    Returns the number of uncompressed bytes written.  */
146 static size_t
warc_write_buffer(const char * buffer,size_t size)147 warc_write_buffer (const char *buffer, size_t size)
148 {
149 #ifdef HAVE_LIBZ
150   if (warc_current_gzfile)
151     {
152       warc_current_gzfile_uncompressed_size += size;
153       return gzwrite (warc_current_gzfile, buffer, size);
154     }
155   else
156 #endif
157     return fwrite (buffer, 1, size, warc_current_file);
158 }
159 
160 /* Writes STR to the current WARC file.
161    Returns false and set warc_write_ok to false if there
162    is an error.  */
163 static bool
warc_write_string(const char * str)164 warc_write_string (const char *str)
165 {
166   size_t n;
167 
168   if (!warc_write_ok)
169     return false;
170 
171   n = strlen (str);
172   if (n != warc_write_buffer (str, n))
173     warc_write_ok = false;
174 
175   return warc_write_ok;
176 }
177 
178 
179 #define EXTRA_GZIP_HEADER_SIZE 14
180 #define GZIP_STATIC_HEADER_SIZE  10
181 #define FLG_FEXTRA          0x04
182 #define OFF_FLG             3
183 
184 /* Starts a new WARC record.  Writes the version header.
185    If opt.warc_maxsize is set and the current file is becoming
186    too large, this will open a new WARC file.
187 
188    If compression is enabled, this will start a new
189    gzip stream in the current WARC file.
190 
191    Returns false and set warc_write_ok to false if there
192    is an error.  */
193 static bool
warc_write_start_record(void)194 warc_write_start_record (void)
195 {
196   if (!warc_write_ok)
197     return false;
198 
199   fflush (warc_current_file);
200   if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize)
201     warc_start_new_file (false);
202 
203 #ifdef HAVE_LIBZ
204   /* Start a GZIP stream, if required. */
205   if (opt.warc_compression_enabled)
206     {
207       int dup_fd;
208       /* Record the starting offset of the new record. */
209       warc_current_gzfile_offset = ftello (warc_current_file);
210 
211       /* Reserve space for the extra GZIP header field.
212          In warc_write_end_record we will fill this space
213          with information about the uncompressed and
214          compressed size of the record. */
215       if (fseek (warc_current_file, EXTRA_GZIP_HEADER_SIZE, SEEK_CUR) < 0)
216         {
217           logprintf (LOG_NOTQUIET, _("Error setting WARC file position.\n"));
218           warc_write_ok = false;
219           return false;
220         }
221 
222       if (fflush (warc_current_file) != 0)
223         {
224           logprintf (LOG_NOTQUIET, _("Error flushing WARC file to disk.\n"));
225           warc_write_ok = false;
226           return false;
227         }
228 
229       /* Start a new GZIP stream. */
230       dup_fd = dup (fileno (warc_current_file));
231       if (dup_fd < 0)
232         {
233           logprintf (LOG_NOTQUIET,
234 _("Error duplicating WARC file file descriptor.\n"));
235           warc_write_ok = false;
236           return false;
237         }
238 
239       warc_current_gzfile = gzdopen (dup_fd, "wb9");
240       warc_current_gzfile_uncompressed_size = 0;
241 
242       if (warc_current_gzfile == NULL)
243         {
244           logprintf (LOG_NOTQUIET,
245 _("Error opening GZIP stream to WARC file.\n"));
246           close (dup_fd);
247           warc_write_ok = false;
248           return false;
249         }
250     }
251 #endif
252 
253   warc_write_string ("WARC/1.0\r\n");
254   return warc_write_ok;
255 }
256 
257 /* Writes a WARC header to the current WARC record.
258    This method may be run after warc_write_start_record and
259    before warc_write_block_from_file.  */
260 static bool
warc_write_header(const char * name,const char * value)261 warc_write_header (const char *name, const char *value)
262 {
263   if (value)
264     {
265       warc_write_string (name);
266       warc_write_string (": ");
267       warc_write_string (value);
268       warc_write_string ("\r\n");
269     }
270   return warc_write_ok;
271 }
272 
273 /* Writes a WARC header with a URI as value to the current WARC record.
274    This method may be run after warc_write_start_record and
275    before warc_write_block_from_file.  */
276 static bool
warc_write_header_uri(const char * name,const char * value)277 warc_write_header_uri (const char *name, const char *value)
278 {
279   if (value)
280     {
281       warc_write_string (name);
282       warc_write_string (": <");
283       warc_write_string (value);
284       warc_write_string (">\r\n");
285     }
286   return warc_write_ok;
287 }
288 
289 /* Copies the contents of DATA_IN to the WARC record.
290    Adds a Content-Length header to the WARC record.
291    Run this method after warc_write_header,
292    then run warc_write_end_record. */
293 static bool
warc_write_block_from_file(FILE * data_in)294 warc_write_block_from_file (FILE *data_in)
295 {
296   /* Add the Content-Length header. */
297   char content_length[MAX_INT_TO_STRING_LEN(off_t)];
298   char buffer[BUFSIZ];
299   size_t s;
300 
301   fseeko (data_in, 0L, SEEK_END);
302   number_to_string (content_length, ftello (data_in));
303   warc_write_header ("Content-Length", content_length);
304 
305   /* End of the WARC header section. */
306   warc_write_string ("\r\n");
307 
308   if (fseeko (data_in, 0L, SEEK_SET) != 0)
309     warc_write_ok = false;
310 
311   /* Copy the data in the file to the WARC record. */
312   while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
313     {
314       if (warc_write_buffer (buffer, s) < s)
315         warc_write_ok = false;
316     }
317 
318   return warc_write_ok;
319 }
320 
321 /* Run this method to close the current WARC record.
322 
323    If compression is enabled, this method closes the
324    current GZIP stream and fills the extra GZIP header
325    with the uncompressed and compressed length of the
326    record. */
327 static bool
warc_write_end_record(void)328 warc_write_end_record (void)
329 {
330   if (warc_write_buffer ("\r\n\r\n", 4) != 4)
331     {
332       warc_write_ok = false;
333       return false;
334     }
335 
336 #ifdef HAVE_LIBZ
337   /* We start a new gzip stream for each record.  */
338   if (warc_write_ok && warc_current_gzfile)
339     {
340       char extra_header[EXTRA_GZIP_HEADER_SIZE];
341       char static_header[GZIP_STATIC_HEADER_SIZE];
342       off_t current_offset, uncompressed_size, compressed_size;
343       size_t result;
344 
345       if (gzclose (warc_current_gzfile) != Z_OK)
346         {
347           warc_write_ok = false;
348           return false;
349         }
350 
351       fflush (warc_current_file);
352       fseeko (warc_current_file, 0, SEEK_END);
353 
354       /* The WARC standard suggests that we add 'skip length' data in the
355          extra header field of the GZIP stream.
356 
357          In warc_write_start_record we reserved space for this extra header.
358          This extra space starts at warc_current_gzfile_offset and fills
359          EXTRA_GZIP_HEADER_SIZE bytes.  The static GZIP header starts at
360          warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE.
361 
362          We need to do three things:
363          1. Move the static GZIP header to warc_current_gzfile_offset;
364          2. Set the FEXTRA flag in the GZIP header;
365          3. Write the extra GZIP header after the static header, that is,
366             starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE.
367       */
368 
369       /* Calculate the uncompressed and compressed sizes. */
370       current_offset = ftello (warc_current_file);
371       uncompressed_size = current_offset - warc_current_gzfile_offset;
372       compressed_size = warc_current_gzfile_uncompressed_size;
373 
374       /* Go back to the static GZIP header. */
375       result = fseeko (warc_current_file, warc_current_gzfile_offset
376               + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
377       if (result != 0)
378         {
379           warc_write_ok = false;
380           return false;
381         }
382 
383       /* Read the header. */
384       result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE,
385                              warc_current_file);
386       if (result != GZIP_STATIC_HEADER_SIZE)
387         {
388           warc_write_ok = false;
389           return false;
390         }
391 
392       /* Set the FEXTRA flag in the flags byte of the header. */
393       static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
394 
395       /* Write the header back to the file, but starting at
396          warc_current_gzfile_offset. */
397       fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
398       fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
399 
400       /* Prepare the extra GZIP header. */
401       /* XLEN, the length of the extra header fields.  */
402       extra_header[0]  = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
403       extra_header[1]  = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
404       /* The extra header field identifier for the WARC skip length. */
405       extra_header[2]  = 's';
406       extra_header[3]  = 'l';
407       /* The size of the field value (8 bytes).  */
408       extra_header[4]  = (8 & 255);
409       extra_header[5]  = ((8 >> 8) & 255);
410       /* The size of the uncompressed record.  */
411       extra_header[6]  = (uncompressed_size & 255);
412       extra_header[7]  = (uncompressed_size >> 8) & 255;
413       extra_header[8]  = (uncompressed_size >> 16) & 255;
414       extra_header[9]  = (uncompressed_size >> 24) & 255;
415       /* The size of the compressed record.  */
416       extra_header[10] = (compressed_size & 255);
417       extra_header[11] = (compressed_size >> 8) & 255;
418       extra_header[12] = (compressed_size >> 16) & 255;
419       extra_header[13] = (compressed_size >> 24) & 255;
420 
421       /* Write the extra header after the static header. */
422       fseeko (warc_current_file, warc_current_gzfile_offset
423               + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
424       fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
425 
426       /* Done, move back to the end of the file. */
427       fflush (warc_current_file);
428       fseeko (warc_current_file, 0, SEEK_END);
429     }
430 #endif /* HAVE_LIBZ */
431 
432   return warc_write_ok;
433 }
434 
435 
436 /* Writes the WARC-Date header for the given timestamp to
437    the current WARC record.
438    If timestamp is NULL, the current time will be used.  */
439 static bool
warc_write_date_header(const char * timestamp)440 warc_write_date_header (const char *timestamp)
441 {
442   char current_timestamp[21];
443 
444   return warc_write_header ("WARC-Date", timestamp ? timestamp :
445                             warc_timestamp (current_timestamp, sizeof(current_timestamp)));
446 }
447 
448 /* Writes the WARC-IP-Address header for the given IP to
449    the current WARC record.  If IP is NULL, no header will
450    be written.  */
451 static bool
warc_write_ip_header(const ip_address * ip)452 warc_write_ip_header (const ip_address *ip)
453 {
454   if (ip != NULL)
455     return warc_write_header ("WARC-IP-Address", print_address (ip));
456   else
457     return warc_write_ok;
458 }
459 
460 
461 /* warc_sha1_stream_with_payload is a modified copy of sha1_stream
462    from gnulib/sha1.c.  This version calculates two digests in one go.
463 
464    Compute SHA1 message digests for bytes read from STREAM.  The
465    digest of the complete file will be written into the 16 bytes
466    beginning at RES_BLOCK.
467 
468    If payload_offset >= 0, a second digest will be calculated of the
469    portion of the file starting at payload_offset and continuing to
470    the end of the file.  The digest number will be written into the
471    16 bytes beginning ad RES_PAYLOAD.  */
472 static int
warc_sha1_stream_with_payload(FILE * stream,void * res_block,void * res_payload,off_t payload_offset)473 warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
474                                off_t payload_offset)
475 {
476 #define BLOCKSIZE 32768
477 
478   struct sha1_ctx ctx_block;
479   struct sha1_ctx ctx_payload;
480   off_t pos;
481   off_t sum;
482 
483   char *buffer = xmalloc (BLOCKSIZE + 72);
484 
485   /* Initialize the computation context.  */
486   sha1_init_ctx (&ctx_block);
487   if (payload_offset >= 0)
488     sha1_init_ctx (&ctx_payload);
489 
490   pos = 0;
491 
492   /* Iterate over full file contents.  */
493   while (1)
494     {
495       /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
496          computation function processes the whole buffer so that with the
497          next round of the loop another block can be read.  */
498       off_t n;
499       sum = 0;
500 
501       /* Read block.  Take care for partial reads.  */
502       while (1)
503         {
504           n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
505 
506           sum += n;
507           pos += n;
508 
509           if (sum == BLOCKSIZE)
510             break;
511 
512           if (n == 0)
513             {
514               /* Check for the error flag IF N == 0, so that we don't
515                  exit the loop after a partial read due to e.g., EAGAIN
516                  or EWOULDBLOCK.  */
517               if (ferror (stream))
518                 {
519                   xfree (buffer);
520                   return 1;
521                 }
522               goto process_partial_block;
523             }
524 
525           /* We've read at least one byte, so ignore errors.  But always
526              check for EOF, since feof may be true even though N > 0.
527              Otherwise, we could end up calling fread after EOF.  */
528           if (feof (stream))
529             goto process_partial_block;
530         }
531 
532       /* Process buffer with BLOCKSIZE bytes.  Note that
533                         BLOCKSIZE % 64 == 0
534        */
535       sha1_process_block (buffer, BLOCKSIZE, &ctx_block);
536       if (payload_offset >= 0 && payload_offset < pos)
537         {
538           /* At least part of the buffer contains data from payload. */
539           off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
540           if (start_of_payload <= 0)
541             /* All bytes in the buffer belong to the payload. */
542             start_of_payload = 0;
543 
544           /* Process the payload part of the buffer.
545              Note: we can't use  sha1_process_block  here even if we
546              process the complete buffer.  Because the payload doesn't
547              have to start with a full block, there may still be some
548              bytes left from the previous buffer.  Therefore, we need
549              to continue with  sha1_process_bytes.  */
550           sha1_process_bytes (buffer + start_of_payload,
551                               BLOCKSIZE - start_of_payload, &ctx_payload);
552         }
553     }
554 
555  process_partial_block:;
556 
557   /* Process any remaining bytes.  */
558   if (sum > 0)
559     {
560       sha1_process_bytes (buffer, sum, &ctx_block);
561       if (payload_offset >= 0 && payload_offset < pos)
562         {
563           /* At least part of the buffer contains data from payload. */
564           off_t start_of_payload = payload_offset - (pos - sum);
565           if (start_of_payload <= 0)
566             /* All bytes in the buffer belong to the payload. */
567             start_of_payload = 0;
568 
569           /* Process the payload part of the buffer. */
570           sha1_process_bytes (buffer + start_of_payload,
571                               sum - start_of_payload, &ctx_payload);
572         }
573     }
574 
575   /* Construct result in desired memory.  */
576   sha1_finish_ctx (&ctx_block,   res_block);
577   if (payload_offset >= 0)
578     sha1_finish_ctx (&ctx_payload, res_payload);
579   xfree (buffer);
580   return 0;
581 
582 #undef BLOCKSIZE
583 }
584 
585 /* Converts the SHA1 digest to a base32-encoded string.
586    "sha1:DIGEST\0"  (Allocates a new string for the response.)  */
587 static char *
warc_base32_sha1_digest(const char * sha1_digest,char * sha1_base32,size_t sha1_base32_size)588 warc_base32_sha1_digest (const char *sha1_digest, char *sha1_base32, size_t sha1_base32_size)
589 {
590   if (sha1_base32_size >= BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5 + 1)
591     {
592       memcpy (sha1_base32, "sha1:", 5);
593       base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5,
594                      sha1_base32_size - 5);
595     }
596   else
597     *sha1_base32 = 0;
598 
599   return sha1_base32;
600 }
601 
602 
603 /* Sets the digest headers of the record.
604    This method will calculate the block digest and, if payload_offset >= 0,
605    will also calculate the payload digest of the payload starting at the
606    provided offset.  */
607 static void
warc_write_digest_headers(FILE * file,long payload_offset)608 warc_write_digest_headers (FILE *file, long payload_offset)
609 {
610   if (opt.warc_digests_enabled)
611     {
612       /* Calculate the block and payload digests. */
613       char sha1_res_block[SHA1_DIGEST_SIZE];
614       char sha1_res_payload[SHA1_DIGEST_SIZE];
615 
616       rewind (file);
617       if (warc_sha1_stream_with_payload (file, sha1_res_block,
618           sha1_res_payload, payload_offset) == 0)
619         {
620           char digest[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5];
621 
622           warc_write_header ("WARC-Block-Digest",
623               warc_base32_sha1_digest (sha1_res_block, digest, sizeof(digest)));
624 
625           if (payload_offset >= 0)
626               warc_write_header ("WARC-Payload-Digest",
627                   warc_base32_sha1_digest (sha1_res_payload, digest, sizeof(digest)));
628         }
629     }
630 }
631 
632 
633 /* Fills timestamp with the current time and date.
634    The UTC time is formatted following ISO 8601, as required
635    for use in the WARC-Date header.
636    The timestamp will be 21 characters long. */
637 char *
warc_timestamp(char * timestamp,size_t timestamp_size)638 warc_timestamp (char *timestamp, size_t timestamp_size)
639 {
640   time_t rawtime = time (NULL);
641   struct tm * timeinfo = gmtime (&rawtime);
642 
643   if (strftime (timestamp, timestamp_size, "%Y-%m-%dT%H:%M:%SZ", timeinfo) == 0 && timestamp_size > 0)
644     *timestamp = 0;
645 
646   return timestamp;
647 }
648 
649 /* Fills urn_str with a UUID in the format required
650    for the WARC-Record-Id header.
651    The string will be 47 characters long. */
652 #if HAVE_LIBUUID
653 void
warc_uuid_str(char * urn_str,size_t urn_size)654 warc_uuid_str (char *urn_str, size_t urn_size)
655 {
656   char uuid_str[37];
657   uuid_t record_id;
658 
659   uuid_generate (record_id);
660   uuid_unparse (record_id, uuid_str);
661 
662   snprintf (urn_str, urn_size, "<urn:uuid:%s>", uuid_str);
663 }
664 #elif HAVE_UUID_CREATE
665 void
warc_uuid_str(char * urn_str,size_t urn_size)666 warc_uuid_str (char *urn_str, size_t urn_size)
667 {
668   char *uuid_str;
669   uuid_t record_id;
670 
671   uuid_create (&record_id, NULL);
672   uuid_to_string (&record_id, &uuid_str, NULL);
673 
674   snprintf (urn_str, urn_size, "<urn:uuid:%s>", uuid_str);
675   xfree (uuid_str);
676 }
677 #else
678 # ifdef WINDOWS
679 
680 typedef RPC_STATUS (RPC_ENTRY * UuidCreate_proc) (UUID *);
681 typedef RPC_STATUS (RPC_ENTRY * UuidToString_proc) (UUID *, unsigned char **);
682 typedef RPC_STATUS (RPC_ENTRY * RpcStringFree_proc) (unsigned char **);
683 
684 static int
windows_uuid_str(char * urn_str,size_t urn_size)685 windows_uuid_str (char *urn_str, size_t urn_size)
686 {
687   static UuidCreate_proc pfn_UuidCreate = NULL;
688   static UuidToString_proc pfn_UuidToString = NULL;
689   static RpcStringFree_proc pfn_RpcStringFree = NULL;
690   static int rpc_uuid_avail = -1;
691 
692   /* Rpcrt4.dll is not available on older versions of Windows, so we
693      need to test its availability at run time.  */
694   if (rpc_uuid_avail == -1)
695     {
696       HMODULE hm_rpcrt4 = LoadLibrary ("Rpcrt4.dll");
697 
698       if (hm_rpcrt4)
699 	{
700 	  pfn_UuidCreate =
701 	    (UuidCreate_proc) GetProcAddress (hm_rpcrt4, "UuidCreate");
702 	  pfn_UuidToString =
703 	    (UuidToString_proc) GetProcAddress (hm_rpcrt4, "UuidToStringA");
704 	  pfn_RpcStringFree =
705 	    (RpcStringFree_proc) GetProcAddress (hm_rpcrt4, "RpcStringFreeA");
706 	  if (pfn_UuidCreate && pfn_UuidToString && pfn_RpcStringFree)
707 	    rpc_uuid_avail = 1;
708 	  else
709 	    rpc_uuid_avail = 0;
710 	}
711       else
712 	rpc_uuid_avail = 0;
713     }
714 
715   if (rpc_uuid_avail)
716     {
717       BYTE *uuid_str;
718       UUID  uuid;
719 
720       if (pfn_UuidCreate (&uuid) == RPC_S_OK)
721 	{
722 	  if (pfn_UuidToString (&uuid, &uuid_str) == RPC_S_OK)
723 	    {
724 	      snprintf (urn_str, urn_size, "<urn:uuid:%s>", uuid_str);
725 	      pfn_RpcStringFree (&uuid_str);
726 	      return 1;
727 	    }
728 	}
729     }
730   return 0;
731 }
732 #endif
733 /* Fills urn_str with a UUID based on random numbers in the format
734    required for the WARC-Record-Id header.
735    (See RFC 4122, UUID version 4.)
736 
737    Note: this is a fallback method, it is much better to use the
738    methods provided by libuuid.
739 
740    The string will be 47 characters long. */
741 void
warc_uuid_str(char * urn_str,size_t urn_size)742 warc_uuid_str (char *urn_str, size_t urn_size)
743 {
744   /* RFC 4122, a version 4 UUID with only random numbers */
745 
746   unsigned char uuid_data[16];
747   int i;
748 
749 #ifdef WINDOWS
750   /* If the native method fails (expected on older Windows versions),
751      use the fallback below.  */
752   if (windows_uuid_str (urn_str, urn_size))
753     return;
754 #endif
755 
756   for (i=0; i<16; i++)
757     uuid_data[i] = random_number (255);
758 
759   /* Set the four most significant bits (bits 12 through 15) of the
760 	*  time_hi_and_version field to the 4-bit version number */
761   uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
762 
763   /* Set the two most significant bits (bits 6 and 7) of the
764 	*  clock_seq_hi_and_reserved to zero and one, respectively. */
765   uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
766 
767   snprintf (urn_str, urn_size,
768     "<urn:uuid:%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>",
769     uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4],
770     uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9],
771     uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14],
772     uuid_data[15]);
773 }
774 #endif
775 
776 /* Write a warcinfo record to the current file.
777    Updates warc_current_warcinfo_uuid_str. */
778 static bool
warc_write_warcinfo_record(const char * filename)779 warc_write_warcinfo_record (const char *filename)
780 {
781   FILE *warc_tmp;
782   char timestamp[22];
783   char *filename_basename;
784 
785   /* Write warc-info record as the first record of the file. */
786   /* We add the record id of this info record to the other records in the
787      file. */
788   warc_uuid_str (warc_current_warcinfo_uuid_str, sizeof (warc_current_warcinfo_uuid_str));
789 
790   warc_timestamp (timestamp, sizeof(timestamp));
791 
792   filename_basename = base_name (filename);
793 
794   warc_write_start_record ();
795   warc_write_header ("WARC-Type", "warcinfo");
796   warc_write_header ("Content-Type", "application/warc-fields");
797   warc_write_header ("WARC-Date", timestamp);
798   warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str);
799   warc_write_header ("WARC-Filename", filename_basename);
800 
801   xfree (filename_basename);
802 
803   /* Create content.  */
804   warc_tmp = warc_tempfile ();
805   if (warc_tmp == NULL)
806     {
807       return false;
808     }
809 
810   fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE);
811   fprintf (warc_tmp, "format: WARC File Format 1.0\r\n");
812   fprintf (warc_tmp,
813 "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n");
814   fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off"));
815   fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring);
816   /* Add the user headers, if any. */
817   if (opt.warc_user_headers)
818     {
819       int i;
820       for (i = 0; opt.warc_user_headers[i]; i++)
821         fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]);
822     }
823   fprintf(warc_tmp, "\r\n");
824 
825   warc_write_digest_headers (warc_tmp, -1);
826   warc_write_block_from_file (warc_tmp);
827   warc_write_end_record ();
828 
829   if (! warc_write_ok)
830     logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
831 
832   fclose (warc_tmp);
833   return warc_write_ok;
834 }
835 
836 /* Opens a new WARC file.
837    If META is true, generates a filename ending with 'meta.warc.gz'.
838 
839    This method will:
840    1. close the current WARC file (if there is one);
841    2. increment warc_current_file_number;
842    3. open a new WARC file;
843    4. write the initial warcinfo record.
844 
845    Returns true on success, false otherwise.
846    */
847 static bool
warc_start_new_file(bool meta)848 warc_start_new_file (bool meta)
849 {
850 #ifdef __VMS
851 # define WARC_GZ "warc-gz"
852 #else /* def __VMS */
853 # define WARC_GZ "warc.gz"
854 #endif /* def __VMS [else] */
855 
856 #ifdef HAVE_LIBZ
857   const char *extension = (opt.warc_compression_enabled ? WARC_GZ : "warc");
858 #else
859   const char *extension = "warc";
860 #endif
861 
862   int base_filename_length;
863   char *new_filename;
864 
865   if (opt.warc_filename == NULL)
866     return false;
867 
868   if (warc_current_file != NULL)
869     fclose (warc_current_file);
870 
871   *warc_current_warcinfo_uuid_str = 0;
872   xfree (warc_current_filename);
873 
874   warc_current_file_number++;
875 
876   base_filename_length = strlen (opt.warc_filename);
877   /* filename format:  base + "-" + 5 digit serial number + ".warc.gz" */
878   new_filename = xmalloc (base_filename_length + 1 + 5 + 8 + 1);
879 
880   warc_current_filename = new_filename;
881 
882   /* If max size is enabled, we add a serial number to the file names. */
883   if (meta)
884     sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
885   else if (opt.warc_maxsize > 0)
886     {
887       sprintf (new_filename, "%s-%05d.%s", opt.warc_filename,
888                warc_current_file_number, extension);
889     }
890   else
891     sprintf (new_filename, "%s.%s", opt.warc_filename, extension);
892 
893   logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename));
894 
895   /* Open the WARC file. */
896   warc_current_file = fopen (new_filename, "wb+");
897   if (warc_current_file == NULL)
898     {
899       logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"),
900                  quote (new_filename));
901       return false;
902     }
903 
904   if (! warc_write_warcinfo_record (new_filename))
905     return false;
906 
907   /* Add warcinfo uuid to manifest. */
908   if (warc_manifest_fp)
909     fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str);
910 
911   return true;
912 }
913 
914 /* Opens the CDX file for output. */
915 static bool
warc_start_cdx_file(void)916 warc_start_cdx_file (void)
917 {
918   char *cdx_filename = aprintf("%s.cdx", opt.warc_filename);
919   warc_current_cdx_file = fopen (cdx_filename, "a+");
920   free(cdx_filename);
921 
922   if (warc_current_cdx_file == NULL)
923     return false;
924 
925   /* Print the CDX header.
926    *
927    * a - original url
928    * b - date
929    * m - mime type
930    * s - response code
931    * k - new style checksum
932    * r - redirect
933    * M - meta tags
934    * V - compressed arc file offset
935    * g - file name
936    * u - record-id
937    */
938   fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n");
939   fflush (warc_current_cdx_file);
940 
941   return true;
942 }
943 
944 #define CDX_FIELDSEP " \t\r\n"
945 
946 /* Parse the CDX header and find the field numbers of the original url,
947    checksum and record ID fields. */
948 static bool
warc_parse_cdx_header(char * lineptr,int * field_num_original_url,int * field_num_checksum,int * field_num_record_id)949 warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
950                        int *field_num_checksum, int *field_num_record_id)
951 {
952   char *token;
953   char *save_ptr;
954 
955   *field_num_original_url = -1;
956   *field_num_checksum = -1;
957   *field_num_record_id = -1;
958 
959   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
960 
961   if (token != NULL && strcmp (token, "CDX") == 0)
962     {
963       int field_num = 0;
964       while (token != NULL)
965         {
966           token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
967           if (token != NULL)
968             {
969               switch (token[0])
970                 {
971                 case 'a':
972                   *field_num_original_url = field_num;
973                   break;
974                 case 'k':
975                   *field_num_checksum = field_num;
976                   break;
977                 case 'u':
978                   *field_num_record_id = field_num;
979                   break;
980                 }
981             }
982           field_num++;
983         }
984     }
985 
986   return *field_num_original_url != -1
987          && *field_num_checksum != -1
988          && *field_num_record_id != -1;
989 }
990 
991 /* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */
992 static void
warc_process_cdx_line(char * lineptr,int field_num_original_url,int field_num_checksum,int field_num_record_id)993 warc_process_cdx_line (char *lineptr, int field_num_original_url,
994                        int field_num_checksum, int field_num_record_id)
995 {
996   char *original_url = NULL;
997   char *checksum = NULL;
998   char *record_id = NULL;
999   char *token;
1000   char *save_ptr;
1001   int field_num = 0;
1002 
1003   /* Read this line to get the fields we need. */
1004   token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
1005   while (token != NULL)
1006     {
1007       char **val;
1008       if (field_num == field_num_original_url)
1009         val = &original_url;
1010       else if (field_num == field_num_checksum)
1011         val = &checksum;
1012       else if (field_num == field_num_record_id)
1013         val = &record_id;
1014       else
1015         val = NULL;
1016 
1017       if (val != NULL)
1018         *val = strdup (token);
1019 
1020       token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr);
1021       field_num++;
1022     }
1023 
1024   if (original_url != NULL && checksum != NULL && record_id != NULL)
1025     {
1026       /* For some extra efficiency, we decode the base32 encoded
1027          checksum value.  This should produce exactly SHA1_DIGEST_SIZE
1028          bytes.  */
1029       size_t checksum_l;
1030       char * checksum_v;
1031       base32_decode_alloc (checksum, strlen (checksum), &checksum_v,
1032                            &checksum_l);
1033       xfree (checksum);
1034 
1035       if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
1036         {
1037           /* This is a valid line with a valid checksum. */
1038           struct warc_cdx_record *rec;
1039           rec = xmalloc (sizeof (struct warc_cdx_record));
1040           rec->url = original_url;
1041           rec->uuid = record_id;
1042           memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
1043           hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
1044           xfree (checksum_v);
1045         }
1046       else
1047         {
1048           xfree (original_url);
1049           xfree (checksum_v);
1050           xfree (record_id);
1051         }
1052     }
1053   else
1054     {
1055       xfree(checksum);
1056       xfree(original_url);
1057       xfree(record_id);
1058     }
1059 }
1060 
1061 /* Loads the CDX file from opt.warc_cdx_dedup_filename and fills
1062    the warc_cdx_dedup_table. */
1063 static bool
warc_load_cdx_dedup_file(void)1064 warc_load_cdx_dedup_file (void)
1065 {
1066   FILE *f;
1067   char *lineptr = NULL;
1068   size_t n = 0;
1069   ssize_t line_length;
1070   int field_num_original_url = -1;
1071   int field_num_checksum = -1;
1072   int field_num_record_id = -1;
1073 
1074   f = fopen (opt.warc_cdx_dedup_filename, "r");
1075   if (f == NULL)
1076     return false;
1077 
1078   /* The first line should contain the CDX header.
1079      Format:  " CDX x x x x x"
1080      where x are field type indicators.  For our purposes, we only
1081      need 'a' (the original url), 'k' (the SHA1 checksum) and
1082      'u' (the WARC record id). */
1083   line_length = getline (&lineptr, &n, f);
1084   if (line_length != -1)
1085     warc_parse_cdx_header (lineptr, &field_num_original_url,
1086                            &field_num_checksum, &field_num_record_id);
1087 
1088   /* If the file contains all three fields, read the complete file. */
1089   if (field_num_original_url == -1
1090       || field_num_checksum == -1
1091       || field_num_record_id == -1)
1092     {
1093       if (field_num_original_url == -1)
1094         logprintf (LOG_NOTQUIET,
1095 _("CDX file does not list original urls. (Missing column 'a'.)\n"));
1096       if (field_num_checksum == -1)
1097         logprintf (LOG_NOTQUIET,
1098 _("CDX file does not list checksums. (Missing column 'k'.)\n"));
1099       if (field_num_record_id == -1)
1100         logprintf (LOG_NOTQUIET,
1101 _("CDX file does not list record ids. (Missing column 'u'.)\n"));
1102     }
1103   else
1104     {
1105       int nrecords;
1106 
1107       /* Initialize the table. */
1108       warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest,
1109                                              warc_cmp_sha1_digest);
1110 
1111       do
1112         {
1113           line_length = getline (&lineptr, &n, f);
1114           if (line_length != -1)
1115             {
1116               warc_process_cdx_line (lineptr, field_num_original_url,
1117                             field_num_checksum, field_num_record_id);
1118             }
1119 
1120         }
1121       while (line_length != -1);
1122 
1123       /* Print results. */
1124       nrecords = hash_table_count (warc_cdx_dedup_table);
1125       logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
1126                                         "Loaded %d records from CDX.\n\n",
1127                                          nrecords),
1128                               nrecords);
1129     }
1130 
1131   xfree (lineptr);
1132   fclose (f);
1133 
1134   return true;
1135 }
1136 #undef CDX_FIELDSEP
1137 
1138 /* Returns the existing duplicate CDX record for the given url and payload
1139    digest.  Returns NULL if the url is not found or if the payload digest
1140    does not match, or if CDX deduplication is disabled. */
1141 static struct warc_cdx_record *
warc_find_duplicate_cdx_record(const char * url,char * sha1_digest_payload)1142 warc_find_duplicate_cdx_record (const char *url, char *sha1_digest_payload)
1143 {
1144   struct warc_cdx_record *rec_existing;
1145 
1146   if (warc_cdx_dedup_table == NULL)
1147     return NULL;
1148 
1149   rec_existing = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload);
1150 
1151   if (rec_existing && strcmp (rec_existing->url, url) == 0)
1152     return rec_existing;
1153   else
1154     return NULL;
1155 }
1156 
1157 /* Initializes the WARC writer (if opt.warc_filename is set).
1158    This should be called before any WARC record is written. */
1159 void
warc_init(void)1160 warc_init (void)
1161 {
1162   warc_write_ok = true;
1163 
1164   if (opt.warc_filename != NULL)
1165     {
1166       if (opt.warc_cdx_dedup_filename != NULL)
1167         {
1168           if (! warc_load_cdx_dedup_file ())
1169             {
1170               logprintf (LOG_NOTQUIET,
1171                          _("Could not read CDX file %s for deduplication.\n"),
1172                          quote (opt.warc_cdx_dedup_filename));
1173               exit (WGET_EXIT_GENERIC_ERROR);
1174             }
1175         }
1176 
1177       warc_manifest_fp = warc_tempfile ();
1178       if (warc_manifest_fp == NULL)
1179         {
1180           logprintf (LOG_NOTQUIET,
1181                      _("Could not open temporary WARC manifest file.\n"));
1182           exit (WGET_EXIT_GENERIC_ERROR);
1183         }
1184 
1185       if (opt.warc_keep_log)
1186         {
1187           warc_log_fp = warc_tempfile ();
1188           if (warc_log_fp == NULL)
1189             {
1190               logprintf (LOG_NOTQUIET,
1191                          _("Could not open temporary WARC log file.\n"));
1192               exit (WGET_EXIT_GENERIC_ERROR);
1193             }
1194           log_set_warc_log_fp (warc_log_fp);
1195         }
1196 
1197       warc_current_file_number = -1;
1198       if (! warc_start_new_file (false))
1199         {
1200           logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
1201           exit (WGET_EXIT_GENERIC_ERROR);
1202         }
1203 
1204       if (opt.warc_cdx_enabled)
1205         {
1206           if (! warc_start_cdx_file ())
1207             {
1208               logprintf (LOG_NOTQUIET,
1209                          _("Could not open CDX file for output.\n"));
1210               exit (WGET_EXIT_GENERIC_ERROR);
1211             }
1212         }
1213     }
1214 }
1215 
1216 /* Writes metadata (manifest, configuration, log file) to the WARC file. */
1217 static void
warc_write_metadata(void)1218 warc_write_metadata (void)
1219 {
1220   char manifest_uuid[48];
1221   FILE *warc_tmp_fp;
1222 
1223   /* If there are multiple WARC files, the metadata should be written to a separate file. */
1224   if (opt.warc_maxsize > 0)
1225     warc_start_new_file (true);
1226 
1227   warc_uuid_str (manifest_uuid, sizeof (manifest_uuid));
1228 
1229   fflush (warc_manifest_fp);
1230   warc_write_metadata_record (manifest_uuid,
1231                               "metadata://gnu.org/software/wget/warc/MANIFEST.txt",
1232                               NULL, NULL, NULL, "text/plain",
1233                               warc_manifest_fp, -1);
1234   /* warc_write_resource_record has closed warc_manifest_fp. */
1235 
1236   warc_tmp_fp = warc_tempfile ();
1237   if (warc_tmp_fp == NULL)
1238     {
1239       logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
1240       exit (WGET_EXIT_GENERIC_ERROR);
1241     }
1242   fflush (warc_tmp_fp);
1243   fprintf (warc_tmp_fp, "%s\n", program_argstring);
1244 
1245   warc_write_resource_record (NULL,
1246                    "metadata://gnu.org/software/wget/warc/wget_arguments.txt",
1247                               NULL, manifest_uuid, NULL, "text/plain",
1248                               warc_tmp_fp, -1);
1249   /* warc_write_resource_record has closed warc_tmp_fp. */
1250 
1251   if (warc_log_fp != NULL)
1252     {
1253       warc_write_resource_record (NULL,
1254                               "metadata://gnu.org/software/wget/warc/wget.log",
1255                                   NULL, manifest_uuid, NULL, "text/plain",
1256                                   warc_log_fp, -1);
1257       /* warc_write_resource_record has closed warc_log_fp. */
1258 
1259       warc_log_fp = NULL;
1260       log_set_warc_log_fp (NULL);
1261     }
1262 }
1263 
1264 /* Finishes the WARC writing.
1265    This should be called at the end of the program. */
1266 void
warc_close(void)1267 warc_close (void)
1268 {
1269   if (warc_current_file != NULL)
1270     {
1271       warc_write_metadata ();
1272       *warc_current_warcinfo_uuid_str = 0;
1273       fclose (warc_current_file);
1274       warc_current_file = NULL;
1275     }
1276 
1277   if (warc_current_cdx_file != NULL)
1278     {
1279       fclose (warc_current_cdx_file);
1280       warc_current_cdx_file = NULL;
1281     }
1282 
1283   if (warc_log_fp != NULL)
1284     {
1285       fclose (warc_log_fp);
1286       log_set_warc_log_fp (NULL);
1287     }
1288 }
1289 
1290 /* Creates a temporary file for writing WARC output.
1291    The temporary file will be created in opt.warc_tempdir.
1292    Returns the pointer to the temporary file, or NULL. */
1293 FILE *
warc_tempfile(void)1294 warc_tempfile (void)
1295 {
1296   char filename[100];
1297   int fd;
1298 
1299   if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
1300     return NULL;
1301 
1302 #ifdef __VMS
1303   /* 2013-07-12 SMS.
1304    * mkostemp()+unlink()+fdopen() scheme causes trouble on VMS, so use
1305    * mktemp() to uniquify the (VMS-style) name, and then use a normal
1306    * fopen() with a "create temp file marked for delete" option.
1307    */
1308   {
1309     char *tfn;
1310 
1311     tfn = mktemp (filename);            /* Get unique name from template. */
1312     if (tfn == NULL)
1313       return NULL;
1314     return fopen (tfn, "w+", "fop=tmd");    /* Create auto-delete temp file. */
1315   }
1316 #else /* def __VMS */
1317   fd = mkostemp (filename, O_TEMPORARY);
1318   if (fd < 0)
1319     return NULL;
1320 
1321 #if !O_TEMPORARY
1322   if (unlink (filename) < 0)
1323     {
1324       close(fd);
1325       return NULL;
1326     }
1327 #endif
1328 
1329   return fdopen (fd, "wb+");
1330 #endif /* def __VMS [else] */
1331 }
1332 
1333 
1334 /* Writes a request record to the WARC file.
1335    url  is the target uri of the request,
1336    timestamp_str  is the timestamp of the request (generated with warc_timestamp),
1337    record_uuid  is the uuid of the request (generated with warc_uuid_str),
1338    body  is a pointer to a file containing the request headers and body.
1339    ip  is the ip address of the server (or NULL),
1340    Calling this function will close body.
1341    Returns true on success, false on error. */
1342 bool
warc_write_request_record(const char * url,const char * timestamp_str,const char * record_uuid,const ip_address * ip,FILE * body,off_t payload_offset)1343 warc_write_request_record (const char *url, const char *timestamp_str,
1344                            const char *record_uuid, const ip_address *ip,
1345                            FILE *body, off_t payload_offset)
1346 {
1347   warc_write_start_record ();
1348   warc_write_header ("WARC-Type", "request");
1349   warc_write_header_uri ("WARC-Target-URI", url);
1350   warc_write_header ("Content-Type", "application/http;msgtype=request");
1351   warc_write_date_header (timestamp_str);
1352   warc_write_header ("WARC-Record-ID", record_uuid);
1353   warc_write_ip_header (ip);
1354   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1355   warc_write_digest_headers (body, payload_offset);
1356   warc_write_block_from_file (body);
1357   warc_write_end_record ();
1358 
1359   fclose (body);
1360 
1361   return warc_write_ok;
1362 }
1363 
1364 /* Writes a response record to the CDX file.
1365    url  is the target uri of the request/response,
1366    timestamp_str  is the timestamp of the request that generated this response,
1367                   (generated with warc_timestamp),
1368    mime_type  is the mime type of the response body (will be printed to CDX),
1369    response_code  is the HTTP response code (will be printed to CDX),
1370    payload_digest  is the sha1 digest of the payload,
1371    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1372    offset  is the position of the WARC record in the WARC file,
1373    warc_filename  is the filename of the WARC,
1374    response_uuid  is the uuid of the response.
1375    Returns true on success, false on error. */
1376 static bool
warc_write_cdx_record(const char * url,const char * timestamp_str,const char * mime_type,int response_code,const char * payload_digest,const char * redirect_location,off_t offset,const char * warc_filename _GL_UNUSED,const char * response_uuid)1377 warc_write_cdx_record (const char *url, const char *timestamp_str,
1378                        const char *mime_type, int response_code,
1379                        const char *payload_digest, const char *redirect_location,
1380                        off_t offset, const char *warc_filename _GL_UNUSED,
1381                        const char *response_uuid)
1382 {
1383   /* Transform the timestamp. */
1384   char timestamp_str_cdx[15];
1385   char offset_string[MAX_INT_TO_STRING_LEN(off_t)];
1386   const char *checksum;
1387   char *tmp_location = NULL;
1388 
1389   memcpy (timestamp_str_cdx     , timestamp_str     , 4); /* "YYYY" "-" */
1390   memcpy (timestamp_str_cdx +  4, timestamp_str +  5, 2); /* "mm"   "-" */
1391   memcpy (timestamp_str_cdx +  6, timestamp_str +  8, 2); /* "dd"   "T" */
1392   memcpy (timestamp_str_cdx +  8, timestamp_str + 11, 2); /* "HH"   ":" */
1393   memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM"   ":" */
1394   memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS"   "Z" */
1395   timestamp_str_cdx[14] = '\0';
1396 
1397   /* Rewrite the checksum. */
1398   if (payload_digest != NULL)
1399     checksum = payload_digest + 5; /* Skip the "sha1:" */
1400   else
1401     checksum = "-";
1402 
1403   if (mime_type == NULL || strlen(mime_type) == 0)
1404     mime_type = "-";
1405   if (redirect_location == NULL || strlen(redirect_location) == 0)
1406     tmp_location = strdup ("-");
1407   else
1408     tmp_location = url_escape(redirect_location);
1409 
1410   number_to_string (offset_string, offset);
1411 
1412   /* Print the CDX line. */
1413   fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %s %s %s\n", url,
1414            timestamp_str_cdx, url, mime_type, response_code, checksum,
1415            tmp_location, offset_string, warc_current_filename,
1416            response_uuid);
1417   fflush (warc_current_cdx_file);
1418   free (tmp_location);
1419 
1420   return true;
1421 }
1422 
1423 /* Writes a revisit record to the WARC file.
1424    url  is the target uri of the request/response,
1425    timestamp_str  is the timestamp of the request that generated this response
1426                   (generated with warc_timestamp),
1427    concurrent_to_uuid  is the uuid of the request for that generated this response
1428                  (generated with warc_uuid_str),
1429    refers_to_uuid  is the uuid of the original response
1430                  (generated with warc_uuid_str),
1431    payload_digest  is the sha1 digest of the payload,
1432    ip  is the ip address of the server (or NULL),
1433    body  is a pointer to a file containing the response headers (without payload).
1434    Calling this function will close body.
1435    Returns true on success, false on error. */
1436 static bool
warc_write_revisit_record(const char * url,const char * timestamp_str,const char * concurrent_to_uuid,const char * payload_digest,const char * refers_to,const ip_address * ip,FILE * body)1437 warc_write_revisit_record (const char *url, const char *timestamp_str,
1438                            const char *concurrent_to_uuid, const char *payload_digest,
1439                            const char *refers_to, const ip_address *ip, FILE *body)
1440 {
1441   char revisit_uuid [48];
1442   char block_digest[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5];
1443   char sha1_res_block[SHA1_DIGEST_SIZE];
1444 
1445   warc_uuid_str (revisit_uuid, sizeof (revisit_uuid));
1446 
1447   sha1_stream (body, sha1_res_block);
1448   warc_base32_sha1_digest (sha1_res_block, block_digest, sizeof(block_digest));
1449 
1450   warc_write_start_record ();
1451   warc_write_header ("WARC-Type", "revisit");
1452   warc_write_header ("WARC-Record-ID", revisit_uuid);
1453   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1454   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1455   warc_write_header ("WARC-Refers-To", refers_to);
1456   warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest");
1457   warc_write_header ("WARC-Truncated", "length");
1458   warc_write_header_uri ("WARC-Target-URI", url);
1459   warc_write_date_header (timestamp_str);
1460   warc_write_ip_header (ip);
1461   warc_write_header ("Content-Type", "application/http;msgtype=response");
1462   warc_write_header ("WARC-Block-Digest", block_digest);
1463   warc_write_header ("WARC-Payload-Digest", payload_digest);
1464   warc_write_block_from_file (body);
1465   warc_write_end_record ();
1466 
1467   fclose (body);
1468 
1469   return warc_write_ok;
1470 }
1471 
1472 /* Writes a response record to the WARC file.
1473    url  is the target uri of the request/response,
1474    timestamp_str  is the timestamp of the request that generated this response
1475                   (generated with warc_timestamp),
1476    concurrent_to_uuid  is the uuid of the request for that generated this response
1477                  (generated with warc_uuid_str),
1478    ip  is the ip address of the server (or NULL),
1479    body  is a pointer to a file containing the response headers and body.
1480    mime_type  is the mime type of the response body (will be printed to CDX),
1481    response_code  is the HTTP response code (will be printed to CDX),
1482    redirect_location  is the contents of the Location: header, or NULL (will be printed to CDX),
1483    Calling this function will close body.
1484    Returns true on success, false on error. */
1485 bool
warc_write_response_record(const char * url,const char * timestamp_str,const char * concurrent_to_uuid,const ip_address * ip,FILE * body,off_t payload_offset,const char * mime_type,int response_code,const char * redirect_location)1486 warc_write_response_record (const char *url, const char *timestamp_str,
1487                             const char *concurrent_to_uuid, const ip_address *ip,
1488                             FILE *body, off_t payload_offset, const char *mime_type,
1489                             int response_code, const char *redirect_location)
1490 {
1491   char block_digest[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5];
1492   char payload_digest[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5];
1493   char sha1_res_block[SHA1_DIGEST_SIZE];
1494   char sha1_res_payload[SHA1_DIGEST_SIZE];
1495   char response_uuid [48];
1496   off_t offset;
1497 
1498   if (opt.warc_digests_enabled)
1499     {
1500       /* Calculate the block and payload digests. */
1501       rewind (body);
1502       if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload,
1503           payload_offset) == 0)
1504         {
1505           /* Decide (based on url + payload digest) if we have seen this
1506              data before. */
1507           struct warc_cdx_record *rec_existing;
1508           rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload);
1509           if (rec_existing != NULL)
1510             {
1511               bool result;
1512 
1513               /* Found an existing record. */
1514               logprintf (LOG_VERBOSE,
1515           _("Found exact match in CDX file. Saving revisit record to WARC.\n"));
1516 
1517               /* Remove the payload from the file. */
1518               if (payload_offset > 0)
1519                 {
1520                   if (ftruncate (fileno (body), payload_offset) == -1)
1521                     return false;
1522                 }
1523 
1524               /* Send the original payload digest. */
1525               warc_base32_sha1_digest (sha1_res_payload, payload_digest, sizeof(payload_digest));
1526               result = warc_write_revisit_record (url, timestamp_str,
1527                          concurrent_to_uuid, payload_digest, rec_existing->uuid,
1528                          ip, body);
1529 
1530               return result;
1531             }
1532 
1533           warc_base32_sha1_digest (sha1_res_block, block_digest, sizeof(block_digest));
1534           warc_base32_sha1_digest (sha1_res_payload, payload_digest, sizeof(payload_digest));
1535         }
1536     }
1537 
1538   /* Not a revisit, just store the record. */
1539 
1540   warc_uuid_str (response_uuid, sizeof (response_uuid));
1541 
1542   fseeko (warc_current_file, 0L, SEEK_END);
1543   offset = ftello (warc_current_file);
1544 
1545   warc_write_start_record ();
1546   warc_write_header ("WARC-Type", "response");
1547   warc_write_header ("WARC-Record-ID", response_uuid);
1548   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1549   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1550   warc_write_header_uri ("WARC-Target-URI", url);
1551   warc_write_date_header (timestamp_str);
1552   warc_write_ip_header (ip);
1553   warc_write_header ("WARC-Block-Digest", block_digest);
1554   warc_write_header ("WARC-Payload-Digest", payload_digest);
1555   warc_write_header ("Content-Type", "application/http;msgtype=response");
1556   warc_write_block_from_file (body);
1557   warc_write_end_record ();
1558 
1559   fclose (body);
1560 
1561   if (warc_write_ok && opt.warc_cdx_enabled)
1562     {
1563       /* Add this record to the CDX. */
1564       warc_write_cdx_record (url, timestamp_str, mime_type, response_code,
1565       payload_digest, redirect_location, offset, warc_current_filename,
1566       response_uuid);
1567     }
1568 
1569   return warc_write_ok;
1570 }
1571 
1572 /* Writes a resource or metadata record to the WARC file.
1573    warc_type  is either "resource" or "metadata",
1574    resource_uuid  is the uuid of the resource (or NULL),
1575    url  is the target uri of the resource,
1576    timestamp_str  is the timestamp (generated with warc_timestamp),
1577    concurrent_to_uuid  is the uuid of the record that generated this,
1578    resource (generated with warc_uuid_str) or NULL,
1579    ip  is the ip address of the server (or NULL),
1580    content_type  is the mime type of the body (or NULL),
1581    body  is a pointer to a file containing the resource data.
1582    Calling this function will close body.
1583    Returns true on success, false on error. */
1584 static bool
warc_write_record(const char * record_type,const char * resource_uuid,const char * url,const char * timestamp_str,const char * concurrent_to_uuid,const ip_address * ip,const char * content_type,FILE * body,off_t payload_offset)1585 warc_write_record (const char *record_type, const char *resource_uuid,
1586                  const char *url, const char *timestamp_str,
1587                  const char *concurrent_to_uuid,
1588                  const ip_address *ip, const char *content_type, FILE *body,
1589                  off_t payload_offset)
1590 {
1591   char uuid_buf[48];
1592 
1593   if (resource_uuid == NULL)
1594     {
1595       warc_uuid_str (uuid_buf, sizeof (uuid_buf));
1596       resource_uuid = uuid_buf;
1597     }
1598 
1599   if (content_type == NULL)
1600     content_type = "application/octet-stream";
1601 
1602   warc_write_start_record ();
1603   warc_write_header ("WARC-Type", record_type);
1604   warc_write_header ("WARC-Record-ID", resource_uuid);
1605   warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str);
1606   warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid);
1607   warc_write_header_uri ("WARC-Target-URI", url);
1608   warc_write_date_header (timestamp_str);
1609   warc_write_ip_header (ip);
1610   warc_write_digest_headers (body, payload_offset);
1611   warc_write_header ("Content-Type", content_type);
1612   warc_write_block_from_file (body);
1613   warc_write_end_record ();
1614 
1615   fclose (body);
1616 
1617   return warc_write_ok;
1618 }
1619 
1620 /* Writes a resource record to the WARC file.
1621    resource_uuid  is the uuid of the resource (or NULL),
1622    url  is the target uri of the resource,
1623    timestamp_str  is the timestamp (generated with warc_timestamp),
1624    concurrent_to_uuid  is the uuid of the record that generated this,
1625    resource (generated with warc_uuid_str) or NULL,
1626    ip  is the ip address of the server (or NULL),
1627    content_type  is the mime type of the body (or NULL),
1628    body  is a pointer to a file containing the resource data.
1629    Calling this function will close body.
1630    Returns true on success, false on error. */
1631 bool
warc_write_resource_record(const char * resource_uuid,const char * url,const char * timestamp_str,const char * concurrent_to_uuid,const ip_address * ip,const char * content_type,FILE * body,off_t payload_offset)1632 warc_write_resource_record (const char *resource_uuid, const char *url,
1633                  const char *timestamp_str, const char *concurrent_to_uuid,
1634                  const ip_address *ip, const char *content_type, FILE *body,
1635                  off_t payload_offset)
1636 {
1637   return warc_write_record ("resource",
1638       resource_uuid, url, timestamp_str, concurrent_to_uuid,
1639       ip, content_type, body, payload_offset);
1640 }
1641 
1642 /* Writes a metadata record to the WARC file.
1643    record_uuid  is the uuid of the record (or NULL),
1644    url  is the target uri of the record,
1645    timestamp_str  is the timestamp (generated with warc_timestamp),
1646    concurrent_to_uuid  is the uuid of the record that generated this,
1647    record (generated with warc_uuid_str) or NULL,
1648    ip  is the ip address of the server (or NULL),
1649    content_type  is the mime type of the body (or NULL),
1650    body  is a pointer to a file containing the record data.
1651    Calling this function will close body.
1652    Returns true on success, false on error. */
1653 bool
warc_write_metadata_record(const char * record_uuid,const char * url,const char * timestamp_str,const char * concurrent_to_uuid,ip_address * ip,const char * content_type,FILE * body,off_t payload_offset)1654 warc_write_metadata_record (const char *record_uuid, const char *url,
1655                  const char *timestamp_str, const char *concurrent_to_uuid,
1656                  ip_address *ip, const char *content_type, FILE *body,
1657                  off_t payload_offset)
1658 {
1659   return warc_write_record ("metadata",
1660       record_uuid, url, timestamp_str, concurrent_to_uuid,
1661       ip, content_type, body, payload_offset);
1662 }
1663