1 /*- 2 * Copyright (c) 2014 Sebastian Freundt 3 * Author: Sebastian Freundt <devel@fresse.org> 4 * 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "archive_platform.h" 29 __FBSDID("$FreeBSD$"); 30 31 #ifdef HAVE_ERRNO_H 32 #include <errno.h> 33 #endif 34 #include <stdio.h> 35 #ifdef HAVE_STDLIB_H 36 #include <stdlib.h> 37 #endif 38 #ifdef HAVE_STRING_H 39 #include <string.h> 40 #endif 41 #ifdef HAVE_TIME_H 42 #include <time.h> 43 #endif 44 45 #include "archive.h" 46 #include "archive_entry.h" 47 #include "archive_entry_locale.h" 48 #include "archive_private.h" 49 #include "archive_random_private.h" 50 #include "archive_write_private.h" 51 #include "archive_write_set_format_private.h" 52 53 struct warc_s { 54 unsigned int omit_warcinfo:1; 55 56 time_t now; 57 mode_t typ; 58 unsigned int rng; 59 /* populated size */ 60 uint64_t populz; 61 }; 62 63 static const char warcinfo[] = 64 "software: libarchive/" ARCHIVE_VERSION_ONLY_STRING "\r\n" 65 "format: WARC file version 1.0\r\n"; 66 67 typedef enum { 68 WT_NONE, 69 /* warcinfo */ 70 WT_INFO, 71 /* metadata */ 72 WT_META, 73 /* resource */ 74 WT_RSRC, 75 /* request, unsupported */ 76 WT_REQ, 77 /* response, unsupported */ 78 WT_RSP, 79 /* revisit, unsupported */ 80 WT_RVIS, 81 /* conversion, unsupported */ 82 WT_CONV, 83 /* continuation, unsupported at the moment */ 84 WT_CONT, 85 /* invalid type */ 86 LAST_WT 87 } warc_type_t; 88 89 typedef struct { 90 warc_type_t type; 91 const char *tgturi; 92 const char *recid; 93 time_t rtime; 94 time_t mtime; 95 const char *cnttyp; 96 uint64_t cntlen; 97 } warc_essential_hdr_t; 98 99 typedef struct { 100 unsigned int u[4U]; 101 } warc_uuid_t; 102 103 static int _warc_options(struct archive_write*, const char *key, const char *v); 104 static int _warc_header(struct archive_write *a, struct archive_entry *entry); 105 static ssize_t _warc_data(struct archive_write *a, const void *buf, size_t sz); 106 static int _warc_finish_entry(struct archive_write *a); 107 static int _warc_close(struct archive_write *a); 108 static int _warc_free(struct archive_write *a); 109 110 /* private routines */ 111 static ssize_t _popul_ehdr(struct archive_string *t, size_t z, warc_essential_hdr_t); 112 static int _gen_uuid(warc_uuid_t *tgt); 113 114 115 /* 116 * Set output format to ISO 28500 (aka WARC) format. 117 */ 118 int 119 archive_write_set_format_warc(struct archive *_a) 120 { 121 struct archive_write *a = (struct archive_write *)_a; 122 struct warc_s *w; 123 124 archive_check_magic(_a, ARCHIVE_WRITE_MAGIC, 125 ARCHIVE_STATE_NEW, "archive_write_set_format_warc"); 126 127 /* If another format was already registered, unregister it. */ 128 if (a->format_free != NULL) { 129 (a->format_free)(a); 130 } 131 132 w = malloc(sizeof(*w)); 133 if (w == NULL) { 134 archive_set_error(&a->archive, ENOMEM, 135 "Can't allocate warc data"); 136 return (ARCHIVE_FATAL); 137 } 138 /* by default we're emitting a file wide header */ 139 w->omit_warcinfo = 0U; 140 /* obtain current time for date fields */ 141 w->now = time(NULL); 142 /* reset file type info */ 143 w->typ = 0; 144 /* also initialise our rng */ 145 w->rng = (unsigned int)w->now; 146 147 a->format_data = w; 148 a->format_name = "WARC/1.0"; 149 a->format_options = _warc_options; 150 a->format_write_header = _warc_header; 151 a->format_write_data = _warc_data; 152 a->format_close = _warc_close; 153 a->format_free = _warc_free; 154 a->format_finish_entry = _warc_finish_entry; 155 a->archive.archive_format = ARCHIVE_FORMAT_WARC; 156 a->archive.archive_format_name = "WARC/1.0"; 157 return (ARCHIVE_OK); 158 } 159 160 161 /* archive methods */ 162 static int 163 _warc_options(struct archive_write *a, const char *key, const char *val) 164 { 165 struct warc_s *w = a->format_data; 166 167 if (strcmp(key, "omit-warcinfo") == 0) { 168 if (val == NULL || strcmp(val, "true") == 0) { 169 /* great */ 170 w->omit_warcinfo = 1U; 171 return (ARCHIVE_OK); 172 } 173 } 174 175 /* Note: The "warn" return is just to inform the options 176 * supervisor that we didn't handle it. It will generate 177 * a suitable error if no one used this option. */ 178 return (ARCHIVE_WARN); 179 } 180 181 static int 182 _warc_header(struct archive_write *a, struct archive_entry *entry) 183 { 184 struct warc_s *w = a->format_data; 185 struct archive_string hdr; 186 #define MAX_HDR_SIZE 512 187 188 /* check whether warcinfo record needs outputting */ 189 if (!w->omit_warcinfo) { 190 ssize_t r; 191 warc_essential_hdr_t wi = { 192 WT_INFO, 193 /*uri*/NULL, 194 /*urn*/NULL, 195 /*rtm*/0, 196 /*mtm*/0, 197 /*cty*/"application/warc-fields", 198 /*len*/sizeof(warcinfo) - 1U, 199 }; 200 wi.rtime = w->now; 201 wi.mtime = w->now; 202 203 archive_string_init(&hdr); 204 r = _popul_ehdr(&hdr, MAX_HDR_SIZE, wi); 205 if (r >= 0) { 206 /* jackpot! */ 207 /* now also use HDR buffer for the actual warcinfo */ 208 archive_strncat(&hdr, warcinfo, sizeof(warcinfo) -1); 209 210 /* append end-of-record indicator */ 211 archive_strncat(&hdr, "\r\n\r\n", 4); 212 213 /* write to output stream */ 214 __archive_write_output(a, hdr.s, archive_strlen(&hdr)); 215 } 216 /* indicate we're done with file header writing */ 217 w->omit_warcinfo = 1U; 218 archive_string_free(&hdr); 219 } 220 221 if (archive_entry_pathname(entry) == NULL) { 222 archive_set_error(&a->archive, EINVAL, 223 "Invalid filename"); 224 return (ARCHIVE_WARN); 225 } 226 227 w->typ = archive_entry_filetype(entry); 228 w->populz = 0U; 229 if (w->typ == AE_IFREG) { 230 warc_essential_hdr_t rh = { 231 WT_RSRC, 232 /*uri*/NULL, 233 /*urn*/NULL, 234 /*rtm*/0, 235 /*mtm*/0, 236 /*cty*/NULL, 237 /*len*/0, 238 }; 239 ssize_t r; 240 rh.tgturi = archive_entry_pathname(entry); 241 rh.rtime = w->now; 242 rh.mtime = archive_entry_mtime(entry); 243 rh.cntlen = (size_t)archive_entry_size(entry); 244 245 archive_string_init(&hdr); 246 r = _popul_ehdr(&hdr, MAX_HDR_SIZE, rh); 247 if (r < 0) { 248 /* don't bother */ 249 archive_set_error( 250 &a->archive, 251 ARCHIVE_ERRNO_FILE_FORMAT, 252 "cannot archive file"); 253 return (ARCHIVE_WARN); 254 } 255 /* otherwise append to output stream */ 256 __archive_write_output(a, hdr.s, r); 257 /* and let subsequent calls to _data() know about the size */ 258 w->populz = rh.cntlen; 259 archive_string_free(&hdr); 260 return (ARCHIVE_OK); 261 } 262 /* just resort to erroring as per Tim's advice */ 263 __archive_write_entry_filetype_unsupported( 264 &a->archive, entry, "WARC"); 265 return (ARCHIVE_FAILED); 266 } 267 268 static ssize_t 269 _warc_data(struct archive_write *a, const void *buf, size_t len) 270 { 271 struct warc_s *w = a->format_data; 272 273 if (w->typ == AE_IFREG) { 274 int rc; 275 276 /* never write more bytes than announced */ 277 if (len > w->populz) { 278 len = (size_t)w->populz; 279 } 280 281 /* now then, out we put the whole shebang */ 282 rc = __archive_write_output(a, buf, len); 283 if (rc != ARCHIVE_OK) { 284 return rc; 285 } 286 } 287 return len; 288 } 289 290 static int 291 _warc_finish_entry(struct archive_write *a) 292 { 293 static const char _eor[] = "\r\n\r\n"; 294 struct warc_s *w = a->format_data; 295 296 if (w->typ == AE_IFREG) { 297 int rc = __archive_write_output(a, _eor, sizeof(_eor) - 1U); 298 299 if (rc != ARCHIVE_OK) { 300 return rc; 301 } 302 } 303 /* reset type info */ 304 w->typ = 0; 305 return (ARCHIVE_OK); 306 } 307 308 static int 309 _warc_close(struct archive_write *a) 310 { 311 (void)a; /* UNUSED */ 312 return (ARCHIVE_OK); 313 } 314 315 static int 316 _warc_free(struct archive_write *a) 317 { 318 struct warc_s *w = a->format_data; 319 320 free(w); 321 a->format_data = NULL; 322 return (ARCHIVE_OK); 323 } 324 325 326 /* private routines */ 327 static void 328 xstrftime(struct archive_string *as, const char *fmt, time_t t) 329 { 330 /** like strftime(3) but for time_t objects */ 331 struct tm *rt; 332 #if defined(HAVE_GMTIME_R) || defined(HAVE__GMTIME64_S) 333 struct tm timeHere; 334 #endif 335 #if defined(HAVE__GMTIME64_S) 336 errno_t terr; 337 __time64_t tmptime; 338 #endif 339 char strtime[100]; 340 size_t len; 341 342 #ifdef HAVE_GMTIME_R 343 if ((rt = gmtime_r(&t, &timeHere)) == NULL) 344 return; 345 #elif defined(HAVE__GMTIME64_S) 346 tmptime = t; 347 terr = _gmtime64_s(&timeHere, &tmptime); 348 if (terr) 349 rt = NULL; 350 else 351 rt = &timeHere; 352 #else 353 if ((rt = gmtime(&t)) == NULL) 354 return; 355 #endif 356 /* leave the hard yacker to our role model strftime() */ 357 len = strftime(strtime, sizeof(strtime)-1, fmt, rt); 358 archive_strncat(as, strtime, len); 359 } 360 361 static ssize_t 362 _popul_ehdr(struct archive_string *tgt, size_t tsz, warc_essential_hdr_t hdr) 363 { 364 static const char _ver[] = "WARC/1.0\r\n"; 365 static const char * const _typ[LAST_WT] = { 366 NULL, "warcinfo", "metadata", "resource", NULL 367 }; 368 char std_uuid[48U]; 369 370 if (hdr.type == WT_NONE || hdr.type > WT_RSRC) { 371 /* brilliant, how exactly did we get here? */ 372 return -1; 373 } 374 375 archive_strcpy(tgt, _ver); 376 377 archive_string_sprintf(tgt, "WARC-Type: %s\r\n", _typ[hdr.type]); 378 379 if (hdr.tgturi != NULL) { 380 /* check if there's a xyz:// */ 381 static const char _uri[] = ""; 382 static const char _fil[] = "file://"; 383 const char *u; 384 char *chk = strchr(hdr.tgturi, ':'); 385 386 if (chk != NULL && chk[1U] == '/' && chk[2U] == '/') { 387 /* yep, it's definitely a URI */ 388 u = _uri; 389 } else { 390 /* hm, best to prepend file:// then */ 391 u = _fil; 392 } 393 archive_string_sprintf(tgt, 394 "WARC-Target-URI: %s%s\r\n", u, hdr.tgturi); 395 } 396 397 /* record time is usually when the http is sent off, 398 * just treat the archive writing as such for a moment */ 399 xstrftime(tgt, "WARC-Date: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.rtime); 400 401 /* while we're at it, record the mtime */ 402 xstrftime(tgt, "Last-Modified: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.mtime); 403 404 if (hdr.recid == NULL) { 405 /* generate one, grrrr */ 406 warc_uuid_t u; 407 408 _gen_uuid(&u); 409 /* Unfortunately, archive_string_sprintf does not 410 * handle the minimum number following '%'. 411 * So we have to use snprintf function here instead 412 * of archive_string_snprintf function. */ 413 #if defined(_WIN32) && !defined(__CYGWIN__) && !( defined(_MSC_VER) && _MSC_VER >= 1900) 414 #define snprintf _snprintf 415 #endif 416 snprintf( 417 std_uuid, sizeof(std_uuid), 418 "<urn:uuid:%08x-%04x-%04x-%04x-%04x%08x>", 419 u.u[0U], 420 u.u[1U] >> 16U, u.u[1U] & 0xffffU, 421 u.u[2U] >> 16U, u.u[2U] & 0xffffU, 422 u.u[3U]); 423 hdr.recid = std_uuid; 424 } 425 426 /* record-id is mandatory, fingers crossed we won't fail */ 427 archive_string_sprintf(tgt, "WARC-Record-ID: %s\r\n", hdr.recid); 428 429 if (hdr.cnttyp != NULL) { 430 archive_string_sprintf(tgt, "Content-Type: %s\r\n", hdr.cnttyp); 431 } 432 433 /* next one is mandatory */ 434 archive_string_sprintf(tgt, "Content-Length: %ju\r\n", (uintmax_t)hdr.cntlen); 435 /**/ 436 archive_strncat(tgt, "\r\n", 2); 437 438 return (archive_strlen(tgt) >= tsz)? -1: (ssize_t)archive_strlen(tgt); 439 } 440 441 static int 442 _gen_uuid(warc_uuid_t *tgt) 443 { 444 archive_random(tgt->u, sizeof(tgt->u)); 445 /* obey uuid version 4 rules */ 446 tgt->u[1U] &= 0xffff0fffU; 447 tgt->u[1U] |= 0x4000U; 448 tgt->u[2U] &= 0x3fffffffU; 449 tgt->u[2U] |= 0x80000000U; 450 return 0; 451 } 452 453 /* archive_write_set_format_warc.c ends here */ 454