1 /*- 2 * Copyright (c) 2014 Sebastian Freundt 3 * Author: Sebastian Freundt <devel@fresse.org> 4 * 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "archive_platform.h" 29 __FBSDID("$FreeBSD$"); 30 31 #ifdef HAVE_ERRNO_H 32 #include <errno.h> 33 #endif 34 #include <stdio.h> 35 #ifdef HAVE_STDLIB_H 36 #include <stdlib.h> 37 #endif 38 #ifdef HAVE_STRING_H 39 #include <string.h> 40 #endif 41 #ifdef HAVE_TIME_H 42 #include <time.h> 43 #endif 44 45 #include "archive.h" 46 #include "archive_entry.h" 47 #include "archive_entry_locale.h" 48 #include "archive_private.h" 49 #include "archive_random_private.h" 50 #include "archive_write_private.h" 51 52 struct warc_s { 53 unsigned int omit_warcinfo:1; 54 55 time_t now; 56 mode_t typ; 57 unsigned int rng; 58 /* populated size */ 59 uint64_t populz; 60 }; 61 62 static const char warcinfo[] = 63 "software: libarchive/" ARCHIVE_VERSION_ONLY_STRING "\r\n" 64 "format: WARC file version 1.0\r\n"; 65 66 typedef enum { 67 WT_NONE, 68 /* warcinfo */ 69 WT_INFO, 70 /* metadata */ 71 WT_META, 72 /* resource */ 73 WT_RSRC, 74 /* request, unsupported */ 75 WT_REQ, 76 /* response, unsupported */ 77 WT_RSP, 78 /* revisit, unsupported */ 79 WT_RVIS, 80 /* conversion, unsupported */ 81 WT_CONV, 82 /* continuation, unsupported at the moment */ 83 WT_CONT, 84 /* invalid type */ 85 LAST_WT 86 } warc_type_t; 87 88 typedef struct { 89 warc_type_t type; 90 const char *tgturi; 91 const char *recid; 92 time_t rtime; 93 time_t mtime; 94 const char *cnttyp; 95 uint64_t cntlen; 96 } warc_essential_hdr_t; 97 98 typedef struct { 99 unsigned int u[4U]; 100 } warc_uuid_t; 101 102 static int _warc_options(struct archive_write*, const char *key, const char *v); 103 static int _warc_header(struct archive_write *a, struct archive_entry *entry); 104 static ssize_t _warc_data(struct archive_write *a, const void *buf, size_t sz); 105 static int _warc_finish_entry(struct archive_write *a); 106 static int _warc_close(struct archive_write *a); 107 static int _warc_free(struct archive_write *a); 108 109 /* private routines */ 110 static ssize_t _popul_ehdr(struct archive_string *t, size_t z, warc_essential_hdr_t); 111 static int _gen_uuid(warc_uuid_t *tgt); 112 113 114 /* 115 * Set output format to ISO 28500 (aka WARC) format. 116 */ 117 int 118 archive_write_set_format_warc(struct archive *_a) 119 { 120 struct archive_write *a = (struct archive_write *)_a; 121 struct warc_s *w; 122 123 archive_check_magic(_a, ARCHIVE_WRITE_MAGIC, 124 ARCHIVE_STATE_NEW, "archive_write_set_format_warc"); 125 126 /* If another format was already registered, unregister it. */ 127 if (a->format_free != NULL) { 128 (a->format_free)(a); 129 } 130 131 w = malloc(sizeof(*w)); 132 if (w == NULL) { 133 archive_set_error(&a->archive, ENOMEM, 134 "Can't allocate warc data"); 135 return (ARCHIVE_FATAL); 136 } 137 /* by default we're emitting a file wide header */ 138 w->omit_warcinfo = 0U; 139 /* obtain current time for date fields */ 140 w->now = time(NULL); 141 /* reset file type info */ 142 w->typ = 0; 143 /* also initialise our rng */ 144 w->rng = (unsigned int)w->now; 145 146 a->format_data = w; 147 a->format_name = "WARC/1.0"; 148 a->format_options = _warc_options; 149 a->format_write_header = _warc_header; 150 a->format_write_data = _warc_data; 151 a->format_close = _warc_close; 152 a->format_free = _warc_free; 153 a->format_finish_entry = _warc_finish_entry; 154 a->archive.archive_format = ARCHIVE_FORMAT_WARC; 155 a->archive.archive_format_name = "WARC/1.0"; 156 return (ARCHIVE_OK); 157 } 158 159 160 /* archive methods */ 161 static int 162 _warc_options(struct archive_write *a, const char *key, const char *val) 163 { 164 struct warc_s *w = a->format_data; 165 166 if (strcmp(key, "omit-warcinfo") == 0) { 167 if (val == NULL || strcmp(val, "true") == 0) { 168 /* great */ 169 w->omit_warcinfo = 1U; 170 return (ARCHIVE_OK); 171 } 172 } 173 174 /* Note: The "warn" return is just to inform the options 175 * supervisor that we didn't handle it. It will generate 176 * a suitable error if no one used this option. */ 177 return (ARCHIVE_WARN); 178 } 179 180 static int 181 _warc_header(struct archive_write *a, struct archive_entry *entry) 182 { 183 struct warc_s *w = a->format_data; 184 struct archive_string hdr; 185 #define MAX_HDR_SIZE 512 186 187 /* check whether warcinfo record needs outputting */ 188 if (!w->omit_warcinfo) { 189 ssize_t r; 190 warc_essential_hdr_t wi = { 191 WT_INFO, 192 /*uri*/NULL, 193 /*urn*/NULL, 194 /*rtm*/0, 195 /*mtm*/0, 196 /*cty*/"application/warc-fields", 197 /*len*/sizeof(warcinfo) - 1U, 198 }; 199 wi.rtime = w->now; 200 wi.mtime = w->now; 201 202 archive_string_init(&hdr); 203 r = _popul_ehdr(&hdr, MAX_HDR_SIZE, wi); 204 if (r >= 0) { 205 /* jackpot! */ 206 /* now also use HDR buffer for the actual warcinfo */ 207 archive_strncat(&hdr, warcinfo, sizeof(warcinfo) -1); 208 209 /* append end-of-record indicator */ 210 archive_strncat(&hdr, "\r\n\r\n", 4); 211 212 /* write to output stream */ 213 __archive_write_output(a, hdr.s, archive_strlen(&hdr)); 214 } 215 /* indicate we're done with file header writing */ 216 w->omit_warcinfo = 1U; 217 archive_string_free(&hdr); 218 } 219 220 if (archive_entry_pathname(entry) == NULL) { 221 archive_set_error(&a->archive, EINVAL, 222 "Invalid filename"); 223 return (ARCHIVE_WARN); 224 } 225 226 w->typ = archive_entry_filetype(entry); 227 w->populz = 0U; 228 if (w->typ == AE_IFREG) { 229 warc_essential_hdr_t rh = { 230 WT_RSRC, 231 /*uri*/NULL, 232 /*urn*/NULL, 233 /*rtm*/0, 234 /*mtm*/0, 235 /*cty*/NULL, 236 /*len*/0, 237 }; 238 ssize_t r; 239 rh.tgturi = archive_entry_pathname(entry); 240 rh.rtime = w->now; 241 rh.mtime = archive_entry_mtime(entry); 242 rh.cntlen = (size_t)archive_entry_size(entry); 243 244 archive_string_init(&hdr); 245 r = _popul_ehdr(&hdr, MAX_HDR_SIZE, rh); 246 if (r < 0) { 247 /* don't bother */ 248 archive_set_error( 249 &a->archive, 250 ARCHIVE_ERRNO_FILE_FORMAT, 251 "cannot archive file"); 252 return (ARCHIVE_WARN); 253 } 254 /* otherwise append to output stream */ 255 __archive_write_output(a, hdr.s, r); 256 /* and let subsequent calls to _data() know about the size */ 257 w->populz = rh.cntlen; 258 archive_string_free(&hdr); 259 return (ARCHIVE_OK); 260 } 261 /* just resort to erroring as per Tim's advice */ 262 archive_set_error( 263 &a->archive, 264 ARCHIVE_ERRNO_FILE_FORMAT, 265 "WARC can only process regular files"); 266 return (ARCHIVE_FAILED); 267 } 268 269 static ssize_t 270 _warc_data(struct archive_write *a, const void *buf, size_t len) 271 { 272 struct warc_s *w = a->format_data; 273 274 if (w->typ == AE_IFREG) { 275 int rc; 276 277 /* never write more bytes than announced */ 278 if (len > w->populz) { 279 len = (size_t)w->populz; 280 } 281 282 /* now then, out we put the whole shebang */ 283 rc = __archive_write_output(a, buf, len); 284 if (rc != ARCHIVE_OK) { 285 return rc; 286 } 287 } 288 return len; 289 } 290 291 static int 292 _warc_finish_entry(struct archive_write *a) 293 { 294 static const char _eor[] = "\r\n\r\n"; 295 struct warc_s *w = a->format_data; 296 297 if (w->typ == AE_IFREG) { 298 int rc = __archive_write_output(a, _eor, sizeof(_eor) - 1U); 299 300 if (rc != ARCHIVE_OK) { 301 return rc; 302 } 303 } 304 /* reset type info */ 305 w->typ = 0; 306 return (ARCHIVE_OK); 307 } 308 309 static int 310 _warc_close(struct archive_write *a) 311 { 312 (void)a; /* UNUSED */ 313 return (ARCHIVE_OK); 314 } 315 316 static int 317 _warc_free(struct archive_write *a) 318 { 319 struct warc_s *w = a->format_data; 320 321 free(w); 322 a->format_data = NULL; 323 return (ARCHIVE_OK); 324 } 325 326 327 /* private routines */ 328 static void 329 xstrftime(struct archive_string *as, const char *fmt, time_t t) 330 { 331 /** like strftime(3) but for time_t objects */ 332 struct tm *rt; 333 #if defined(HAVE_GMTIME_R) || defined(HAVE__GMTIME64_S) 334 struct tm timeHere; 335 #endif 336 char strtime[100]; 337 size_t len; 338 339 #ifdef HAVE_GMTIME_R 340 if ((rt = gmtime_r(&t, &timeHere)) == NULL) 341 return; 342 #elif defined(HAVE__GMTIME64_S) 343 _gmtime64_s(&timeHere, &t); 344 #else 345 if ((rt = gmtime(&t)) == NULL) 346 return; 347 #endif 348 /* leave the hard yacker to our role model strftime() */ 349 len = strftime(strtime, sizeof(strtime)-1, fmt, rt); 350 archive_strncat(as, strtime, len); 351 } 352 353 static ssize_t 354 _popul_ehdr(struct archive_string *tgt, size_t tsz, warc_essential_hdr_t hdr) 355 { 356 static const char _ver[] = "WARC/1.0\r\n"; 357 static const char * const _typ[LAST_WT] = { 358 NULL, "warcinfo", "metadata", "resource", NULL 359 }; 360 char std_uuid[48U]; 361 362 if (hdr.type == WT_NONE || hdr.type > WT_RSRC) { 363 /* brilliant, how exactly did we get here? */ 364 return -1; 365 } 366 367 archive_strcpy(tgt, _ver); 368 369 archive_string_sprintf(tgt, "WARC-Type: %s\r\n", _typ[hdr.type]); 370 371 if (hdr.tgturi != NULL) { 372 /* check if there's a xyz:// */ 373 static const char _uri[] = ""; 374 static const char _fil[] = "file://"; 375 const char *u; 376 char *chk = strchr(hdr.tgturi, ':'); 377 378 if (chk != NULL && chk[1U] == '/' && chk[2U] == '/') { 379 /* yep, it's definitely a URI */ 380 u = _uri; 381 } else { 382 /* hm, best to prepend file:// then */ 383 u = _fil; 384 } 385 archive_string_sprintf(tgt, 386 "WARC-Target-URI: %s%s\r\n", u, hdr.tgturi); 387 } 388 389 /* record time is usually when the http is sent off, 390 * just treat the archive writing as such for a moment */ 391 xstrftime(tgt, "WARC-Date: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.rtime); 392 393 /* while we're at it, record the mtime */ 394 xstrftime(tgt, "Last-Modified: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.mtime); 395 396 if (hdr.recid == NULL) { 397 /* generate one, grrrr */ 398 warc_uuid_t u; 399 400 _gen_uuid(&u); 401 /* Unfortunately, archive_string_sprintf does not 402 * handle the minimum number following '%'. 403 * So we have to use snprintf function here instead 404 * of archive_string_snprintf function. */ 405 #if defined(_WIN32) && !defined(__CYGWIN__) && !( defined(_MSC_VER) && _MSC_VER >= 1900) 406 #define snprintf _snprintf 407 #endif 408 snprintf( 409 std_uuid, sizeof(std_uuid), 410 "<urn:uuid:%08x-%04x-%04x-%04x-%04x%08x>", 411 u.u[0U], 412 u.u[1U] >> 16U, u.u[1U] & 0xffffU, 413 u.u[2U] >> 16U, u.u[2U] & 0xffffU, 414 u.u[3U]); 415 hdr.recid = std_uuid; 416 } 417 418 /* record-id is mandatory, fingers crossed we won't fail */ 419 archive_string_sprintf(tgt, "WARC-Record-ID: %s\r\n", hdr.recid); 420 421 if (hdr.cnttyp != NULL) { 422 archive_string_sprintf(tgt, "Content-Type: %s\r\n", hdr.cnttyp); 423 } 424 425 /* next one is mandatory */ 426 archive_string_sprintf(tgt, "Content-Length: %ju\r\n", (uintmax_t)hdr.cntlen); 427 /**/ 428 archive_strncat(tgt, "\r\n", 2); 429 430 return (archive_strlen(tgt) >= tsz)? -1: (ssize_t)archive_strlen(tgt); 431 } 432 433 static int 434 _gen_uuid(warc_uuid_t *tgt) 435 { 436 archive_random(tgt->u, sizeof(tgt->u)); 437 /* obey uuid version 4 rules */ 438 tgt->u[1U] &= 0xffff0fffU; 439 tgt->u[1U] |= 0x4000U; 440 tgt->u[2U] &= 0x3fffffffU; 441 tgt->u[2U] |= 0x80000000U; 442 return 0; 443 } 444 445 /* archive_write_set_format_warc.c ends here */ 446