1 /*- 2 * Copyright (c) 2014 Sebastian Freundt 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include "archive_platform.h" 27 __FBSDID("$FreeBSD$"); 28 29 /** 30 * WARC is standardised by ISO TC46/SC4/WG12 and currently available as 31 * ISO 28500:2009. 32 * For the purposes of this file we used the final draft from: 33 * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf 34 * 35 * Todo: 36 * [ ] real-world warcs can contain resources at endpoints ending in / 37 * e.g. http://bibnum.bnf.fr/warc/ 38 * if you're lucky their response contains a Content-Location: header 39 * pointing to a unix-compliant filename, in the example above it's 40 * Content-Location: http://bibnum.bnf.fr/warc/index.html 41 * however, that's not mandated and github for example doesn't follow 42 * this convention. 43 * We need a set of archive options to control what to do with 44 * entries like these, at the moment care is taken to skip them. 45 * 46 **/ 47 48 #ifdef HAVE_SYS_STAT_H 49 #include <sys/stat.h> 50 #endif 51 #ifdef HAVE_ERRNO_H 52 #include <errno.h> 53 #endif 54 #ifdef HAVE_STDLIB_H 55 #include <stdlib.h> 56 #endif 57 #ifdef HAVE_STRING_H 58 #include <string.h> 59 #endif 60 #ifdef HAVE_LIMITS_H 61 #include <limits.h> 62 #endif 63 #ifdef HAVE_CTYPE_H 64 #include <ctype.h> 65 #endif 66 #ifdef HAVE_TIME_H 67 #include <time.h> 68 #endif 69 70 #include "archive.h" 71 #include "archive_entry.h" 72 #include "archive_private.h" 73 #include "archive_read_private.h" 74 75 typedef enum { 76 WT_NONE, 77 /* warcinfo */ 78 WT_INFO, 79 /* metadata */ 80 WT_META, 81 /* resource */ 82 WT_RSRC, 83 /* request, unsupported */ 84 WT_REQ, 85 /* response, unsupported */ 86 WT_RSP, 87 /* revisit, unsupported */ 88 WT_RVIS, 89 /* conversion, unsupported */ 90 WT_CONV, 91 /* continuation, unsupported at the moment */ 92 WT_CONT, 93 /* invalid type */ 94 LAST_WT 95 } warc_type_t; 96 97 typedef struct { 98 size_t len; 99 const char *str; 100 } warc_string_t; 101 102 typedef struct { 103 size_t len; 104 char *str; 105 } warc_strbuf_t; 106 107 struct warc_s { 108 /* content length ahead */ 109 size_t cntlen; 110 /* and how much we've processed so far */ 111 size_t cntoff; 112 /* and how much we need to consume between calls */ 113 size_t unconsumed; 114 115 /* string pool */ 116 warc_strbuf_t pool; 117 /* previous version */ 118 unsigned int pver; 119 /* stringified format name */ 120 struct archive_string sver; 121 }; 122 123 static int _warc_bid(struct archive_read *a, int); 124 static int _warc_cleanup(struct archive_read *a); 125 static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*); 126 static int _warc_skip(struct archive_read *a); 127 static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e); 128 129 /* private routines */ 130 static unsigned int _warc_rdver(const char *buf, size_t bsz); 131 static unsigned int _warc_rdtyp(const char *buf, size_t bsz); 132 static warc_string_t _warc_rduri(const char *buf, size_t bsz); 133 static ssize_t _warc_rdlen(const char *buf, size_t bsz); 134 static time_t _warc_rdrtm(const char *buf, size_t bsz); 135 static time_t _warc_rdmtm(const char *buf, size_t bsz); 136 static const char *_warc_find_eoh(const char *buf, size_t bsz); 137 static const char *_warc_find_eol(const char *buf, size_t bsz); 138 139 int 140 archive_read_support_format_warc(struct archive *_a) 141 { 142 struct archive_read *a = (struct archive_read *)_a; 143 struct warc_s *w; 144 int r; 145 146 archive_check_magic(_a, ARCHIVE_READ_MAGIC, 147 ARCHIVE_STATE_NEW, "archive_read_support_format_warc"); 148 149 if ((w = calloc(1, sizeof(*w))) == NULL) { 150 archive_set_error(&a->archive, ENOMEM, 151 "Can't allocate warc data"); 152 return (ARCHIVE_FATAL); 153 } 154 155 r = __archive_read_register_format( 156 a, w, "warc", 157 _warc_bid, NULL, _warc_rdhdr, _warc_read, 158 _warc_skip, NULL, _warc_cleanup, NULL, NULL); 159 160 if (r != ARCHIVE_OK) { 161 free(w); 162 return (r); 163 } 164 return (ARCHIVE_OK); 165 } 166 167 static int 168 _warc_cleanup(struct archive_read *a) 169 { 170 struct warc_s *w = a->format->data; 171 172 if (w->pool.len > 0U) { 173 free(w->pool.str); 174 } 175 archive_string_free(&w->sver); 176 free(w); 177 a->format->data = NULL; 178 return (ARCHIVE_OK); 179 } 180 181 static int 182 _warc_bid(struct archive_read *a, int best_bid) 183 { 184 const char *hdr; 185 ssize_t nrd; 186 unsigned int ver; 187 188 (void)best_bid; /* UNUSED */ 189 190 /* check first line of file, it should be a record already */ 191 if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) { 192 /* no idea what to do */ 193 return -1; 194 } else if (nrd < 12) { 195 /* nah, not for us, our magic cookie is at least 12 bytes */ 196 return -1; 197 } 198 199 /* otherwise snarf the record's version number */ 200 ver = _warc_rdver(hdr, nrd); 201 if (ver < 1200U || ver > 10000U) { 202 /* we only support WARC 0.12 to 1.0 */ 203 return -1; 204 } 205 206 /* otherwise be confident */ 207 return (64); 208 } 209 210 static int 211 _warc_rdhdr(struct archive_read *a, struct archive_entry *entry) 212 { 213 #define HDR_PROBE_LEN (12U) 214 struct warc_s *w = a->format->data; 215 unsigned int ver; 216 const char *buf; 217 ssize_t nrd; 218 const char *eoh; 219 /* for the file name, saves some strndup()'ing */ 220 warc_string_t fnam; 221 /* warc record type, not that we really use it a lot */ 222 warc_type_t ftyp; 223 /* content-length+error monad */ 224 ssize_t cntlen; 225 /* record time is the WARC-Date time we reinterpret it as ctime */ 226 time_t rtime; 227 /* mtime is the Last-Modified time which will be the entry's mtime */ 228 time_t mtime; 229 230 start_over: 231 /* just use read_ahead() they keep track of unconsumed 232 * bits and bobs for us; no need to put an extra shift in 233 * and reproduce that functionality here */ 234 buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd); 235 236 if (nrd < 0) { 237 /* no good */ 238 archive_set_error( 239 &a->archive, ARCHIVE_ERRNO_MISC, 240 "Bad record header"); 241 return (ARCHIVE_FATAL); 242 } else if (buf == NULL) { 243 /* there should be room for at least WARC/bla\r\n 244 * must be EOF therefore */ 245 return (ARCHIVE_EOF); 246 } 247 /* looks good so far, try and find the end of the header now */ 248 eoh = _warc_find_eoh(buf, nrd); 249 if (eoh == NULL) { 250 /* still no good, the header end might be beyond the 251 * probe we've requested, but then again who'd cram 252 * so much stuff into the header *and* be 28500-compliant */ 253 archive_set_error( 254 &a->archive, ARCHIVE_ERRNO_MISC, 255 "Bad record header"); 256 return (ARCHIVE_FATAL); 257 } 258 ver = _warc_rdver(buf, eoh - buf); 259 /* we currently support WARC 0.12 to 1.0 */ 260 if (ver == 0U) { 261 archive_set_error( 262 &a->archive, ARCHIVE_ERRNO_MISC, 263 "Invalid record version"); 264 return (ARCHIVE_FATAL); 265 } else if (ver < 1200U || ver > 10000U) { 266 archive_set_error( 267 &a->archive, ARCHIVE_ERRNO_MISC, 268 "Unsupported record version: %u.%u", 269 ver / 10000, (ver % 10000) / 100); 270 return (ARCHIVE_FATAL); 271 } 272 cntlen = _warc_rdlen(buf, eoh - buf); 273 if (cntlen < 0) { 274 /* nightmare! the specs say content-length is mandatory 275 * so I don't feel overly bad stopping the reader here */ 276 archive_set_error( 277 &a->archive, EINVAL, 278 "Bad content length"); 279 return (ARCHIVE_FATAL); 280 } 281 rtime = _warc_rdrtm(buf, eoh - buf); 282 if (rtime == (time_t)-1) { 283 /* record time is mandatory as per WARC/1.0, 284 * so just barf here, fast and loud */ 285 archive_set_error( 286 &a->archive, EINVAL, 287 "Bad record time"); 288 return (ARCHIVE_FATAL); 289 } 290 291 /* let the world know we're a WARC archive */ 292 a->archive.archive_format = ARCHIVE_FORMAT_WARC; 293 if (ver != w->pver) { 294 /* stringify this entry's version */ 295 archive_string_sprintf(&w->sver, 296 "WARC/%u.%u", ver / 10000, (ver % 10000) / 100); 297 /* remember the version */ 298 w->pver = ver; 299 } 300 /* start off with the type */ 301 ftyp = _warc_rdtyp(buf, eoh - buf); 302 /* and let future calls know about the content */ 303 w->cntlen = cntlen; 304 w->cntoff = 0U; 305 mtime = 0;/* Avoid compiling error on some platform. */ 306 307 switch (ftyp) { 308 case WT_RSRC: 309 case WT_RSP: 310 /* only try and read the filename in the cases that are 311 * guaranteed to have one */ 312 fnam = _warc_rduri(buf, eoh - buf); 313 /* check the last character in the URI to avoid creating 314 * directory endpoints as files, see Todo above */ 315 if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') { 316 /* break here for now */ 317 fnam.len = 0U; 318 fnam.str = NULL; 319 break; 320 } 321 /* bang to our string pool, so we save a 322 * malloc()+free() roundtrip */ 323 if (fnam.len + 1U > w->pool.len) { 324 w->pool.len = ((fnam.len + 64U) / 64U) * 64U; 325 w->pool.str = realloc(w->pool.str, w->pool.len); 326 } 327 memcpy(w->pool.str, fnam.str, fnam.len); 328 w->pool.str[fnam.len] = '\0'; 329 /* let no one else know about the pool, it's a secret, shhh */ 330 fnam.str = w->pool.str; 331 332 /* snarf mtime or deduce from rtime 333 * this is a custom header added by our writer, it's quite 334 * hard to believe anyone else would go through with it 335 * (apart from being part of some http responses of course) */ 336 if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) { 337 mtime = rtime; 338 } 339 break; 340 case WT_NONE: 341 case WT_INFO: 342 case WT_META: 343 case WT_REQ: 344 case WT_RVIS: 345 case WT_CONV: 346 case WT_CONT: 347 case LAST_WT: 348 default: 349 fnam.len = 0U; 350 fnam.str = NULL; 351 break; 352 } 353 354 /* now eat some of those delicious buffer bits */ 355 __archive_read_consume(a, eoh - buf); 356 357 switch (ftyp) { 358 case WT_RSRC: 359 case WT_RSP: 360 if (fnam.len > 0U) { 361 /* populate entry object */ 362 archive_entry_set_filetype(entry, AE_IFREG); 363 archive_entry_copy_pathname(entry, fnam.str); 364 archive_entry_set_size(entry, cntlen); 365 archive_entry_set_perm(entry, 0644); 366 /* rtime is the new ctime, mtime stays mtime */ 367 archive_entry_set_ctime(entry, rtime, 0L); 368 archive_entry_set_mtime(entry, mtime, 0L); 369 break; 370 } 371 /* FALLTHROUGH */ 372 case WT_NONE: 373 case WT_INFO: 374 case WT_META: 375 case WT_REQ: 376 case WT_RVIS: 377 case WT_CONV: 378 case WT_CONT: 379 case LAST_WT: 380 default: 381 /* consume the content and start over */ 382 _warc_skip(a); 383 goto start_over; 384 } 385 return (ARCHIVE_OK); 386 } 387 388 static int 389 _warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off) 390 { 391 struct warc_s *w = a->format->data; 392 const char *rab; 393 ssize_t nrd; 394 395 if (w->cntoff >= w->cntlen) { 396 eof: 397 /* it's our lucky day, no work, we can leave early */ 398 *buf = NULL; 399 *bsz = 0U; 400 *off = w->cntoff + 4U/*for \r\n\r\n separator*/; 401 w->unconsumed = 0U; 402 return (ARCHIVE_EOF); 403 } 404 405 if (w->unconsumed) { 406 __archive_read_consume(a, w->unconsumed); 407 w->unconsumed = 0U; 408 } 409 410 rab = __archive_read_ahead(a, 1U, &nrd); 411 if (nrd < 0) { 412 *bsz = 0U; 413 /* big catastrophe */ 414 return (int)nrd; 415 } else if (nrd == 0) { 416 goto eof; 417 } else if ((size_t)nrd > w->cntlen - w->cntoff) { 418 /* clamp to content-length */ 419 nrd = w->cntlen - w->cntoff; 420 } 421 *off = w->cntoff; 422 *bsz = nrd; 423 *buf = rab; 424 425 w->cntoff += nrd; 426 w->unconsumed = (size_t)nrd; 427 return (ARCHIVE_OK); 428 } 429 430 static int 431 _warc_skip(struct archive_read *a) 432 { 433 struct warc_s *w = a->format->data; 434 435 __archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/); 436 w->cntlen = 0U; 437 w->cntoff = 0U; 438 return (ARCHIVE_OK); 439 } 440 441 442 /* private routines */ 443 static void* 444 deconst(const void *c) 445 { 446 return (void *)(uintptr_t)c; 447 } 448 449 static char* 450 xmemmem(const char *hay, const size_t haysize, 451 const char *needle, const size_t needlesize) 452 { 453 const char *const eoh = hay + haysize; 454 const char *const eon = needle + needlesize; 455 const char *hp; 456 const char *np; 457 const char *cand; 458 unsigned int hsum; 459 unsigned int nsum; 460 unsigned int eqp; 461 462 /* trivial checks first 463 * a 0-sized needle is defined to be found anywhere in haystack 464 * then run strchr() to find a candidate in HAYSTACK (i.e. a portion 465 * that happens to begin with *NEEDLE) */ 466 if (needlesize == 0UL) { 467 return deconst(hay); 468 } else if ((hay = memchr(hay, *needle, haysize)) == NULL) { 469 /* trivial */ 470 return NULL; 471 } 472 473 /* First characters of haystack and needle are the same now. Both are 474 * guaranteed to be at least one character long. Now computes the sum 475 * of characters values of needle together with the sum of the first 476 * needle_len characters of haystack. */ 477 for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U; 478 hp < eoh && np < eon; 479 hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++); 480 481 /* HP now references the (NEEDLESIZE + 1)-th character. */ 482 if (np < eon) { 483 /* haystack is smaller than needle, :O */ 484 return NULL; 485 } else if (eqp) { 486 /* found a match */ 487 return deconst(hay); 488 } 489 490 /* now loop through the rest of haystack, 491 * updating the sum iteratively */ 492 for (cand = hay; hp < eoh; hp++) { 493 hsum ^= *cand++; 494 hsum ^= *hp; 495 496 /* Since the sum of the characters is already known to be 497 * equal at that point, it is enough to check just NEEDLESIZE - 1 498 * characters for equality, 499 * also CAND is by design < HP, so no need for range checks */ 500 if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) { 501 return deconst(cand); 502 } 503 } 504 return NULL; 505 } 506 507 static int 508 strtoi_lim(const char *str, const char **ep, int llim, int ulim) 509 { 510 int res = 0; 511 const char *sp; 512 /* we keep track of the number of digits via rulim */ 513 int rulim; 514 515 for (sp = str, rulim = ulim > 10 ? ulim : 10; 516 res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9'; 517 sp++, rulim /= 10) { 518 res *= 10; 519 res += *sp - '0'; 520 } 521 if (sp == str) { 522 res = -1; 523 } else if (res < llim || res > ulim) { 524 res = -2; 525 } 526 *ep = (const char*)sp; 527 return res; 528 } 529 530 static time_t 531 time_from_tm(struct tm *t) 532 { 533 #if HAVE_TIMEGM 534 /* Use platform timegm() if available. */ 535 return (timegm(t)); 536 #elif HAVE__MKGMTIME64 537 return (_mkgmtime64(t)); 538 #else 539 /* Else use direct calculation using POSIX assumptions. */ 540 /* First, fix up tm_yday based on the year/month/day. */ 541 if (mktime(t) == (time_t)-1) 542 return ((time_t)-1); 543 /* Then we can compute timegm() from first principles. */ 544 return (t->tm_sec 545 + t->tm_min * 60 546 + t->tm_hour * 3600 547 + t->tm_yday * 86400 548 + (t->tm_year - 70) * 31536000 549 + ((t->tm_year - 69) / 4) * 86400 550 - ((t->tm_year - 1) / 100) * 86400 551 + ((t->tm_year + 299) / 400) * 86400); 552 #endif 553 } 554 555 static time_t 556 xstrpisotime(const char *s, char **endptr) 557 { 558 /** like strptime() but strictly for ISO 8601 Zulu strings */ 559 struct tm tm; 560 time_t res = (time_t)-1; 561 562 /* make sure tm is clean */ 563 memset(&tm, 0, sizeof(tm)); 564 565 /* as a courtesy to our callers, and since this is a non-standard 566 * routine, we skip leading whitespace */ 567 while (*s == ' ' || *s == '\t') 568 ++s; 569 570 /* read year */ 571 if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') { 572 goto out; 573 } 574 /* read month */ 575 if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') { 576 goto out; 577 } 578 /* read day-of-month */ 579 if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') { 580 goto out; 581 } 582 /* read hour */ 583 if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') { 584 goto out; 585 } 586 /* read minute */ 587 if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') { 588 goto out; 589 } 590 /* read second */ 591 if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') { 592 goto out; 593 } 594 595 /* massage TM to fulfill some of POSIX' constraints */ 596 tm.tm_year -= 1900; 597 tm.tm_mon--; 598 599 /* now convert our custom tm struct to a unix stamp using UTC */ 600 res = time_from_tm(&tm); 601 602 out: 603 if (endptr != NULL) { 604 *endptr = deconst(s); 605 } 606 return res; 607 } 608 609 static unsigned int 610 _warc_rdver(const char *buf, size_t bsz) 611 { 612 static const char magic[] = "WARC/"; 613 const char *c; 614 unsigned int ver = 0U; 615 unsigned int end = 0U; 616 617 if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) { 618 /* buffer too small or invalid magic */ 619 return ver; 620 } 621 /* looks good so far, read the version number for a laugh */ 622 buf += sizeof(magic) - 1U; 623 624 if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') && 625 isdigit((unsigned char)buf[2U])) { 626 /* we support a maximum of 2 digits in the minor version */ 627 if (isdigit((unsigned char)buf[3U])) 628 end = 1U; 629 /* set up major version */ 630 ver = (buf[0U] - '0') * 10000U; 631 /* set up minor version */ 632 if (end == 1U) { 633 ver += (buf[2U] - '0') * 1000U; 634 ver += (buf[3U] - '0') * 100U; 635 } else 636 ver += (buf[2U] - '0') * 100U; 637 /* 638 * WARC below version 0.12 has a space-separated header 639 * WARC 0.12 and above terminates the version with a CRLF 640 */ 641 c = buf + 3U + end; 642 if (ver >= 1200U) { 643 if (memcmp(c, "\r\n", 2U) != 0) 644 ver = 0U; 645 } else { 646 /* ver < 1200U */ 647 if (*c != ' ' && *c != '\t') 648 ver = 0U; 649 } 650 } 651 return ver; 652 } 653 654 static unsigned int 655 _warc_rdtyp(const char *buf, size_t bsz) 656 { 657 static const char _key[] = "\r\nWARC-Type:"; 658 const char *val, *eol; 659 660 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 661 /* no bother */ 662 return WT_NONE; 663 } 664 val += sizeof(_key) - 1U; 665 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) { 666 /* no end of line */ 667 return WT_NONE; 668 } 669 670 /* overread whitespace */ 671 while (val < eol && (*val == ' ' || *val == '\t')) 672 ++val; 673 674 if (val + 8U == eol) { 675 if (memcmp(val, "resource", 8U) == 0) 676 return WT_RSRC; 677 else if (memcmp(val, "response", 8U) == 0) 678 return WT_RSP; 679 } 680 return WT_NONE; 681 } 682 683 static warc_string_t 684 _warc_rduri(const char *buf, size_t bsz) 685 { 686 static const char _key[] = "\r\nWARC-Target-URI:"; 687 const char *val, *uri, *eol, *p; 688 warc_string_t res = {0U, NULL}; 689 690 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 691 /* no bother */ 692 return res; 693 } 694 /* overread whitespace */ 695 val += sizeof(_key) - 1U; 696 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) { 697 /* no end of line */ 698 return res; 699 } 700 701 while (val < eol && (*val == ' ' || *val == '\t')) 702 ++val; 703 704 /* overread URL designators */ 705 if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) { 706 /* not touching that! */ 707 return res; 708 } 709 710 /* spaces inside uri are not allowed, CRLF should follow */ 711 for (p = val; p < eol; p++) { 712 if (isspace((unsigned char)*p)) 713 return res; 714 } 715 716 /* there must be at least space for ftp */ 717 if (uri < (val + 3U)) 718 return res; 719 720 /* move uri to point to after :// */ 721 uri += 3U; 722 723 /* now then, inspect the URI */ 724 if (memcmp(val, "file", 4U) == 0) { 725 /* perfect, nothing left to do here */ 726 727 } else if (memcmp(val, "http", 4U) == 0 || 728 memcmp(val, "ftp", 3U) == 0) { 729 /* overread domain, and the first / */ 730 while (uri < eol && *uri++ != '/'); 731 } else { 732 /* not sure what to do? best to bugger off */ 733 return res; 734 } 735 res.str = uri; 736 res.len = eol - uri; 737 return res; 738 } 739 740 static ssize_t 741 _warc_rdlen(const char *buf, size_t bsz) 742 { 743 static const char _key[] = "\r\nContent-Length:"; 744 const char *val, *eol; 745 char *on = NULL; 746 long int len; 747 748 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 749 /* no bother */ 750 return -1; 751 } 752 val += sizeof(_key) - 1U; 753 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) { 754 /* no end of line */ 755 return -1; 756 } 757 758 /* skip leading whitespace */ 759 while (val < eol && (*val == ' ' || *val == '\t')) 760 val++; 761 /* there must be at least one digit */ 762 if (!isdigit((unsigned char)*val)) 763 return -1; 764 errno = 0; 765 len = strtol(val, &on, 10); 766 if (errno != 0 || on != eol) { 767 /* line must end here */ 768 return -1; 769 } 770 771 return (size_t)len; 772 } 773 774 static time_t 775 _warc_rdrtm(const char *buf, size_t bsz) 776 { 777 static const char _key[] = "\r\nWARC-Date:"; 778 const char *val, *eol; 779 char *on = NULL; 780 time_t res; 781 782 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 783 /* no bother */ 784 return (time_t)-1; 785 } 786 val += sizeof(_key) - 1U; 787 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) { 788 /* no end of line */ 789 return -1; 790 } 791 792 /* xstrpisotime() kindly overreads whitespace for us, so use that */ 793 res = xstrpisotime(val, &on); 794 if (on != eol) { 795 /* line must end here */ 796 return -1; 797 } 798 return res; 799 } 800 801 static time_t 802 _warc_rdmtm(const char *buf, size_t bsz) 803 { 804 static const char _key[] = "\r\nLast-Modified:"; 805 const char *val, *eol; 806 char *on = NULL; 807 time_t res; 808 809 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 810 /* no bother */ 811 return (time_t)-1; 812 } 813 val += sizeof(_key) - 1U; 814 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) { 815 /* no end of line */ 816 return -1; 817 } 818 819 /* xstrpisotime() kindly overreads whitespace for us, so use that */ 820 res = xstrpisotime(val, &on); 821 if (on != eol) { 822 /* line must end here */ 823 return -1; 824 } 825 return res; 826 } 827 828 static const char* 829 _warc_find_eoh(const char *buf, size_t bsz) 830 { 831 static const char _marker[] = "\r\n\r\n"; 832 const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U); 833 834 if (hit != NULL) { 835 hit += sizeof(_marker) - 1U; 836 } 837 return hit; 838 } 839 840 static const char* 841 _warc_find_eol(const char *buf, size_t bsz) 842 { 843 static const char _marker[] = "\r\n"; 844 const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U); 845 846 return hit; 847 } 848 /* archive_read_support_format_warc.c ends here */ 849