1 /*- 2 * Copyright (c) 2014 Sebastian Freundt 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include "archive_platform.h" 27 __FBSDID("$FreeBSD$"); 28 29 /** 30 * WARC is standardised by ISO TC46/SC4/WG12 and currently available as 31 * ISO 28500:2009. 32 * For the purposes of this file we used the final draft from: 33 * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf 34 * 35 * Todo: 36 * [ ] real-world warcs can contain resources at endpoints ending in / 37 * e.g. http://bibnum.bnf.fr/warc/ 38 * if you're lucky their response contains a Content-Location: header 39 * pointing to a unix-compliant filename, in the example above it's 40 * Content-Location: http://bibnum.bnf.fr/warc/index.html 41 * however, that's not mandated and github for example doesn't follow 42 * this convention. 43 * We need a set of archive options to control what to do with 44 * entries like these, at the moment care is taken to skip them. 45 * 46 **/ 47 48 #ifdef HAVE_SYS_STAT_H 49 #include <sys/stat.h> 50 #endif 51 #ifdef HAVE_ERRNO_H 52 #include <errno.h> 53 #endif 54 #ifdef HAVE_STDLIB_H 55 #include <stdlib.h> 56 #endif 57 #ifdef HAVE_STRING_H 58 #include <string.h> 59 #endif 60 #ifdef HAVE_LIMITS_H 61 #include <limits.h> 62 #endif 63 #ifdef HAVE_CTYPE_H 64 #include <ctype.h> 65 #endif 66 #ifdef HAVE_TIME_H 67 #include <time.h> 68 #endif 69 70 #include "archive.h" 71 #include "archive_entry.h" 72 #include "archive_private.h" 73 #include "archive_read_private.h" 74 75 typedef enum { 76 WT_NONE, 77 /* warcinfo */ 78 WT_INFO, 79 /* metadata */ 80 WT_META, 81 /* resource */ 82 WT_RSRC, 83 /* request, unsupported */ 84 WT_REQ, 85 /* response, unsupported */ 86 WT_RSP, 87 /* revisit, unsupported */ 88 WT_RVIS, 89 /* conversion, unsupported */ 90 WT_CONV, 91 /* continutation, unsupported at the moment */ 92 WT_CONT, 93 /* invalid type */ 94 LAST_WT 95 } warc_type_t; 96 97 typedef struct { 98 size_t len; 99 const char *str; 100 } warc_string_t; 101 102 typedef struct { 103 size_t len; 104 char *str; 105 } warc_strbuf_t; 106 107 struct warc_s { 108 /* content length ahead */ 109 size_t cntlen; 110 /* and how much we've processed so far */ 111 size_t cntoff; 112 /* and how much we need to consume between calls */ 113 size_t unconsumed; 114 115 /* string pool */ 116 warc_strbuf_t pool; 117 /* previous version */ 118 unsigned int pver; 119 /* stringified format name */ 120 struct archive_string sver; 121 }; 122 123 static int _warc_bid(struct archive_read *a, int); 124 static int _warc_cleanup(struct archive_read *a); 125 static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*); 126 static int _warc_skip(struct archive_read *a); 127 static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e); 128 129 /* private routines */ 130 static unsigned int _warc_rdver(const char buf[10], size_t bsz); 131 static unsigned int _warc_rdtyp(const char *buf, size_t bsz); 132 static warc_string_t _warc_rduri(const char *buf, size_t bsz); 133 static ssize_t _warc_rdlen(const char *buf, size_t bsz); 134 static time_t _warc_rdrtm(const char *buf, size_t bsz); 135 static time_t _warc_rdmtm(const char *buf, size_t bsz); 136 static const char *_warc_find_eoh(const char *buf, size_t bsz); 137 138 139 int 140 archive_read_support_format_warc(struct archive *_a) 141 { 142 struct archive_read *a = (struct archive_read *)_a; 143 struct warc_s *w; 144 int r; 145 146 archive_check_magic(_a, ARCHIVE_READ_MAGIC, 147 ARCHIVE_STATE_NEW, "archive_read_support_format_warc"); 148 149 if ((w = malloc(sizeof(*w))) == NULL) { 150 archive_set_error(&a->archive, ENOMEM, 151 "Can't allocate warc data"); 152 return (ARCHIVE_FATAL); 153 } 154 memset(w, 0, sizeof(*w)); 155 156 r = __archive_read_register_format( 157 a, w, "warc", 158 _warc_bid, NULL, _warc_rdhdr, _warc_read, 159 _warc_skip, NULL, _warc_cleanup, NULL, NULL); 160 161 if (r != ARCHIVE_OK) { 162 free(w); 163 return (r); 164 } 165 return (ARCHIVE_OK); 166 } 167 168 static int 169 _warc_cleanup(struct archive_read *a) 170 { 171 struct warc_s *w = a->format->data; 172 173 if (w->pool.len > 0U) { 174 free(w->pool.str); 175 } 176 archive_string_free(&w->sver); 177 free(w); 178 a->format->data = NULL; 179 return (ARCHIVE_OK); 180 } 181 182 static int 183 _warc_bid(struct archive_read *a, int best_bid) 184 { 185 const char *hdr; 186 ssize_t nrd; 187 unsigned int ver; 188 189 (void)best_bid; /* UNUSED */ 190 191 /* check first line of file, it should be a record already */ 192 if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) { 193 /* no idea what to do */ 194 return -1; 195 } else if (nrd < 12) { 196 /* nah, not for us, our magic cookie is at least 12 bytes */ 197 return -1; 198 } 199 200 /* otherwise snarf the record's version number */ 201 ver = _warc_rdver(hdr, nrd); 202 if (ver == 0U || ver > 10000U) { 203 /* oh oh oh, best not to wager ... */ 204 return -1; 205 } 206 207 /* otherwise be confident */ 208 return (64); 209 } 210 211 static int 212 _warc_rdhdr(struct archive_read *a, struct archive_entry *entry) 213 { 214 #define HDR_PROBE_LEN (12U) 215 struct warc_s *w = a->format->data; 216 unsigned int ver; 217 const char *buf; 218 ssize_t nrd; 219 const char *eoh; 220 /* for the file name, saves some strndup()'ing */ 221 warc_string_t fnam; 222 /* warc record type, not that we really use it a lot */ 223 warc_type_t ftyp; 224 /* content-length+error monad */ 225 ssize_t cntlen; 226 /* record time is the WARC-Date time we reinterpret it as ctime */ 227 time_t rtime; 228 /* mtime is the Last-Modified time which will be the entry's mtime */ 229 time_t mtime; 230 231 start_over: 232 /* just use read_ahead() they keep track of unconsumed 233 * bits and bobs for us; no need to put an extra shift in 234 * and reproduce that functionality here */ 235 buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd); 236 237 if (nrd < 0) { 238 /* no good */ 239 archive_set_error( 240 &a->archive, ARCHIVE_ERRNO_MISC, 241 "Bad record header"); 242 return (ARCHIVE_FATAL); 243 } else if (buf == NULL) { 244 /* there should be room for at least WARC/bla\r\n 245 * must be EOF therefore */ 246 return (ARCHIVE_EOF); 247 } 248 /* looks good so far, try and find the end of the header now */ 249 eoh = _warc_find_eoh(buf, nrd); 250 if (eoh == NULL) { 251 /* still no good, the header end might be beyond the 252 * probe we've requested, but then again who'd cram 253 * so much stuff into the header *and* be 28500-compliant */ 254 archive_set_error( 255 &a->archive, ARCHIVE_ERRNO_MISC, 256 "Bad record header"); 257 return (ARCHIVE_FATAL); 258 } else if ((ver = _warc_rdver(buf, eoh - buf)) > 10000U) { 259 /* nawww, I wish they promised backward compatibility 260 * anyhoo, in their infinite wisdom the 28500 guys might 261 * come up with something we can't possibly handle so 262 * best end things here */ 263 archive_set_error( 264 &a->archive, ARCHIVE_ERRNO_MISC, 265 "Unsupported record version"); 266 return (ARCHIVE_FATAL); 267 } else if ((cntlen = _warc_rdlen(buf, eoh - buf)) < 0) { 268 /* nightmare! the specs say content-length is mandatory 269 * so I don't feel overly bad stopping the reader here */ 270 archive_set_error( 271 &a->archive, EINVAL, 272 "Bad content length"); 273 return (ARCHIVE_FATAL); 274 } else if ((rtime = _warc_rdrtm(buf, eoh - buf)) == (time_t)-1) { 275 /* record time is mandatory as per WARC/1.0, 276 * so just barf here, fast and loud */ 277 archive_set_error( 278 &a->archive, EINVAL, 279 "Bad record time"); 280 return (ARCHIVE_FATAL); 281 } 282 283 /* let the world know we're a WARC archive */ 284 a->archive.archive_format = ARCHIVE_FORMAT_WARC; 285 if (ver != w->pver) { 286 /* stringify this entry's version */ 287 archive_string_sprintf(&w->sver, 288 "WARC/%u.%u", ver / 10000, ver % 10000); 289 /* remember the version */ 290 w->pver = ver; 291 } 292 /* start off with the type */ 293 ftyp = _warc_rdtyp(buf, eoh - buf); 294 /* and let future calls know about the content */ 295 w->cntlen = cntlen; 296 w->cntoff = 0U; 297 mtime = 0;/* Avoid compiling error on some platform. */ 298 299 switch (ftyp) { 300 case WT_RSRC: 301 case WT_RSP: 302 /* only try and read the filename in the cases that are 303 * guaranteed to have one */ 304 fnam = _warc_rduri(buf, eoh - buf); 305 /* check the last character in the URI to avoid creating 306 * directory endpoints as files, see Todo above */ 307 if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') { 308 /* break here for now */ 309 fnam.len = 0U; 310 fnam.str = NULL; 311 break; 312 } 313 /* bang to our string pool, so we save a 314 * malloc()+free() roundtrip */ 315 if (fnam.len + 1U > w->pool.len) { 316 w->pool.len = ((fnam.len + 64U) / 64U) * 64U; 317 w->pool.str = realloc(w->pool.str, w->pool.len); 318 } 319 memcpy(w->pool.str, fnam.str, fnam.len); 320 w->pool.str[fnam.len] = '\0'; 321 /* let noone else know about the pool, it's a secret, shhh */ 322 fnam.str = w->pool.str; 323 324 /* snarf mtime or deduce from rtime 325 * this is a custom header added by our writer, it's quite 326 * hard to believe anyone else would go through with it 327 * (apart from being part of some http responses of course) */ 328 if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) { 329 mtime = rtime; 330 } 331 break; 332 default: 333 fnam.len = 0U; 334 fnam.str = NULL; 335 break; 336 } 337 338 /* now eat some of those delicious buffer bits */ 339 __archive_read_consume(a, eoh - buf); 340 341 switch (ftyp) { 342 case WT_RSRC: 343 case WT_RSP: 344 if (fnam.len > 0U) { 345 /* populate entry object */ 346 archive_entry_set_filetype(entry, AE_IFREG); 347 archive_entry_copy_pathname(entry, fnam.str); 348 archive_entry_set_size(entry, cntlen); 349 archive_entry_set_perm(entry, 0644); 350 /* rtime is the new ctime, mtime stays mtime */ 351 archive_entry_set_ctime(entry, rtime, 0L); 352 archive_entry_set_mtime(entry, mtime, 0L); 353 break; 354 } 355 /* FALLTHROUGH */ 356 default: 357 /* consume the content and start over */ 358 _warc_skip(a); 359 goto start_over; 360 } 361 return (ARCHIVE_OK); 362 } 363 364 static int 365 _warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off) 366 { 367 struct warc_s *w = a->format->data; 368 const char *rab; 369 ssize_t nrd; 370 371 if (w->cntoff >= w->cntlen) { 372 eof: 373 /* it's our lucky day, no work, we can leave early */ 374 *buf = NULL; 375 *bsz = 0U; 376 *off = w->cntoff + 4U/*for \r\n\r\n separator*/; 377 w->unconsumed = 0U; 378 return (ARCHIVE_EOF); 379 } 380 381 rab = __archive_read_ahead(a, 1U, &nrd); 382 if (nrd < 0) { 383 *bsz = 0U; 384 /* big catastrophe */ 385 return (int)nrd; 386 } else if (nrd == 0) { 387 goto eof; 388 } else if ((size_t)nrd > w->cntlen - w->cntoff) { 389 /* clamp to content-length */ 390 nrd = w->cntlen - w->cntoff; 391 } 392 *off = w->cntoff; 393 *bsz = nrd; 394 *buf = rab; 395 396 w->cntoff += nrd; 397 w->unconsumed = (size_t)nrd; 398 return (ARCHIVE_OK); 399 } 400 401 static int 402 _warc_skip(struct archive_read *a) 403 { 404 struct warc_s *w = a->format->data; 405 406 __archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/); 407 w->cntlen = 0U; 408 w->cntoff = 0U; 409 return (ARCHIVE_OK); 410 } 411 412 413 /* private routines */ 414 static void* 415 deconst(const void *c) 416 { 417 return (char *)0x1 + (((const char *)c) - (const char *)0x1); 418 } 419 420 static char* 421 xmemmem(const char *hay, const size_t haysize, 422 const char *needle, const size_t needlesize) 423 { 424 const char *const eoh = hay + haysize; 425 const char *const eon = needle + needlesize; 426 const char *hp; 427 const char *np; 428 const char *cand; 429 unsigned int hsum; 430 unsigned int nsum; 431 unsigned int eqp; 432 433 /* trivial checks first 434 * a 0-sized needle is defined to be found anywhere in haystack 435 * then run strchr() to find a candidate in HAYSTACK (i.e. a portion 436 * that happens to begin with *NEEDLE) */ 437 if (needlesize == 0UL) { 438 return deconst(hay); 439 } else if ((hay = memchr(hay, *needle, haysize)) == NULL) { 440 /* trivial */ 441 return NULL; 442 } 443 444 /* First characters of haystack and needle are the same now. Both are 445 * guaranteed to be at least one character long. Now computes the sum 446 * of characters values of needle together with the sum of the first 447 * needle_len characters of haystack. */ 448 for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U; 449 hp < eoh && np < eon; 450 hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++); 451 452 /* HP now references the (NEEDLESIZE + 1)-th character. */ 453 if (np < eon) { 454 /* haystack is smaller than needle, :O */ 455 return NULL; 456 } else if (eqp) { 457 /* found a match */ 458 return deconst(hay); 459 } 460 461 /* now loop through the rest of haystack, 462 * updating the sum iteratively */ 463 for (cand = hay; hp < eoh; hp++) { 464 hsum ^= *cand++; 465 hsum ^= *hp; 466 467 /* Since the sum of the characters is already known to be 468 * equal at that point, it is enough to check just NEEDLESIZE - 1 469 * characters for equality, 470 * also CAND is by design < HP, so no need for range checks */ 471 if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) { 472 return deconst(cand); 473 } 474 } 475 return NULL; 476 } 477 478 static int 479 strtoi_lim(const char *str, const char **ep, int llim, int ulim) 480 { 481 int res = 0; 482 const char *sp; 483 /* we keep track of the number of digits via rulim */ 484 int rulim; 485 486 for (sp = str, rulim = ulim > 10 ? ulim : 10; 487 res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9'; 488 sp++, rulim /= 10) { 489 res *= 10; 490 res += *sp - '0'; 491 } 492 if (sp == str) { 493 res = -1; 494 } else if (res < llim || res > ulim) { 495 res = -2; 496 } 497 *ep = (const char*)sp; 498 return res; 499 } 500 501 static time_t 502 time_from_tm(struct tm *t) 503 { 504 #if HAVE_TIMEGM 505 /* Use platform timegm() if available. */ 506 return (timegm(t)); 507 #elif HAVE__MKGMTIME64 508 return (_mkgmtime64(t)); 509 #else 510 /* Else use direct calculation using POSIX assumptions. */ 511 /* First, fix up tm_yday based on the year/month/day. */ 512 if (mktime(t) == (time_t)-1) 513 return ((time_t)-1); 514 /* Then we can compute timegm() from first principles. */ 515 return (t->tm_sec 516 + t->tm_min * 60 517 + t->tm_hour * 3600 518 + t->tm_yday * 86400 519 + (t->tm_year - 70) * 31536000 520 + ((t->tm_year - 69) / 4) * 86400 521 - ((t->tm_year - 1) / 100) * 86400 522 + ((t->tm_year + 299) / 400) * 86400); 523 #endif 524 } 525 526 static time_t 527 xstrpisotime(const char *s, char **endptr) 528 { 529 /** like strptime() but strictly for ISO 8601 Zulu strings */ 530 struct tm tm; 531 time_t res = (time_t)-1; 532 533 /* make sure tm is clean */ 534 memset(&tm, 0, sizeof(tm)); 535 536 /* as a courtesy to our callers, and since this is a non-standard 537 * routine, we skip leading whitespace */ 538 for (; isspace(*s); s++); 539 540 /* read year */ 541 if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') { 542 goto out; 543 } 544 /* read month */ 545 if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') { 546 goto out; 547 } 548 /* read day-of-month */ 549 if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') { 550 goto out; 551 } 552 /* read hour */ 553 if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') { 554 goto out; 555 } 556 /* read minute */ 557 if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') { 558 goto out; 559 } 560 /* read second */ 561 if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') { 562 goto out; 563 } 564 565 /* massage TM to fulfill some of POSIX' contraints */ 566 tm.tm_year -= 1900; 567 tm.tm_mon--; 568 569 /* now convert our custom tm struct to a unix stamp using UTC */ 570 res = time_from_tm(&tm); 571 572 out: 573 if (endptr != NULL) { 574 *endptr = deconst(s); 575 } 576 return res; 577 } 578 579 static unsigned int 580 _warc_rdver(const char buf[10], size_t bsz) 581 { 582 static const char magic[] = "WARC/"; 583 unsigned int ver; 584 585 (void)bsz; /* UNUSED */ 586 587 if (memcmp(buf, magic, sizeof(magic) - 1U) != 0) { 588 /* nope */ 589 return 99999U; 590 } 591 /* looks good so far, read the version number for a laugh */ 592 buf += sizeof(magic) - 1U; 593 /* most common case gets a quick-check here */ 594 if (memcmp(buf, "1.0\r\n", 5U) == 0) { 595 ver = 10000U; 596 } else { 597 switch (*buf) { 598 case '0': 599 case '1': 600 case '2': 601 case '3': 602 case '4': 603 case '5': 604 case '6': 605 case '7': 606 case '8': 607 if (buf[1U] == '.') { 608 char *on; 609 610 /* set up major version */ 611 ver = (buf[0U] - '0') * 10000U; 612 /* minor version, anyone? */ 613 ver += (strtol(buf + 2U, &on, 10)) * 100U; 614 /* don't parse anything else */ 615 if (on > buf + 2U) { 616 break; 617 } 618 } 619 /* FALLTHROUGH */ 620 case '9': 621 default: 622 /* just make the version ridiculously high */ 623 ver = 999999U; 624 break; 625 } 626 } 627 return ver; 628 } 629 630 static unsigned int 631 _warc_rdtyp(const char *buf, size_t bsz) 632 { 633 static const char _key[] = "\r\nWARC-Type:"; 634 const char *const eob = buf + bsz; 635 const char *val; 636 637 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 638 /* no bother */ 639 return WT_NONE; 640 } 641 /* overread whitespace */ 642 for (val += sizeof(_key) - 1U; val < eob && isspace(*val); val++); 643 644 if (val + 8U > eob) { 645 ; 646 } else if (memcmp(val, "resource", 8U) == 0) { 647 return WT_RSRC; 648 } else if (memcmp(val, "warcinfo", 8U) == 0) { 649 return WT_INFO; 650 } else if (memcmp(val, "metadata", 8U) == 0) { 651 return WT_META; 652 } else if (memcmp(val, "request", 7U) == 0) { 653 return WT_REQ; 654 } else if (memcmp(val, "response", 8U) == 0) { 655 return WT_RSP; 656 } else if (memcmp(val, "conversi", 8U) == 0) { 657 return WT_CONV; 658 } else if (memcmp(val, "continua", 8U) == 0) { 659 return WT_CONT; 660 } 661 return WT_NONE; 662 } 663 664 static warc_string_t 665 _warc_rduri(const char *buf, size_t bsz) 666 { 667 static const char _key[] = "\r\nWARC-Target-URI:"; 668 const char *const eob = buf + bsz; 669 const char *val; 670 const char *uri; 671 const char *eol; 672 warc_string_t res = {0U, NULL}; 673 674 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 675 /* no bother */ 676 return res; 677 } 678 /* overread whitespace */ 679 for (val += sizeof(_key) - 1U; val < eob && isspace(*val); val++); 680 681 /* overread URL designators */ 682 if ((uri = xmemmem(val, eob - val, "://", 3U)) == NULL) { 683 /* not touching that! */ 684 return res; 685 } else if ((eol = memchr(uri, '\n', eob - uri)) == NULL) { 686 /* no end of line? :O */ 687 return res; 688 } 689 690 /* massage uri to point to after :// */ 691 uri += 3U; 692 /* also massage eol to point to the first whitespace 693 * after the last non-whitespace character before 694 * the end of the line */ 695 for (; eol > uri && isspace(eol[-1]); eol--); 696 697 /* now then, inspect the URI */ 698 if (memcmp(val, "file", 4U) == 0) { 699 /* perfect, nothing left to do here */ 700 701 } else if (memcmp(val, "http", 4U) == 0 || 702 memcmp(val, "ftp", 3U) == 0) { 703 /* overread domain, and the first / */ 704 while (uri < eol && *uri++ != '/'); 705 } else { 706 /* not sure what to do? best to bugger off */ 707 return res; 708 } 709 res.str = uri; 710 res.len = eol - uri; 711 return res; 712 } 713 714 static ssize_t 715 _warc_rdlen(const char *buf, size_t bsz) 716 { 717 static const char _key[] = "\r\nContent-Length:"; 718 const char *val; 719 char *on = NULL; 720 long int len; 721 722 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 723 /* no bother */ 724 return -1; 725 } 726 727 /* strtol kindly overreads whitespace for us, so use that */ 728 val += sizeof(_key) - 1U; 729 len = strtol(val, &on, 10); 730 if (on == NULL || !isspace(*on)) { 731 /* hm, can we trust that number? Best not. */ 732 return -1; 733 } 734 return (size_t)len; 735 } 736 737 static time_t 738 _warc_rdrtm(const char *buf, size_t bsz) 739 { 740 static const char _key[] = "\r\nWARC-Date:"; 741 const char *val; 742 char *on = NULL; 743 time_t res; 744 745 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 746 /* no bother */ 747 return (time_t)-1; 748 } 749 750 /* xstrpisotime() kindly overreads whitespace for us, so use that */ 751 val += sizeof(_key) - 1U; 752 res = xstrpisotime(val, &on); 753 if (on == NULL || !isspace(*on)) { 754 /* hm, can we trust that number? Best not. */ 755 return (time_t)-1; 756 } 757 return res; 758 } 759 760 static time_t 761 _warc_rdmtm(const char *buf, size_t bsz) 762 { 763 static const char _key[] = "\r\nLast-Modified:"; 764 const char *val; 765 char *on = NULL; 766 time_t res; 767 768 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 769 /* no bother */ 770 return (time_t)-1; 771 } 772 773 /* xstrpisotime() kindly overreads whitespace for us, so use that */ 774 val += sizeof(_key) - 1U; 775 res = xstrpisotime(val, &on); 776 if (on == NULL || !isspace(*on)) { 777 /* hm, can we trust that number? Best not. */ 778 return (time_t)-1; 779 } 780 return res; 781 } 782 783 static const char* 784 _warc_find_eoh(const char *buf, size_t bsz) 785 { 786 static const char _marker[] = "\r\n\r\n"; 787 const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U); 788 789 if (hit != NULL) { 790 hit += sizeof(_marker) - 1U; 791 } 792 return hit; 793 } 794 795 /* archive_read_support_format_warc.c ends here */ 796