1 /*-
2  * Copyright (c) 2014 Sebastian Freundt
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #include "archive_platform.h"
27 __FBSDID("$FreeBSD$");
28 
29 /**
30  * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
31  * ISO 28500:2009.
32  * For the purposes of this file we used the final draft from:
33  * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
34  *
35  * Todo:
36  * [ ] real-world warcs can contain resources at endpoints ending in /
37  *     e.g. http://bibnum.bnf.fr/warc/
38  *     if you're lucky their response contains a Content-Location: header
39  *     pointing to a unix-compliant filename, in the example above it's
40  *     Content-Location: http://bibnum.bnf.fr/warc/index.html
41  *     however, that's not mandated and github for example doesn't follow
42  *     this convention.
43  *     We need a set of archive options to control what to do with
44  *     entries like these, at the moment care is taken to skip them.
45  *
46  **/
47 
48 #ifdef HAVE_SYS_STAT_H
49 #include <sys/stat.h>
50 #endif
51 #ifdef HAVE_ERRNO_H
52 #include <errno.h>
53 #endif
54 #ifdef HAVE_STDLIB_H
55 #include <stdlib.h>
56 #endif
57 #ifdef HAVE_STRING_H
58 #include <string.h>
59 #endif
60 #ifdef HAVE_LIMITS_H
61 #include <limits.h>
62 #endif
63 #ifdef HAVE_CTYPE_H
64 #include <ctype.h>
65 #endif
66 #ifdef HAVE_TIME_H
67 #include <time.h>
68 #endif
69 
70 #include "archive.h"
71 #include "archive_entry.h"
72 #include "archive_private.h"
73 #include "archive_read_private.h"
74 
75 typedef enum {
76 	WT_NONE,
77 	/* warcinfo */
78 	WT_INFO,
79 	/* metadata */
80 	WT_META,
81 	/* resource */
82 	WT_RSRC,
83 	/* request, unsupported */
84 	WT_REQ,
85 	/* response, unsupported */
86 	WT_RSP,
87 	/* revisit, unsupported */
88 	WT_RVIS,
89 	/* conversion, unsupported */
90 	WT_CONV,
91 	/* continutation, unsupported at the moment */
92 	WT_CONT,
93 	/* invalid type */
94 	LAST_WT
95 } warc_type_t;
96 
97 typedef struct {
98 	size_t len;
99 	const char *str;
100 } warc_string_t;
101 
102 typedef struct {
103 	size_t len;
104 	char *str;
105 } warc_strbuf_t;
106 
107 struct warc_s {
108 	/* content length ahead */
109 	size_t cntlen;
110 	/* and how much we've processed so far */
111 	size_t cntoff;
112 	/* and how much we need to consume between calls */
113 	size_t unconsumed;
114 
115 	/* string pool */
116 	warc_strbuf_t pool;
117 	/* previous version */
118 	unsigned int pver;
119 	/* stringified format name */
120 	struct archive_string sver;
121 };
122 
123 static int _warc_bid(struct archive_read *a, int);
124 static int _warc_cleanup(struct archive_read *a);
125 static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
126 static int _warc_skip(struct archive_read *a);
127 static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
128 
129 /* private routines */
130 static unsigned int _warc_rdver(const char buf[10], size_t bsz);
131 static unsigned int _warc_rdtyp(const char *buf, size_t bsz);
132 static warc_string_t _warc_rduri(const char *buf, size_t bsz);
133 static ssize_t _warc_rdlen(const char *buf, size_t bsz);
134 static time_t _warc_rdrtm(const char *buf, size_t bsz);
135 static time_t _warc_rdmtm(const char *buf, size_t bsz);
136 static const char *_warc_find_eoh(const char *buf, size_t bsz);
137 
138 
139 int
140 archive_read_support_format_warc(struct archive *_a)
141 {
142 	struct archive_read *a = (struct archive_read *)_a;
143 	struct warc_s *w;
144 	int r;
145 
146 	archive_check_magic(_a, ARCHIVE_READ_MAGIC,
147 	    ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
148 
149 	if ((w = malloc(sizeof(*w))) == NULL) {
150 		archive_set_error(&a->archive, ENOMEM,
151 		    "Can't allocate warc data");
152 		return (ARCHIVE_FATAL);
153 	}
154 	memset(w, 0, sizeof(*w));
155 
156 	r = __archive_read_register_format(
157 		a, w, "warc",
158 		_warc_bid, NULL, _warc_rdhdr, _warc_read,
159 		_warc_skip, NULL, _warc_cleanup, NULL, NULL);
160 
161 	if (r != ARCHIVE_OK) {
162 		free(w);
163 		return (r);
164 	}
165 	return (ARCHIVE_OK);
166 }
167 
168 static int
169 _warc_cleanup(struct archive_read *a)
170 {
171 	struct warc_s *w = a->format->data;
172 
173 	if (w->pool.len > 0U) {
174 		free(w->pool.str);
175 	}
176 	archive_string_free(&w->sver);
177 	free(w);
178 	a->format->data = NULL;
179 	return (ARCHIVE_OK);
180 }
181 
182 static int
183 _warc_bid(struct archive_read *a, int best_bid)
184 {
185 	const char *hdr;
186 	ssize_t nrd;
187 	unsigned int ver;
188 
189 	(void)best_bid; /* UNUSED */
190 
191 	/* check first line of file, it should be a record already */
192 	if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
193 		/* no idea what to do */
194 		return -1;
195 	} else if (nrd < 12) {
196 		/* nah, not for us, our magic cookie is at least 12 bytes */
197 		return -1;
198 	}
199 
200 	/* otherwise snarf the record's version number */
201 	ver = _warc_rdver(hdr, nrd);
202 	if (ver == 0U || ver > 10000U) {
203 		/* oh oh oh, best not to wager ... */
204 		return -1;
205 	}
206 
207 	/* otherwise be confident */
208 	return (64);
209 }
210 
211 static int
212 _warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
213 {
214 #define HDR_PROBE_LEN		(12U)
215 	struct warc_s *w = a->format->data;
216 	unsigned int ver;
217 	const char *buf;
218 	ssize_t nrd;
219 	const char *eoh;
220 	/* for the file name, saves some strndup()'ing */
221 	warc_string_t fnam;
222 	/* warc record type, not that we really use it a lot */
223 	warc_type_t ftyp;
224 	/* content-length+error monad */
225 	ssize_t cntlen;
226 	/* record time is the WARC-Date time we reinterpret it as ctime */
227 	time_t rtime;
228 	/* mtime is the Last-Modified time which will be the entry's mtime */
229 	time_t mtime;
230 
231 start_over:
232 	/* just use read_ahead() they keep track of unconsumed
233 	 * bits and bobs for us; no need to put an extra shift in
234 	 * and reproduce that functionality here */
235 	buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
236 
237 	if (nrd < 0) {
238 		/* no good */
239 		archive_set_error(
240 			&a->archive, ARCHIVE_ERRNO_MISC,
241 			"Bad record header");
242 		return (ARCHIVE_FATAL);
243 	} else if (buf == NULL) {
244 		/* there should be room for at least WARC/bla\r\n
245 		 * must be EOF therefore */
246 		return (ARCHIVE_EOF);
247 	}
248  	/* looks good so far, try and find the end of the header now */
249 	eoh = _warc_find_eoh(buf, nrd);
250 	if (eoh == NULL) {
251 		/* still no good, the header end might be beyond the
252 		 * probe we've requested, but then again who'd cram
253 		 * so much stuff into the header *and* be 28500-compliant */
254 		archive_set_error(
255 			&a->archive, ARCHIVE_ERRNO_MISC,
256 			"Bad record header");
257 		return (ARCHIVE_FATAL);
258 	} else if ((ver = _warc_rdver(buf, eoh - buf)) > 10000U) {
259 		/* nawww, I wish they promised backward compatibility
260 		 * anyhoo, in their infinite wisdom the 28500 guys might
261 		 * come up with something we can't possibly handle so
262 		 * best end things here */
263 		archive_set_error(
264 			&a->archive, ARCHIVE_ERRNO_MISC,
265 			"Unsupported record version");
266 		return (ARCHIVE_FATAL);
267 	} else if ((cntlen = _warc_rdlen(buf, eoh - buf)) < 0) {
268 		/* nightmare!  the specs say content-length is mandatory
269 		 * so I don't feel overly bad stopping the reader here */
270 		archive_set_error(
271 			&a->archive, EINVAL,
272 			"Bad content length");
273 		return (ARCHIVE_FATAL);
274 	} else if ((rtime = _warc_rdrtm(buf, eoh - buf)) == (time_t)-1) {
275 		/* record time is mandatory as per WARC/1.0,
276 		 * so just barf here, fast and loud */
277 		archive_set_error(
278 			&a->archive, EINVAL,
279 			"Bad record time");
280 		return (ARCHIVE_FATAL);
281 	}
282 
283 	/* let the world know we're a WARC archive */
284 	a->archive.archive_format = ARCHIVE_FORMAT_WARC;
285 	if (ver != w->pver) {
286 		/* stringify this entry's version */
287 		archive_string_sprintf(&w->sver,
288 			"WARC/%u.%u", ver / 10000, ver % 10000);
289 		/* remember the version */
290 		w->pver = ver;
291 	}
292 	/* start off with the type */
293 	ftyp = _warc_rdtyp(buf, eoh - buf);
294 	/* and let future calls know about the content */
295 	w->cntlen = cntlen;
296 	w->cntoff = 0U;
297 	mtime = 0;/* Avoid compiling error on some platform. */
298 
299 	switch (ftyp) {
300 	case WT_RSRC:
301 	case WT_RSP:
302 		/* only try and read the filename in the cases that are
303 		 * guaranteed to have one */
304 		fnam = _warc_rduri(buf, eoh - buf);
305 		/* check the last character in the URI to avoid creating
306 		 * directory endpoints as files, see Todo above */
307 		if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
308 			/* break here for now */
309 			fnam.len = 0U;
310 			fnam.str = NULL;
311 			break;
312 		}
313 		/* bang to our string pool, so we save a
314 		 * malloc()+free() roundtrip */
315 		if (fnam.len + 1U > w->pool.len) {
316 			w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
317 			w->pool.str = realloc(w->pool.str, w->pool.len);
318 		}
319 		memcpy(w->pool.str, fnam.str, fnam.len);
320 		w->pool.str[fnam.len] = '\0';
321 		/* let noone else know about the pool, it's a secret, shhh */
322 		fnam.str = w->pool.str;
323 
324 		/* snarf mtime or deduce from rtime
325 		 * this is a custom header added by our writer, it's quite
326 		 * hard to believe anyone else would go through with it
327 		 * (apart from being part of some http responses of course) */
328 		if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
329 			mtime = rtime;
330 		}
331 		break;
332 	default:
333 		fnam.len = 0U;
334 		fnam.str = NULL;
335 		break;
336 	}
337 
338 	/* now eat some of those delicious buffer bits */
339 	__archive_read_consume(a, eoh - buf);
340 
341 	switch (ftyp) {
342 	case WT_RSRC:
343 	case WT_RSP:
344 		if (fnam.len > 0U) {
345 			/* populate entry object */
346 			archive_entry_set_filetype(entry, AE_IFREG);
347 			archive_entry_copy_pathname(entry, fnam.str);
348 			archive_entry_set_size(entry, cntlen);
349 			archive_entry_set_perm(entry, 0644);
350 			/* rtime is the new ctime, mtime stays mtime */
351 			archive_entry_set_ctime(entry, rtime, 0L);
352 			archive_entry_set_mtime(entry, mtime, 0L);
353 			break;
354 		}
355 		/* FALLTHROUGH */
356 	default:
357 		/* consume the content and start over */
358 		_warc_skip(a);
359 		goto start_over;
360 	}
361 	return (ARCHIVE_OK);
362 }
363 
364 static int
365 _warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
366 {
367 	struct warc_s *w = a->format->data;
368 	const char *rab;
369 	ssize_t nrd;
370 
371 	if (w->cntoff >= w->cntlen) {
372 	eof:
373 		/* it's our lucky day, no work, we can leave early */
374 		*buf = NULL;
375 		*bsz = 0U;
376 		*off = w->cntoff + 4U/*for \r\n\r\n separator*/;
377 		w->unconsumed = 0U;
378 		return (ARCHIVE_EOF);
379 	}
380 
381 	rab = __archive_read_ahead(a, 1U, &nrd);
382 	if (nrd < 0) {
383 		*bsz = 0U;
384 		/* big catastrophe */
385 		return (int)nrd;
386 	} else if (nrd == 0) {
387 		goto eof;
388 	} else if ((size_t)nrd > w->cntlen - w->cntoff) {
389 		/* clamp to content-length */
390 		nrd = w->cntlen - w->cntoff;
391 	}
392 	*off = w->cntoff;
393 	*bsz = nrd;
394 	*buf = rab;
395 
396 	w->cntoff += nrd;
397 	w->unconsumed = (size_t)nrd;
398 	return (ARCHIVE_OK);
399 }
400 
401 static int
402 _warc_skip(struct archive_read *a)
403 {
404 	struct warc_s *w = a->format->data;
405 
406 	__archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/);
407 	w->cntlen = 0U;
408 	w->cntoff = 0U;
409 	return (ARCHIVE_OK);
410 }
411 
412 
413 /* private routines */
414 static void*
415 deconst(const void *c)
416 {
417 	return (char *)0x1 + (((const char *)c) - (const char *)0x1);
418 }
419 
420 static char*
421 xmemmem(const char *hay, const size_t haysize,
422 	const char *needle, const size_t needlesize)
423 {
424 	const char *const eoh = hay + haysize;
425 	const char *const eon = needle + needlesize;
426 	const char *hp;
427 	const char *np;
428 	const char *cand;
429 	unsigned int hsum;
430 	unsigned int nsum;
431 	unsigned int eqp;
432 
433 	/* trivial checks first
434          * a 0-sized needle is defined to be found anywhere in haystack
435          * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
436          * that happens to begin with *NEEDLE) */
437 	if (needlesize == 0UL) {
438 		return deconst(hay);
439 	} else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
440 		/* trivial */
441 		return NULL;
442 	}
443 
444 	/* First characters of haystack and needle are the same now. Both are
445 	 * guaranteed to be at least one character long.  Now computes the sum
446 	 * of characters values of needle together with the sum of the first
447 	 * needle_len characters of haystack. */
448 	for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
449 	     hp < eoh && np < eon;
450 	     hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
451 
452 	/* HP now references the (NEEDLESIZE + 1)-th character. */
453 	if (np < eon) {
454 		/* haystack is smaller than needle, :O */
455 		return NULL;
456 	} else if (eqp) {
457 		/* found a match */
458 		return deconst(hay);
459 	}
460 
461 	/* now loop through the rest of haystack,
462 	 * updating the sum iteratively */
463 	for (cand = hay; hp < eoh; hp++) {
464 		hsum ^= *cand++;
465 		hsum ^= *hp;
466 
467 		/* Since the sum of the characters is already known to be
468 		 * equal at that point, it is enough to check just NEEDLESIZE - 1
469 		 * characters for equality,
470 		 * also CAND is by design < HP, so no need for range checks */
471 		if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
472 			return deconst(cand);
473 		}
474 	}
475 	return NULL;
476 }
477 
478 static int
479 strtoi_lim(const char *str, const char **ep, int llim, int ulim)
480 {
481 	int res = 0;
482 	const char *sp;
483 	/* we keep track of the number of digits via rulim */
484 	int rulim;
485 
486 	for (sp = str, rulim = ulim > 10 ? ulim : 10;
487 	     res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
488 	     sp++, rulim /= 10) {
489 		res *= 10;
490 		res += *sp - '0';
491 	}
492 	if (sp == str) {
493 		res = -1;
494 	} else if (res < llim || res > ulim) {
495 		res = -2;
496 	}
497 	*ep = (const char*)sp;
498 	return res;
499 }
500 
501 static time_t
502 time_from_tm(struct tm *t)
503 {
504 #if HAVE_TIMEGM
505         /* Use platform timegm() if available. */
506         return (timegm(t));
507 #elif HAVE__MKGMTIME64
508         return (_mkgmtime64(t));
509 #else
510         /* Else use direct calculation using POSIX assumptions. */
511         /* First, fix up tm_yday based on the year/month/day. */
512         if (mktime(t) == (time_t)-1)
513                 return ((time_t)-1);
514         /* Then we can compute timegm() from first principles. */
515         return (t->tm_sec
516             + t->tm_min * 60
517             + t->tm_hour * 3600
518             + t->tm_yday * 86400
519             + (t->tm_year - 70) * 31536000
520             + ((t->tm_year - 69) / 4) * 86400
521             - ((t->tm_year - 1) / 100) * 86400
522             + ((t->tm_year + 299) / 400) * 86400);
523 #endif
524 }
525 
526 static time_t
527 xstrpisotime(const char *s, char **endptr)
528 {
529 /** like strptime() but strictly for ISO 8601 Zulu strings */
530 	struct tm tm;
531 	time_t res = (time_t)-1;
532 
533 	/* make sure tm is clean */
534 	memset(&tm, 0, sizeof(tm));
535 
536 	/* as a courtesy to our callers, and since this is a non-standard
537 	 * routine, we skip leading whitespace */
538 	for (; isspace(*s); s++);
539 
540 	/* read year */
541 	if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
542 		goto out;
543 	}
544 	/* read month */
545 	if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
546 		goto out;
547 	}
548 	/* read day-of-month */
549 	if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
550 		goto out;
551 	}
552 	/* read hour */
553 	if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
554 		goto out;
555 	}
556 	/* read minute */
557 	if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
558 		goto out;
559 	}
560 	/* read second */
561 	if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
562 		goto out;
563 	}
564 
565 	/* massage TM to fulfill some of POSIX' contraints */
566 	tm.tm_year -= 1900;
567 	tm.tm_mon--;
568 
569 	/* now convert our custom tm struct to a unix stamp using UTC */
570 	res = time_from_tm(&tm);
571 
572 out:
573 	if (endptr != NULL) {
574 		*endptr = deconst(s);
575 	}
576 	return res;
577 }
578 
579 static unsigned int
580 _warc_rdver(const char buf[10], size_t bsz)
581 {
582 	static const char magic[] = "WARC/";
583 	unsigned int ver;
584 
585 	(void)bsz; /* UNUSED */
586 
587 	if (memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
588 		/* nope */
589 		return 99999U;
590 	}
591 	/* looks good so far, read the version number for a laugh */
592 	buf += sizeof(magic) - 1U;
593 	/* most common case gets a quick-check here */
594 	if (memcmp(buf, "1.0\r\n", 5U) == 0) {
595 		ver = 10000U;
596 	} else {
597 		switch (*buf) {
598 		case '0':
599 		case '1':
600 		case '2':
601 		case '3':
602 		case '4':
603 		case '5':
604 		case '6':
605 		case '7':
606 		case '8':
607 			if (buf[1U] == '.') {
608 				char *on;
609 
610 				/* set up major version */
611 				ver = (buf[0U] - '0') * 10000U;
612 				/* minor version, anyone? */
613 				ver += (strtol(buf + 2U, &on, 10)) * 100U;
614 				/* don't parse anything else */
615 				if (on > buf + 2U) {
616 					break;
617 				}
618 			}
619 			/* FALLTHROUGH */
620 		case '9':
621 		default:
622 			/* just make the version ridiculously high */
623 			ver = 999999U;
624 			break;
625 		}
626 	}
627 	return ver;
628 }
629 
630 static unsigned int
631 _warc_rdtyp(const char *buf, size_t bsz)
632 {
633 	static const char _key[] = "\r\nWARC-Type:";
634 	const char *const eob = buf + bsz;
635 	const char *val;
636 
637 	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
638 		/* no bother */
639 		return WT_NONE;
640 	}
641 	/* overread whitespace */
642 	for (val += sizeof(_key) - 1U; val < eob && isspace(*val); val++);
643 
644 	if (val + 8U > eob) {
645 		;
646 	} else if (memcmp(val, "resource", 8U) == 0) {
647 		return WT_RSRC;
648 	} else if (memcmp(val, "warcinfo", 8U) == 0) {
649 		return WT_INFO;
650 	} else if (memcmp(val, "metadata", 8U) == 0) {
651 		return WT_META;
652 	} else if (memcmp(val, "request", 7U) == 0) {
653 		return WT_REQ;
654 	} else if (memcmp(val, "response", 8U) == 0) {
655 		return WT_RSP;
656 	} else if (memcmp(val, "conversi", 8U) == 0) {
657 		return WT_CONV;
658 	} else if (memcmp(val, "continua", 8U) == 0) {
659 		return WT_CONT;
660 	}
661 	return WT_NONE;
662 }
663 
664 static warc_string_t
665 _warc_rduri(const char *buf, size_t bsz)
666 {
667 	static const char _key[] = "\r\nWARC-Target-URI:";
668 	const char *const eob = buf + bsz;
669 	const char *val;
670 	const char *uri;
671 	const char *eol;
672 	warc_string_t res = {0U, NULL};
673 
674 	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
675 		/* no bother */
676 		return res;
677 	}
678 	/* overread whitespace */
679 	for (val += sizeof(_key) - 1U; val < eob && isspace(*val); val++);
680 
681 	/* overread URL designators */
682 	if ((uri = xmemmem(val, eob - val, "://", 3U)) == NULL) {
683 		/* not touching that! */
684 		return res;
685 	} else if ((eol = memchr(uri, '\n', eob - uri)) == NULL) {
686 		/* no end of line? :O */
687 		return res;
688 	}
689 
690 	/* massage uri to point to after :// */
691 	uri += 3U;
692 	/* also massage eol to point to the first whitespace
693 	 * after the last non-whitespace character before
694 	 * the end of the line */
695 	for (; eol > uri && isspace(eol[-1]); eol--);
696 
697 	/* now then, inspect the URI */
698 	if (memcmp(val, "file", 4U) == 0) {
699 		/* perfect, nothing left to do here */
700 
701 	} else if (memcmp(val, "http", 4U) == 0 ||
702 		   memcmp(val, "ftp", 3U) == 0) {
703 		/* overread domain, and the first / */
704 		while (uri < eol && *uri++ != '/');
705 	} else {
706 		/* not sure what to do? best to bugger off */
707 		return res;
708 	}
709 	res.str = uri;
710 	res.len = eol - uri;
711 	return res;
712 }
713 
714 static ssize_t
715 _warc_rdlen(const char *buf, size_t bsz)
716 {
717 	static const char _key[] = "\r\nContent-Length:";
718 	const char *val;
719 	char *on = NULL;
720 	long int len;
721 
722 	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
723 		/* no bother */
724 		return -1;
725 	}
726 
727 	/* strtol kindly overreads whitespace for us, so use that */
728 	val += sizeof(_key) - 1U;
729 	len = strtol(val, &on, 10);
730 	if (on == NULL || !isspace(*on)) {
731 		/* hm, can we trust that number?  Best not. */
732 		return -1;
733 	}
734 	return (size_t)len;
735 }
736 
737 static time_t
738 _warc_rdrtm(const char *buf, size_t bsz)
739 {
740 	static const char _key[] = "\r\nWARC-Date:";
741 	const char *val;
742 	char *on = NULL;
743 	time_t res;
744 
745 	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
746 		/* no bother */
747 		return (time_t)-1;
748 	}
749 
750 	/* xstrpisotime() kindly overreads whitespace for us, so use that */
751 	val += sizeof(_key) - 1U;
752 	res = xstrpisotime(val, &on);
753 	if (on == NULL || !isspace(*on)) {
754 		/* hm, can we trust that number?  Best not. */
755 		return (time_t)-1;
756 	}
757 	return res;
758 }
759 
760 static time_t
761 _warc_rdmtm(const char *buf, size_t bsz)
762 {
763 	static const char _key[] = "\r\nLast-Modified:";
764 	const char *val;
765 	char *on = NULL;
766 	time_t res;
767 
768 	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
769 		/* no bother */
770 		return (time_t)-1;
771 	}
772 
773 	/* xstrpisotime() kindly overreads whitespace for us, so use that */
774 	val += sizeof(_key) - 1U;
775 	res = xstrpisotime(val, &on);
776 	if (on == NULL || !isspace(*on)) {
777 		/* hm, can we trust that number?  Best not. */
778 		return (time_t)-1;
779 	}
780 	return res;
781 }
782 
783 static const char*
784 _warc_find_eoh(const char *buf, size_t bsz)
785 {
786 	static const char _marker[] = "\r\n\r\n";
787 	const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
788 
789 	if (hit != NULL) {
790 		hit += sizeof(_marker) - 1U;
791 	}
792 	return hit;
793 }
794 
795 /* archive_read_support_format_warc.c ends here */
796