1 /*-
2  * Copyright (c) 2014 Sebastian Freundt
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #include "archive_platform.h"
27 __FBSDID("$FreeBSD$");
28 
29 /**
30  * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
31  * ISO 28500:2009.
32  * For the purposes of this file we used the final draft from:
33  * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
34  *
35  * Todo:
36  * [ ] real-world warcs can contain resources at endpoints ending in /
37  *     e.g. http://bibnum.bnf.fr/warc/
38  *     if you're lucky their response contains a Content-Location: header
39  *     pointing to a unix-compliant filename, in the example above it's
40  *     Content-Location: http://bibnum.bnf.fr/warc/index.html
41  *     however, that's not mandated and github for example doesn't follow
42  *     this convention.
43  *     We need a set of archive options to control what to do with
44  *     entries like these, at the moment care is taken to skip them.
45  *
46  **/
47 
48 #ifdef HAVE_SYS_STAT_H
49 #include <sys/stat.h>
50 #endif
51 #ifdef HAVE_ERRNO_H
52 #include <errno.h>
53 #endif
54 #ifdef HAVE_STDLIB_H
55 #include <stdlib.h>
56 #endif
57 #ifdef HAVE_STRING_H
58 #include <string.h>
59 #endif
60 #ifdef HAVE_LIMITS_H
61 #include <limits.h>
62 #endif
63 #ifdef HAVE_CTYPE_H
64 #include <ctype.h>
65 #endif
66 #ifdef HAVE_TIME_H
67 #include <time.h>
68 #endif
69 
70 #include "archive.h"
71 #include "archive_entry.h"
72 #include "archive_private.h"
73 #include "archive_read_private.h"
74 
75 typedef enum {
76 	WT_NONE,
77 	/* warcinfo */
78 	WT_INFO,
79 	/* metadata */
80 	WT_META,
81 	/* resource */
82 	WT_RSRC,
83 	/* request, unsupported */
84 	WT_REQ,
85 	/* response, unsupported */
86 	WT_RSP,
87 	/* revisit, unsupported */
88 	WT_RVIS,
89 	/* conversion, unsupported */
90 	WT_CONV,
91 	/* continuation, unsupported at the moment */
92 	WT_CONT,
93 	/* invalid type */
94 	LAST_WT
95 } warc_type_t;
96 
97 typedef struct {
98 	size_t len;
99 	const char *str;
100 } warc_string_t;
101 
102 typedef struct {
103 	size_t len;
104 	char *str;
105 } warc_strbuf_t;
106 
107 struct warc_s {
108 	/* content length ahead */
109 	size_t cntlen;
110 	/* and how much we've processed so far */
111 	size_t cntoff;
112 	/* and how much we need to consume between calls */
113 	size_t unconsumed;
114 
115 	/* string pool */
116 	warc_strbuf_t pool;
117 	/* previous version */
118 	unsigned int pver;
119 	/* stringified format name */
120 	struct archive_string sver;
121 };
122 
123 static int _warc_bid(struct archive_read *a, int);
124 static int _warc_cleanup(struct archive_read *a);
125 static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
126 static int _warc_skip(struct archive_read *a);
127 static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
128 
129 /* private routines */
130 static unsigned int _warc_rdver(const char buf[10], size_t bsz);
131 static unsigned int _warc_rdtyp(const char *buf, size_t bsz);
132 static warc_string_t _warc_rduri(const char *buf, size_t bsz);
133 static ssize_t _warc_rdlen(const char *buf, size_t bsz);
134 static time_t _warc_rdrtm(const char *buf, size_t bsz);
135 static time_t _warc_rdmtm(const char *buf, size_t bsz);
136 static const char *_warc_find_eoh(const char *buf, size_t bsz);
137 static const char *_warc_find_eol(const char *buf, size_t bsz);
138 
139 int
140 archive_read_support_format_warc(struct archive *_a)
141 {
142 	struct archive_read *a = (struct archive_read *)_a;
143 	struct warc_s *w;
144 	int r;
145 
146 	archive_check_magic(_a, ARCHIVE_READ_MAGIC,
147 	    ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
148 
149 	if ((w = calloc(1, sizeof(*w))) == NULL) {
150 		archive_set_error(&a->archive, ENOMEM,
151 		    "Can't allocate warc data");
152 		return (ARCHIVE_FATAL);
153 	}
154 
155 	r = __archive_read_register_format(
156 		a, w, "warc",
157 		_warc_bid, NULL, _warc_rdhdr, _warc_read,
158 		_warc_skip, NULL, _warc_cleanup, NULL, NULL);
159 
160 	if (r != ARCHIVE_OK) {
161 		free(w);
162 		return (r);
163 	}
164 	return (ARCHIVE_OK);
165 }
166 
167 static int
168 _warc_cleanup(struct archive_read *a)
169 {
170 	struct warc_s *w = a->format->data;
171 
172 	if (w->pool.len > 0U) {
173 		free(w->pool.str);
174 	}
175 	archive_string_free(&w->sver);
176 	free(w);
177 	a->format->data = NULL;
178 	return (ARCHIVE_OK);
179 }
180 
181 static int
182 _warc_bid(struct archive_read *a, int best_bid)
183 {
184 	const char *hdr;
185 	ssize_t nrd;
186 	unsigned int ver;
187 
188 	(void)best_bid; /* UNUSED */
189 
190 	/* check first line of file, it should be a record already */
191 	if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
192 		/* no idea what to do */
193 		return -1;
194 	} else if (nrd < 12) {
195 		/* nah, not for us, our magic cookie is at least 12 bytes */
196 		return -1;
197 	}
198 
199 	/* otherwise snarf the record's version number */
200 	ver = _warc_rdver(hdr, nrd);
201 	if (ver < 1200U || ver > 10000U) {
202 		/* we only support WARC 0.12 to 1.0 */
203 		return -1;
204 	}
205 
206 	/* otherwise be confident */
207 	return (64);
208 }
209 
210 static int
211 _warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
212 {
213 #define HDR_PROBE_LEN		(12U)
214 	struct warc_s *w = a->format->data;
215 	unsigned int ver;
216 	const char *buf;
217 	ssize_t nrd;
218 	const char *eoh;
219 	/* for the file name, saves some strndup()'ing */
220 	warc_string_t fnam;
221 	/* warc record type, not that we really use it a lot */
222 	warc_type_t ftyp;
223 	/* content-length+error monad */
224 	ssize_t cntlen;
225 	/* record time is the WARC-Date time we reinterpret it as ctime */
226 	time_t rtime;
227 	/* mtime is the Last-Modified time which will be the entry's mtime */
228 	time_t mtime;
229 
230 start_over:
231 	/* just use read_ahead() they keep track of unconsumed
232 	 * bits and bobs for us; no need to put an extra shift in
233 	 * and reproduce that functionality here */
234 	buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
235 
236 	if (nrd < 0) {
237 		/* no good */
238 		archive_set_error(
239 			&a->archive, ARCHIVE_ERRNO_MISC,
240 			"Bad record header");
241 		return (ARCHIVE_FATAL);
242 	} else if (buf == NULL) {
243 		/* there should be room for at least WARC/bla\r\n
244 		 * must be EOF therefore */
245 		return (ARCHIVE_EOF);
246 	}
247  	/* looks good so far, try and find the end of the header now */
248 	eoh = _warc_find_eoh(buf, nrd);
249 	if (eoh == NULL) {
250 		/* still no good, the header end might be beyond the
251 		 * probe we've requested, but then again who'd cram
252 		 * so much stuff into the header *and* be 28500-compliant */
253 		archive_set_error(
254 			&a->archive, ARCHIVE_ERRNO_MISC,
255 			"Bad record header");
256 		return (ARCHIVE_FATAL);
257 	}
258 	ver = _warc_rdver(buf, eoh - buf);
259 	/* we currently support WARC 0.12 to 1.0 */
260 	if (ver == 0U) {
261 		archive_set_error(
262 			&a->archive, ARCHIVE_ERRNO_MISC,
263 			"Invalid record version");
264 		return (ARCHIVE_FATAL);
265 	} else if (ver < 1200U || ver > 10000U) {
266 		archive_set_error(
267 			&a->archive, ARCHIVE_ERRNO_MISC,
268 			"Unsupported record version: %u.%u",
269 			ver / 10000, (ver % 10000) / 100);
270 		return (ARCHIVE_FATAL);
271 	}
272 	cntlen = _warc_rdlen(buf, eoh - buf);
273 	if (cntlen < 0) {
274 		/* nightmare!  the specs say content-length is mandatory
275 		 * so I don't feel overly bad stopping the reader here */
276 		archive_set_error(
277 			&a->archive, EINVAL,
278 			"Bad content length");
279 		return (ARCHIVE_FATAL);
280 	}
281 	rtime = _warc_rdrtm(buf, eoh - buf);
282 	if (rtime == (time_t)-1) {
283 		/* record time is mandatory as per WARC/1.0,
284 		 * so just barf here, fast and loud */
285 		archive_set_error(
286 			&a->archive, EINVAL,
287 			"Bad record time");
288 		return (ARCHIVE_FATAL);
289 	}
290 
291 	/* let the world know we're a WARC archive */
292 	a->archive.archive_format = ARCHIVE_FORMAT_WARC;
293 	if (ver != w->pver) {
294 		/* stringify this entry's version */
295 		archive_string_sprintf(&w->sver,
296 			"WARC/%u.%u", ver / 10000, (ver % 10000) / 100);
297 		/* remember the version */
298 		w->pver = ver;
299 	}
300 	/* start off with the type */
301 	ftyp = _warc_rdtyp(buf, eoh - buf);
302 	/* and let future calls know about the content */
303 	w->cntlen = cntlen;
304 	w->cntoff = 0U;
305 	mtime = 0;/* Avoid compiling error on some platform. */
306 
307 	switch (ftyp) {
308 	case WT_RSRC:
309 	case WT_RSP:
310 		/* only try and read the filename in the cases that are
311 		 * guaranteed to have one */
312 		fnam = _warc_rduri(buf, eoh - buf);
313 		/* check the last character in the URI to avoid creating
314 		 * directory endpoints as files, see Todo above */
315 		if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
316 			/* break here for now */
317 			fnam.len = 0U;
318 			fnam.str = NULL;
319 			break;
320 		}
321 		/* bang to our string pool, so we save a
322 		 * malloc()+free() roundtrip */
323 		if (fnam.len + 1U > w->pool.len) {
324 			w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
325 			w->pool.str = realloc(w->pool.str, w->pool.len);
326 		}
327 		memcpy(w->pool.str, fnam.str, fnam.len);
328 		w->pool.str[fnam.len] = '\0';
329 		/* let no one else know about the pool, it's a secret, shhh */
330 		fnam.str = w->pool.str;
331 
332 		/* snarf mtime or deduce from rtime
333 		 * this is a custom header added by our writer, it's quite
334 		 * hard to believe anyone else would go through with it
335 		 * (apart from being part of some http responses of course) */
336 		if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
337 			mtime = rtime;
338 		}
339 		break;
340 	default:
341 		fnam.len = 0U;
342 		fnam.str = NULL;
343 		break;
344 	}
345 
346 	/* now eat some of those delicious buffer bits */
347 	__archive_read_consume(a, eoh - buf);
348 
349 	switch (ftyp) {
350 	case WT_RSRC:
351 	case WT_RSP:
352 		if (fnam.len > 0U) {
353 			/* populate entry object */
354 			archive_entry_set_filetype(entry, AE_IFREG);
355 			archive_entry_copy_pathname(entry, fnam.str);
356 			archive_entry_set_size(entry, cntlen);
357 			archive_entry_set_perm(entry, 0644);
358 			/* rtime is the new ctime, mtime stays mtime */
359 			archive_entry_set_ctime(entry, rtime, 0L);
360 			archive_entry_set_mtime(entry, mtime, 0L);
361 			break;
362 		}
363 		/* FALLTHROUGH */
364 	default:
365 		/* consume the content and start over */
366 		_warc_skip(a);
367 		goto start_over;
368 	}
369 	return (ARCHIVE_OK);
370 }
371 
372 static int
373 _warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
374 {
375 	struct warc_s *w = a->format->data;
376 	const char *rab;
377 	ssize_t nrd;
378 
379 	if (w->cntoff >= w->cntlen) {
380 	eof:
381 		/* it's our lucky day, no work, we can leave early */
382 		*buf = NULL;
383 		*bsz = 0U;
384 		*off = w->cntoff + 4U/*for \r\n\r\n separator*/;
385 		w->unconsumed = 0U;
386 		return (ARCHIVE_EOF);
387 	}
388 
389 	if (w->unconsumed) {
390 		__archive_read_consume(a, w->unconsumed);
391 		w->unconsumed = 0U;
392 	}
393 
394 	rab = __archive_read_ahead(a, 1U, &nrd);
395 	if (nrd < 0) {
396 		*bsz = 0U;
397 		/* big catastrophe */
398 		return (int)nrd;
399 	} else if (nrd == 0) {
400 		goto eof;
401 	} else if ((size_t)nrd > w->cntlen - w->cntoff) {
402 		/* clamp to content-length */
403 		nrd = w->cntlen - w->cntoff;
404 	}
405 	*off = w->cntoff;
406 	*bsz = nrd;
407 	*buf = rab;
408 
409 	w->cntoff += nrd;
410 	w->unconsumed = (size_t)nrd;
411 	return (ARCHIVE_OK);
412 }
413 
414 static int
415 _warc_skip(struct archive_read *a)
416 {
417 	struct warc_s *w = a->format->data;
418 
419 	__archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/);
420 	w->cntlen = 0U;
421 	w->cntoff = 0U;
422 	return (ARCHIVE_OK);
423 }
424 
425 
426 /* private routines */
427 static void*
428 deconst(const void *c)
429 {
430 	return (char *)0x1 + (((const char *)c) - (const char *)0x1);
431 }
432 
433 static char*
434 xmemmem(const char *hay, const size_t haysize,
435 	const char *needle, const size_t needlesize)
436 {
437 	const char *const eoh = hay + haysize;
438 	const char *const eon = needle + needlesize;
439 	const char *hp;
440 	const char *np;
441 	const char *cand;
442 	unsigned int hsum;
443 	unsigned int nsum;
444 	unsigned int eqp;
445 
446 	/* trivial checks first
447          * a 0-sized needle is defined to be found anywhere in haystack
448          * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
449          * that happens to begin with *NEEDLE) */
450 	if (needlesize == 0UL) {
451 		return deconst(hay);
452 	} else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
453 		/* trivial */
454 		return NULL;
455 	}
456 
457 	/* First characters of haystack and needle are the same now. Both are
458 	 * guaranteed to be at least one character long.  Now computes the sum
459 	 * of characters values of needle together with the sum of the first
460 	 * needle_len characters of haystack. */
461 	for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
462 	     hp < eoh && np < eon;
463 	     hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
464 
465 	/* HP now references the (NEEDLESIZE + 1)-th character. */
466 	if (np < eon) {
467 		/* haystack is smaller than needle, :O */
468 		return NULL;
469 	} else if (eqp) {
470 		/* found a match */
471 		return deconst(hay);
472 	}
473 
474 	/* now loop through the rest of haystack,
475 	 * updating the sum iteratively */
476 	for (cand = hay; hp < eoh; hp++) {
477 		hsum ^= *cand++;
478 		hsum ^= *hp;
479 
480 		/* Since the sum of the characters is already known to be
481 		 * equal at that point, it is enough to check just NEEDLESIZE - 1
482 		 * characters for equality,
483 		 * also CAND is by design < HP, so no need for range checks */
484 		if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
485 			return deconst(cand);
486 		}
487 	}
488 	return NULL;
489 }
490 
491 static int
492 strtoi_lim(const char *str, const char **ep, int llim, int ulim)
493 {
494 	int res = 0;
495 	const char *sp;
496 	/* we keep track of the number of digits via rulim */
497 	int rulim;
498 
499 	for (sp = str, rulim = ulim > 10 ? ulim : 10;
500 	     res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
501 	     sp++, rulim /= 10) {
502 		res *= 10;
503 		res += *sp - '0';
504 	}
505 	if (sp == str) {
506 		res = -1;
507 	} else if (res < llim || res > ulim) {
508 		res = -2;
509 	}
510 	*ep = (const char*)sp;
511 	return res;
512 }
513 
514 static time_t
515 time_from_tm(struct tm *t)
516 {
517 #if HAVE_TIMEGM
518         /* Use platform timegm() if available. */
519         return (timegm(t));
520 #elif HAVE__MKGMTIME64
521         return (_mkgmtime64(t));
522 #else
523         /* Else use direct calculation using POSIX assumptions. */
524         /* First, fix up tm_yday based on the year/month/day. */
525         if (mktime(t) == (time_t)-1)
526                 return ((time_t)-1);
527         /* Then we can compute timegm() from first principles. */
528         return (t->tm_sec
529             + t->tm_min * 60
530             + t->tm_hour * 3600
531             + t->tm_yday * 86400
532             + (t->tm_year - 70) * 31536000
533             + ((t->tm_year - 69) / 4) * 86400
534             - ((t->tm_year - 1) / 100) * 86400
535             + ((t->tm_year + 299) / 400) * 86400);
536 #endif
537 }
538 
539 static time_t
540 xstrpisotime(const char *s, char **endptr)
541 {
542 /** like strptime() but strictly for ISO 8601 Zulu strings */
543 	struct tm tm;
544 	time_t res = (time_t)-1;
545 
546 	/* make sure tm is clean */
547 	memset(&tm, 0, sizeof(tm));
548 
549 	/* as a courtesy to our callers, and since this is a non-standard
550 	 * routine, we skip leading whitespace */
551 	while (*s == ' ' || *s == '\t')
552 		++s;
553 
554 	/* read year */
555 	if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
556 		goto out;
557 	}
558 	/* read month */
559 	if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
560 		goto out;
561 	}
562 	/* read day-of-month */
563 	if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
564 		goto out;
565 	}
566 	/* read hour */
567 	if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
568 		goto out;
569 	}
570 	/* read minute */
571 	if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
572 		goto out;
573 	}
574 	/* read second */
575 	if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
576 		goto out;
577 	}
578 
579 	/* massage TM to fulfill some of POSIX' constraints */
580 	tm.tm_year -= 1900;
581 	tm.tm_mon--;
582 
583 	/* now convert our custom tm struct to a unix stamp using UTC */
584 	res = time_from_tm(&tm);
585 
586 out:
587 	if (endptr != NULL) {
588 		*endptr = deconst(s);
589 	}
590 	return res;
591 }
592 
593 static unsigned int
594 _warc_rdver(const char *buf, size_t bsz)
595 {
596 	static const char magic[] = "WARC/";
597 	const char *c;
598 	unsigned int ver = 0U;
599 	unsigned int end = 0U;
600 
601 	if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
602 		/* buffer too small or invalid magic */
603 		return ver;
604 	}
605 	/* looks good so far, read the version number for a laugh */
606 	buf += sizeof(magic) - 1U;
607 
608 	if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') &&
609 	    isdigit((unsigned char)buf[2U])) {
610 		/* we support a maximum of 2 digits in the minor version */
611 		if (isdigit((unsigned char)buf[3U]))
612 			end = 1U;
613 		/* set up major version */
614 		ver = (buf[0U] - '0') * 10000U;
615 		/* set up minor version */
616 		if (end == 1U) {
617 			ver += (buf[2U] - '0') * 1000U;
618 			ver += (buf[3U] - '0') * 100U;
619 		} else
620 			ver += (buf[2U] - '0') * 100U;
621 		/*
622 		 * WARC below version 0.12 has a space-separated header
623 		 * WARC 0.12 and above terminates the version with a CRLF
624 		 */
625 		c = buf + 3U + end;
626 		if (ver >= 1200U) {
627 			if (memcmp(c, "\r\n", 2U) != 0)
628 				ver = 0U;
629 		} else {
630 			/* ver < 1200U */
631 			if (*c != ' ' && *c != '\t')
632 				ver = 0U;
633 		}
634 	}
635 	return ver;
636 }
637 
638 static unsigned int
639 _warc_rdtyp(const char *buf, size_t bsz)
640 {
641 	static const char _key[] = "\r\nWARC-Type:";
642 	const char *val, *eol;
643 
644 	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
645 		/* no bother */
646 		return WT_NONE;
647 	}
648 	val += sizeof(_key) - 1U;
649 	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
650 		/* no end of line */
651 		return WT_NONE;
652 	}
653 
654 	/* overread whitespace */
655 	while (val < eol && (*val == ' ' || *val == '\t'))
656 		++val;
657 
658 	if (val + 8U == eol) {
659 		if (memcmp(val, "resource", 8U) == 0)
660 			return WT_RSRC;
661 		else if (memcmp(val, "response", 8U) == 0)
662 			return WT_RSP;
663 	}
664 	return WT_NONE;
665 }
666 
667 static warc_string_t
668 _warc_rduri(const char *buf, size_t bsz)
669 {
670 	static const char _key[] = "\r\nWARC-Target-URI:";
671 	const char *val, *uri, *eol, *p;
672 	warc_string_t res = {0U, NULL};
673 
674 	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
675 		/* no bother */
676 		return res;
677 	}
678 	/* overread whitespace */
679 	val += sizeof(_key) - 1U;
680 	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
681 		/* no end of line */
682 		return res;
683 	}
684 
685 	while (val < eol && (*val == ' ' || *val == '\t'))
686 		++val;
687 
688 	/* overread URL designators */
689 	if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) {
690 		/* not touching that! */
691 		return res;
692 	}
693 
694 	/* spaces inside uri are not allowed, CRLF should follow */
695 	for (p = val; p < eol; p++) {
696 		if (isspace((unsigned char)*p))
697 			return res;
698 	}
699 
700 	/* there must be at least space for ftp */
701 	if (uri < (val + 3U))
702 		return res;
703 
704 	/* move uri to point to after :// */
705 	uri += 3U;
706 
707 	/* now then, inspect the URI */
708 	if (memcmp(val, "file", 4U) == 0) {
709 		/* perfect, nothing left to do here */
710 
711 	} else if (memcmp(val, "http", 4U) == 0 ||
712 		   memcmp(val, "ftp", 3U) == 0) {
713 		/* overread domain, and the first / */
714 		while (uri < eol && *uri++ != '/');
715 	} else {
716 		/* not sure what to do? best to bugger off */
717 		return res;
718 	}
719 	res.str = uri;
720 	res.len = eol - uri;
721 	return res;
722 }
723 
724 static ssize_t
725 _warc_rdlen(const char *buf, size_t bsz)
726 {
727 	static const char _key[] = "\r\nContent-Length:";
728 	const char *val, *eol;
729 	char *on = NULL;
730 	long int len;
731 
732 	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
733 		/* no bother */
734 		return -1;
735 	}
736 	val += sizeof(_key) - 1U;
737 	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
738 		/* no end of line */
739 		return -1;
740 	}
741 
742 	/* skip leading whitespace */
743 	while (val < eol && (*val == ' ' || *val == '\t'))
744 		val++;
745 	/* there must be at least one digit */
746 	if (!isdigit((unsigned char)*val))
747 		return -1;
748 	errno = 0;
749 	len = strtol(val, &on, 10);
750 	if (errno != 0 || on != eol) {
751 		/* line must end here */
752 		return -1;
753 	}
754 
755 	return (size_t)len;
756 }
757 
758 static time_t
759 _warc_rdrtm(const char *buf, size_t bsz)
760 {
761 	static const char _key[] = "\r\nWARC-Date:";
762 	const char *val, *eol;
763 	char *on = NULL;
764 	time_t res;
765 
766 	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
767 		/* no bother */
768 		return (time_t)-1;
769 	}
770 	val += sizeof(_key) - 1U;
771 	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
772 		/* no end of line */
773 		return -1;
774 	}
775 
776 	/* xstrpisotime() kindly overreads whitespace for us, so use that */
777 	res = xstrpisotime(val, &on);
778 	if (on != eol) {
779 		/* line must end here */
780 		return -1;
781 	}
782 	return res;
783 }
784 
785 static time_t
786 _warc_rdmtm(const char *buf, size_t bsz)
787 {
788 	static const char _key[] = "\r\nLast-Modified:";
789 	const char *val, *eol;
790 	char *on = NULL;
791 	time_t res;
792 
793 	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
794 		/* no bother */
795 		return (time_t)-1;
796 	}
797 	val += sizeof(_key) - 1U;
798 	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
799 		/* no end of line */
800 		return -1;
801 	}
802 
803 	/* xstrpisotime() kindly overreads whitespace for us, so use that */
804 	res = xstrpisotime(val, &on);
805 	if (on != eol) {
806 		/* line must end here */
807 		return -1;
808 	}
809 	return res;
810 }
811 
812 static const char*
813 _warc_find_eoh(const char *buf, size_t bsz)
814 {
815 	static const char _marker[] = "\r\n\r\n";
816 	const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
817 
818 	if (hit != NULL) {
819 		hit += sizeof(_marker) - 1U;
820 	}
821 	return hit;
822 }
823 
824 static const char*
825 _warc_find_eol(const char *buf, size_t bsz)
826 {
827 	static const char _marker[] = "\r\n";
828 	const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
829 
830 	return hit;
831 }
832 /* archive_read_support_format_warc.c ends here */
833