1 /*-
2  * Copyright (c) 2014 Sebastian Freundt
3  * Author: Sebastian Freundt  <devel@fresse.org>
4  *
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "archive_platform.h"
29 __FBSDID("$FreeBSD$");
30 
31 #ifdef HAVE_ERRNO_H
32 #include <errno.h>
33 #endif
34 #include <stdio.h>
35 #ifdef HAVE_STDLIB_H
36 #include <stdlib.h>
37 #endif
38 #ifdef HAVE_STRING_H
39 #include <string.h>
40 #endif
41 #ifdef HAVE_TIME_H
42 #include <time.h>
43 #endif
44 
45 #include "archive.h"
46 #include "archive_entry.h"
47 #include "archive_entry_locale.h"
48 #include "archive_private.h"
49 #include "archive_random_private.h"
50 #include "archive_write_private.h"
51 #include "archive_write_set_format_private.h"
52 
53 struct warc_s {
54 	unsigned int omit_warcinfo:1;
55 
56 	time_t now;
57 	mode_t typ;
58 	unsigned int rng;
59 	/* populated size */
60 	uint64_t populz;
61 };
62 
63 static const char warcinfo[] =
64     "software: libarchive/" ARCHIVE_VERSION_ONLY_STRING "\r\n"
65     "format: WARC file version 1.0\r\n";
66 
67 typedef enum {
68 	WT_NONE,
69 	/* warcinfo */
70 	WT_INFO,
71 	/* metadata */
72 	WT_META,
73 	/* resource */
74 	WT_RSRC,
75 	/* request, unsupported */
76 	WT_REQ,
77 	/* response, unsupported */
78 	WT_RSP,
79 	/* revisit, unsupported */
80 	WT_RVIS,
81 	/* conversion, unsupported */
82 	WT_CONV,
83 	/* continuation, unsupported at the moment */
84 	WT_CONT,
85 	/* invalid type */
86 	LAST_WT
87 } warc_type_t;
88 
89 typedef struct {
90 	warc_type_t type;
91 	const char *tgturi;
92 	const char *recid;
93 	time_t rtime;
94 	time_t mtime;
95 	const char *cnttyp;
96 	uint64_t cntlen;
97 } warc_essential_hdr_t;
98 
99 typedef struct {
100 	unsigned int u[4U];
101 } warc_uuid_t;
102 
103 static int _warc_options(struct archive_write*, const char *key, const char *v);
104 static int _warc_header(struct archive_write *a, struct archive_entry *entry);
105 static ssize_t _warc_data(struct archive_write *a, const void *buf, size_t sz);
106 static int _warc_finish_entry(struct archive_write *a);
107 static int _warc_close(struct archive_write *a);
108 static int _warc_free(struct archive_write *a);
109 
110 /* private routines */
111 static ssize_t _popul_ehdr(struct archive_string *t, size_t z, warc_essential_hdr_t);
112 static int _gen_uuid(warc_uuid_t *tgt);
113 
114 
115 /*
116  * Set output format to ISO 28500 (aka WARC) format.
117  */
118 int
119 archive_write_set_format_warc(struct archive *_a)
120 {
121 	struct archive_write *a = (struct archive_write *)_a;
122 	struct warc_s *w;
123 
124 	archive_check_magic(_a, ARCHIVE_WRITE_MAGIC,
125 	    ARCHIVE_STATE_NEW, "archive_write_set_format_warc");
126 
127 	/* If another format was already registered, unregister it. */
128 	if (a->format_free != NULL) {
129 		(a->format_free)(a);
130 	}
131 
132 	w = malloc(sizeof(*w));
133 	if (w == NULL) {
134 		archive_set_error(&a->archive, ENOMEM,
135 		    "Can't allocate warc data");
136 		return (ARCHIVE_FATAL);
137 	}
138 	/* by default we're emitting a file wide header */
139 	w->omit_warcinfo = 0U;
140 	/* obtain current time for date fields */
141 	w->now = time(NULL);
142 	/* reset file type info */
143 	w->typ = 0;
144 	/* also initialise our rng */
145 	w->rng = (unsigned int)w->now;
146 
147 	a->format_data = w;
148 	a->format_name = "WARC/1.0";
149 	a->format_options = _warc_options;
150 	a->format_write_header = _warc_header;
151 	a->format_write_data = _warc_data;
152 	a->format_close = _warc_close;
153 	a->format_free = _warc_free;
154 	a->format_finish_entry = _warc_finish_entry;
155 	a->archive.archive_format = ARCHIVE_FORMAT_WARC;
156 	a->archive.archive_format_name = "WARC/1.0";
157 	return (ARCHIVE_OK);
158 }
159 
160 
161 /* archive methods */
162 static int
163 _warc_options(struct archive_write *a, const char *key, const char *val)
164 {
165 	struct warc_s *w = a->format_data;
166 
167 	if (strcmp(key, "omit-warcinfo") == 0) {
168 		if (val == NULL || strcmp(val, "true") == 0) {
169 			/* great */
170 			w->omit_warcinfo = 1U;
171 			return (ARCHIVE_OK);
172 		}
173 	}
174 
175 	/* Note: The "warn" return is just to inform the options
176 	 * supervisor that we didn't handle it.  It will generate
177 	 * a suitable error if no one used this option. */
178 	return (ARCHIVE_WARN);
179 }
180 
181 static int
182 _warc_header(struct archive_write *a, struct archive_entry *entry)
183 {
184 	struct warc_s *w = a->format_data;
185 	struct archive_string hdr;
186 #define MAX_HDR_SIZE 512
187 
188 	/* check whether warcinfo record needs outputting */
189 	if (!w->omit_warcinfo) {
190 		ssize_t r;
191 		warc_essential_hdr_t wi = {
192 			WT_INFO,
193 			/*uri*/NULL,
194 			/*urn*/NULL,
195 			/*rtm*/0,
196 			/*mtm*/0,
197 			/*cty*/"application/warc-fields",
198 			/*len*/sizeof(warcinfo) - 1U,
199 		};
200 		wi.rtime = w->now;
201 		wi.mtime = w->now;
202 
203 		archive_string_init(&hdr);
204 		r = _popul_ehdr(&hdr, MAX_HDR_SIZE, wi);
205 		if (r >= 0) {
206 			/* jackpot! */
207 			/* now also use HDR buffer for the actual warcinfo */
208 			archive_strncat(&hdr, warcinfo, sizeof(warcinfo) -1);
209 
210 			/* append end-of-record indicator */
211 			archive_strncat(&hdr, "\r\n\r\n", 4);
212 
213 			/* write to output stream */
214 			__archive_write_output(a, hdr.s, archive_strlen(&hdr));
215 		}
216 		/* indicate we're done with file header writing */
217 		w->omit_warcinfo = 1U;
218 		archive_string_free(&hdr);
219 	}
220 
221 	if (archive_entry_pathname(entry) == NULL) {
222 		archive_set_error(&a->archive, EINVAL,
223 		    "Invalid filename");
224 		return (ARCHIVE_WARN);
225 	}
226 
227 	w->typ = archive_entry_filetype(entry);
228 	w->populz = 0U;
229 	if (w->typ == AE_IFREG) {
230 		warc_essential_hdr_t rh = {
231 			WT_RSRC,
232 			/*uri*/NULL,
233 			/*urn*/NULL,
234 			/*rtm*/0,
235 			/*mtm*/0,
236 			/*cty*/NULL,
237 			/*len*/0,
238 		};
239 		ssize_t r;
240 		rh.tgturi = archive_entry_pathname(entry);
241 		rh.rtime = w->now;
242 		rh.mtime = archive_entry_mtime(entry);
243 		rh.cntlen = (size_t)archive_entry_size(entry);
244 
245 		archive_string_init(&hdr);
246 		r = _popul_ehdr(&hdr, MAX_HDR_SIZE, rh);
247 		if (r < 0) {
248 			/* don't bother */
249 			archive_set_error(
250 				&a->archive,
251 				ARCHIVE_ERRNO_FILE_FORMAT,
252 				"cannot archive file");
253 			return (ARCHIVE_WARN);
254 		}
255 		/* otherwise append to output stream */
256 		__archive_write_output(a, hdr.s, r);
257 		/* and let subsequent calls to _data() know about the size */
258 		w->populz = rh.cntlen;
259 		archive_string_free(&hdr);
260 		return (ARCHIVE_OK);
261 	}
262 	/* just resort to erroring as per Tim's advice */
263 	__archive_write_entry_filetype_unsupported(
264 	    &a->archive, entry, "WARC");
265 	return (ARCHIVE_FAILED);
266 }
267 
268 static ssize_t
269 _warc_data(struct archive_write *a, const void *buf, size_t len)
270 {
271 	struct warc_s *w = a->format_data;
272 
273 	if (w->typ == AE_IFREG) {
274 		int rc;
275 
276 		/* never write more bytes than announced */
277 		if (len > w->populz) {
278 			len = (size_t)w->populz;
279 		}
280 
281 		/* now then, out we put the whole shebang */
282 		rc = __archive_write_output(a, buf, len);
283 		if (rc != ARCHIVE_OK) {
284 			return rc;
285 		}
286 	}
287 	return len;
288 }
289 
290 static int
291 _warc_finish_entry(struct archive_write *a)
292 {
293 	static const char _eor[] = "\r\n\r\n";
294 	struct warc_s *w = a->format_data;
295 
296 	if (w->typ == AE_IFREG) {
297 		int rc = __archive_write_output(a, _eor, sizeof(_eor) - 1U);
298 
299 		if (rc != ARCHIVE_OK) {
300 			return rc;
301 		}
302 	}
303 	/* reset type info */
304 	w->typ = 0;
305 	return (ARCHIVE_OK);
306 }
307 
308 static int
309 _warc_close(struct archive_write *a)
310 {
311 	(void)a; /* UNUSED */
312 	return (ARCHIVE_OK);
313 }
314 
315 static int
316 _warc_free(struct archive_write *a)
317 {
318 	struct warc_s *w = a->format_data;
319 
320 	free(w);
321 	a->format_data = NULL;
322 	return (ARCHIVE_OK);
323 }
324 
325 
326 /* private routines */
327 static void
328 xstrftime(struct archive_string *as, const char *fmt, time_t t)
329 {
330 /** like strftime(3) but for time_t objects */
331 	struct tm *rt;
332 #if defined(HAVE_GMTIME_R) || defined(HAVE__GMTIME64_S)
333 	struct tm timeHere;
334 #endif
335 #if defined(HAVE__GMTIME64_S)
336 	errno_t terr;
337 	__time64_t tmptime;
338 #endif
339 	char strtime[100];
340 	size_t len;
341 
342 #ifdef HAVE_GMTIME_R
343 	if ((rt = gmtime_r(&t, &timeHere)) == NULL)
344 		return;
345 #elif defined(HAVE__GMTIME64_S)
346 	tmptime = t;
347 	terr = _gmtime64_s(&timeHere, &tmptime);
348 	if (terr)
349 		rt = NULL;
350 	else
351 		rt = &timeHere;
352 #else
353 	if ((rt = gmtime(&t)) == NULL)
354 		return;
355 #endif
356 	/* leave the hard yacker to our role model strftime() */
357 	len = strftime(strtime, sizeof(strtime)-1, fmt, rt);
358 	archive_strncat(as, strtime, len);
359 }
360 
361 static ssize_t
362 _popul_ehdr(struct archive_string *tgt, size_t tsz, warc_essential_hdr_t hdr)
363 {
364 	static const char _ver[] = "WARC/1.0\r\n";
365 	static const char * const _typ[LAST_WT] = {
366 		NULL, "warcinfo", "metadata", "resource", NULL
367 	};
368 	char std_uuid[48U];
369 
370 	if (hdr.type == WT_NONE || hdr.type > WT_RSRC) {
371 		/* brilliant, how exactly did we get here? */
372 		return -1;
373 	}
374 
375 	archive_strcpy(tgt, _ver);
376 
377 	archive_string_sprintf(tgt, "WARC-Type: %s\r\n", _typ[hdr.type]);
378 
379 	if (hdr.tgturi != NULL) {
380 		/* check if there's a xyz:// */
381 		static const char _uri[] = "";
382 		static const char _fil[] = "file://";
383 		const char *u;
384 		char *chk = strchr(hdr.tgturi, ':');
385 
386 		if (chk != NULL && chk[1U] == '/' && chk[2U] == '/') {
387 			/* yep, it's definitely a URI */
388 			u = _uri;
389 		} else {
390 			/* hm, best to prepend file:// then */
391 			u = _fil;
392 		}
393 		archive_string_sprintf(tgt,
394 			"WARC-Target-URI: %s%s\r\n", u, hdr.tgturi);
395 	}
396 
397 	/* record time is usually when the http is sent off,
398 	 * just treat the archive writing as such for a moment */
399 	xstrftime(tgt, "WARC-Date: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.rtime);
400 
401 	/* while we're at it, record the mtime */
402 	xstrftime(tgt, "Last-Modified: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.mtime);
403 
404 	if (hdr.recid == NULL) {
405 		/* generate one, grrrr */
406 		warc_uuid_t u;
407 
408 		_gen_uuid(&u);
409 		/* Unfortunately, archive_string_sprintf does not
410 		 * handle the minimum number following '%'.
411 		 * So we have to use snprintf function here instead
412 		 * of archive_string_snprintf function. */
413 #if defined(_WIN32) && !defined(__CYGWIN__) && !( defined(_MSC_VER) && _MSC_VER >= 1900)
414 #define snprintf _snprintf
415 #endif
416 		snprintf(
417 			std_uuid, sizeof(std_uuid),
418 			"<urn:uuid:%08x-%04x-%04x-%04x-%04x%08x>",
419 			u.u[0U],
420 			u.u[1U] >> 16U, u.u[1U] & 0xffffU,
421 			u.u[2U] >> 16U, u.u[2U] & 0xffffU,
422 			u.u[3U]);
423 		hdr.recid = std_uuid;
424 	}
425 
426 	/* record-id is mandatory, fingers crossed we won't fail */
427 	archive_string_sprintf(tgt, "WARC-Record-ID: %s\r\n", hdr.recid);
428 
429 	if (hdr.cnttyp != NULL) {
430 		archive_string_sprintf(tgt, "Content-Type: %s\r\n", hdr.cnttyp);
431 	}
432 
433 	/* next one is mandatory */
434 	archive_string_sprintf(tgt, "Content-Length: %ju\r\n", (uintmax_t)hdr.cntlen);
435 	/**/
436 	archive_strncat(tgt, "\r\n", 2);
437 
438 	return (archive_strlen(tgt) >= tsz)? -1: (ssize_t)archive_strlen(tgt);
439 }
440 
441 static int
442 _gen_uuid(warc_uuid_t *tgt)
443 {
444 	archive_random(tgt->u, sizeof(tgt->u));
445 	/* obey uuid version 4 rules */
446 	tgt->u[1U] &= 0xffff0fffU;
447 	tgt->u[1U] |= 0x4000U;
448 	tgt->u[2U] &= 0x3fffffffU;
449 	tgt->u[2U] |= 0x80000000U;
450 	return 0;
451 }
452 
453 /* archive_write_set_format_warc.c ends here */
454