1 /*-
2  * Copyright (c) 2014 Sebastian Freundt
3  * Author: Sebastian Freundt  <devel@fresse.org>
4  *
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "archive_platform.h"
29 
30 #ifdef HAVE_ERRNO_H
31 #include <errno.h>
32 #endif
33 #include <stdio.h>
34 #ifdef HAVE_STDLIB_H
35 #include <stdlib.h>
36 #endif
37 #ifdef HAVE_STRING_H
38 #include <string.h>
39 #endif
40 #ifdef HAVE_TIME_H
41 #include <time.h>
42 #endif
43 
44 #include "archive.h"
45 #include "archive_entry.h"
46 #include "archive_entry_locale.h"
47 #include "archive_private.h"
48 #include "archive_random_private.h"
49 #include "archive_write_private.h"
50 #include "archive_write_set_format_private.h"
51 
52 struct warc_s {
53 	unsigned int omit_warcinfo:1;
54 
55 	time_t now;
56 	mode_t typ;
57 	unsigned int rng;
58 	/* populated size */
59 	uint64_t populz;
60 };
61 
62 static const char warcinfo[] =
63     "software: libarchive/" ARCHIVE_VERSION_ONLY_STRING "\r\n"
64     "format: WARC file version 1.0\r\n";
65 
66 typedef enum {
67 	WT_NONE,
68 	/* warcinfo */
69 	WT_INFO,
70 	/* metadata */
71 	WT_META,
72 	/* resource */
73 	WT_RSRC,
74 	/* request, unsupported */
75 	WT_REQ,
76 	/* response, unsupported */
77 	WT_RSP,
78 	/* revisit, unsupported */
79 	WT_RVIS,
80 	/* conversion, unsupported */
81 	WT_CONV,
82 	/* continuation, unsupported at the moment */
83 	WT_CONT,
84 	/* invalid type */
85 	LAST_WT
86 } warc_type_t;
87 
88 typedef struct {
89 	warc_type_t type;
90 	const char *tgturi;
91 	const char *recid;
92 	time_t rtime;
93 	time_t mtime;
94 	const char *cnttyp;
95 	uint64_t cntlen;
96 } warc_essential_hdr_t;
97 
98 typedef struct {
99 	unsigned int u[4U];
100 } warc_uuid_t;
101 
102 static int _warc_options(struct archive_write*, const char *key, const char *v);
103 static int _warc_header(struct archive_write *a, struct archive_entry *entry);
104 static ssize_t _warc_data(struct archive_write *a, const void *buf, size_t sz);
105 static int _warc_finish_entry(struct archive_write *a);
106 static int _warc_close(struct archive_write *a);
107 static int _warc_free(struct archive_write *a);
108 
109 /* private routines */
110 static ssize_t _popul_ehdr(struct archive_string *t, size_t z, warc_essential_hdr_t);
111 static int _gen_uuid(warc_uuid_t *tgt);
112 
113 
114 /*
115  * Set output format to ISO 28500 (aka WARC) format.
116  */
117 int
118 archive_write_set_format_warc(struct archive *_a)
119 {
120 	struct archive_write *a = (struct archive_write *)_a;
121 	struct warc_s *w;
122 
123 	archive_check_magic(_a, ARCHIVE_WRITE_MAGIC,
124 	    ARCHIVE_STATE_NEW, "archive_write_set_format_warc");
125 
126 	/* If another format was already registered, unregister it. */
127 	if (a->format_free != NULL) {
128 		(a->format_free)(a);
129 	}
130 
131 	w = malloc(sizeof(*w));
132 	if (w == NULL) {
133 		archive_set_error(&a->archive, ENOMEM,
134 		    "Can't allocate warc data");
135 		return (ARCHIVE_FATAL);
136 	}
137 	/* by default we're emitting a file wide header */
138 	w->omit_warcinfo = 0U;
139 	/* obtain current time for date fields */
140 	w->now = time(NULL);
141 	/* reset file type info */
142 	w->typ = 0;
143 	/* also initialise our rng */
144 	w->rng = (unsigned int)w->now;
145 
146 	a->format_data = w;
147 	a->format_name = "WARC/1.0";
148 	a->format_options = _warc_options;
149 	a->format_write_header = _warc_header;
150 	a->format_write_data = _warc_data;
151 	a->format_close = _warc_close;
152 	a->format_free = _warc_free;
153 	a->format_finish_entry = _warc_finish_entry;
154 	a->archive.archive_format = ARCHIVE_FORMAT_WARC;
155 	a->archive.archive_format_name = "WARC/1.0";
156 	return (ARCHIVE_OK);
157 }
158 
159 
160 /* archive methods */
161 static int
162 _warc_options(struct archive_write *a, const char *key, const char *val)
163 {
164 	struct warc_s *w = a->format_data;
165 
166 	if (strcmp(key, "omit-warcinfo") == 0) {
167 		if (val == NULL || strcmp(val, "true") == 0) {
168 			/* great */
169 			w->omit_warcinfo = 1U;
170 			return (ARCHIVE_OK);
171 		}
172 	}
173 
174 	/* Note: The "warn" return is just to inform the options
175 	 * supervisor that we didn't handle it.  It will generate
176 	 * a suitable error if no one used this option. */
177 	return (ARCHIVE_WARN);
178 }
179 
180 static int
181 _warc_header(struct archive_write *a, struct archive_entry *entry)
182 {
183 	struct warc_s *w = a->format_data;
184 	struct archive_string hdr;
185 #define MAX_HDR_SIZE 512
186 
187 	/* check whether warcinfo record needs outputting */
188 	if (!w->omit_warcinfo) {
189 		ssize_t r;
190 		warc_essential_hdr_t wi = {
191 			WT_INFO,
192 			/*uri*/NULL,
193 			/*urn*/NULL,
194 			/*rtm*/0,
195 			/*mtm*/0,
196 			/*cty*/"application/warc-fields",
197 			/*len*/sizeof(warcinfo) - 1U,
198 		};
199 		wi.rtime = w->now;
200 		wi.mtime = w->now;
201 
202 		archive_string_init(&hdr);
203 		r = _popul_ehdr(&hdr, MAX_HDR_SIZE, wi);
204 		if (r >= 0) {
205 			/* jackpot! */
206 			/* now also use HDR buffer for the actual warcinfo */
207 			archive_strncat(&hdr, warcinfo, sizeof(warcinfo) -1);
208 
209 			/* append end-of-record indicator */
210 			archive_strncat(&hdr, "\r\n\r\n", 4);
211 
212 			/* write to output stream */
213 			__archive_write_output(a, hdr.s, archive_strlen(&hdr));
214 		}
215 		/* indicate we're done with file header writing */
216 		w->omit_warcinfo = 1U;
217 		archive_string_free(&hdr);
218 	}
219 
220 	if (archive_entry_pathname(entry) == NULL) {
221 		archive_set_error(&a->archive, EINVAL,
222 		    "Invalid filename");
223 		return (ARCHIVE_WARN);
224 	}
225 
226 	w->typ = archive_entry_filetype(entry);
227 	w->populz = 0U;
228 	if (w->typ == AE_IFREG) {
229 		warc_essential_hdr_t rh = {
230 			WT_RSRC,
231 			/*uri*/NULL,
232 			/*urn*/NULL,
233 			/*rtm*/0,
234 			/*mtm*/0,
235 			/*cty*/NULL,
236 			/*len*/0,
237 		};
238 		ssize_t r;
239 		rh.tgturi = archive_entry_pathname(entry);
240 		rh.rtime = w->now;
241 		rh.mtime = archive_entry_mtime(entry);
242 		rh.cntlen = (size_t)archive_entry_size(entry);
243 
244 		archive_string_init(&hdr);
245 		r = _popul_ehdr(&hdr, MAX_HDR_SIZE, rh);
246 		if (r < 0) {
247 			/* don't bother */
248 			archive_set_error(
249 				&a->archive,
250 				ARCHIVE_ERRNO_FILE_FORMAT,
251 				"cannot archive file");
252 			return (ARCHIVE_WARN);
253 		}
254 		/* otherwise append to output stream */
255 		__archive_write_output(a, hdr.s, r);
256 		/* and let subsequent calls to _data() know about the size */
257 		w->populz = rh.cntlen;
258 		archive_string_free(&hdr);
259 		return (ARCHIVE_OK);
260 	}
261 	/* just resort to erroring as per Tim's advice */
262 	__archive_write_entry_filetype_unsupported(
263 	    &a->archive, entry, "WARC");
264 	return (ARCHIVE_FAILED);
265 }
266 
267 static ssize_t
268 _warc_data(struct archive_write *a, const void *buf, size_t len)
269 {
270 	struct warc_s *w = a->format_data;
271 
272 	if (w->typ == AE_IFREG) {
273 		int rc;
274 
275 		/* never write more bytes than announced */
276 		if (len > w->populz) {
277 			len = (size_t)w->populz;
278 		}
279 
280 		/* now then, out we put the whole shebang */
281 		rc = __archive_write_output(a, buf, len);
282 		if (rc != ARCHIVE_OK) {
283 			return rc;
284 		}
285 	}
286 	return len;
287 }
288 
289 static int
290 _warc_finish_entry(struct archive_write *a)
291 {
292 	static const char _eor[] = "\r\n\r\n";
293 	struct warc_s *w = a->format_data;
294 
295 	if (w->typ == AE_IFREG) {
296 		int rc = __archive_write_output(a, _eor, sizeof(_eor) - 1U);
297 
298 		if (rc != ARCHIVE_OK) {
299 			return rc;
300 		}
301 	}
302 	/* reset type info */
303 	w->typ = 0;
304 	return (ARCHIVE_OK);
305 }
306 
307 static int
308 _warc_close(struct archive_write *a)
309 {
310 	(void)a; /* UNUSED */
311 	return (ARCHIVE_OK);
312 }
313 
314 static int
315 _warc_free(struct archive_write *a)
316 {
317 	struct warc_s *w = a->format_data;
318 
319 	free(w);
320 	a->format_data = NULL;
321 	return (ARCHIVE_OK);
322 }
323 
324 
325 /* private routines */
326 static void
327 xstrftime(struct archive_string *as, const char *fmt, time_t t)
328 {
329 /** like strftime(3) but for time_t objects */
330 	struct tm *rt;
331 #if defined(HAVE_GMTIME_R) || defined(HAVE_GMTIME_S)
332 	struct tm timeHere;
333 #endif
334 	char strtime[100];
335 	size_t len;
336 
337 #if defined(HAVE_GMTIME_S)
338 	rt = gmtime_s(&timeHere, &t) ? NULL : &timeHere;
339 #elif defined(HAVE_GMTIME_R)
340 	rt = gmtime_r(&t, &timeHere);
341 #else
342 	rt = gmtime(&t);
343 #endif
344 	if (!rt)
345 		return;
346 	/* leave the hard yacker to our role model strftime() */
347 	len = strftime(strtime, sizeof(strtime)-1, fmt, rt);
348 	archive_strncat(as, strtime, len);
349 }
350 
351 static ssize_t
352 _popul_ehdr(struct archive_string *tgt, size_t tsz, warc_essential_hdr_t hdr)
353 {
354 	static const char _ver[] = "WARC/1.0\r\n";
355 	static const char * const _typ[LAST_WT] = {
356 		NULL, "warcinfo", "metadata", "resource", NULL
357 	};
358 	char std_uuid[48U];
359 
360 	if (hdr.type == WT_NONE || hdr.type > WT_RSRC) {
361 		/* brilliant, how exactly did we get here? */
362 		return -1;
363 	}
364 
365 	archive_strcpy(tgt, _ver);
366 
367 	archive_string_sprintf(tgt, "WARC-Type: %s\r\n", _typ[hdr.type]);
368 
369 	if (hdr.tgturi != NULL) {
370 		/* check if there's a xyz:// */
371 		static const char _uri[] = "";
372 		static const char _fil[] = "file://";
373 		const char *u;
374 		char *chk = strchr(hdr.tgturi, ':');
375 
376 		if (chk != NULL && chk[1U] == '/' && chk[2U] == '/') {
377 			/* yep, it's definitely a URI */
378 			u = _uri;
379 		} else {
380 			/* hm, best to prepend file:// then */
381 			u = _fil;
382 		}
383 		archive_string_sprintf(tgt,
384 			"WARC-Target-URI: %s%s\r\n", u, hdr.tgturi);
385 	}
386 
387 	/* record time is usually when the http is sent off,
388 	 * just treat the archive writing as such for a moment */
389 	xstrftime(tgt, "WARC-Date: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.rtime);
390 
391 	/* while we're at it, record the mtime */
392 	xstrftime(tgt, "Last-Modified: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.mtime);
393 
394 	if (hdr.recid == NULL) {
395 		/* generate one, grrrr */
396 		warc_uuid_t u;
397 
398 		_gen_uuid(&u);
399 		/* Unfortunately, archive_string_sprintf does not
400 		 * handle the minimum number following '%'.
401 		 * So we have to use snprintf function here instead
402 		 * of archive_string_snprintf function. */
403 #if defined(_WIN32) && !defined(__CYGWIN__) && !( defined(_MSC_VER) && _MSC_VER >= 1900)
404 #define snprintf _snprintf
405 #endif
406 		snprintf(
407 			std_uuid, sizeof(std_uuid),
408 			"<urn:uuid:%08x-%04x-%04x-%04x-%04x%08x>",
409 			u.u[0U],
410 			u.u[1U] >> 16U, u.u[1U] & 0xffffU,
411 			u.u[2U] >> 16U, u.u[2U] & 0xffffU,
412 			u.u[3U]);
413 		hdr.recid = std_uuid;
414 	}
415 
416 	/* record-id is mandatory, fingers crossed we won't fail */
417 	archive_string_sprintf(tgt, "WARC-Record-ID: %s\r\n", hdr.recid);
418 
419 	if (hdr.cnttyp != NULL) {
420 		archive_string_sprintf(tgt, "Content-Type: %s\r\n", hdr.cnttyp);
421 	}
422 
423 	/* next one is mandatory */
424 	archive_string_sprintf(tgt, "Content-Length: %ju\r\n", (uintmax_t)hdr.cntlen);
425 	/**/
426 	archive_strncat(tgt, "\r\n", 2);
427 
428 	return (archive_strlen(tgt) >= tsz)? -1: (ssize_t)archive_strlen(tgt);
429 }
430 
431 static int
432 _gen_uuid(warc_uuid_t *tgt)
433 {
434 	archive_random(tgt->u, sizeof(tgt->u));
435 	/* obey uuid version 4 rules */
436 	tgt->u[1U] &= 0xffff0fffU;
437 	tgt->u[1U] |= 0x4000U;
438 	tgt->u[2U] &= 0x3fffffffU;
439 	tgt->u[2U] |= 0x80000000U;
440 	return 0;
441 }
442 
443 /* archive_write_set_format_warc.c ends here */
444