1/**
2
3	MultiMarkdown -- Lightweight markup processor to produce HTML, LaTeX, and more.
4
5	@file xml.c
6
7	@brief Utilities to help parse XML files
8
9
10	@author	Fletcher T. Penney
11	@bug
12
13**/
14
15/*
16
17	Copyright © 2016 - 2019 Fletcher T. Penney.
18
19
20	The `MultiMarkdown 6` project is released under the MIT License..
21
22	GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
23
24		https://github.com/fletcher/MultiMarkdown-4/
25
26	MMD 4 is released under both the MIT License and GPL.
27
28
29	CuTest is released under the zlib/libpng license. See CuTest.c for the
30	text of the license.
31
32	uthash library:
33		Copyright (c) 2005-2016, Troy D. Hanson
34
35		Licensed under Revised BSD license
36
37	miniz library:
38		Copyright 2013-2014 RAD Game Tools and Valve Software
39		Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
40
41		Licensed under the MIT license
42
43	argtable3 library:
44		Copyright (C) 1998-2001,2003-2011,2013 Stewart Heitmann
45		<sheitmann@users.sourceforge.net>
46		All rights reserved.
47
48		Licensed under the Revised BSD License
49
50
51	## The MIT License ##
52
53	Permission is hereby granted, free of charge, to any person obtaining
54	a copy of this software and associated documentation files (the
55	"Software"), to deal in the Software without restriction, including
56	without limitation the rights to use, copy, modify, merge, publish,
57	distribute, sublicense, and/or sell copies of the Software, and to
58	permit persons to whom the Software is furnished to do so, subject to
59	the following conditions:
60
61	The above copyright notice and this permission notice shall be
62	included in all copies or substantial portions of the Software.
63
64	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
65	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
66	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
67	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
68	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
69	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
70	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
71
72
73	## Revised BSD License ##
74
75	Redistribution and use in source and binary forms, with or without
76	modification, are permitted provided that the following conditions are
77	met:
78	    * Redistributions of source code must retain the above copyright
79	      notice, this list of conditions and the following disclaimer.
80	    * Redistributions in binary form must reproduce the above
81	      copyright notice, this list of conditions and the following
82	      disclaimer in the documentation and/or other materials provided
83	      with the distribution.
84	    * Neither the name of the <organization> nor the
85	      names of its contributors may be used to endorse or promote
86	      products derived from this software without specific prior
87	      written permission.
88
89	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
90	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
91	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
92	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT
93	HOLDER> BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
94	EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
95	PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF USE, DATA, OR
96	PROFITS OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
97	LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
98	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
99	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
100
101
102*/
103
104#include <ctype.h>
105#include <stdlib.h>
106#include <string.h>
107
108#include "d_string.h"
109#include "xml.h"
110
111
112#define print(x) d_string_append(out, x)
113#define print_const(x) d_string_append_c_array(out, x, sizeof(x) - 1)
114#define print_char(x) d_string_append_c(out, x)
115#define printf(...) d_string_append_printf(out, __VA_ARGS__)
116
117
118/// strndup not available on all platforms
119static char * my_strndup(const char * source, size_t n) {
120	if (source == NULL) {
121		return NULL;
122	}
123
124	size_t len = 0;
125	char * result;
126	const char * test = source;
127
128	// strlen is too slow if strlen(source) >> n
129	for (len = 0; len < n; ++len) {
130		if (test == '\0') {
131			break;
132		}
133
134		test++;
135	}
136
137	result = malloc(len + 1);
138
139	if (result) {
140		memcpy(result, source, len);
141		result[len] = '\0';
142	}
143
144	return result;
145}
146
147
148/*!re2c
149
150	re2c:define:YYCTYPE = "unsigned char";
151	re2c:define:YYCURSOR = c;
152	re2c:define:YYMARKER = marker;
153	re2c:define:YYCTXMARKER = marker;
154	re2c:yyfill:enable = 0;
155
156	NL														= "\r\n" | '\n' | '\r';
157	WS														= [ \t]+;
158	WSNL													= (NL | WS)+;
159
160	EQUAL													= '=';
161
162	double_quoted											= '"' [^"\x00]* '"';
163	single_quoted											= "'" [^'\x00]* "'";
164	quoted_value											= double_quoted | single_quoted;
165
166	attribute_name											= [a-zA-Z_:] [a-zA-Z0-9_:.\-]*;
167	regular_attribute										= WSNL* attribute_name WSNL* EQUAL WSNL* quoted_value WSNL*;
168	boolean_attribute										= WSNL* attribute_name WSNL*;
169	attribute 												= regular_attribute | boolean_attribute;
170
171	contains_newline										= "&#10;" | "&#13;";
172
173*/
174
175
176/// skip through whitespace
177size_t xml_scan_wsnl(const char * c) {
178	const char * start = c;
179
180/*!re2c
181	WSNL*									{ return (size_t)( c - start ); }
182	*										{ return 0; }
183*/
184}
185
186
187/// scan generic attribute_name
188size_t xml_scan_attribute_name(const char * c) {;
189	const char * start = c;
190
191/*!re2c
192	attribute_name 							{ return (size_t)( c - start ); }
193	*										{ return 0; }
194*/
195}
196
197
198/// scan until start of value, if present
199size_t xml_scan_until_value(const char * c) {
200	const char * marker = NULL;
201	const char * start = c;
202
203/*!re2c
204	WSNL* EQUAL WSNL* / quoted_value		{ return (size_t)( c - start ); }
205	*										{ return 0; }
206*/
207}
208
209
210/// scan value
211size_t xml_scan_value(const char * c) {
212	const char * marker = NULL;
213	const char * start = c;
214
215/*!re2c
216	quoted_value							{ return (size_t)( c - start ); }
217	*										{ return 0; }
218*/
219}
220
221
222/// Does the string include encoded newline?
223size_t xml_scan_encoded_newline(const char * c, size_t len) {
224	const char * marker = NULL;
225	const char * start = c;
226
227	scan:
228
229	if ((*c == '\0') || ((c - start) > len)) {
230		// Not found
231		return -1;
232	}
233
234/*!re2c
235	contains_newline						{ return (size_t)(c - start); }
236	*										{ goto scan; }
237*/
238}
239
240
241/// Decode XML encoded text and print to DString
242void print_xml_as_text(DString * out, const char * source, size_t start, size_t len) {
243	const char * s_start = &source[start];
244	const char * s_stop = &source[start + len];
245
246	char * c = (char *) s_start;
247
248	while (c < s_stop) {
249		switch (*c) {
250			case '&':
251				switch (*++c) {
252					case '#':
253						if (strncmp(c, "#10;", 4) == 0) {
254							print_char('\n');
255							c += 4;
256							continue;
257						}
258
259						if (strncmp(c, "#9;", 3) == 0) {
260							print_char('\t');
261							c += 3;
262							continue;
263						}
264
265						if (strncmp(c, "#13;", 4) == 0) {
266							print_char('\r');
267							c += 4;
268							continue;
269						}
270
271						break;
272
273					case 'a':
274						if (strncmp(c, "amp;", 4) == 0) {
275							print_char('&');
276							c += 4;
277							continue;
278						}
279
280						if (strncmp(c, "apos;", 5) == 0) {
281							print_char('\'');
282							c += 5;
283							continue;
284						}
285
286						break;
287
288					case 'l':
289						if (strncmp(c, "lt;", 3) == 0) {
290							print_char('<');
291							c += 3;
292							continue;
293						}
294
295						break;
296
297					case 'g':
298						if (strncmp(c, "gt;", 3) == 0) {
299							print_char('>');
300							c += 3;
301							continue;
302						}
303
304						break;
305
306					case 'q':
307						if (strncmp(c, "quot;", 5) == 0) {
308							print_char('"');
309							c += 5;
310							continue;
311						}
312
313						break;
314
315					default:
316						break;
317				}
318
319				print_char('&');
320				continue;
321				break;
322
323			default:
324				print_char(*c);
325				break;
326		}
327
328		c++;
329	}
330}
331
332
333/// Parse XML text for attribute and value
334size_t xml_extract_attribute(const char * source, size_t start, char ** attr, char ** value) {
335	size_t cursor = start;
336	size_t len = 0;
337
338	if (*attr) {
339		free(*attr);
340		*attr = NULL;
341	}
342
343	if (*value) {
344		free(*value);
345		*value = NULL;
346	}
347
348	// Skip leading whitespace
349	cursor += xml_scan_wsnl(&source[start]);
350
351	len = xml_scan_attribute_name(&source[cursor]);
352
353	if (len) {
354		// Copy attribute name
355		*attr = my_strndup(&source[cursor], len);
356
357		cursor += len;
358
359		// Value?
360		cursor += xml_scan_until_value(&source[cursor]);
361		len = xml_scan_value(&source[cursor]);
362
363		if (len) {
364			*value = my_strndup(&source[cursor + 1], len - 2);
365		}
366
367		cursor += len;
368	}
369
370
371	return cursor - start;
372}
373
374
375/// Extract attribute with specified name
376char * xml_extract_named_attribute(const char * source, size_t start, const char * name) {
377	char * lower_name = my_strndup(name, strlen(name));
378	char * result = NULL;
379
380	// Use lower case for easy comparison
381	for(int i = 0; lower_name[i]; i++){
382 		 lower_name[i] = tolower(lower_name[i]);
383	}
384
385	char * attr = NULL, * value = NULL, * lower_attr = NULL;
386
387	do {
388		start += xml_extract_attribute(source, start, &attr, &value);
389
390		if (attr) {
391			lower_attr = my_strndup(attr, strlen(attr));
392
393			// Use lower case for easy comparison
394			for(int i = 0; lower_name[i]; i++){
395		 		 lower_attr[i] = tolower(lower_attr[i]);
396			}
397
398			if (strcmp(lower_name, lower_attr) == 0) {
399				// Match
400				result = value;
401				value = NULL;
402				free(lower_attr);
403				goto finish;
404			}
405
406			free(lower_attr);
407		}
408	} while (attr);
409
410	finish:
411	free(attr);
412	free(value);
413	free(lower_name);
414
415	return result;
416}
417