1 /**
2 
3 	MultiMarkdown -- Lightweight markup processor to produce HTML, LaTeX, and more.
4 
5 	@file itmz-reader.c
6 
7 	@brief
8 
9 
10 	@author	Fletcher T. Penney
11 	@bug
12 
13 **/
14 
15 /*
16 
17 	Copyright © 2016 - 2019 Fletcher T. Penney.
18 
19 
20 	The `MultiMarkdown 6` project is released under the MIT License..
21 
22 	GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
23 
24 		https://github.com/fletcher/MultiMarkdown-4/
25 
26 	MMD 4 is released under both the MIT License and GPL.
27 
28 
29 	CuTest is released under the zlib/libpng license. See CuTest.c for the
30 	text of the license.
31 
32 	uthash library:
33 		Copyright (c) 2005-2016, Troy D. Hanson
34 
35 		Licensed under Revised BSD license
36 
37 	miniz library:
38 		Copyright 2013-2014 RAD Game Tools and Valve Software
39 		Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
40 
41 		Licensed under the MIT license
42 
43 	argtable3 library:
44 		Copyright (C) 1998-2001,2003-2011,2013 Stewart Heitmann
45 		<sheitmann@users.sourceforge.net>
46 		All rights reserved.
47 
48 		Licensed under the Revised BSD License
49 
50 
51 	## The MIT License ##
52 
53 	Permission is hereby granted, free of charge, to any person obtaining
54 	a copy of this software and associated documentation files (the
55 	"Software"), to deal in the Software without restriction, including
56 	without limitation the rights to use, copy, modify, merge, publish,
57 	distribute, sublicense, and/or sell copies of the Software, and to
58 	permit persons to whom the Software is furnished to do so, subject to
59 	the following conditions:
60 
61 	The above copyright notice and this permission notice shall be
62 	included in all copies or substantial portions of the Software.
63 
64 	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
65 	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
66 	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
67 	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
68 	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
69 	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
70 	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
71 
72 
73 	## Revised BSD License ##
74 
75 	Redistribution and use in source and binary forms, with or without
76 	modification, are permitted provided that the following conditions are
77 	met:
78 	    * Redistributions of source code must retain the above copyright
79 	      notice, this list of conditions and the following disclaimer.
80 	    * Redistributions in binary form must reproduce the above
81 	      copyright notice, this list of conditions and the following
82 	      disclaimer in the documentation and/or other materials provided
83 	      with the distribution.
84 	    * Neither the name of the <organization> nor the
85 	      names of its contributors may be used to endorse or promote
86 	      products derived from this software without specific prior
87 	      written permission.
88 
89 	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
90 	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
91 	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
92 	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT
93 	HOLDER> BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
94 	EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
95 	PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF USE, DATA, OR
96 	PROFITS OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
97 	LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
98 	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
99 	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
100 
101 
102 */
103 
104 #include <stdio.h>
105 #include <stdlib.h>
106 
107 #include "d_string.h"
108 #include "mmd.h"
109 #include "itmz-reader.h"
110 #include "itmz-lexer.h"
111 #include "itmz-parser.h"
112 #include "token.h"
113 #include "xml.h"
114 #include "zip.h"
115 
116 
117 // Basic parser function declarations
118 void * ITMZAlloc(void *);
119 void ITMZ(void *, int, void *, void *);
120 void ITMZFree(void *, void *);
121 void ITMZTrace(FILE * stream, char * zPrefix);
122 
123 
124 #define print(x) d_string_append(out, x)
125 #define print_const(x) d_string_append_c_array(out, x, sizeof(x) - 1)
126 #define print_char(x) d_string_append_c(out, x)
127 #define printf(...) d_string_append_printf(out, __VA_ARGS__)
128 
129 
130 /// Create a token chain from source ITMZ string
tokenize_itmz_string(mmd_engine * e,size_t start,size_t len)131 token * tokenize_itmz_string(mmd_engine * e, size_t start, size_t len) {
132 
133 	// Create a scanner (for re2c)
134 	Scanner s;
135 	s.start = &e->dstr->str[start];
136 	s.cur = s.start;
137 
138 	// Where do we stop parsing?
139 	const char * stop = &e->dstr->str[start] + len;
140 
141 	int type;								// TOKEN type
142 	token * t;								// Create tokens for incorporation
143 
144 	token * root = token_new(0, start, len);		// Store the final parse tree here
145 
146 	const char * last_stop = &e->dstr->str[start];	// Remember where last token ended
147 
148 	do {
149 		// Scan for next token (type of 0 means there is nothing left);
150 		type = itmz_scan(&s, stop);
151 
152 		//if (type && s.start != last_stop) {
153 		if (s.start != last_stop) {
154 			// We skipped characters between tokens
155 
156 			if (type) {
157 				// Create a default token type for the skipped characters
158 				//			t = token_new(TEXT_PLAIN, (size_t)(last_stop - e->dstr->str), (size_t)(s.start - last_stop));
159 			} else {
160 				if (stop > last_stop) {
161 					// Source text ends without newline
162 					//				t = token_new(TEXT_PLAIN, (size_t)(last_stop - e->dstr->str), (size_t)(stop - last_stop));
163 				}
164 			}
165 		} else if (type == 0 && stop > last_stop) {
166 			// Source text ends without newline
167 			//		t = token_new(TEXT_PLAIN, (size_t)(last_stop - e->dstr->str), (size_t)(stop - last_stop));
168 		}
169 
170 
171 		switch (type) {
172 			case 0:
173 				// 0 means we finished with input
174 				break;
175 
176 			case ITMZ_WSNL:
177 				// Ignore for now
178 				break;
179 
180 			default:
181 				t = token_new(type, (size_t)(s.start - e->dstr->str), (size_t)(s.cur - s.start));
182 				token_chain_append(root, t);
183 				break;
184 		}
185 
186 		// Remember where token ends to detect skipped characters
187 		last_stop = s.cur;
188 	} while (type != 0);
189 
190 	return root;
191 }
192 
193 
parse_itmz_token_chain(mmd_engine * e,token * chain)194 void parse_itmz_token_chain(mmd_engine * e, token * chain) {
195 
196 	void * pParser = ITMZAlloc (malloc);		// Create a parser (for lemon)
197 	token * walker = chain->next;				// Walk the existing tree
198 	token * remainder;							// Hold unparsed tail of chain
199 
200 #ifndef NDEBUG
201 	ITMZTrace(stderr, "parser >>");
202 #endif
203 
204 	// Remove existing token tree
205 	e->root = NULL;
206 
207 	while (walker != NULL) {
208 		remainder = walker->next;
209 
210 		ITMZ(pParser, walker->type, walker, e);
211 
212 		walker = remainder;
213 	}
214 
215 	// Signal finish to parser
216 #ifndef NDEBUG
217 	fprintf(stderr, "\nFinish parse\n");
218 #endif
219 	ITMZ(pParser, 0, NULL, e);
220 
221 	if (e->root) {
222 		// Successful parse -- process to new source document
223 		DString * final = d_string_new("");
224 		DString * metadata = d_string_new("");
225 		DString * out = final;
226 
227 		size_t header_level = -1;	// ITMZ has a dummy root note
228 		size_t start, len;
229 
230 		walker = chain->next;
231 
232 		while (walker) {
233 			switch (walker->type) {
234 				case ITMZ_TOPIC_PREAMBLE:
235 				case ITMZ_TOPIC_OPEN:
236 				case ITMZ_TOPIC_SELF_CLOSE:
237 					header_level++;
238 
239 					if (header_level == 0) {
240 						// ITMZ has a dummy parent node
241 						break;
242 					}
243 
244 					// Advance over `<topic`
245 					start = walker->start + 6;
246 
247 					char * text = xml_extract_named_attribute(e->dstr->str, start, "text");
248 
249 					if (text) {
250 						len = strlen(text);
251 
252 						if (strcmp("&gt;&gt;Preamble&lt;&lt;", text) != 0) {
253 							if (out == metadata) {
254 								print_xml_as_text(out, text, 0, len);
255 								print_const(":\t");
256 							} else {
257 								// Print header
258 
259 								if (xml_scan_encoded_newline(text, len) == -1) {
260 									// ATX header
261 									for (int i = 0; i < header_level; ++i) {
262 										print_char('#');
263 									}
264 
265 									print_char(' ');
266 								}
267 
268 								print_xml_as_text(out, text, 0, len);
269 
270 								if (xml_scan_encoded_newline(text, len) == -1) {
271 									// ATX header
272 									print_char(' ');
273 
274 									for (int i = 0; i < header_level; ++i) {
275 										print_char('#');
276 									}
277 								} else {
278 									// Setext Header
279 									switch (header_level) {
280 										case 1:
281 											print_const("\n======");
282 											break;
283 
284 										default:
285 											print_const("\n------");
286 											break;
287 									}
288 								}
289 
290 								print_const("\n");
291 							}
292 						}
293 
294 						free(text);
295 					}
296 
297 					// Print contents of topic
298 					text = xml_extract_named_attribute(e->dstr->str, start, "note");
299 
300 					if (text) {
301 						print_xml_as_text(out, text, 0, strlen(text));
302 
303 						free(text);
304 					}
305 
306 					if (out == metadata) {
307 						print_const("  \n");
308 					} else {
309 						// Ensure that contents end in newline
310 						if (out->currentStringLength) {
311 							switch (out->str[out->currentStringLength - 1]) {
312 								case '\n':
313 								case '\r':
314 									break;
315 
316 								default:
317 									d_string_append_c(out, '\n');
318 									break;
319 							}
320 						}
321 					}
322 
323 					if (walker->type == ITMZ_TOPIC_SELF_CLOSE) {
324 						header_level--;
325 					}
326 
327 					break;
328 
329 				case ITMZ_TOPIC_METADATA:
330 					// Now handle metadata
331 					out = metadata;
332 					header_level++;
333 					break;
334 
335 				case ITMZ_TOPIC_CLOSE:
336 					header_level--;
337 					break;
338 
339 				default:
340 					break;
341 			}
342 
343 			walker = walker->next;
344 		}
345 
346 		// Append body to metadata
347 		d_string_append_c_array(metadata, final->str, final->currentStringLength);
348 
349 		// TODO: How to safely swap the new text, given that we might not own e->dstr->str?
350 
351 		free(e->dstr->str);
352 		e->dstr->str = metadata->str;
353 		e->dstr->currentStringLength = metadata->currentStringLength;
354 
355 		d_string_free(metadata, false);
356 		d_string_free(final, true);
357 	} else {
358 		// Unsuccessful parse -- free token chain
359 	}
360 
361 	// Clean up token chain
362 	token_tree_free(chain);
363 
364 	ITMZFree(pParser, free);
365 }
366 
367 
368 /// Create a token chain from source OPML string
mmd_convert_itmz_string(mmd_engine * e,size_t start,size_t len)369 void mmd_convert_itmz_string(mmd_engine * e, size_t start, size_t len) {
370 	// Need to extract mapdata.xml file from the zip archive
371 	DString * text = d_string_new("");
372 
373 	mz_bool status = unzip_file_from_data(e->dstr->str, e->dstr->currentStringLength, "mapdata.xml", text);
374 
375 	if (status) {
376 		free(e->dstr->str);
377 		e->dstr->str = text->str;
378 		e->dstr->currentStringLength = text->currentStringLength;
379 
380 		d_string_free(text, false);
381 
382 		// Now convert mapdata.xml -> MMD text
383 		token * chain = tokenize_itmz_string(e, 0, e->dstr->currentStringLength);
384 		parse_itmz_token_chain(e, chain);
385 	} else {
386 		d_string_free(text, true);
387 	}
388 }
389