1 /**
2
3 MultiMarkdown -- Lightweight markup processor to produce HTML, LaTeX, and more.
4
5 @file itmz-reader.c
6
7 @brief
8
9
10 @author Fletcher T. Penney
11 @bug
12
13 **/
14
15 /*
16
17 Copyright © 2016 - 2019 Fletcher T. Penney.
18
19
20 The `MultiMarkdown 6` project is released under the MIT License..
21
22 GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
23
24 https://github.com/fletcher/MultiMarkdown-4/
25
26 MMD 4 is released under both the MIT License and GPL.
27
28
29 CuTest is released under the zlib/libpng license. See CuTest.c for the
30 text of the license.
31
32 uthash library:
33 Copyright (c) 2005-2016, Troy D. Hanson
34
35 Licensed under Revised BSD license
36
37 miniz library:
38 Copyright 2013-2014 RAD Game Tools and Valve Software
39 Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
40
41 Licensed under the MIT license
42
43 argtable3 library:
44 Copyright (C) 1998-2001,2003-2011,2013 Stewart Heitmann
45 <sheitmann@users.sourceforge.net>
46 All rights reserved.
47
48 Licensed under the Revised BSD License
49
50
51 ## The MIT License ##
52
53 Permission is hereby granted, free of charge, to any person obtaining
54 a copy of this software and associated documentation files (the
55 "Software"), to deal in the Software without restriction, including
56 without limitation the rights to use, copy, modify, merge, publish,
57 distribute, sublicense, and/or sell copies of the Software, and to
58 permit persons to whom the Software is furnished to do so, subject to
59 the following conditions:
60
61 The above copyright notice and this permission notice shall be
62 included in all copies or substantial portions of the Software.
63
64 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
65 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
66 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
67 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
68 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
69 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
70 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
71
72
73 ## Revised BSD License ##
74
75 Redistribution and use in source and binary forms, with or without
76 modification, are permitted provided that the following conditions are
77 met:
78 * Redistributions of source code must retain the above copyright
79 notice, this list of conditions and the following disclaimer.
80 * Redistributions in binary form must reproduce the above
81 copyright notice, this list of conditions and the following
82 disclaimer in the documentation and/or other materials provided
83 with the distribution.
84 * Neither the name of the <organization> nor the
85 names of its contributors may be used to endorse or promote
86 products derived from this software without specific prior
87 written permission.
88
89 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
90 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
91 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
92 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT
93 HOLDER> BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
94 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
95 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF USE, DATA, OR
96 PROFITS OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
97 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
98 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
99 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
100
101
102 */
103
104 #include <stdio.h>
105 #include <stdlib.h>
106
107 #include "d_string.h"
108 #include "mmd.h"
109 #include "itmz-reader.h"
110 #include "itmz-lexer.h"
111 #include "itmz-parser.h"
112 #include "token.h"
113 #include "xml.h"
114 #include "zip.h"
115
116
117 // Basic parser function declarations
118 void * ITMZAlloc(void *);
119 void ITMZ(void *, int, void *, void *);
120 void ITMZFree(void *, void *);
121 void ITMZTrace(FILE * stream, char * zPrefix);
122
123
124 #define print(x) d_string_append(out, x)
125 #define print_const(x) d_string_append_c_array(out, x, sizeof(x) - 1)
126 #define print_char(x) d_string_append_c(out, x)
127 #define printf(...) d_string_append_printf(out, __VA_ARGS__)
128
129
130 /// Create a token chain from source ITMZ string
tokenize_itmz_string(mmd_engine * e,size_t start,size_t len)131 token * tokenize_itmz_string(mmd_engine * e, size_t start, size_t len) {
132
133 // Create a scanner (for re2c)
134 Scanner s;
135 s.start = &e->dstr->str[start];
136 s.cur = s.start;
137
138 // Where do we stop parsing?
139 const char * stop = &e->dstr->str[start] + len;
140
141 int type; // TOKEN type
142 token * t; // Create tokens for incorporation
143
144 token * root = token_new(0, start, len); // Store the final parse tree here
145
146 const char * last_stop = &e->dstr->str[start]; // Remember where last token ended
147
148 do {
149 // Scan for next token (type of 0 means there is nothing left);
150 type = itmz_scan(&s, stop);
151
152 //if (type && s.start != last_stop) {
153 if (s.start != last_stop) {
154 // We skipped characters between tokens
155
156 if (type) {
157 // Create a default token type for the skipped characters
158 // t = token_new(TEXT_PLAIN, (size_t)(last_stop - e->dstr->str), (size_t)(s.start - last_stop));
159 } else {
160 if (stop > last_stop) {
161 // Source text ends without newline
162 // t = token_new(TEXT_PLAIN, (size_t)(last_stop - e->dstr->str), (size_t)(stop - last_stop));
163 }
164 }
165 } else if (type == 0 && stop > last_stop) {
166 // Source text ends without newline
167 // t = token_new(TEXT_PLAIN, (size_t)(last_stop - e->dstr->str), (size_t)(stop - last_stop));
168 }
169
170
171 switch (type) {
172 case 0:
173 // 0 means we finished with input
174 break;
175
176 case ITMZ_WSNL:
177 // Ignore for now
178 break;
179
180 default:
181 t = token_new(type, (size_t)(s.start - e->dstr->str), (size_t)(s.cur - s.start));
182 token_chain_append(root, t);
183 break;
184 }
185
186 // Remember where token ends to detect skipped characters
187 last_stop = s.cur;
188 } while (type != 0);
189
190 return root;
191 }
192
193
parse_itmz_token_chain(mmd_engine * e,token * chain)194 void parse_itmz_token_chain(mmd_engine * e, token * chain) {
195
196 void * pParser = ITMZAlloc (malloc); // Create a parser (for lemon)
197 token * walker = chain->next; // Walk the existing tree
198 token * remainder; // Hold unparsed tail of chain
199
200 #ifndef NDEBUG
201 ITMZTrace(stderr, "parser >>");
202 #endif
203
204 // Remove existing token tree
205 e->root = NULL;
206
207 while (walker != NULL) {
208 remainder = walker->next;
209
210 ITMZ(pParser, walker->type, walker, e);
211
212 walker = remainder;
213 }
214
215 // Signal finish to parser
216 #ifndef NDEBUG
217 fprintf(stderr, "\nFinish parse\n");
218 #endif
219 ITMZ(pParser, 0, NULL, e);
220
221 if (e->root) {
222 // Successful parse -- process to new source document
223 DString * final = d_string_new("");
224 DString * metadata = d_string_new("");
225 DString * out = final;
226
227 size_t header_level = -1; // ITMZ has a dummy root note
228 size_t start, len;
229
230 walker = chain->next;
231
232 while (walker) {
233 switch (walker->type) {
234 case ITMZ_TOPIC_PREAMBLE:
235 case ITMZ_TOPIC_OPEN:
236 case ITMZ_TOPIC_SELF_CLOSE:
237 header_level++;
238
239 if (header_level == 0) {
240 // ITMZ has a dummy parent node
241 break;
242 }
243
244 // Advance over `<topic`
245 start = walker->start + 6;
246
247 char * text = xml_extract_named_attribute(e->dstr->str, start, "text");
248
249 if (text) {
250 len = strlen(text);
251
252 if (strcmp(">>Preamble<<", text) != 0) {
253 if (out == metadata) {
254 print_xml_as_text(out, text, 0, len);
255 print_const(":\t");
256 } else {
257 // Print header
258
259 if (xml_scan_encoded_newline(text, len) == -1) {
260 // ATX header
261 for (int i = 0; i < header_level; ++i) {
262 print_char('#');
263 }
264
265 print_char(' ');
266 }
267
268 print_xml_as_text(out, text, 0, len);
269
270 if (xml_scan_encoded_newline(text, len) == -1) {
271 // ATX header
272 print_char(' ');
273
274 for (int i = 0; i < header_level; ++i) {
275 print_char('#');
276 }
277 } else {
278 // Setext Header
279 switch (header_level) {
280 case 1:
281 print_const("\n======");
282 break;
283
284 default:
285 print_const("\n------");
286 break;
287 }
288 }
289
290 print_const("\n");
291 }
292 }
293
294 free(text);
295 }
296
297 // Print contents of topic
298 text = xml_extract_named_attribute(e->dstr->str, start, "note");
299
300 if (text) {
301 print_xml_as_text(out, text, 0, strlen(text));
302
303 free(text);
304 }
305
306 if (out == metadata) {
307 print_const(" \n");
308 } else {
309 // Ensure that contents end in newline
310 if (out->currentStringLength) {
311 switch (out->str[out->currentStringLength - 1]) {
312 case '\n':
313 case '\r':
314 break;
315
316 default:
317 d_string_append_c(out, '\n');
318 break;
319 }
320 }
321 }
322
323 if (walker->type == ITMZ_TOPIC_SELF_CLOSE) {
324 header_level--;
325 }
326
327 break;
328
329 case ITMZ_TOPIC_METADATA:
330 // Now handle metadata
331 out = metadata;
332 header_level++;
333 break;
334
335 case ITMZ_TOPIC_CLOSE:
336 header_level--;
337 break;
338
339 default:
340 break;
341 }
342
343 walker = walker->next;
344 }
345
346 // Append body to metadata
347 d_string_append_c_array(metadata, final->str, final->currentStringLength);
348
349 // TODO: How to safely swap the new text, given that we might not own e->dstr->str?
350
351 free(e->dstr->str);
352 e->dstr->str = metadata->str;
353 e->dstr->currentStringLength = metadata->currentStringLength;
354
355 d_string_free(metadata, false);
356 d_string_free(final, true);
357 } else {
358 // Unsuccessful parse -- free token chain
359 }
360
361 // Clean up token chain
362 token_tree_free(chain);
363
364 ITMZFree(pParser, free);
365 }
366
367
368 /// Create a token chain from source OPML string
mmd_convert_itmz_string(mmd_engine * e,size_t start,size_t len)369 void mmd_convert_itmz_string(mmd_engine * e, size_t start, size_t len) {
370 // Need to extract mapdata.xml file from the zip archive
371 DString * text = d_string_new("");
372
373 mz_bool status = unzip_file_from_data(e->dstr->str, e->dstr->currentStringLength, "mapdata.xml", text);
374
375 if (status) {
376 free(e->dstr->str);
377 e->dstr->str = text->str;
378 e->dstr->currentStringLength = text->currentStringLength;
379
380 d_string_free(text, false);
381
382 // Now convert mapdata.xml -> MMD text
383 token * chain = tokenize_itmz_string(e, 0, e->dstr->currentStringLength);
384 parse_itmz_token_chain(e, chain);
385 } else {
386 d_string_free(text, true);
387 }
388 }
389