1 /*
2 * Markdown parsing definitions for HTMLDOC, a HTML document processing program.
3 *
4 * Copyright © 2017-2020 by Michael R Sweet.
5 *
6 * This program is free software. Distribution and use rights are outlined in
7 * the file "COPYING".
8 */
9
10 /*
11 * Include necessary headers...
12 */
13
14 # include "markdown.h"
15 # include "mmd.h"
16 # include "progress.h"
17
18
19 /*
20 * Local functions...
21 */
22
23 static void add_block(tree_t *hparent, mmd_t *parent);
24 static void add_leaf(tree_t *hparent, mmd_t *node);
25 static uchar *get_text(uchar *text);
26 static uchar *make_anchor(mmd_t *block);
27 static uchar *make_anchor(const uchar *text);
28
29
30 /*
31 * 'mdReadFile()' - Read a Markdown file.
32 */
33
34 tree_t * /* O - HTML document tree */
mdReadFile(tree_t * parent,FILE * fp,const char * base)35 mdReadFile(tree_t *parent, /* I - Parent node */
36 FILE *fp, /* I - File to read from */
37 const char *base) /* I - Base path/URL */
38 {
39 mmd_t *doc = mmdLoadFile(fp); /* Markdown document */
40 tree_t *html, /* HTML element */
41 *head, /* HEAD element */
42 *temp, /* META/TITLE element */
43 *body; /* BODY element */
44 const char *meta; /* Title, author, etc. */
45
46
47 html = htmlAddTree(parent, MARKUP_HTML, NULL);
48 if ((meta = mmdGetMetadata(doc, "lang")) != NULL)
49 htmlSetVariable(html, (uchar *)"lang", get_text((uchar *)meta));
50
51 head = htmlAddTree(html, MARKUP_HEAD, NULL);
52 if ((meta = mmdGetMetadata(doc, "title")) != NULL)
53 {
54 temp = htmlAddTree(head, MARKUP_TITLE, NULL);
55 htmlAddTree(temp, MARKUP_NONE, get_text((uchar *)meta));
56 }
57 if ((meta = mmdGetMetadata(doc, "author")) != NULL)
58 {
59 temp = htmlAddTree(head, MARKUP_META, NULL);
60 htmlSetVariable(temp, (uchar *)"name", (uchar *)"author");
61 htmlSetVariable(temp, (uchar *)"content", get_text((uchar *)meta));
62 }
63 if ((meta = mmdGetMetadata(doc, "copyright")) != NULL)
64 {
65 temp = htmlAddTree(head, MARKUP_META, NULL);
66 htmlSetVariable(temp, (uchar *)"name", (uchar *)"copyright");
67 htmlSetVariable(temp, (uchar *)"content", get_text((uchar *)meta));
68 }
69 if ((meta = mmdGetMetadata(doc, "version")) != NULL)
70 {
71 temp = htmlAddTree(head, MARKUP_META, NULL);
72 htmlSetVariable(temp, (uchar *)"name", (uchar *)"version");
73 htmlSetVariable(temp, (uchar *)"content", get_text((uchar *)meta));
74 }
75 if ((meta = mmdGetMetadata(doc, "language")) != NULL)
76 {
77 htmlSetVariable(html, (uchar *)"lang", get_text((uchar *)meta));
78 }
79 if ((meta = mmdGetMetadata(doc, "subject")) != NULL)
80 {
81 temp = htmlAddTree(head, MARKUP_META, NULL);
82 htmlSetVariable(temp, (uchar *)"name", (uchar *)"keywords");
83 htmlSetVariable(temp, (uchar *)"content", get_text((uchar *)meta));
84 }
85
86 body = htmlAddTree(html, MARKUP_BODY, NULL);
87 add_block(body, doc);
88
89 mmdFree(doc);
90
91 return (html);
92 }
93
94
95 /*
96 * 'add_block()' - Add a block node.
97 */
98
99 static void
add_block(tree_t * html,mmd_t * parent)100 add_block(tree_t *html, /* I - Parent HTML node */
101 mmd_t *parent) /* I - Parent node */
102 {
103 markup_t element; /* Enclosing element, if any */
104 mmd_t *node; /* Current child node */
105 mmd_type_t type; /* Node type */
106 tree_t *block; /* Block node */
107 const char *align = NULL; /* Alignment */
108
109
110 switch (type = mmdGetType(parent))
111 {
112 case MMD_TYPE_BLOCK_QUOTE :
113 element = MARKUP_BLOCKQUOTE;
114 break;
115
116 case MMD_TYPE_ORDERED_LIST :
117 element = MARKUP_OL;
118 break;
119
120 case MMD_TYPE_UNORDERED_LIST :
121 element = MARKUP_UL;
122 break;
123
124 case MMD_TYPE_LIST_ITEM :
125 element = MARKUP_LI;
126 break;
127
128 case MMD_TYPE_HEADING_1 :
129 element = MARKUP_H1;
130 break;
131
132 case MMD_TYPE_HEADING_2 :
133 element = MARKUP_H2;
134 break;
135
136 case MMD_TYPE_HEADING_3 :
137 element = MARKUP_H3;
138 break;
139
140 case MMD_TYPE_HEADING_4 :
141 element = MARKUP_H4;
142 break;
143
144 case MMD_TYPE_HEADING_5 :
145 element = MARKUP_H5;
146 break;
147
148 case MMD_TYPE_HEADING_6 :
149 element = MARKUP_H6;
150 break;
151
152 case MMD_TYPE_PARAGRAPH :
153 element = MARKUP_P;
154 break;
155
156 case MMD_TYPE_CODE_BLOCK :
157 block = htmlAddTree(html, MARKUP_PRE, NULL);
158
159 for (node = mmdGetFirstChild(parent); node; node = mmdGetNextSibling(node))
160 htmlAddTree(block, MARKUP_NONE, get_text((uchar *)mmdGetText(node)));
161 return;
162
163 case MMD_TYPE_THEMATIC_BREAK :
164 htmlAddTree(html, MARKUP_HR, NULL);
165 return;
166
167 case MMD_TYPE_TABLE :
168 element = MARKUP_TABLE;
169 break;
170
171 case MMD_TYPE_TABLE_HEADER :
172 element = MARKUP_THEAD;
173 break;
174
175 case MMD_TYPE_TABLE_BODY :
176 element = MARKUP_TBODY;
177 break;
178
179 case MMD_TYPE_TABLE_ROW :
180 element = MARKUP_TR;
181 break;
182
183 case MMD_TYPE_TABLE_HEADER_CELL :
184 element = MARKUP_TH;
185 break;
186
187 case MMD_TYPE_TABLE_BODY_CELL_LEFT :
188 element = MARKUP_TD;
189 break;
190
191 case MMD_TYPE_TABLE_BODY_CELL_CENTER :
192 element = MARKUP_TD;
193 align = "center";
194 break;
195
196 case MMD_TYPE_TABLE_BODY_CELL_RIGHT :
197 element = MARKUP_TD;
198 align = "right";
199 break;
200
201 default :
202 element = MARKUP_NONE;
203 break;
204 }
205
206 if (element != MARKUP_NONE)
207 block = htmlAddTree(html, element, NULL);
208 else
209 block = html;
210
211 if (align)
212 {
213 htmlSetVariable(block, (uchar *)"align", (uchar *)align);
214
215 if (!strcmp(align, "center"))
216 block->halignment = ALIGN_CENTER;
217 else
218 block->halignment = ALIGN_RIGHT;
219 }
220 else if (element == MARKUP_TH)
221 {
222 block->halignment = ALIGN_CENTER;
223 htmlSetVariable(block, (uchar *)"bgcolor", (uchar *)"#cccccc");
224 }
225 else if (element == MARKUP_TABLE)
226 {
227 htmlSetVariable(block, (uchar *)"border", (uchar *)"1");
228 htmlSetVariable(block, (uchar *)"cellpadding", (uchar *)"2");
229 }
230
231 if (type >= MMD_TYPE_HEADING_1 && type <= MMD_TYPE_HEADING_6)
232 {
233 /*
234 * Add an anchor for each heading...
235 */
236
237 block = htmlAddTree(block, MARKUP_A, NULL);
238 htmlSetVariable(block, (uchar *)"id", make_anchor(parent));
239 }
240
241 if (type >= MMD_TYPE_TABLE_BODY_CELL_LEFT && type <= MMD_TYPE_TABLE_BODY_CELL_RIGHT && !mmdGetFirstChild(parent))
242 {
243 // Make sure table body cells have at least a non-breaking space...
244 htmlAddTree(block, MARKUP_NONE, (uchar *)" ");
245 }
246 else
247 {
248 for (node = mmdGetFirstChild(parent); node; node = mmdGetNextSibling(node))
249 {
250 if (mmdIsBlock(node))
251 add_block(block, node);
252 else
253 add_leaf(block, node);
254 }
255 }
256 }
257
258
259 /*
260 * 'add_leaf()' - Add a leaf node.
261 */
262
263 static void
add_leaf(tree_t * html,mmd_t * node)264 add_leaf(tree_t *html, /* I - Parent HTML node */
265 mmd_t *node) /* I - Leaf node */
266 {
267 tree_t *parent; /* HTML node for this text */
268 markup_t element; /* HTML element for this text */
269 uchar buffer[1024], /* Text with any added whitespace */
270 *text, /* Text to write */
271 *url; /* URL to write */
272 int whitespace; /* Whitespace before text? */
273
274
275 text = get_text((uchar *)mmdGetText(node));
276 url = (uchar *)mmdGetURL(node);
277 whitespace = mmdGetWhitespace(node);
278
279 switch (mmdGetType(node))
280 {
281 case MMD_TYPE_EMPHASIZED_TEXT :
282 element = MARKUP_EM;
283 break;
284
285 case MMD_TYPE_STRONG_TEXT :
286 element = MARKUP_STRONG;
287 break;
288
289 case MMD_TYPE_STRUCK_TEXT :
290 element = MARKUP_DEL;
291 break;
292
293 case MMD_TYPE_LINKED_TEXT :
294 element = MARKUP_A;
295 break;
296
297 case MMD_TYPE_CODE_TEXT :
298 element = MARKUP_CODE;
299 break;
300
301 case MMD_TYPE_IMAGE :
302 if (mmdGetWhitespace(node))
303 htmlAddTree(html, MARKUP_NONE, (uchar *)" ");
304
305 parent = htmlAddTree(html, MARKUP_IMG, NULL);
306 htmlSetVariable(parent, (uchar *)"src", url);
307 if (text)
308 htmlSetVariable(parent, (uchar *)"alt", text);
309 return;
310
311 case MMD_TYPE_HARD_BREAK :
312 htmlAddTree(html, MARKUP_BR, NULL);
313 return;
314
315 case MMD_TYPE_SOFT_BREAK :
316 htmlAddTree(html, MARKUP_WBR, NULL);
317 return;
318
319 case MMD_TYPE_METADATA_TEXT :
320 return;
321
322 default :
323 element = MARKUP_NONE;
324 break;
325 }
326
327 if (element == MARKUP_NONE)
328 parent = html;
329 else if ((parent = html->last_child) == NULL || parent->markup != element)
330 {
331 if (whitespace)
332 {
333 htmlAddTree(html, MARKUP_NONE, (uchar *)" ");
334 whitespace = 0;
335 }
336
337 parent = htmlAddTree(html, element, NULL);
338
339 if (element == MARKUP_A && url)
340 {
341 if (!strcmp((char *)url, "@"))
342 htmlSetVariable(parent, (uchar *)"href", make_anchor(text));
343 else
344 htmlSetVariable(parent, (uchar *)"href", url);
345 }
346 }
347
348 if (whitespace)
349 {
350 buffer[0] = ' ';
351 strlcpy((char *)buffer + 1, (char *)text, sizeof(buffer) - 1);
352 text = buffer;
353 }
354
355 htmlAddTree(parent, MARKUP_NONE, text);
356 }
357
358
359 /*
360 * 'get_text()' - Get Markdown text in HTMLDOC's charset.
361 */
362
363 static uchar * /* O - Encoded text */
get_text(uchar * text)364 get_text(uchar *text) /* I - Markdown text */
365 {
366 uchar *bufptr, /* Pointer into buffer */
367 *bufend; /* End of buffer */
368 int unich; /* Unicode character */
369 static uchar buffer[8192]; /* Temporary buffer */
370
371
372 if (!_htmlUTF8)
373 return (text);
374
375 bufptr = buffer;
376 bufend = buffer + sizeof(buffer) - 1;
377
378 while (*text && bufptr < bufend)
379 {
380 if (*text & 0x80)
381 {
382 unich = 0;
383
384 if ((*text & 0xe0) == 0xc0)
385 {
386 if ((text[1] & 0xc0) != 0x80)
387 {
388 progress_error(HD_ERROR_READ_ERROR, "Bad UTF-8 character sequence %02X %02X.", *text, text[1]);
389 *bufptr++ = '?';
390 text ++;
391 }
392 else
393 {
394 unich = ((*text & 0x1f) << 6) | (text[1] & 0x3f);
395 text += 2;
396 }
397 }
398 else if ((*text & 0xf0) == 0xe0)
399 {
400 if ((text[1] & 0xc0) != 0x80 || (text[2] & 0xc0) != 0x80)
401 {
402 progress_error(HD_ERROR_READ_ERROR, "Bad UTF-8 character sequence %02X %02X %02X.", *text, text[1], text[2]);
403 *bufptr++ = '?';
404 text ++;
405 }
406 else
407 {
408 unich = ((*text & 0x0f) << 12) | ((text[1] & 0x3f) << 6) | (text[2] & 0x3f);
409 text += 3;
410 }
411 }
412 else
413 {
414 progress_error(HD_ERROR_READ_ERROR, "Bad UTF-8 character sequence %02X.", *text);
415 *bufptr++ = '?';
416 text ++;
417 }
418
419 if (unich)
420 {
421 uchar ch = htmlMapUnicode(unich);
422 /* 8-bit character */
423
424 *bufptr++ = ch ? ch : '?';
425 }
426 }
427 else
428 *bufptr++ = *text++;
429 }
430
431 *bufptr = '\0';
432
433 return (buffer);
434 }
435
436
437 /*
438 * 'make_anchor()' - Make an anchor for internal links from a block node.
439 */
440
441 static uchar * /* O - Anchor string */
make_anchor(mmd_t * block)442 make_anchor(mmd_t *block) /* I - Block node */
443 {
444 mmd_t *node; /* Current child node */
445 const char *text; /* Text from block */
446 uchar *bufptr; /* Pointer into buffer */
447 static uchar buffer[1024]; /* Buffer for anchor string */
448
449
450 for (bufptr = buffer, node = mmdGetFirstChild(block); node; node = mmdGetNextSibling(node))
451 {
452 if (mmdGetWhitespace(node) && bufptr < (buffer + sizeof(buffer) - 1))
453 *bufptr++ = '-';
454 for (text = mmdGetText(node); text && *text && bufptr < (buffer + sizeof(buffer) -1); text ++)
455 {
456 if ((*text >= '0' && *text <= '9') || (*text >= 'a' && *text <= 'z') || (*text >= 'A' && *text <= 'Z') || *text == '.' || *text == '-')
457 *bufptr++ = (uchar)tolower(*text);
458 else if (*text == ' ')
459 *bufptr++ = '-';
460 }
461 }
462
463 *bufptr = '\0';
464
465 return (buffer);
466 }
467
468
469 /*
470 * 'make_anchor()' - Make an anchor for internal links from text.
471 */
472
473 static uchar * /* O - Anchor string */
make_anchor(const uchar * text)474 make_anchor(const uchar *text) /* I - Text */
475 {
476 uchar *bufptr; /* Pointer into buffer */
477 static uchar buffer[1024]; /* Buffer for anchor string */
478
479
480 for (bufptr = buffer; *text && bufptr < (buffer + sizeof(buffer) - 1); text ++)
481 {
482 if ((*text >= '0' && *text <= '9') || (*text >= 'a' && *text <= 'z') || (*text >= 'A' && *text <= 'Z') || *text == '.' || *text == '-')
483 *bufptr++ = (uchar)tolower(*text);
484 else if (*text == ' ')
485 *bufptr++ = '-';
486 }
487
488 *bufptr = '\0';
489
490 return (buffer);
491 }
492