1 /*
2  * Markdown parsing definitions for HTMLDOC, a HTML document processing program.
3  *
4  * Copyright © 2017-2020 by Michael R Sweet.
5  *
6  * This program is free software.  Distribution and use rights are outlined in
7  * the file "COPYING".
8  */
9 
10 /*
11  * Include necessary headers...
12  */
13 
14 #  include "markdown.h"
15 #  include "mmd.h"
16 #  include "progress.h"
17 
18 
19 /*
20  * Local functions...
21  */
22 
23 static void       add_block(tree_t *hparent, mmd_t *parent);
24 static void       add_leaf(tree_t *hparent, mmd_t *node);
25 static uchar      *get_text(uchar *text);
26 static uchar      *make_anchor(mmd_t *block);
27 static uchar      *make_anchor(const uchar *text);
28 
29 
30 /*
31  * 'mdReadFile()' - Read a Markdown file.
32  */
33 
34 tree_t *				/* O - HTML document tree */
mdReadFile(tree_t * parent,FILE * fp,const char * base)35 mdReadFile(tree_t     *parent,		/* I - Parent node */
36            FILE       *fp,		/* I - File to read from */
37            const char *base)		/* I - Base path/URL */
38 {
39   mmd_t       *doc = mmdLoadFile(fp);   /* Markdown document */
40   tree_t      *html,                    /* HTML element */
41               *head,                    /* HEAD element */
42               *temp,                    /* META/TITLE element */
43               *body;                    /* BODY element */
44   const char  *meta;                    /* Title, author, etc. */
45 
46 
47   html = htmlAddTree(parent, MARKUP_HTML, NULL);
48   if ((meta = mmdGetMetadata(doc, "lang")) != NULL)
49     htmlSetVariable(html, (uchar *)"lang", get_text((uchar *)meta));
50 
51   head = htmlAddTree(html, MARKUP_HEAD, NULL);
52   if ((meta = mmdGetMetadata(doc, "title")) != NULL)
53   {
54     temp = htmlAddTree(head, MARKUP_TITLE, NULL);
55     htmlAddTree(temp, MARKUP_NONE, get_text((uchar *)meta));
56   }
57   if ((meta = mmdGetMetadata(doc, "author")) != NULL)
58   {
59     temp = htmlAddTree(head, MARKUP_META, NULL);
60     htmlSetVariable(temp, (uchar *)"name", (uchar *)"author");
61     htmlSetVariable(temp, (uchar *)"content", get_text((uchar *)meta));
62   }
63   if ((meta = mmdGetMetadata(doc, "copyright")) != NULL)
64   {
65     temp = htmlAddTree(head, MARKUP_META, NULL);
66     htmlSetVariable(temp, (uchar *)"name", (uchar *)"copyright");
67     htmlSetVariable(temp, (uchar *)"content", get_text((uchar *)meta));
68   }
69   if ((meta = mmdGetMetadata(doc, "version")) != NULL)
70   {
71     temp = htmlAddTree(head, MARKUP_META, NULL);
72     htmlSetVariable(temp, (uchar *)"name", (uchar *)"version");
73     htmlSetVariable(temp, (uchar *)"content", get_text((uchar *)meta));
74   }
75   if ((meta = mmdGetMetadata(doc, "language")) != NULL)
76   {
77     htmlSetVariable(html, (uchar *)"lang", get_text((uchar *)meta));
78   }
79   if ((meta = mmdGetMetadata(doc, "subject")) != NULL)
80   {
81     temp = htmlAddTree(head, MARKUP_META, NULL);
82     htmlSetVariable(temp, (uchar *)"name", (uchar *)"keywords");
83     htmlSetVariable(temp, (uchar *)"content", get_text((uchar *)meta));
84   }
85 
86   body = htmlAddTree(html, MARKUP_BODY, NULL);
87   add_block(body, doc);
88 
89   mmdFree(doc);
90 
91   return (html);
92 }
93 
94 
95 /*
96  * 'add_block()' - Add a block node.
97  */
98 
99 static void
add_block(tree_t * html,mmd_t * parent)100 add_block(tree_t *html,                 /* I - Parent HTML node */
101           mmd_t  *parent)               /* I - Parent node */
102 {
103   markup_t      element;                /* Enclosing element, if any */
104   mmd_t         *node;                  /* Current child node */
105   mmd_type_t    type;                   /* Node type */
106   tree_t        *block;                 /* Block node */
107   const char	*align = NULL;		/* Alignment */
108 
109 
110   switch (type = mmdGetType(parent))
111   {
112     case MMD_TYPE_BLOCK_QUOTE :
113         element = MARKUP_BLOCKQUOTE;
114         break;
115 
116     case MMD_TYPE_ORDERED_LIST :
117         element = MARKUP_OL;
118         break;
119 
120     case MMD_TYPE_UNORDERED_LIST :
121         element = MARKUP_UL;
122         break;
123 
124     case MMD_TYPE_LIST_ITEM :
125         element = MARKUP_LI;
126         break;
127 
128     case MMD_TYPE_HEADING_1 :
129         element = MARKUP_H1;
130         break;
131 
132     case MMD_TYPE_HEADING_2 :
133         element = MARKUP_H2;
134         break;
135 
136     case MMD_TYPE_HEADING_3 :
137         element = MARKUP_H3;
138         break;
139 
140     case MMD_TYPE_HEADING_4 :
141         element = MARKUP_H4;
142         break;
143 
144     case MMD_TYPE_HEADING_5 :
145         element = MARKUP_H5;
146         break;
147 
148     case MMD_TYPE_HEADING_6 :
149         element = MARKUP_H6;
150         break;
151 
152     case MMD_TYPE_PARAGRAPH :
153         element = MARKUP_P;
154         break;
155 
156     case MMD_TYPE_CODE_BLOCK :
157         block = htmlAddTree(html, MARKUP_PRE, NULL);
158 
159         for (node = mmdGetFirstChild(parent); node; node = mmdGetNextSibling(node))
160           htmlAddTree(block, MARKUP_NONE, get_text((uchar *)mmdGetText(node)));
161         return;
162 
163     case MMD_TYPE_THEMATIC_BREAK :
164         htmlAddTree(html, MARKUP_HR, NULL);
165         return;
166 
167     case MMD_TYPE_TABLE :
168         element = MARKUP_TABLE;
169         break;
170 
171     case MMD_TYPE_TABLE_HEADER :
172         element = MARKUP_THEAD;
173         break;
174 
175     case MMD_TYPE_TABLE_BODY :
176         element = MARKUP_TBODY;
177         break;
178 
179     case MMD_TYPE_TABLE_ROW :
180         element = MARKUP_TR;
181         break;
182 
183     case MMD_TYPE_TABLE_HEADER_CELL :
184         element = MARKUP_TH;
185         break;
186 
187     case MMD_TYPE_TABLE_BODY_CELL_LEFT :
188         element = MARKUP_TD;
189         break;
190 
191     case MMD_TYPE_TABLE_BODY_CELL_CENTER :
192         element = MARKUP_TD;
193         align   = "center";
194         break;
195 
196     case MMD_TYPE_TABLE_BODY_CELL_RIGHT :
197         element = MARKUP_TD;
198         align   = "right";
199         break;
200 
201     default :
202         element = MARKUP_NONE;
203         break;
204   }
205 
206   if (element != MARKUP_NONE)
207     block = htmlAddTree(html, element, NULL);
208   else
209     block = html;
210 
211   if (align)
212   {
213     htmlSetVariable(block, (uchar *)"align", (uchar *)align);
214 
215     if (!strcmp(align, "center"))
216       block->halignment = ALIGN_CENTER;
217     else
218       block->halignment = ALIGN_RIGHT;
219   }
220   else if (element == MARKUP_TH)
221   {
222     block->halignment = ALIGN_CENTER;
223     htmlSetVariable(block, (uchar *)"bgcolor", (uchar *)"#cccccc");
224   }
225   else if (element == MARKUP_TABLE)
226   {
227     htmlSetVariable(block, (uchar *)"border", (uchar *)"1");
228     htmlSetVariable(block, (uchar *)"cellpadding", (uchar *)"2");
229   }
230 
231   if (type >= MMD_TYPE_HEADING_1 && type <= MMD_TYPE_HEADING_6)
232   {
233    /*
234     * Add an anchor for each heading...
235     */
236 
237     block = htmlAddTree(block, MARKUP_A, NULL);
238     htmlSetVariable(block, (uchar *)"id", make_anchor(parent));
239   }
240 
241   if (type >= MMD_TYPE_TABLE_BODY_CELL_LEFT && type <= MMD_TYPE_TABLE_BODY_CELL_RIGHT && !mmdGetFirstChild(parent))
242   {
243     // Make sure table body cells have at least a non-breaking space...
244     htmlAddTree(block, MARKUP_NONE, (uchar *)" ");
245   }
246   else
247   {
248     for (node = mmdGetFirstChild(parent); node; node = mmdGetNextSibling(node))
249     {
250       if (mmdIsBlock(node))
251 	add_block(block, node);
252       else
253 	add_leaf(block, node);
254     }
255   }
256 }
257 
258 
259 /*
260  * 'add_leaf()' - Add a leaf node.
261  */
262 
263 static void
add_leaf(tree_t * html,mmd_t * node)264 add_leaf(tree_t *html,                  /* I - Parent HTML node */
265          mmd_t  *node)                  /* I - Leaf node */
266 {
267   tree_t        *parent;                /* HTML node for this text */
268   markup_t      element;                /* HTML element for this text */
269   uchar         buffer[1024],           /* Text with any added whitespace */
270                 *text,                  /* Text to write */
271                 *url;                   /* URL to write */
272   int		whitespace;		/* Whitespace before text? */
273 
274 
275   text       = get_text((uchar *)mmdGetText(node));
276   url        = (uchar *)mmdGetURL(node);
277   whitespace = mmdGetWhitespace(node);
278 
279   switch (mmdGetType(node))
280   {
281     case MMD_TYPE_EMPHASIZED_TEXT :
282         element = MARKUP_EM;
283         break;
284 
285     case MMD_TYPE_STRONG_TEXT :
286         element = MARKUP_STRONG;
287         break;
288 
289     case MMD_TYPE_STRUCK_TEXT :
290         element = MARKUP_DEL;
291         break;
292 
293     case MMD_TYPE_LINKED_TEXT :
294         element = MARKUP_A;
295         break;
296 
297     case MMD_TYPE_CODE_TEXT :
298         element = MARKUP_CODE;
299         break;
300 
301     case MMD_TYPE_IMAGE :
302         if (mmdGetWhitespace(node))
303           htmlAddTree(html, MARKUP_NONE, (uchar *)" ");
304 
305         parent = htmlAddTree(html, MARKUP_IMG, NULL);
306         htmlSetVariable(parent, (uchar *)"src", url);
307         if (text)
308           htmlSetVariable(parent, (uchar *)"alt", text);
309         return;
310 
311     case MMD_TYPE_HARD_BREAK :
312         htmlAddTree(html, MARKUP_BR, NULL);
313         return;
314 
315     case MMD_TYPE_SOFT_BREAK :
316         htmlAddTree(html, MARKUP_WBR, NULL);
317         return;
318 
319     case MMD_TYPE_METADATA_TEXT :
320         return;
321 
322     default :
323         element = MARKUP_NONE;
324         break;
325   }
326 
327   if (element == MARKUP_NONE)
328     parent = html;
329   else if ((parent = html->last_child) == NULL || parent->markup != element)
330   {
331     if (whitespace)
332     {
333       htmlAddTree(html, MARKUP_NONE, (uchar *)" ");
334       whitespace = 0;
335     }
336 
337     parent = htmlAddTree(html, element, NULL);
338 
339     if (element == MARKUP_A && url)
340     {
341       if (!strcmp((char *)url, "@"))
342         htmlSetVariable(parent, (uchar *)"href", make_anchor(text));
343       else
344         htmlSetVariable(parent, (uchar *)"href", url);
345     }
346   }
347 
348   if (whitespace)
349   {
350     buffer[0] = ' ';
351     strlcpy((char *)buffer + 1, (char *)text, sizeof(buffer) - 1);
352     text = buffer;
353   }
354 
355   htmlAddTree(parent, MARKUP_NONE, text);
356 }
357 
358 
359 /*
360  * 'get_text()' - Get Markdown text in HTMLDOC's charset.
361  */
362 
363 static uchar *                          /* O - Encoded text */
get_text(uchar * text)364 get_text(uchar *text)                   /* I - Markdown text */
365 {
366   uchar         *bufptr,                /* Pointer into buffer */
367                 *bufend;                /* End of buffer */
368   int           unich;                  /* Unicode character */
369   static uchar  buffer[8192];           /* Temporary buffer */
370 
371 
372   if (!_htmlUTF8)
373     return (text);
374 
375   bufptr = buffer;
376   bufend = buffer + sizeof(buffer) - 1;
377 
378   while (*text && bufptr < bufend)
379   {
380     if (*text & 0x80)
381     {
382       unich = 0;
383 
384       if ((*text & 0xe0) == 0xc0)
385       {
386         if ((text[1] & 0xc0) != 0x80)
387         {
388           progress_error(HD_ERROR_READ_ERROR, "Bad UTF-8 character sequence %02X %02X.", *text, text[1]);
389           *bufptr++ = '?';
390           text ++;
391         }
392         else
393         {
394           unich = ((*text & 0x1f) << 6) | (text[1] & 0x3f);
395           text += 2;
396         }
397       }
398       else if ((*text & 0xf0) == 0xe0)
399       {
400         if ((text[1] & 0xc0) != 0x80 || (text[2] & 0xc0) != 0x80)
401         {
402           progress_error(HD_ERROR_READ_ERROR, "Bad UTF-8 character sequence %02X %02X %02X.", *text, text[1], text[2]);
403           *bufptr++ = '?';
404           text ++;
405         }
406         else
407         {
408           unich = ((*text & 0x0f) << 12) | ((text[1] & 0x3f) << 6) | (text[2] & 0x3f);
409           text += 3;
410         }
411       }
412       else
413       {
414         progress_error(HD_ERROR_READ_ERROR, "Bad UTF-8 character sequence %02X.", *text);
415         *bufptr++ = '?';
416         text ++;
417       }
418 
419       if (unich)
420       {
421         uchar ch = htmlMapUnicode(unich);
422 					/* 8-bit character */
423 
424         *bufptr++ = ch ? ch : '?';
425       }
426     }
427     else
428       *bufptr++ = *text++;
429   }
430 
431   *bufptr = '\0';
432 
433   return (buffer);
434 }
435 
436 
437 /*
438  * 'make_anchor()' - Make an anchor for internal links from a block node.
439  */
440 
441 static uchar *                          /* O - Anchor string */
make_anchor(mmd_t * block)442 make_anchor(mmd_t *block)               /* I - Block node */
443 {
444   mmd_t         *node;                  /* Current child node */
445   const char    *text;                  /* Text from block */
446   uchar         *bufptr;                /* Pointer into buffer */
447   static uchar  buffer[1024];           /* Buffer for anchor string */
448 
449 
450   for (bufptr = buffer, node = mmdGetFirstChild(block); node; node = mmdGetNextSibling(node))
451   {
452     if (mmdGetWhitespace(node) && bufptr < (buffer + sizeof(buffer) - 1))
453       *bufptr++ = '-';
454     for (text = mmdGetText(node); text && *text && bufptr < (buffer + sizeof(buffer) -1); text ++)
455     {
456       if ((*text >= '0' && *text <= '9') || (*text >= 'a' && *text <= 'z') || (*text >= 'A' && *text <= 'Z') || *text == '.' || *text == '-')
457         *bufptr++ = (uchar)tolower(*text);
458       else if (*text == ' ')
459         *bufptr++ = '-';
460     }
461   }
462 
463   *bufptr = '\0';
464 
465   return (buffer);
466 }
467 
468 
469 /*
470  * 'make_anchor()' - Make an anchor for internal links from text.
471  */
472 
473 static uchar *                          /* O - Anchor string */
make_anchor(const uchar * text)474 make_anchor(const uchar *text)          /* I - Text */
475 {
476   uchar         *bufptr;                /* Pointer into buffer */
477   static uchar  buffer[1024];           /* Buffer for anchor string */
478 
479 
480   for (bufptr = buffer; *text && bufptr < (buffer + sizeof(buffer) - 1); text ++)
481   {
482     if ((*text >= '0' && *text <= '9') || (*text >= 'a' && *text <= 'z') || (*text >= 'A' && *text <= 'Z') || *text == '.' || *text == '-')
483       *bufptr++ = (uchar)tolower(*text);
484     else if (*text == ' ')
485       *bufptr++ = '-';
486   }
487 
488   *bufptr = '\0';
489 
490   return (buffer);
491 }
492