1 /*
2 *   Copyright (c) 2016, Jiri Techet
3 *
4 *   This source code is released for free distribution under the terms of the
5 *   GNU General Public License version 2 or (at your option) any later version.
6 *
7 *   This module contains functions for generating tags for HTML language
8 *   files.
9 */
10 
11 #include "general.h"
12 
13 #include <string.h>
14 #include <ctype.h>
15 
16 #include "entry.h"
17 #include "parse.h"
18 #include "read.h"
19 #include "routines.h"
20 #include "keyword.h"
21 #include "promise.h"
22 
23 /* The max. number of nested elements - prevents further recursion if the limit
24  * is exceeded and avoids stack overflow for invalid input containing too many
25  * open tags */
26 #define MAX_DEPTH 1000
27 
28 
29 typedef enum {
30 	K_ANCHOR,
31 	K_CLASS,
32 	K_HEADING1,
33 	K_HEADING2,
34 	K_HEADING3,
35 	K_STYELSHEET,
36 	K_ID,
37 	K_SCRIPT,
38 } htmlKind;
39 
40 
41 typedef enum {
42 	CLASS_KIND_ATTRIBUTE_ROLE,
43 } ClassRole;
44 
45 typedef enum {
46 	SCRIPT_KIND_EXTERNAL_FILE_ROLE,
47 } ScriptRole;
48 
49 typedef enum {
50 	STYLESHEET_KIND_EXTERNAL_FILE_ROLE,
51 } StylesheetRole;
52 
53 static roleDefinition ClassRoles [] = {
54 	{ true, "attribute", "assigned as attributes" },
55 };
56 
57 static roleDefinition ScriptRoles [] = {
58 	{ true, "extFile", "referenced as external files" },
59 };
60 
61 static roleDefinition StylesheetRoles [] = {
62 	{ true, "extFile", "referenced as external files" },
63 };
64 
65 static kindDefinition HtmlKinds [] = {
66 	{ true, 'a', "anchor",		"named anchors" },
67 	{ true, 'c', "class",		"classes",
68 	  .referenceOnly = true, ATTACH_ROLES (ClassRoles)},
69 	{ true, 'h', "heading1",	"H1 headings" },
70 	{ true, 'i', "heading2",	"H2 headings" },
71 	{ true, 'j', "heading3",	"H3 headings" },
72 	{ true, 'C', "stylesheet",	"stylesheets",
73 	  .referenceOnly = true, ATTACH_ROLES (StylesheetRoles)},
74 	{ true, 'I', "id",			"identifiers" },
75 	{ true, 'J', "script",		"scripts",
76 	  .referenceOnly = true, ATTACH_ROLES (ScriptRoles)},
77 };
78 
79 typedef enum {
80 	KEYWORD_h1,
81 	KEYWORD_h2,
82 	KEYWORD_h3,
83 	KEYWORD_a,
84 	KEYWORD_script,
85 	KEYWORD_style,
86 	KEYWORD_name,
87 
88 	/* void elements */
89 	KEYWORD_area,
90 	KEYWORD_base,
91 	KEYWORD_br,
92 	KEYWORD_class,
93 	KEYWORD_col,
94 	KEYWORD_command,
95 	KEYWORD_embed,
96 	KEYWORD_hr,
97 	KEYWORD_href,
98 	KEYWORD_id,
99 	KEYWORD_img,
100 	KEYWORD_input,
101 	KEYWORD_keygen,
102 	KEYWORD_link,
103 	KEYWORD_meta,
104 	KEYWORD_param,
105 	KEYWORD_rel,
106 	KEYWORD_source,
107 	KEYWORD_src,
108 	KEYWORD_track,
109 	KEYWORD_wbr
110 } keywordId;
111 
112 static const keywordTable HtmlKeywordTable[] = {
113 	{"h1", KEYWORD_h1},
114 	{"h2", KEYWORD_h2},
115 	{"h3", KEYWORD_h3},
116 	{"a", KEYWORD_a},
117 	{"script", KEYWORD_script},
118 	{"style", KEYWORD_style},
119 	{"name", KEYWORD_name},
120 
121 	/* void elements */
122 	{"area", KEYWORD_area},
123 	{"base", KEYWORD_base},
124 	{"br", KEYWORD_br},
125 	{"class", KEYWORD_class},
126 	{"col", KEYWORD_col},
127 	{"command", KEYWORD_command},
128 	{"embed", KEYWORD_embed},
129 	{"hr", KEYWORD_hr},
130 	{"href", KEYWORD_href},
131 	{"id", KEYWORD_id},
132 	{"img", KEYWORD_img},
133 	{"input", KEYWORD_input},
134 	{"keygen", KEYWORD_keygen},
135 	{"link", KEYWORD_link},
136 	{"meta", KEYWORD_meta},
137 	{"param", KEYWORD_param},
138 	{"rel", KEYWORD_rel},
139 	{"source", KEYWORD_source},
140 	{"src", KEYWORD_src},
141 	{"track", KEYWORD_track},
142 	{"wbr", KEYWORD_wbr},
143 };
144 
145 typedef enum {
146 	TOKEN_EOF,
147 	TOKEN_NAME,			/* tag and attribute names */
148 	TOKEN_STRING,		/* single- or double-quoted attribute value */
149 	TOKEN_TEXT,
150 	TOKEN_TAG_START,	/* <  */
151 	TOKEN_TAG_START2,	/* </ */
152 	TOKEN_TAG_END,		/* >  */
153 	TOKEN_TAG_END2,		/* /> */
154 	TOKEN_EQUAL,
155 	TOKEN_COMMENT,
156 	TOKEN_OTHER
157 } tokenType;
158 
159 #ifdef DEBUG
160 const char *tokenTypes[] = {
161 #define E(X) [TOKEN_##X] = #X
162 	E(EOF),
163 	E(NAME),
164 	E(STRING),
165 	E(TEXT),
166 	E(TAG_START),
167 	E(TAG_START2),
168 	E(TAG_END),
169 	E(TAG_END2),
170 	E(EQUAL),
171 	E(COMMENT),
172 	E(OTHER),
173 #undef E
174 };
175 #endif
176 
177 typedef struct {
178 	tokenType type;
179 	vString *string;
180 } tokenInfo;
181 
182 
183 static int Lang_html;
184 
185 
186 static void readTag (tokenInfo *token, vString *text, int depth);
187 
188 #ifdef DEBUG
189 #if 0
190 static void dumpToken (tokenInfo *token, const char *context, const char* extra_context)
191 {
192 	fprintf (stderr, "[%7s] %-20s@%s.%s\n",
193 			 tokenTypes[token->type], vStringValue(token->string),
194 			 context, extra_context? extra_context: "_");
195 }
196 #endif
197 #endif
198 
readTokenText(tokenInfo * const token,bool collectText)199 static void readTokenText (tokenInfo *const token, bool collectText)
200 {
201 	int c;
202 	int lastC = 'X';  /* whatever non-space character */
203 
204 	vStringClear (token->string);
205 
206 getNextChar:
207 
208 	c = getcFromInputFile ();
209 
210 	switch (c)
211 	{
212 		case EOF:
213 			token->type = TOKEN_EOF;
214 			break;
215 
216 		case '<':
217 			ungetcToInputFile (c);
218 			token->type = TOKEN_TEXT;
219 			break;
220 
221 		default:
222 			if (collectText)
223 			{
224 				if (isspace (c))
225 					c = ' ';
226 				if (c != ' ' || lastC != ' ')
227 				{
228 					vStringPut (token->string, c);
229 					lastC = c;
230 				}
231 			}
232 
233 			goto getNextChar;
234 	}
235 }
236 
readToken(tokenInfo * const token,bool skipComments)237 static void readToken (tokenInfo *const token, bool skipComments)
238 {
239 	int c;
240 
241 	vStringClear (token->string);
242 
243 getNextChar:
244 
245 	c = getcFromInputFile ();
246 	while (isspace (c))
247 		c = getcFromInputFile ();
248 
249 	switch (c)
250 	{
251 		case EOF:
252 			token->type = TOKEN_EOF;
253 			break;
254 
255 		case '<':
256 		{
257 			int d = getcFromInputFile ();
258 
259 			if (d == '!')
260 			{
261 				d = getcFromInputFile ();
262 				if (d == '-')
263 				{
264 					d = getcFromInputFile ();
265 					if (d == '-')
266 					{
267 						int e = ' ';
268 						int f = ' ';
269 						do
270 						{
271 							d = e;
272 							e = f;
273 							f = getcFromInputFile ();
274 						}
275 						while (f != EOF && ! (d == '-' && e == '-' && f == '>'));
276 
277 						if (skipComments)
278 							goto getNextChar;
279 						else
280 						{
281 							token->type = TOKEN_COMMENT;
282 							break;
283 						}
284 					}
285 				}
286 				ungetcToInputFile (d);
287 				token->type = TOKEN_OTHER;
288 			}
289 			else if (d == '?')
290 				token->type = TOKEN_OTHER;
291 			else if (d == '/')
292 				token->type = TOKEN_TAG_START2;
293 			else
294 			{
295 				ungetcToInputFile (d);
296 				token->type = TOKEN_TAG_START;
297 			}
298 			break;
299 		}
300 		case '/':
301 		{
302 			int d = getcFromInputFile ();
303 			if (d == '>')
304 				token->type = TOKEN_TAG_END2;
305 			else
306 			{
307 				ungetcToInputFile (d);
308 				token->type = TOKEN_OTHER;
309 			}
310 			break;
311 		}
312 		case '>':
313 			token->type = TOKEN_TAG_END;
314 			break;
315 
316 		case '=':
317 			token->type = TOKEN_EQUAL;
318 			break;
319 
320 		case '"':
321 		case '\'':
322 		{
323 			const int delimiter = c;
324 			c = getcFromInputFile ();
325 			while (c != EOF && c != delimiter)
326 			{
327 				vStringPut (token->string, c);
328 				c = getcFromInputFile ();
329 			}
330 			token->type = TOKEN_STRING;
331 			break;
332 		}
333 
334 		default:
335 		{
336 			do
337 			{
338 				vStringPut (token->string, tolower (c));
339 				c = getcFromInputFile ();
340 			}
341 			while (!isspace (c) && c != '<' && c != '>' && c != '/' &&
342 				   c != '=' && c != '\'' && c != '"' && c != EOF);
343 			if (c != EOF)
344 				ungetcToInputFile (c);
345 			token->type = TOKEN_NAME;
346 			break;
347 		}
348 	}
349 }
350 
appendText(vString * text,vString * appendedText)351 static void appendText (vString *text, vString *appendedText)
352 {
353 	if (text != NULL && vStringLength (appendedText) > 0)
354 	{
355 		if (vStringLength (text) > 0 && vStringLast (text) == ' ' &&
356 			vStringLength (appendedText) > 0 && vStringChar (appendedText, 0) == ' ')
357 		{
358 			vStringStripTrailing (text);
359 		}
360 		vStringCat (text, appendedText);
361 	}
362 }
363 
readTagContent(tokenInfo * token,vString * text,long * line,long * lineOffset,int depth)364 static bool readTagContent (tokenInfo *token, vString *text, long *line, long *lineOffset, int depth)
365 {
366 	tokenType type;
367 
368 	readTokenText (token, text != NULL);
369 	appendText (text, token->string);
370 
371 	do
372 	{
373 		*line = getInputLineNumber ();
374 		*lineOffset = getInputLineOffset ();
375 		readToken (token, false);
376 		type = token->type;
377 		if (type == TOKEN_TAG_START)
378 			readTag (token, text, depth + 1);
379 		if (type == TOKEN_COMMENT || type == TOKEN_TAG_START)
380 		{
381 			readTokenText (token, text != NULL);
382 			appendText (text, token->string);
383 		}
384 	}
385 	while (type == TOKEN_COMMENT || type == TOKEN_TAG_START);
386 
387 	return type == TOKEN_TAG_START2;
388 }
389 
skipScriptContent(tokenInfo * token,long * line,long * lineOffset)390 static bool skipScriptContent (tokenInfo *token, long *line, long *lineOffset)
391 {
392 	bool found_start = false;
393 	bool found_script = false;
394 
395 	long line_tmp[2] = {0};
396 	long lineOffset_tmp[2] = {0};
397 
398 	tokenType type;
399 
400 	do
401 	{
402 		line_tmp[0] = getInputLineNumber ();
403 		lineOffset_tmp[0] = getInputLineOffset ();
404 
405 		readToken (token, false);
406 		type = token->type;
407 
408 		if (type == TOKEN_TAG_START2)
409 		{
410 			found_start = true;
411 			line_tmp[1] = line_tmp[0];
412 			lineOffset_tmp[1] = lineOffset_tmp[0];
413 		}
414 		else if (found_start
415 				 && type == TOKEN_NAME
416 				 && lookupKeyword (vStringValue (token->string), Lang_html) == KEYWORD_script)
417 		{
418 			found_script = true;
419 			*line = line_tmp[1];
420 			*lineOffset = lineOffset_tmp[1];
421 		}
422 		else
423 			found_start = false;
424 	}
425 	while ((type != TOKEN_EOF) && (!found_script));
426 
427 	return found_script;
428 }
429 
makeClassRefTags(const char * classes)430 static void makeClassRefTags (const char *classes)
431 {
432 	vString *klass = vStringNew ();
433 
434 	do
435 	{
436 		if (*classes && !isspace (*classes))
437 			vStringPut (klass, *classes);
438 		else if (!vStringIsEmpty (klass))
439 		{
440 			makeSimpleRefTag (klass, K_CLASS,
441 							  CLASS_KIND_ATTRIBUTE_ROLE);
442 			vStringClear (klass);
443 		}
444 
445 		if (!*classes)
446 			break;
447 
448 		classes++;
449 	} while (1);
450 
451 	vStringDelete (klass);
452 }
453 
readTag(tokenInfo * token,vString * text,int depth)454 static void readTag (tokenInfo *token, vString *text, int depth)
455 {
456 	bool textCreated = false;
457 
458 	readToken (token, true);
459 	if (token->type == TOKEN_NAME)
460 	{
461 		keywordId startTag;
462 		bool isHeading;
463 		bool isVoid;
464 		vString *stylesheet = NULL;
465 		bool stylesheet_expectation = false;
466 
467 		startTag = lookupKeyword (vStringValue (token->string), Lang_html);
468 		isHeading = (startTag == KEYWORD_h1 || startTag == KEYWORD_h2 || startTag == KEYWORD_h3);
469 		isVoid = (startTag >= KEYWORD_area && startTag <= KEYWORD_wbr);
470 		if (text == NULL && isHeading)
471 		{
472 			text = vStringNew ();
473 			textCreated = true;
474 		}
475 
476 		do
477 		{
478 			keywordId attribute = KEYWORD_NONE;
479 
480 			readToken (token, true);
481 			if (token->type == TOKEN_NAME)
482 				attribute = lookupKeyword (vStringValue (token->string), Lang_html);
483 
484 			if (attribute == KEYWORD_class)
485 			{
486 				readToken (token, true);
487 				if (token->type == TOKEN_EQUAL)
488 				{
489 					readToken (token, true);
490 					if (token->type == TOKEN_STRING)
491 						makeClassRefTags (vStringValue (token->string));
492 				}
493 			}
494 			else if (attribute == KEYWORD_id)
495 			{
496 				readToken (token, true);
497 				if (token->type == TOKEN_EQUAL)
498 				{
499 					readToken (token, true);
500 					if (token->type == TOKEN_STRING)
501 						makeSimpleTag (token->string, K_ID);
502 				}
503 			}
504 			else if (startTag == KEYWORD_a && attribute == KEYWORD_name)
505 			{
506 				readToken (token, true);
507 				if (token->type == TOKEN_EQUAL)
508 				{
509 					readToken (token, true);
510 					if (token->type == TOKEN_STRING || token->type == TOKEN_NAME)
511 						makeSimpleTag (token->string, K_ANCHOR);
512 				}
513 			}
514 			else if (startTag == KEYWORD_script && attribute == KEYWORD_src)
515 			{
516 				readToken (token, true);
517 				if (token->type == TOKEN_EQUAL)
518 				{
519 					readToken (token, true);
520 					if (token->type == TOKEN_STRING)
521 						makeSimpleRefTag (token->string, K_SCRIPT,
522 										  SCRIPT_KIND_EXTERNAL_FILE_ROLE);
523 				}
524 			}
525 			else if (startTag == KEYWORD_link)
526 			{
527 				if (attribute == KEYWORD_rel)
528 				{
529 					readToken (token, true);
530 					if (token->type == TOKEN_EQUAL)
531 					{
532 						readToken (token, true);
533 						if (token->type == TOKEN_STRING &&
534 							/* strcmp is not enough:
535 							 * e.g. <link href="fancy.css"
536 							 *            rel="alternate stylesheet" title="Fancy"> */
537 							vStringLength(token->string) >= 10 &&
538 							strstr (vStringValue (token->string), "stylesheet"))
539 							stylesheet_expectation = true;
540 					}
541 				}
542 				else if (attribute == KEYWORD_href)
543 				{
544 					readToken (token, true);
545 					if (token->type == TOKEN_EQUAL)
546 					{
547 						readToken (token, true);
548 						if (token->type == TOKEN_STRING)
549 						{
550 							if (stylesheet == NULL)
551 								stylesheet = vStringNewCopy (token->string);
552 							else
553 								vStringCopy (stylesheet, token->string);
554 						}
555 					}
556 				}
557 				if (stylesheet_expectation && stylesheet && !vStringIsEmpty (stylesheet))
558 				{
559 					makeSimpleRefTag (stylesheet, K_STYELSHEET,
560 									  STYLESHEET_KIND_EXTERNAL_FILE_ROLE);
561 					stylesheet_expectation = false;
562 					if (stylesheet)
563 						vStringClear (stylesheet);
564 				}
565 			}
566 		}
567 		while (token->type != TOKEN_TAG_END && token->type != TOKEN_TAG_END2 &&
568 			   token->type != TOKEN_EOF);
569 
570 		vStringDelete (stylesheet);
571 		stylesheet = NULL;
572 
573 		if (!isVoid && token->type == TOKEN_TAG_END && depth < MAX_DEPTH)
574 		{
575 			long startSourceLineNumber = getSourceLineNumber ();
576 			long startLineNumber = getInputLineNumber ();
577 			long startLineOffset = getInputLineOffset ();
578 			long endLineNumber;
579 			long endLineOffset;
580 			bool tag_start2;
581 
582 			if (startTag == KEYWORD_script)
583 			{
584 				bool script = skipScriptContent (token, &endLineNumber, &endLineOffset);
585 				if (script)
586 					makePromise ("JavaScript", startLineNumber, startLineOffset,
587 								 endLineNumber, endLineOffset, startSourceLineNumber);
588 				readToken (token, true);
589 				goto out;
590 			}
591 
592 			tag_start2 = readTagContent (token, text, &endLineNumber, &endLineOffset, depth);
593 			if (tag_start2)
594 			{
595 				readToken (token, true);
596 				if (isHeading && textCreated && vStringLength (text) > 0)
597 				{
598 					keywordId endTag = lookupKeyword (vStringValue (token->string), Lang_html);
599 					if (startTag == endTag)
600 					{
601 						htmlKind headingKind;
602 
603 						if (startTag == KEYWORD_h1)
604 							headingKind = K_HEADING1;
605 						else if (startTag == KEYWORD_h2)
606 							headingKind = K_HEADING2;
607 						else
608 							headingKind = K_HEADING3;
609 
610 						vStringStripLeading (text);
611 						vStringStripTrailing (text);
612 						makeSimpleTag (text, headingKind);
613 					}
614 				}
615 				else if (startTag == KEYWORD_style)
616 				{
617 					keywordId endTag = lookupKeyword (vStringValue (token->string), Lang_html);
618 					if (startTag == endTag)
619 						makePromise ("CSS", startLineNumber, startLineOffset,
620 									 endLineNumber, endLineOffset, startSourceLineNumber);
621 				}
622 
623 				readToken (token, true);
624 			}
625 		}
626 	}
627 
628  out:
629 	if (textCreated)
630 		vStringDelete (text);
631 }
632 
findHtmlTags(void)633 static void findHtmlTags (void)
634 {
635 	tokenInfo token;
636 
637 	token.string = vStringNew ();
638 
639 	do
640 	{
641 		readToken (&token, true);
642 		if (token.type == TOKEN_TAG_START)
643 			readTag (&token, NULL, 0);
644 	}
645 	while (token.type != TOKEN_EOF);
646 
647 	vStringDelete (token.string);
648 }
649 
initialize(const langType language)650 static void initialize (const langType language)
651 {
652 	Lang_html = language;
653 }
654 
655 /* parser definition */
HtmlParser(void)656 extern parserDefinition* HtmlParser (void)
657 {
658 	static const char *const extensions [] = { "htm", "html", NULL };
659 	parserDefinition* def = parserNew ("HTML");
660 	def->kindTable        = HtmlKinds;
661 	def->kindCount    = ARRAY_SIZE (HtmlKinds);
662 	def->extensions   = extensions;
663 	def->parser       = findHtmlTags;
664 	def->initialize   = initialize;
665 	def->keywordTable = HtmlKeywordTable;
666 	def->keywordCount = ARRAY_SIZE (HtmlKeywordTable);
667 	return def;
668 }
669