1 /*
2 *   Copyright (c) 2016, Jiri Techet
3 *
4 *   This source code is released for free distribution under the terms of the
5 *   GNU General Public License version 2 or (at your option) any later version.
6 *
7 *   This module contains functions for generating tags for HTML language
8 *   files.
9 */
10 
11 #include "general.h"
12 
13 #include <string.h>
14 #include <ctype.h>
15 
16 #include "entry.h"
17 #include "parse.h"
18 #include "read.h"
19 #include "routines.h"
20 #include "keyword.h"
21 #include "promise.h"
22 
23 /* The max. number of nested elements - prevents further recursion if the limit
24  * is exceeded and avoids stack overflow for invalid input containing too many
25  * open tags */
26 #define MAX_DEPTH 1000
27 
28 
29 typedef enum {
30 	K_ANCHOR,
31 	K_HEADING1,
32 	K_HEADING2,
33 	K_HEADING3
34 } htmlKind;
35 
36 
37 static kindDefinition HtmlKinds [] = {
38 	{ true, 'a', "anchor",		"named anchors" },
39 	{ true, 'h', "heading1",	"H1 headings" },
40 	{ true, 'i', "heading2",	"H2 headings" },
41 	{ true, 'j', "heading3",	"H3 headings" }
42 };
43 
44 typedef enum {
45 	KEYWORD_h1,
46 	KEYWORD_h2,
47 	KEYWORD_h3,
48 	KEYWORD_a,
49 	KEYWORD_script,
50 	KEYWORD_style,
51 	KEYWORD_name,
52 
53 	/* void elements */
54 	KEYWORD_area,
55 	KEYWORD_base,
56 	KEYWORD_br,
57 	KEYWORD_col,
58 	KEYWORD_command,
59 	KEYWORD_embed,
60 	KEYWORD_hr,
61 	KEYWORD_img,
62 	KEYWORD_input,
63 	KEYWORD_keygen,
64 	KEYWORD_link,
65 	KEYWORD_meta,
66 	KEYWORD_param,
67 	KEYWORD_source,
68 	KEYWORD_track,
69 	KEYWORD_wbr
70 } keywordId;
71 
72 static const keywordTable HtmlKeywordTable[] = {
73 	{"h1", KEYWORD_h1},
74 	{"h2", KEYWORD_h2},
75 	{"h3", KEYWORD_h3},
76 	{"a", KEYWORD_a},
77 	{"script", KEYWORD_script},
78 	{"style", KEYWORD_style},
79 	{"name", KEYWORD_name},
80 
81 	/* void elements */
82 	{"area", KEYWORD_area},
83 	{"base", KEYWORD_base},
84 	{"br", KEYWORD_br},
85 	{"col", KEYWORD_col},
86 	{"command", KEYWORD_command},
87 	{"embed", KEYWORD_embed},
88 	{"hr", KEYWORD_hr},
89 	{"img", KEYWORD_img},
90 	{"input", KEYWORD_input},
91 	{"keygen", KEYWORD_keygen},
92 	{"link", KEYWORD_link},
93 	{"meta", KEYWORD_meta},
94 	{"param", KEYWORD_param},
95 	{"source", KEYWORD_source},
96 	{"track", KEYWORD_track},
97 	{"wbr", KEYWORD_wbr},
98 };
99 
100 typedef enum {
101 	TOKEN_EOF,
102 	TOKEN_NAME,			/* tag and attribute names */
103 	TOKEN_STRING,		/* single- or double-quoted attribute value */
104 	TOKEN_TEXT,
105 	TOKEN_TAG_START,	/* <  */
106 	TOKEN_TAG_START2,	/* </ */
107 	TOKEN_TAG_END,		/* >  */
108 	TOKEN_TAG_END2,		/* /> */
109 	TOKEN_EQUAL,
110 	TOKEN_COMMENT,
111 	TOKEN_OTHER
112 } tokenType;
113 
114 #ifdef DEBUG
115 const char *tokenTypes[] = {
116 #define E(X) [TOKEN_##X] = #X
117 	E(EOF),
118 	E(NAME),
119 	E(STRING),
120 	E(TEXT),
121 	E(TAG_START),
122 	E(TAG_START2),
123 	E(TAG_END),
124 	E(TAG_END2),
125 	E(EQUAL),
126 	E(COMMENT),
127 	E(OTHER),
128 #undef E
129 };
130 #endif
131 
132 typedef struct {
133 	tokenType type;
134 	vString *string;
135 } tokenInfo;
136 
137 
138 static int Lang_html;
139 
140 
141 static void readTag (tokenInfo *token, vString *text, int depth);
142 
143 #ifdef DEBUG
dumpToken(tokenInfo * token,const char * context,const char * extra_context)144 static void dumpToken (tokenInfo *token, const char *context, const char* extra_context)
145 {
146 	fprintf (stderr, "[%7s] %-20s@%s.%s\n",
147 			 tokenTypes[token->type], vStringValue(token->string),
148 			 context, extra_context? extra_context: "_");
149 }
150 #endif
151 
readTokenText(tokenInfo * const token,bool collectText)152 static void readTokenText (tokenInfo *const token, bool collectText)
153 {
154 	int c;
155 	int lastC = 'X';  /* whatever non-space character */
156 
157 	vStringClear (token->string);
158 
159 getNextChar:
160 
161 	c = getcFromInputFile ();
162 
163 	switch (c)
164 	{
165 		case EOF:
166 			token->type = TOKEN_EOF;
167 			break;
168 
169 		case '<':
170 			ungetcToInputFile (c);
171 			token->type = TOKEN_TEXT;
172 			break;
173 
174 		default:
175 			if (collectText)
176 			{
177 				if (isspace (c))
178 					c = ' ';
179 				if (c != ' ' || lastC != ' ')
180 				{
181 					vStringPut (token->string, c);
182 					lastC = c;
183 				}
184 			}
185 
186 			goto getNextChar;
187 	}
188 }
189 
readToken(tokenInfo * const token,bool skipComments)190 static void readToken (tokenInfo *const token, bool skipComments)
191 {
192 	int c;
193 
194 	vStringClear (token->string);
195 
196 getNextChar:
197 
198 	c = getcFromInputFile ();
199 	while (isspace (c))
200 		c = getcFromInputFile ();
201 
202 	switch (c)
203 	{
204 		case EOF:
205 			token->type = TOKEN_EOF;
206 			break;
207 
208 		case '<':
209 		{
210 			int d = getcFromInputFile ();
211 
212 			if (d == '!')
213 			{
214 				d = getcFromInputFile ();
215 				if (d == '-')
216 				{
217 					d = getcFromInputFile ();
218 					if (d == '-')
219 					{
220 						int e = ' ';
221 						int f = ' ';
222 						do
223 						{
224 							d = e;
225 							e = f;
226 							f = getcFromInputFile ();
227 						}
228 						while (f != EOF && ! (d == '-' && e == '-' && f == '>'));
229 
230 						if (skipComments)
231 							goto getNextChar;
232 						else
233 						{
234 							token->type = TOKEN_COMMENT;
235 							break;
236 						}
237 					}
238 				}
239 				ungetcToInputFile (d);
240 				token->type = TOKEN_OTHER;
241 			}
242 			else if (d == '?')
243 				token->type = TOKEN_OTHER;
244 			else if (d == '/')
245 				token->type = TOKEN_TAG_START2;
246 			else
247 			{
248 				ungetcToInputFile (d);
249 				token->type = TOKEN_TAG_START;
250 			}
251 			break;
252 		}
253 		case '/':
254 		{
255 			int d = getcFromInputFile ();
256 			if (d == '>')
257 				token->type = TOKEN_TAG_END2;
258 			else
259 			{
260 				ungetcToInputFile (d);
261 				token->type = TOKEN_OTHER;
262 			}
263 			break;
264 		}
265 		case '>':
266 			token->type = TOKEN_TAG_END;
267 			break;
268 
269 		case '=':
270 			token->type = TOKEN_EQUAL;
271 			break;
272 
273 		case '"':
274 		case '\'':
275 		{
276 			const int delimiter = c;
277 			c = getcFromInputFile ();
278 			while (c != EOF && c != delimiter)
279 			{
280 				vStringPut (token->string, c);
281 				c = getcFromInputFile ();
282 			}
283 			token->type = TOKEN_STRING;
284 			break;
285 		}
286 
287 		default:
288 		{
289 			do
290 			{
291 				vStringPut (token->string, tolower (c));
292 				c = getcFromInputFile ();
293 			}
294 			while (!isspace (c) && c != '<' && c != '>' && c != '/' &&
295 				   c != '=' && c != '\'' && c != '"' && c != EOF);
296 			if (c != EOF)
297 				ungetcToInputFile (c);
298 			token->type = TOKEN_NAME;
299 			break;
300 		}
301 	}
302 }
303 
appendText(vString * text,vString * appendedText)304 static void appendText (vString *text, vString *appendedText)
305 {
306 	if (text != NULL && vStringLength (appendedText) > 0)
307 	{
308 		if (vStringLength (text) > 0 && vStringLast (text) == ' ' &&
309 			vStringLength (appendedText) > 0 && vStringChar (appendedText, 0) == ' ')
310 		{
311 			vStringStripTrailing (text);
312 		}
313 		vStringCat (text, appendedText);
314 	}
315 }
316 
readTagContent(tokenInfo * token,vString * text,long * line,long * lineOffset,int depth)317 static bool readTagContent (tokenInfo *token, vString *text, long *line, long *lineOffset, int depth)
318 {
319 	tokenType type;
320 
321 	readTokenText (token, text != NULL);
322 	appendText (text, token->string);
323 
324 	do
325 	{
326 		*line = getInputLineNumber ();
327 		*lineOffset = getInputLineOffset ();
328 		readToken (token, false);
329 		type = token->type;
330 		if (type == TOKEN_TAG_START)
331 			readTag (token, text, depth + 1);
332 		if (type == TOKEN_COMMENT || type == TOKEN_TAG_START)
333 		{
334 			readTokenText (token, text != NULL);
335 			appendText (text, token->string);
336 		}
337 	}
338 	while (type == TOKEN_COMMENT || type == TOKEN_TAG_START);
339 
340 	return type == TOKEN_TAG_START2;
341 }
342 
skipScriptContent(tokenInfo * token,long * line,long * lineOffset)343 static bool skipScriptContent (tokenInfo *token, long *line, long *lineOffset)
344 {
345 	bool found_start = false;
346 	bool found_script = false;
347 
348 	long line_tmp[2];
349 	long lineOffset_tmp[2];
350 
351 	tokenType type;
352 
353 	do
354 	{
355 		line_tmp[0] = getInputLineNumber ();
356 		lineOffset_tmp[0] = getInputLineOffset ();
357 
358 		readToken (token, false);
359 		type = token->type;
360 
361 		if (type == TOKEN_TAG_START2)
362 		{
363 			found_start = true;
364 			line_tmp[1] = line_tmp[0];
365 			lineOffset_tmp[1] = lineOffset_tmp[0];
366 		}
367 		else if (found_start
368 				 && type == TOKEN_NAME
369 				 && lookupKeyword (vStringValue (token->string), Lang_html) == KEYWORD_script)
370 		{
371 			found_script = true;
372 			*line = line_tmp[1];
373 			*lineOffset = lineOffset_tmp[1];
374 		}
375 		else
376 			found_start = false;
377 	}
378 	while ((type != TOKEN_EOF) && (!found_script));
379 
380 	return found_script;
381 }
382 
readTag(tokenInfo * token,vString * text,int depth)383 static void readTag (tokenInfo *token, vString *text, int depth)
384 {
385 	bool textCreated = false;
386 
387 	readToken (token, true);
388 	if (token->type == TOKEN_NAME)
389 	{
390 		keywordId startTag;
391 		bool isHeading;
392 		bool isVoid;
393 
394 		startTag = lookupKeyword (vStringValue (token->string), Lang_html);
395 		isHeading = (startTag == KEYWORD_h1 || startTag == KEYWORD_h2 || startTag == KEYWORD_h3);
396 		isVoid = (startTag >= KEYWORD_area && startTag <= KEYWORD_wbr);
397 		if (text == NULL && isHeading)
398 		{
399 			text = vStringNew ();
400 			textCreated = true;
401 		}
402 
403 		do
404 		{
405 			readToken (token, true);
406 			if (startTag == KEYWORD_a && token->type == TOKEN_NAME)
407 			{
408 				keywordId attribute = lookupKeyword (vStringValue (token->string), Lang_html);
409 
410 				if (attribute == KEYWORD_name)
411 				{
412 					readToken (token, true);
413 					if (token->type == TOKEN_EQUAL)
414 					{
415 						readToken (token, true);
416 						if (token->type == TOKEN_STRING || token->type == TOKEN_NAME)
417 							makeSimpleTag (token->string, K_ANCHOR);
418 					}
419 				}
420 			}
421 		}
422 		while (token->type != TOKEN_TAG_END && token->type != TOKEN_TAG_END2 &&
423 			   token->type != TOKEN_EOF);
424 
425 		if (!isVoid && token->type == TOKEN_TAG_END && depth < MAX_DEPTH)
426 		{
427 			long startSourceLineNumber = getSourceLineNumber ();
428 			long startLineNumber = getInputLineNumber ();
429 			long startLineOffset = getInputLineOffset ();
430 			long endLineNumber;
431 			long endLineOffset;
432 			bool tag_start2;
433 
434 			if (startTag == KEYWORD_script)
435 			{
436 				bool script = skipScriptContent (token, &endLineNumber, &endLineOffset);
437 				if (script)
438 					makePromise ("JavaScript", startLineNumber, startLineOffset,
439 								 endLineNumber, endLineOffset, startSourceLineNumber);
440 				readToken (token, true);
441 				goto out;
442 			}
443 
444 			tag_start2 = readTagContent (token, text, &endLineNumber, &endLineOffset, depth);
445 			if (tag_start2)
446 			{
447 				readToken (token, true);
448 				if (isHeading && textCreated && vStringLength (text) > 0)
449 				{
450 					keywordId endTag = lookupKeyword (vStringValue (token->string), Lang_html);
451 					if (startTag == endTag)
452 					{
453 						htmlKind headingKind;
454 
455 						if (startTag == KEYWORD_h1)
456 							headingKind = K_HEADING1;
457 						else if (startTag == KEYWORD_h2)
458 							headingKind = K_HEADING2;
459 						else
460 							headingKind = K_HEADING3;
461 
462 						vStringStripLeading (text);
463 						vStringStripTrailing (text);
464 						makeSimpleTag (text, headingKind);
465 					}
466 				}
467 				else if (startTag == KEYWORD_style)
468 				{
469 					keywordId endTag = lookupKeyword (vStringValue (token->string), Lang_html);
470 					if (startTag == endTag)
471 						makePromise ("CSS", startLineNumber, startLineOffset,
472 									 endLineNumber, endLineOffset, startSourceLineNumber);
473 				}
474 
475 				readToken (token, true);
476 			}
477 		}
478 	}
479 
480  out:
481 	if (textCreated)
482 		vStringDelete (text);
483 }
484 
findHtmlTags(void)485 static void findHtmlTags (void)
486 {
487 	tokenInfo token;
488 
489 	token.string = vStringNew ();
490 
491 	do
492 	{
493 		readToken (&token, true);
494 		if (token.type == TOKEN_TAG_START)
495 			readTag (&token, NULL, 0);
496 	}
497 	while (token.type != TOKEN_EOF);
498 
499 	vStringDelete (token.string);
500 }
501 
initialize(const langType language)502 static void initialize (const langType language)
503 {
504 	Lang_html = language;
505 }
506 
507 /* parser definition */
HtmlParser(void)508 extern parserDefinition* HtmlParser (void)
509 {
510 	static const char *const extensions [] = { "htm", "html", NULL };
511 	parserDefinition* def = parserNew ("HTML");
512 	def->kindTable        = HtmlKinds;
513 	def->kindCount    = ARRAY_SIZE (HtmlKinds);
514 	def->extensions   = extensions;
515 	def->parser       = findHtmlTags;
516 	def->initialize   = initialize;
517 	def->keywordTable = HtmlKeywordTable;
518 	def->keywordCount = ARRAY_SIZE (HtmlKeywordTable);
519 	return def;
520 }
521