1 /*
2 * Copyright (c) 2016, Jiri Techet
3 *
4 * This source code is released for free distribution under the terms of the
5 * GNU General Public License version 2 or (at your option) any later version.
6 *
7 * This module contains functions for generating tags for HTML language
8 * files.
9 */
10
11 #include "general.h"
12
13 #include <string.h>
14 #include <ctype.h>
15
16 #include "entry.h"
17 #include "parse.h"
18 #include "read.h"
19 #include "routines.h"
20 #include "keyword.h"
21 #include "promise.h"
22
23 /* The max. number of nested elements - prevents further recursion if the limit
24 * is exceeded and avoids stack overflow for invalid input containing too many
25 * open tags */
26 #define MAX_DEPTH 1000
27
28
29 typedef enum {
30 K_ANCHOR,
31 K_HEADING1,
32 K_HEADING2,
33 K_HEADING3
34 } htmlKind;
35
36
37 static kindDefinition HtmlKinds [] = {
38 { true, 'a', "anchor", "named anchors" },
39 { true, 'h', "heading1", "H1 headings" },
40 { true, 'i', "heading2", "H2 headings" },
41 { true, 'j', "heading3", "H3 headings" }
42 };
43
44 typedef enum {
45 KEYWORD_h1,
46 KEYWORD_h2,
47 KEYWORD_h3,
48 KEYWORD_a,
49 KEYWORD_script,
50 KEYWORD_style,
51 KEYWORD_name,
52
53 /* void elements */
54 KEYWORD_area,
55 KEYWORD_base,
56 KEYWORD_br,
57 KEYWORD_col,
58 KEYWORD_command,
59 KEYWORD_embed,
60 KEYWORD_hr,
61 KEYWORD_img,
62 KEYWORD_input,
63 KEYWORD_keygen,
64 KEYWORD_link,
65 KEYWORD_meta,
66 KEYWORD_param,
67 KEYWORD_source,
68 KEYWORD_track,
69 KEYWORD_wbr
70 } keywordId;
71
72 static const keywordTable HtmlKeywordTable[] = {
73 {"h1", KEYWORD_h1},
74 {"h2", KEYWORD_h2},
75 {"h3", KEYWORD_h3},
76 {"a", KEYWORD_a},
77 {"script", KEYWORD_script},
78 {"style", KEYWORD_style},
79 {"name", KEYWORD_name},
80
81 /* void elements */
82 {"area", KEYWORD_area},
83 {"base", KEYWORD_base},
84 {"br", KEYWORD_br},
85 {"col", KEYWORD_col},
86 {"command", KEYWORD_command},
87 {"embed", KEYWORD_embed},
88 {"hr", KEYWORD_hr},
89 {"img", KEYWORD_img},
90 {"input", KEYWORD_input},
91 {"keygen", KEYWORD_keygen},
92 {"link", KEYWORD_link},
93 {"meta", KEYWORD_meta},
94 {"param", KEYWORD_param},
95 {"source", KEYWORD_source},
96 {"track", KEYWORD_track},
97 {"wbr", KEYWORD_wbr},
98 };
99
100 typedef enum {
101 TOKEN_EOF,
102 TOKEN_NAME, /* tag and attribute names */
103 TOKEN_STRING, /* single- or double-quoted attribute value */
104 TOKEN_TEXT,
105 TOKEN_TAG_START, /* < */
106 TOKEN_TAG_START2, /* </ */
107 TOKEN_TAG_END, /* > */
108 TOKEN_TAG_END2, /* /> */
109 TOKEN_EQUAL,
110 TOKEN_COMMENT,
111 TOKEN_OTHER
112 } tokenType;
113
114 #ifdef DEBUG
115 const char *tokenTypes[] = {
116 #define E(X) [TOKEN_##X] = #X
117 E(EOF),
118 E(NAME),
119 E(STRING),
120 E(TEXT),
121 E(TAG_START),
122 E(TAG_START2),
123 E(TAG_END),
124 E(TAG_END2),
125 E(EQUAL),
126 E(COMMENT),
127 E(OTHER),
128 #undef E
129 };
130 #endif
131
132 typedef struct {
133 tokenType type;
134 vString *string;
135 } tokenInfo;
136
137
138 static int Lang_html;
139
140
141 static void readTag (tokenInfo *token, vString *text, int depth);
142
143 #ifdef DEBUG
dumpToken(tokenInfo * token,const char * context,const char * extra_context)144 static void dumpToken (tokenInfo *token, const char *context, const char* extra_context)
145 {
146 fprintf (stderr, "[%7s] %-20s@%s.%s\n",
147 tokenTypes[token->type], vStringValue(token->string),
148 context, extra_context? extra_context: "_");
149 }
150 #endif
151
readTokenText(tokenInfo * const token,bool collectText)152 static void readTokenText (tokenInfo *const token, bool collectText)
153 {
154 int c;
155 int lastC = 'X'; /* whatever non-space character */
156
157 vStringClear (token->string);
158
159 getNextChar:
160
161 c = getcFromInputFile ();
162
163 switch (c)
164 {
165 case EOF:
166 token->type = TOKEN_EOF;
167 break;
168
169 case '<':
170 ungetcToInputFile (c);
171 token->type = TOKEN_TEXT;
172 break;
173
174 default:
175 if (collectText)
176 {
177 if (isspace (c))
178 c = ' ';
179 if (c != ' ' || lastC != ' ')
180 {
181 vStringPut (token->string, c);
182 lastC = c;
183 }
184 }
185
186 goto getNextChar;
187 }
188 }
189
readToken(tokenInfo * const token,bool skipComments)190 static void readToken (tokenInfo *const token, bool skipComments)
191 {
192 int c;
193
194 vStringClear (token->string);
195
196 getNextChar:
197
198 c = getcFromInputFile ();
199 while (isspace (c))
200 c = getcFromInputFile ();
201
202 switch (c)
203 {
204 case EOF:
205 token->type = TOKEN_EOF;
206 break;
207
208 case '<':
209 {
210 int d = getcFromInputFile ();
211
212 if (d == '!')
213 {
214 d = getcFromInputFile ();
215 if (d == '-')
216 {
217 d = getcFromInputFile ();
218 if (d == '-')
219 {
220 int e = ' ';
221 int f = ' ';
222 do
223 {
224 d = e;
225 e = f;
226 f = getcFromInputFile ();
227 }
228 while (f != EOF && ! (d == '-' && e == '-' && f == '>'));
229
230 if (skipComments)
231 goto getNextChar;
232 else
233 {
234 token->type = TOKEN_COMMENT;
235 break;
236 }
237 }
238 }
239 ungetcToInputFile (d);
240 token->type = TOKEN_OTHER;
241 }
242 else if (d == '?')
243 token->type = TOKEN_OTHER;
244 else if (d == '/')
245 token->type = TOKEN_TAG_START2;
246 else
247 {
248 ungetcToInputFile (d);
249 token->type = TOKEN_TAG_START;
250 }
251 break;
252 }
253 case '/':
254 {
255 int d = getcFromInputFile ();
256 if (d == '>')
257 token->type = TOKEN_TAG_END2;
258 else
259 {
260 ungetcToInputFile (d);
261 token->type = TOKEN_OTHER;
262 }
263 break;
264 }
265 case '>':
266 token->type = TOKEN_TAG_END;
267 break;
268
269 case '=':
270 token->type = TOKEN_EQUAL;
271 break;
272
273 case '"':
274 case '\'':
275 {
276 const int delimiter = c;
277 c = getcFromInputFile ();
278 while (c != EOF && c != delimiter)
279 {
280 vStringPut (token->string, c);
281 c = getcFromInputFile ();
282 }
283 token->type = TOKEN_STRING;
284 break;
285 }
286
287 default:
288 {
289 do
290 {
291 vStringPut (token->string, tolower (c));
292 c = getcFromInputFile ();
293 }
294 while (!isspace (c) && c != '<' && c != '>' && c != '/' &&
295 c != '=' && c != '\'' && c != '"' && c != EOF);
296 if (c != EOF)
297 ungetcToInputFile (c);
298 token->type = TOKEN_NAME;
299 break;
300 }
301 }
302 }
303
appendText(vString * text,vString * appendedText)304 static void appendText (vString *text, vString *appendedText)
305 {
306 if (text != NULL && vStringLength (appendedText) > 0)
307 {
308 if (vStringLength (text) > 0 && vStringLast (text) == ' ' &&
309 vStringLength (appendedText) > 0 && vStringChar (appendedText, 0) == ' ')
310 {
311 vStringStripTrailing (text);
312 }
313 vStringCat (text, appendedText);
314 }
315 }
316
readTagContent(tokenInfo * token,vString * text,long * line,long * lineOffset,int depth)317 static bool readTagContent (tokenInfo *token, vString *text, long *line, long *lineOffset, int depth)
318 {
319 tokenType type;
320
321 readTokenText (token, text != NULL);
322 appendText (text, token->string);
323
324 do
325 {
326 *line = getInputLineNumber ();
327 *lineOffset = getInputLineOffset ();
328 readToken (token, false);
329 type = token->type;
330 if (type == TOKEN_TAG_START)
331 readTag (token, text, depth + 1);
332 if (type == TOKEN_COMMENT || type == TOKEN_TAG_START)
333 {
334 readTokenText (token, text != NULL);
335 appendText (text, token->string);
336 }
337 }
338 while (type == TOKEN_COMMENT || type == TOKEN_TAG_START);
339
340 return type == TOKEN_TAG_START2;
341 }
342
skipScriptContent(tokenInfo * token,long * line,long * lineOffset)343 static bool skipScriptContent (tokenInfo *token, long *line, long *lineOffset)
344 {
345 bool found_start = false;
346 bool found_script = false;
347
348 long line_tmp[2];
349 long lineOffset_tmp[2];
350
351 tokenType type;
352
353 do
354 {
355 line_tmp[0] = getInputLineNumber ();
356 lineOffset_tmp[0] = getInputLineOffset ();
357
358 readToken (token, false);
359 type = token->type;
360
361 if (type == TOKEN_TAG_START2)
362 {
363 found_start = true;
364 line_tmp[1] = line_tmp[0];
365 lineOffset_tmp[1] = lineOffset_tmp[0];
366 }
367 else if (found_start
368 && type == TOKEN_NAME
369 && lookupKeyword (vStringValue (token->string), Lang_html) == KEYWORD_script)
370 {
371 found_script = true;
372 *line = line_tmp[1];
373 *lineOffset = lineOffset_tmp[1];
374 }
375 else
376 found_start = false;
377 }
378 while ((type != TOKEN_EOF) && (!found_script));
379
380 return found_script;
381 }
382
readTag(tokenInfo * token,vString * text,int depth)383 static void readTag (tokenInfo *token, vString *text, int depth)
384 {
385 bool textCreated = false;
386
387 readToken (token, true);
388 if (token->type == TOKEN_NAME)
389 {
390 keywordId startTag;
391 bool isHeading;
392 bool isVoid;
393
394 startTag = lookupKeyword (vStringValue (token->string), Lang_html);
395 isHeading = (startTag == KEYWORD_h1 || startTag == KEYWORD_h2 || startTag == KEYWORD_h3);
396 isVoid = (startTag >= KEYWORD_area && startTag <= KEYWORD_wbr);
397 if (text == NULL && isHeading)
398 {
399 text = vStringNew ();
400 textCreated = true;
401 }
402
403 do
404 {
405 readToken (token, true);
406 if (startTag == KEYWORD_a && token->type == TOKEN_NAME)
407 {
408 keywordId attribute = lookupKeyword (vStringValue (token->string), Lang_html);
409
410 if (attribute == KEYWORD_name)
411 {
412 readToken (token, true);
413 if (token->type == TOKEN_EQUAL)
414 {
415 readToken (token, true);
416 if (token->type == TOKEN_STRING || token->type == TOKEN_NAME)
417 makeSimpleTag (token->string, K_ANCHOR);
418 }
419 }
420 }
421 }
422 while (token->type != TOKEN_TAG_END && token->type != TOKEN_TAG_END2 &&
423 token->type != TOKEN_EOF);
424
425 if (!isVoid && token->type == TOKEN_TAG_END && depth < MAX_DEPTH)
426 {
427 long startSourceLineNumber = getSourceLineNumber ();
428 long startLineNumber = getInputLineNumber ();
429 long startLineOffset = getInputLineOffset ();
430 long endLineNumber;
431 long endLineOffset;
432 bool tag_start2;
433
434 if (startTag == KEYWORD_script)
435 {
436 bool script = skipScriptContent (token, &endLineNumber, &endLineOffset);
437 if (script)
438 makePromise ("JavaScript", startLineNumber, startLineOffset,
439 endLineNumber, endLineOffset, startSourceLineNumber);
440 readToken (token, true);
441 goto out;
442 }
443
444 tag_start2 = readTagContent (token, text, &endLineNumber, &endLineOffset, depth);
445 if (tag_start2)
446 {
447 readToken (token, true);
448 if (isHeading && textCreated && vStringLength (text) > 0)
449 {
450 keywordId endTag = lookupKeyword (vStringValue (token->string), Lang_html);
451 if (startTag == endTag)
452 {
453 htmlKind headingKind;
454
455 if (startTag == KEYWORD_h1)
456 headingKind = K_HEADING1;
457 else if (startTag == KEYWORD_h2)
458 headingKind = K_HEADING2;
459 else
460 headingKind = K_HEADING3;
461
462 vStringStripLeading (text);
463 vStringStripTrailing (text);
464 makeSimpleTag (text, headingKind);
465 }
466 }
467 else if (startTag == KEYWORD_style)
468 {
469 keywordId endTag = lookupKeyword (vStringValue (token->string), Lang_html);
470 if (startTag == endTag)
471 makePromise ("CSS", startLineNumber, startLineOffset,
472 endLineNumber, endLineOffset, startSourceLineNumber);
473 }
474
475 readToken (token, true);
476 }
477 }
478 }
479
480 out:
481 if (textCreated)
482 vStringDelete (text);
483 }
484
findHtmlTags(void)485 static void findHtmlTags (void)
486 {
487 tokenInfo token;
488
489 token.string = vStringNew ();
490
491 do
492 {
493 readToken (&token, true);
494 if (token.type == TOKEN_TAG_START)
495 readTag (&token, NULL, 0);
496 }
497 while (token.type != TOKEN_EOF);
498
499 vStringDelete (token.string);
500 }
501
initialize(const langType language)502 static void initialize (const langType language)
503 {
504 Lang_html = language;
505 }
506
507 /* parser definition */
HtmlParser(void)508 extern parserDefinition* HtmlParser (void)
509 {
510 static const char *const extensions [] = { "htm", "html", NULL };
511 parserDefinition* def = parserNew ("HTML");
512 def->kindTable = HtmlKinds;
513 def->kindCount = ARRAY_SIZE (HtmlKinds);
514 def->extensions = extensions;
515 def->parser = findHtmlTags;
516 def->initialize = initialize;
517 def->keywordTable = HtmlKeywordTable;
518 def->keywordCount = ARRAY_SIZE (HtmlKeywordTable);
519 return def;
520 }
521