1 /*
2  *	 $Id: tex.c 666 2008-05-15 17:47:31Z dfishburn $
3  *
4  *	 Copyright (c) 2008, David Fishburn
5  *
6  *	 This source code is released for free distribution under the terms of the
7  *	 GNU General Public License.
8  *
9  *	 This module contains functions for generating tags for TeX language files.
10  *
11  *	 Tex language reference:
12  *		 http://en.wikibooks.org/wiki/TeX#The_Structure_of_TeX
13  */
14 
15 /*
16  *	 INCLUDE FILES
17  */
18 #include "general.h"	/* must always come first */
19 #include <ctype.h>	/* to define isalpha () */
20 #include <setjmp.h>
21 #ifdef DEBUG
22 #include <stdio.h>
23 #endif
24 
25 #include "debug.h"
26 #include "entry.h"
27 #include "keyword.h"
28 #include "parse.h"
29 #include "read.h"
30 #include "routines.h"
31 #include "vstring.h"
32 
33 /*
34  *	 MACROS
35  */
36 #define isType(token,t)		(boolean) ((token)->type == (t))
37 #define isKeyword(token,k)	(boolean) ((token)->keyword == (k))
38 
39 /*
40  *	 DATA DECLARATIONS
41  */
42 
43 typedef enum eException { ExceptionNone, ExceptionEOF } exception_t;
44 
45 /*
46  * Used to specify type of keyword.
47  */
48 typedef enum eKeywordId {
49 	KEYWORD_NONE = -1,
50 	KEYWORD_chapter,
51 	KEYWORD_section,
52 	KEYWORD_subsection,
53 	KEYWORD_subsubsection,
54 	KEYWORD_part,
55 	KEYWORD_paragraph,
56 	KEYWORD_subparagraph,
57 	KEYWORD_include
58 } keywordId;
59 
60 /*	Used to determine whether keyword is valid for the token language and
61  *	what its ID is.
62  */
63 typedef struct sKeywordDesc {
64 	const char *name;
65 	keywordId id;
66 } keywordDesc;
67 
68 typedef enum eTokenType {
69 	TOKEN_UNDEFINED,
70 	TOKEN_CHARACTER,
71 	TOKEN_CLOSE_PAREN,
72 	TOKEN_COMMA,
73 	TOKEN_KEYWORD,
74 	TOKEN_OPEN_PAREN,
75 	TOKEN_IDENTIFIER,
76 	TOKEN_STRING,
77 	TOKEN_OPEN_CURLY,
78 	TOKEN_CLOSE_CURLY,
79 	TOKEN_OPEN_SQUARE,
80 	TOKEN_CLOSE_SQUARE,
81 	TOKEN_QUESTION_MARK,
82 	TOKEN_STAR
83 } tokenType;
84 
85 typedef struct sTokenInfo {
86 	tokenType		type;
87 	keywordId		keyword;
88 	vString *		string;
89 	vString *		scope;
90 	unsigned long 	lineNumber;
91 	long   			filePosition;
92 } tokenInfo;
93 
94 /*
95  *	DATA DEFINITIONS
96  */
97 
98 static langType Lang_js;
99 
100 static jmp_buf Exception;
101 
102 typedef enum {
103 	TEXTAG_CHAPTER,
104 	TEXTAG_SECTION,
105 	TEXTAG_SUBSECTION,
106 	TEXTAG_SUBSUBSECTION,
107 	TEXTAG_PART,
108 	TEXTAG_PARAGRAPH,
109 	TEXTAG_SUBPARAGRAPH,
110 	TEXTAG_INCLUDE,
111 	TEXTAG_COUNT
112 } texKind;
113 
114 static kindOption TexKinds [] = {
115 	{ TRUE,  'c', "chapter",		  "chapters"		   },
116 	{ TRUE,  's', "section",		  "sections"		   },
117 	{ TRUE,  'u', "subsection",		  "subsections"		   },
118 	{ TRUE,  'b', "subsubsection",	  "subsubsections"	   },
119 	{ TRUE,  'p', "part",			  "parts"			   },
120 	{ TRUE,  'P', "paragraph",		  "paragraphs"		   },
121 	{ TRUE,  'G', "subparagraph",	  "subparagraphs"	   },
122 	{ TRUE,  'i', "include",	  	  "includes"		   }
123 };
124 
125 static const keywordDesc TexKeywordTable [] = {
126 	/* keyword			keyword ID */
127 	{ "chapter",		KEYWORD_chapter				},
128 	{ "section",		KEYWORD_section				},
129 	{ "subsection",		KEYWORD_subsection			},
130 	{ "subsubsection",	KEYWORD_subsubsection		},
131 	{ "part",			KEYWORD_part				},
132 	{ "paragraph",		KEYWORD_paragraph			},
133 	{ "subparagraph",	KEYWORD_subparagraph		},
134 	{ "include",		KEYWORD_include				}
135 };
136 
137 /*
138  *	 FUNCTION DEFINITIONS
139  */
140 
isIdentChar(const int c)141 static boolean isIdentChar (const int c)
142 {
143 	return (boolean)
144 		(isalpha (c) || isdigit (c) || c == '$' ||
145 		  c == '_' || c == '#' || c == '-' || c == '.');
146 }
147 
buildTexKeywordHash(void)148 static void buildTexKeywordHash (void)
149 {
150 	const size_t count = sizeof (TexKeywordTable) /
151 		sizeof (TexKeywordTable [0]);
152 	size_t i;
153 	for (i = 0	;  i < count  ;  ++i)
154 	{
155 		const keywordDesc* const p = &TexKeywordTable [i];
156 		addKeyword (p->name, Lang_js, (int) p->id);
157 	}
158 }
159 
newToken(void)160 static tokenInfo *newToken (void)
161 {
162 	tokenInfo *const token = xMalloc (1, tokenInfo);
163 
164 	token->type			= TOKEN_UNDEFINED;
165 	token->keyword		= KEYWORD_NONE;
166 	token->string		= vStringNew ();
167 	token->scope		= vStringNew ();
168 	token->lineNumber   = getSourceLineNumber ();
169 	token->filePosition = getInputFilePosition ();
170 
171 	return token;
172 }
173 
deleteToken(tokenInfo * const token)174 static void deleteToken (tokenInfo *const token)
175 {
176 	vStringDelete (token->string);
177 	vStringDelete (token->scope);
178 	eFree (token);
179 }
180 
181 /*
182  *	 Tag generation functions
183  */
184 
makeConstTag(tokenInfo * const token,const texKind kind)185 static void makeConstTag (tokenInfo *const token, const texKind kind)
186 {
187 	if (TexKinds [kind].enabled )
188 	{
189 		const char *const name = vStringValue (token->string);
190 		tagEntryInfo e;
191 		initTagEntry (&e, name);
192 
193 		e.lineNumber   = token->lineNumber;
194 		e.filePosition = token->filePosition;
195 		e.kindName	   = TexKinds [kind].name;
196 		e.kind		   = TexKinds [kind].letter;
197 
198 		makeTagEntry (&e);
199 	}
200 }
201 
makeTexTag(tokenInfo * const token,texKind kind)202 static void makeTexTag (tokenInfo *const token, texKind kind)
203 {
204 	vString *	fulltag;
205 
206 	if (TexKinds [kind].enabled)
207 	{
208 		/*
209 		 * If a scope has been added to the token, change the token
210 		 * string to include the scope when making the tag.
211 		 */
212 		if ( vStringLength (token->scope) > 0 )
213 		{
214 			fulltag = vStringNew ();
215 			vStringCopy (fulltag, token->scope);
216 			vStringCatS (fulltag, ".");
217 			vStringCatS (fulltag, vStringValue (token->string));
218 			vStringTerminate (fulltag);
219 			vStringCopy (token->string, fulltag);
220 			vStringDelete (fulltag);
221 		}
222 		makeConstTag (token, kind);
223 	}
224 }
225 
226 /*
227  *	 Parsing functions
228  */
229 
parseString(vString * const string,const int delimiter)230 static void parseString (vString *const string, const int delimiter)
231 {
232 	boolean end = FALSE;
233 	while (! end)
234 	{
235 		int c = fileGetc ();
236 		if (c == EOF)
237 			end = TRUE;
238 		else if (c == '\\')
239 		{
240 			c = fileGetc(); /* This maybe a ' or ". */
241 			vStringPut (string, c);
242 		}
243 		else if (c == delimiter)
244 			end = TRUE;
245 		else
246 			vStringPut (string, c);
247 	}
248 	vStringTerminate (string);
249 }
250 
251 /*
252  *	Read a C identifier beginning with "firstChar" and places it into
253  *	"name".
254  */
parseIdentifier(vString * const string,const int firstChar)255 static void parseIdentifier (vString *const string, const int firstChar)
256 {
257 	int c = firstChar;
258 	Assert (isIdentChar (c));
259 	do
260 	{
261 		vStringPut (string, c);
262 		c = fileGetc ();
263 	} while (isIdentChar (c));
264 
265 	vStringTerminate (string);
266 	if (!isspace (c))
267 		fileUngetc (c);		/* unget non-identifier character */
268 }
269 
readToken(tokenInfo * const token)270 static void readToken (tokenInfo *const token)
271 {
272 	int c;
273 
274 	token->type			= TOKEN_UNDEFINED;
275 	token->keyword		= KEYWORD_NONE;
276 	vStringClear (token->string);
277 
278 getNextChar:
279 	do
280 	{
281 		c = fileGetc ();
282 		token->lineNumber   = getSourceLineNumber ();
283 		token->filePosition = getInputFilePosition ();
284 	}
285 	while (c == '\t'  ||  c == ' ' ||  c == '\n');
286 
287 	switch (c)
288 	{
289 		case EOF: longjmp (Exception, (int)ExceptionEOF);	break;
290 		case '(': token->type = TOKEN_OPEN_PAREN;			break;
291 		case ')': token->type = TOKEN_CLOSE_PAREN;			break;
292 		case ',': token->type = TOKEN_COMMA;				break;
293 		case '{': token->type = TOKEN_OPEN_CURLY;			break;
294 		case '}': token->type = TOKEN_CLOSE_CURLY;			break;
295 		case '[': token->type = TOKEN_OPEN_SQUARE;			break;
296 		case ']': token->type = TOKEN_CLOSE_SQUARE;			break;
297 		case '*': token->type = TOKEN_STAR;					break;
298 
299 		case '\'':
300 		case '"':
301 				  token->type = TOKEN_STRING;
302 				  parseString (token->string, c);
303 				  token->lineNumber = getSourceLineNumber ();
304 				  token->filePosition = getInputFilePosition ();
305 				  break;
306 
307 		case '\\':
308 				  /*
309 				   * All Tex tags start with a backslash.
310 				   * Check if the next character is an alpha character
311 				   * else it is not a potential tex tag.
312 				   */
313 				  c = fileGetc ();
314 				  if (! isalpha (c))
315 					  fileUngetc (c);
316 				  else
317 				  {
318 					  parseIdentifier (token->string, c);
319 					  token->lineNumber = getSourceLineNumber ();
320 					  token->filePosition = getInputFilePosition ();
321 					  token->keyword = analyzeToken (token->string, Lang_js);
322 					  if (isKeyword (token, KEYWORD_NONE))
323 						  token->type = TOKEN_IDENTIFIER;
324 					  else
325 						  token->type = TOKEN_KEYWORD;
326 				  }
327 				  break;
328 
329 		case '%':
330 				  fileSkipToCharacter ('\n'); /* % are single line comments */
331 				  goto getNextChar;
332 				  break;
333 
334 		default:
335 				  if (! isIdentChar (c))
336 					  token->type = TOKEN_UNDEFINED;
337 				  else
338 				  {
339 					  parseIdentifier (token->string, c);
340 					  token->lineNumber = getSourceLineNumber ();
341 					  token->filePosition = getInputFilePosition ();
342 					  token->type = TOKEN_IDENTIFIER;
343 				  }
344 				  break;
345 	}
346 }
347 
copyToken(tokenInfo * const dest,tokenInfo * const src)348 static void copyToken (tokenInfo *const dest, tokenInfo *const src)
349 {
350 	dest->lineNumber = src->lineNumber;
351 	dest->filePosition = src->filePosition;
352 	dest->type = src->type;
353 	dest->keyword = src->keyword;
354 	vStringCopy (dest->string, src->string);
355 	vStringCopy (dest->scope, src->scope);
356 }
357 
358 /*
359  *	 Scanning functions
360  */
361 
parseTag(tokenInfo * const token,texKind kind)362 static boolean parseTag (tokenInfo *const token, texKind kind)
363 {
364 	tokenInfo *const name = newToken ();
365 	vString *	fullname;
366 	boolean		useLongName = TRUE;
367 
368 	fullname = vStringNew ();
369 	vStringClear (fullname);
370 
371 	/*
372 	 * Tex tags are of these formats:
373 	 *   \keyword{any number of words}
374 	 *   \keyword[short desc]{any number of words}
375 	 *   \keyword*[short desc]{any number of words}
376 	 *
377 	 * When a keyword is found, loop through all words within
378 	 * the curly braces for the tag name.
379 	 */
380 
381 	if (isType (token, TOKEN_KEYWORD))
382 	{
383 		copyToken (name, token);
384 		readToken (token);
385 	}
386 
387 	if (isType (token, TOKEN_OPEN_SQUARE))
388 	{
389 		useLongName = FALSE;
390 
391 		readToken (token);
392 		while (! isType (token, TOKEN_CLOSE_SQUARE) )
393 		{
394 			if (isType (token, TOKEN_IDENTIFIER))
395 			{
396 				if (fullname->length > 0)
397 					vStringCatS (fullname, " ");
398 				vStringCatS (fullname, vStringValue (token->string));
399 			}
400 			readToken (token);
401 		}
402 		vStringTerminate (fullname);
403 		vStringCopy (name->string, fullname);
404 		makeTexTag (name, kind);
405 	}
406 
407 	if (isType (token, TOKEN_STAR))
408 	{
409 		readToken (token);
410 	}
411 
412 	if (isType (token, TOKEN_OPEN_CURLY))
413 	{
414 		readToken (token);
415 		while (! isType (token, TOKEN_CLOSE_CURLY) )
416 		{
417 			/* if (isType (token, TOKEN_IDENTIFIER) && useLongName) */
418 			if (useLongName)
419 			{
420 				if (fullname->length > 0)
421 					vStringCatS (fullname, " ");
422 				vStringCatS (fullname, vStringValue (token->string));
423 			}
424 			readToken (token);
425 		}
426 		if (useLongName)
427 		{
428 			vStringTerminate (fullname);
429 			vStringCopy (name->string, fullname);
430 			makeTexTag (name, kind);
431 		}
432 	}
433 
434 	deleteToken (name);
435 	vStringDelete (fullname);
436 	return TRUE;
437 }
438 
parseTexFile(tokenInfo * const token)439 static void parseTexFile (tokenInfo *const token)
440 {
441 	do
442 	{
443 		readToken (token);
444 
445 		if (isType (token, TOKEN_KEYWORD))
446 		{
447 			switch (token->keyword)
448 			{
449 				case KEYWORD_chapter:
450 					parseTag (token, TEXTAG_CHAPTER);
451 					break;
452 				case KEYWORD_section:
453 					parseTag (token, TEXTAG_SECTION);
454 					break;
455 				case KEYWORD_subsection:
456 					parseTag (token, TEXTAG_SUBSUBSECTION);
457 					break;
458 				case KEYWORD_subsubsection:
459 					parseTag (token, TEXTAG_SUBSUBSECTION);
460 					break;
461 				case KEYWORD_part:
462 					parseTag (token, TEXTAG_PART);
463 					break;
464 				case KEYWORD_paragraph:
465 					parseTag (token, TEXTAG_PARAGRAPH);
466 					break;
467 				case KEYWORD_subparagraph:
468 					parseTag (token, TEXTAG_SUBPARAGRAPH);
469 					break;
470 				case KEYWORD_include:
471 					parseTag (token, TEXTAG_INCLUDE);
472 					break;
473 				default:
474 					break;
475 			}
476 		}
477 	} while (TRUE);
478 }
479 
initialize(const langType language)480 static void initialize (const langType language)
481 {
482 	Assert (sizeof (TexKinds) / sizeof (TexKinds [0]) == TEXTAG_COUNT);
483 	Lang_js = language;
484 	buildTexKeywordHash ();
485 }
486 
findTexTags(void)487 static void findTexTags (void)
488 {
489 	tokenInfo *const token = newToken ();
490 	exception_t exception;
491 
492 	exception = (exception_t) (setjmp (Exception));
493 	while (exception == ExceptionNone)
494 		parseTexFile (token);
495 
496 	deleteToken (token);
497 }
498 
499 /* Create parser definition stucture */
TexParser(void)500 extern parserDefinition* TexParser (void)
501 {
502 	static const char *const extensions [] = { "tex", NULL };
503 	parserDefinition *const def = parserNew ("Tex");
504 	def->extensions = extensions;
505 	/*
506 	 * New definitions for parsing instead of regex
507 	 */
508 	def->kinds		= TexKinds;
509 	def->kindCount	= KIND_COUNT (TexKinds);
510 	def->parser		= findTexTags;
511 	def->initialize = initialize;
512 
513 	return def;
514 }
515 /* vi:set tabstop=4 shiftwidth=4 noexpandtab: */
516