1 /*
2  *   Copyright (c) 2000-2001, Jérôme Plût
3  *   Copyright (c) 2006, Enrico Tröger
4  *   Copyright (c) 2019, Mirco Schönfeld
5  *
6  *   This source code is released for free distribution under the terms of the
7  *   GNU General Public License.
8  *
9  *   This module contains functions for generating tags for source files
10  *   for the BibTex formatting system.
11  *   https://en.wikipedia.org/wiki/BibTeX
12  */
13 
14 /*
15  *	 INCLUDE FILES
16  */
17 #include "general.h"	/* must always come first */
18 #include <ctype.h>	/* to define isalpha () */
19 #include <string.h>
20 
21 #include "debug.h"
22 #include "entry.h"
23 #include "keyword.h"
24 #include "parse.h"
25 #include "read.h"
26 #include "routines.h"
27 #include "vstring.h"
28 
29 /*
30  *	 MACROS
31  */
32 #define isType(token,t)		(bool) ((token)->type == (t))
33 #define isKeyword(token,k)	(bool) ((token)->keyword == (k))
34 #define isIdentChar(c) \
35 	(isalpha (c) || isdigit (c) || (c) == '_' || (c) == '-' || (c) == '+')
36 
37 /*
38  *	 DATA DECLARATIONS
39  */
40 
41 /*
42  * Used to specify type of keyword.
43  */
44 enum eKeywordId {
45 	KEYWORD_article,
46 	KEYWORD_book,
47 	KEYWORD_booklet,
48 	KEYWORD_conference,
49 	KEYWORD_inbook,
50 	KEYWORD_incollection,
51 	KEYWORD_inproceedings,
52 	KEYWORD_manual,
53 	KEYWORD_mastersthesis,
54 	KEYWORD_misc,
55 	KEYWORD_phdthesis,
56 	KEYWORD_proceedings,
57 	KEYWORD_string,
58 	KEYWORD_techreport,
59 	KEYWORD_unpublished
60 };
61 typedef int keywordId; /* to allow KEYWORD_NONE */
62 
63 enum eTokenType {
64 	/* 0..255 are the byte's value.  Some are named for convenience */
65 	TOKEN_OPEN_CURLY = '{',
66 	/* above is special types */
67 	TOKEN_UNDEFINED = 256,
68 	TOKEN_KEYWORD,
69 	TOKEN_IDENTIFIER
70 };
71 typedef int tokenType;
72 
73 typedef struct sTokenInfo {
74 	tokenType		type;
75 	keywordId		keyword;
76 	vString *		string;
77 	unsigned long 	lineNumber;
78 	MIOPos 			filePosition;
79 } tokenInfo;
80 
81 /*
82  *	DATA DEFINITIONS
83  */
84 
85 static langType Lang_bib;
86 
87 typedef enum {
88 	BIBTAG_ARTICLE,
89 	BIBTAG_BOOK,
90 	BIBTAG_BOOKLET,
91 	BIBTAG_CONFERENCE,
92 	BIBTAG_INBOOK,
93 	BIBTAG_INCOLLECTION,
94 	BIBTAG_INPROCEEDINGS,
95 	BIBTAG_MANUAL,
96 	BIBTAG_MASTERSTHESIS,
97 	BIBTAG_MISC,
98 	BIBTAG_PHDTHESIS,
99 	BIBTAG_PROCEEDINGS,
100 	BIBTAG_STRING,
101 	BIBTAG_TECHREPORT,
102 	BIBTAG_UNPUBLISHED,
103 	BIBTAG_COUNT
104 } bibKind;
105 
106 static kindDefinition BibKinds [] = {
107 	{ true,  'a', "article",				"article"				},
108 	{ true,  'b', "book",						"book"					},
109 	{ true,  'B', "booklet",				"booklet"				},
110 	{ true,  'c', "conference",			"conference"		},
111 	{ true,  'i', "inbook",					"inbook"				},
112 	{ true,  'I', "incollection",		"incollection"	},
113 	{ true,  'j', "inproceedings",	"inproceedings"	},
114 	{ true,  'm', "manual",					"manual"				},
115 	{ true,  'M', "mastersthesis",	"mastersthesis"	},
116 	{ true,  'n', "misc",						"misc"					},
117 	{ true,  'p', "phdthesis",			"phdthesis"			},
118 	{ true,  'P', "proceedings",		"proceedings"		},
119 	{ true,  's', "string",					"string"				},
120 	{ true,  't', "techreport",			"techreport"		},
121 	{ true,  'u', "unpublished",		"unpublished"		}
122 };
123 
124 static const keywordTable BibKeywordTable [] = {
125 	/* keyword			  keyword ID */
126 	{ "article",	    KEYWORD_article				},
127 	{ "book",	        KEYWORD_book				  },
128 	{ "booklet",	    KEYWORD_booklet				},
129 	{ "conference",	  KEYWORD_conference		},
130 	{ "inbook",	      KEYWORD_inbook				},
131 	{ "incollection",	KEYWORD_incollection	},
132 	{ "inproceedings",KEYWORD_inproceedings	},
133 	{ "manual",	      KEYWORD_manual				},
134 	{ "mastersthesis",KEYWORD_mastersthesis	},
135 	{ "misc",	        KEYWORD_misc				  },
136 	{ "phdthesis",	  KEYWORD_phdthesis			},
137 	{ "proceedings",	KEYWORD_proceedings		},
138 	{ "string",				KEYWORD_string				},
139 	{ "techreport",	  KEYWORD_techreport		},
140 	{ "unpublished",	KEYWORD_unpublished		}
141 };
142 
143 /*
144  *	 FUNCTION DEFINITIONS
145  */
146 
newToken(void)147 static tokenInfo *newToken (void)
148 {
149 	tokenInfo *const token = xMalloc (1, tokenInfo);
150 
151 	token->type			= TOKEN_UNDEFINED;
152 	token->keyword		= KEYWORD_NONE;
153 	token->string		= vStringNew ();
154 	token->lineNumber   = getInputLineNumber ();
155 	token->filePosition = getInputFilePosition ();
156 
157 	return token;
158 }
159 
deleteToken(tokenInfo * const token)160 static void deleteToken (tokenInfo *const token)
161 {
162 	vStringDelete (token->string);
163 	eFree (token);
164 }
165 
166 /*
167  *	 Tag generation functions
168  */
makeBibTag(tokenInfo * const token,bibKind kind)169 static void makeBibTag (tokenInfo *const token, bibKind kind)
170 {
171 	if (BibKinds [kind].enabled)
172 	{
173 		const char *const name = vStringValue (token->string);
174 		tagEntryInfo e;
175 		initTagEntry (&e, name, kind);
176 
177 		e.lineNumber   = token->lineNumber;
178 		e.filePosition = token->filePosition;
179 
180 		makeTagEntry (&e);
181 	}
182 }
183 
184 /*
185  *	 Parsing functions
186  */
187 
188 /*
189  *	Read a C identifier beginning with "firstChar" and places it into
190  *	"name".
191  */
parseIdentifier(vString * const string,const int firstChar)192 static void parseIdentifier (vString *const string, const int firstChar)
193 {
194 	int c = firstChar;
195 	Assert (isIdentChar (c));
196 	do
197 	{
198 		vStringPut (string, c);
199 		c = getcFromInputFile ();
200 	} while (c != EOF && isIdentChar (c));
201 	if (c != EOF)
202 		ungetcToInputFile (c);		/* unget non-identifier character */
203 }
204 
readToken(tokenInfo * const token)205 static bool readToken (tokenInfo *const token)
206 {
207 	int c;
208 
209 	token->type			= TOKEN_UNDEFINED;
210 	token->keyword		= KEYWORD_NONE;
211 	vStringClear (token->string);
212 
213 getNextChar:
214 
215 	do
216 	{
217 		c = getcFromInputFile ();
218 	}
219 	while (c == '\t' || c == ' ' || c == '\n');
220 
221 	token->lineNumber   = getInputLineNumber ();
222 	token->filePosition = getInputFilePosition ();
223 
224 	token->type = (unsigned char) c;
225 	switch (c)
226 	{
227 		case EOF: return false;
228 
229 		case '@':
230 					/*
231 					 * All Bib entries start with an at symbol.
232 					 * Check if the next character is an alpha character
233 					 * else it is not a potential tex tag.
234 					 */
235 					c = getcFromInputFile ();
236 					if (! isalpha (c))
237 					  ungetcToInputFile (c);
238 					else
239 					{
240 						vStringPut (token->string, '@');
241 						parseIdentifier (token->string, c);
242 						token->keyword = lookupCaseKeyword (vStringValue (token->string) + 1, Lang_bib);
243 						if (isKeyword (token, KEYWORD_NONE))
244 							token->type = TOKEN_IDENTIFIER;
245 						else
246 							token->type = TOKEN_KEYWORD;
247 					}
248 					break;
249 		case '%':
250 					skipToCharacterInInputFile ('\n'); /* % are single line comments */
251 					goto getNextChar;
252 					break;
253 		default:
254 					if (isIdentChar (c))
255 					{
256 						parseIdentifier (token->string, c);
257 						token->type = TOKEN_IDENTIFIER;
258 					}
259 					break;
260 	}
261 	return true;
262 }
263 
copyToken(tokenInfo * const dest,tokenInfo * const src)264 static void copyToken (tokenInfo *const dest, tokenInfo *const src)
265 {
266 	dest->lineNumber = src->lineNumber;
267 	dest->filePosition = src->filePosition;
268 	dest->type = src->type;
269 	dest->keyword = src->keyword;
270 	vStringCopy (dest->string, src->string);
271 }
272 
273 /*
274  *	 Scanning functions
275  */
276 
parseTag(tokenInfo * const token,bibKind kind)277 static bool parseTag (tokenInfo *const token, bibKind kind)
278 {
279 	tokenInfo *	const name = newToken ();
280 	vString *		currentid;
281 	bool				eof = false;
282 
283 	currentid = vStringNew ();
284 	/*
285 	 * Bib entries are of these formats:
286 	 *   @article{identifier,
287 	 *   author="John Doe"}
288 	 *
289 	 * When a keyword is found, loop through all words up to
290 	 * a comma brace for the tag name.
291 	 *
292 	 */
293 	if (isType (token, TOKEN_KEYWORD))
294 	{
295 		copyToken (name, token);
296 		if (!readToken (token))
297 		{
298 			eof = true;
299 			goto out;
300 		}
301 	}
302 
303 	if (isType (token, TOKEN_OPEN_CURLY))
304 	{
305 		if (!readToken (token))
306 		{
307 			eof = true;
308 			goto out;
309 		}
310 		if (isType (token, TOKEN_IDENTIFIER)){
311 			vStringCat (currentid, token->string);
312 			vStringStripTrailing (currentid);
313 			if (vStringLength (currentid) > 0)
314 			{
315 				vStringCopy (name->string, currentid);
316 				makeBibTag (name, kind);
317 			}
318 		}
319 		else
320 		{ // should find an identifier for bib item at first place
321 			eof = true;
322 			goto out;
323 		}
324 	}
325 
326 
327  out:
328 	deleteToken (name);
329 	vStringDelete (currentid);
330 	return eof;
331 }
332 
parseBibFile(tokenInfo * const token)333 static void parseBibFile (tokenInfo *const token)
334 {
335 	bool eof = false;
336 
337 	do
338 	{
339 		if (!readToken (token))
340 			break;
341 
342 		if (isType (token, TOKEN_KEYWORD))
343 		{
344 			switch (token->keyword)
345 			{
346 				case KEYWORD_article:
347 					eof = parseTag (token, BIBTAG_ARTICLE);
348 					break;
349 				case KEYWORD_book:
350 					eof = parseTag (token, BIBTAG_BOOK);
351 					break;
352 				case KEYWORD_booklet:
353 					eof = parseTag (token, BIBTAG_BOOKLET);
354 					break;
355 				case KEYWORD_conference:
356 					eof = parseTag (token, BIBTAG_CONFERENCE);
357 					break;
358 				case KEYWORD_inbook:
359 					eof = parseTag (token, BIBTAG_INBOOK);
360 					break;
361 				case KEYWORD_incollection:
362 					eof = parseTag (token, BIBTAG_INCOLLECTION);
363 					break;
364 				case KEYWORD_inproceedings:
365 					eof = parseTag (token, BIBTAG_INPROCEEDINGS);
366 					break;
367 				case KEYWORD_manual:
368 					eof = parseTag (token, BIBTAG_MANUAL);
369 					break;
370 				case KEYWORD_mastersthesis:
371 					eof = parseTag (token, BIBTAG_MASTERSTHESIS);
372 					break;
373 				case KEYWORD_misc:
374 					eof = parseTag (token, BIBTAG_MISC);
375 					break;
376 				case KEYWORD_phdthesis:
377 					eof = parseTag (token, BIBTAG_PHDTHESIS);
378 					break;
379 				case KEYWORD_proceedings:
380 					eof = parseTag (token, BIBTAG_PROCEEDINGS);
381 					break;
382 				case KEYWORD_string:
383 					eof = parseTag (token, BIBTAG_STRING);
384 					break;
385 				case KEYWORD_techreport:
386 					eof = parseTag (token, BIBTAG_TECHREPORT);
387 					break;
388 				case KEYWORD_unpublished:
389 					eof = parseTag (token, BIBTAG_UNPUBLISHED);
390 					break;
391 				default:
392 					break;
393 			}
394 		}
395 		if (eof)
396 			break;
397 	} while (true);
398 }
399 
initialize(const langType language)400 static void initialize (const langType language)
401 {
402 	Lang_bib = language;
403 }
404 
findBibTags(void)405 static void findBibTags (void)
406 {
407 	tokenInfo *const token = newToken ();
408 
409 	parseBibFile (token);
410 
411 	deleteToken (token);
412 }
413 
414 /* Create parser definition structure */
BibtexParser(void)415 extern parserDefinition* BibtexParser (void)
416 {
417 	Assert (ARRAY_SIZE (BibKinds) == BIBTAG_COUNT);
418 	static const char *const extensions [] = { "bib", NULL };
419 	parserDefinition *const def = parserNew ("BibTeX");
420 	def->extensions = extensions;
421 	/*
422 	 * New definitions for parsing instead of regex
423 	 */
424 	def->kindTable		= BibKinds;
425 	def->kindCount		= ARRAY_SIZE (BibKinds);
426 	def->parser				= findBibTags;
427 	def->initialize		= initialize;
428 	def->keywordTable	= BibKeywordTable;
429 	def->keywordCount	= ARRAY_SIZE (BibKeywordTable);
430 	return def;
431 }
432