1 /*=============================================================================
2    GNU UnRTF, a command-line program to convert RTF documents to other formats.
3    Copyright (C) 2000,2001 Zachary Thayer Smith
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program; if not, write to the Free Software
17    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 
19    The author is reachable by electronic mail at tuorfa@yahoo.com.
20 =============================================================================*/
21 
22 
23 /*----------------------------------------------------------------------
24  * Module name:    parse
25  * Author name:    Zach Smith
26  * Create date:    01 Sep 00
27  * Purpose:        Parsing of the RTF file into a structure of Word objects.
28  *----------------------------------------------------------------------
29  * Changes:
30  * 15 Oct 00, tuorfa@yahoo.com: parse.c created with functions taken from word.c
31  * 15 Oct 00, tuorfa@yahoo.com: backslash before newline is now \par
32  * 08 Apr 01, tuorfa@yahoo.com: removed limit on word length
33  * 03 Aug 01, tuorfa@yahoo.com: added input buffering
34  * 19 Sep 01, tuorfa@yahoo.com: cleaned up read_word()
35  * 22 Sep 01, tuorfa@yahoo.com: moved word_dump() to word.c
36  * 22 Sep 01, tuorfa@yahoo.com: added function-level comment blocks
37  * 08 Sep 03, daved@physiol.usyd.edu.au:  type fixes; ANSI C fixes
38  * 29 Mar 05, daved@physiol.usyd.edu.au: changes requested by ZT Smith
39  * 16 Dec 07, daved@physiol.usyd.edu.au: updated to GPL v3
40  *--------------------------------------------------------------------*/
41 
42 #ifdef HAVE_CONFIG_H
43 #include <config.h>
44 #endif
45 
46 #ifdef HAVE_STDIO_H
47 #include <stdio.h>
48 #endif
49 
50 #ifdef HAVE_STDLIB_H
51 #include <stdlib.h>
52 #endif
53 
54 #ifdef HAVE_CTYPE_H
55 #include <ctype.h>
56 #endif
57 
58 #ifdef HAVE_STRING_H
59 #include <string.h>
60 #endif
61 
62 #include "defs.h"
63 #include "parse.h"
64 #include "malloc.h"
65 #include "main.h"
66 #include "error.h"
67 #include "word.h"
68 #include "hash.h"
69 
70 
71 
72 /* local to getchar stuff */
73 static int ungot_char = -1;
74 static int ungot_char2 = -1;
75 static int ungot_char3 = -1;
76 
77 
78 /*========================================================================
79  * Name:	my_unget_char
80  * Purpose:	My own unget routine, handling up to 3 ungot characters.
81  * Args:	Character.
82  * Returns:	None.
83  *=======================================================================*/
84 
my_unget_char(int ch)85 static void my_unget_char(int ch)
86 {
87 	if (ungot_char >= 0 && ungot_char2 >= 0 && ungot_char3 >= 0)
88 	{
89 		error_handler("More than 3 ungot chars");
90 	}
91 
92 	ungot_char3 = ungot_char2;
93 	ungot_char2 = ungot_char;
94 	ungot_char = ch;
95 }
96 
97 
98 static int last_returned_ch = 0;
99 
100 
101 #define READ_BUF_LEN 2048
102 static int buffer_size = 0;
103 static char *read_buf = NULL;
104 static int read_buf_end = 0;
105 static int read_buf_index = 0;
106 
107 
108 
109 
110 
111 /*========================================================================
112  * Name:	my_getchar
113  * Purpose:	Gets a character: either an ungot one, or a buffered one.
114  * Args:	Input file.
115  * Returns:	Character, or EOF.
116  *=======================================================================*/
117 
my_getchar(FILE * f)118 static int my_getchar(FILE *f)
119 {
120 	int ch;
121 
122 	CHECK_PARAM_NOT_NULL(f);
123 
124 	if (ungot_char >= 0)
125 	{
126 		ch = ungot_char;
127 		ungot_char = ungot_char2;
128 		ungot_char2 = ungot_char3;
129 		ungot_char3 = -1;
130 		last_returned_ch = ch;
131 		if (ch > 255)
132 		{
133 			fprintf(stderr, "returning bad ch = '%c' (0%o)\n",
134 			        ch, ch);
135 		}
136 		return ch;
137 	}
138 	do
139 	{
140 		if (read_buf_index >= read_buf_end)
141 		{
142 			if (!read_buf)
143 			{
144 				buffer_size = READ_BUF_LEN;
145 				read_buf = my_malloc(buffer_size);
146 				if (!read_buf)
147 				{
148 					buffer_size /= 4;
149 					read_buf = my_malloc(buffer_size);
150 					if (!read_buf)
151 					{
152 						error_handler("Cannot allocate read buffer");
153 					}
154 				}
155 			}
156 			read_buf_end = fread(read_buf, 1, buffer_size, f);
157 			read_buf_index = 0;
158 			if (!read_buf_end)
159 			{
160 				return EOF;
161 			}
162 		}
163 		ch = read_buf [read_buf_index++];
164 
165 		if (ch == '\n')
166 		{
167 			lineno++;
168 			/* Convert \(newline) into \par here */
169 			if (last_returned_ch == '\\')
170 			{
171 				my_unget_char(' ');
172 				my_unget_char('r');
173 				my_unget_char('a');
174 				ch = 'p';
175 				break;
176 			}
177 		}
178 	}
179 	while (ch == '\r' /* || ch=='\n' */);
180 
181 	if (ch == '\t')
182 	{
183 		ch = ' ';
184 	}
185 
186 	last_returned_ch = ch;
187 	if (ch > 255)
188 	{
189 		fprintf(stderr, "returning bad ch '%c' (0%o)\n", ch, ch);
190 		exit(1);
191 	}
192 	return ch;
193 }
194 
195 
196 /*========================================================================
197  * Name:	my_skip
198  * Purpose:	Skips the given number of characters.
199  * Args:	Input file, number of characters.
200  * Returns:	None.
201  *=======================================================================*/
202 
my_skip(FILE * f,long n)203 static void my_skip(FILE *f, long n)
204 {
205 	n += read_buf_index;
206 	if (n >= 0 && n < read_buf_end)
207 	{
208 		read_buf_index = (int)n;
209 		return;
210 	}
211 	read_buf_end = read_buf_index = 0;
212 	if (fseek(f, n - read_buf_end, SEEK_CUR))
213 	{
214 		error_handler("Cannot seek");
215 	}
216 	return;
217 }
218 
219 /* local to read_word */
220 static char *input_str = NULL;
221 static unsigned long current_max_length = 1;
222 
223 
224 
225 /*========================================================================
226  * Name:	expand_word_buffer
227  * Purpose:	Expands the buffer used to store an incoming word.
228  *		This allows us to remove the limit on word length.
229  * Args:	None.
230  * Returns:	None.
231  *=======================================================================*/
232 
233 static int
expand_word_buffer()234 expand_word_buffer()
235 {
236 	char *new_ptr;
237 	unsigned long old_length;
238 	if (!input_str)
239 	{
240 		error_handler("No input buffer allocated");
241 	}
242 	old_length = current_max_length;
243 	current_max_length *= 2;
244 	new_ptr = my_malloc(current_max_length);
245 	if (!new_ptr)
246 	{
247 		error_handler("Out of memory while resizing buffer");
248 	}
249 
250 	memcpy(new_ptr, input_str, old_length);
251 	my_free(input_str);
252 	input_str = new_ptr;
253 	return TRUE;
254 }
255 
256 
257 
258 
259 /*========================================================================
260  * Name:	read_word
261  * Purpose:	The core of the parser, this reads a word.
262  * Args:	Input file.
263  * Returns:	Number of characters in the word, or zero.
264  * Note:	The word buffer is static and local to this file.
265  *=======================================================================*/
266 
267 static int
read_word(FILE * f)268 read_word(FILE *f)
269 {
270 	int ch, ch2;
271 	unsigned long ix = 0;
272 	int have_whitespace = FALSE;
273 	int is_control_word = FALSE;
274 	int has_numeric_param = FALSE; /* if is_control_word==TRUE */
275 	int need_unget = FALSE;
276 
277 	CHECK_PARAM_NOT_NULL(f);
278 
279 	if (input_str == NULL)
280 	{
281 		/* Get some storage for a word.
282 		 */
283 		current_max_length = 10; /* XX */
284 		input_str = my_malloc(current_max_length);
285 		if (!input_str)
286 		{
287 			error_handler("Cannot allocate word storage");
288 		}
289 	}
290 
291 	do
292 	{
293 		ch = my_getchar(f);
294 	}
295 	while (ch == '\n');
296 
297 	if (ch == ' ')
298 	{
299 		/* Compress multiple space chars down to one.
300 		 */
301 		while (ch == ' ')
302 		{
303 			ch = my_getchar(f);
304 			have_whitespace = TRUE;
305 		}
306 		if (have_whitespace)
307 		{
308 			my_unget_char(ch);
309 			input_str[0] = ' ';
310 			input_str[1] = 0;
311 			return 1;
312 		}
313 	}
314 
315 	switch (ch)
316 	{
317 	case EOF:
318 		return 0;
319 
320 	case '\\':
321 		ch2 = my_getchar(f);
322 
323 		/* Look for two-character command words.
324 		 */
325 		switch (ch2)
326 		{
327 		case '\n':
328 			strcpy(input_str, "\\par");
329 			return 4;
330 		case '~':
331 		case '{':
332 		case '}':
333 		case '\\':
334 		case '_':
335 		case '-':
336 			input_str[0] = '\\';
337 			input_str[1] = ch2;
338 			input_str[2] = 0;
339 			return 2;
340 		case '\'':
341 			/* Preserve \'## expressions (hex char exprs) for later.
342 			 */
343 			input_str[0] = '\\';
344 			input_str[1] = '\'';
345 			ix = 2;
346 			if (ix == current_max_length)
347 			{
348 				if (!expand_word_buffer())
349 				{
350 					error_handler("Word too long");
351 				}
352 			}
353 			ch = my_getchar(f);
354 			input_str[ix++] = ch;
355 			if (ix == current_max_length)
356 			{
357 				if (!expand_word_buffer())
358 				{
359 					error_handler("Word too long");
360 				}
361 			}
362 			ch = my_getchar(f);
363 			input_str[ix++] = ch;
364 			if (ix == current_max_length)
365 			{
366 				if (!expand_word_buffer())
367 				{
368 					error_handler("Word too long");
369 				}
370 			}
371 			input_str[ix] = 0;
372 			return ix;
373 		}
374 
375 		is_control_word = TRUE;
376 		ix = 1;
377 		input_str[0] = ch;
378 		ch = ch2;
379 		break;
380 
381 	case '\t':
382 		/* In RTF, a tab char is the same as \tab.
383 		 */
384 		strcpy(input_str, "\\tab");
385 		return 4;
386 
387 	case '{':
388 	case '}':
389 	case ';':
390 		input_str[0] = ch;
391 		input_str[1] = 0;
392 		return 1;
393 
394 	}
395 
396 	while (ch != EOF)
397 	{
398 		/* Several chars always ends a word, and we need to save them.
399 		 */
400 		if (ch == '\t' || ch == '{' || ch == '}' || ch == '\\')
401 		{
402 			need_unget = TRUE;
403 			break;
404 		}
405 
406 		/* A newline always ends a command word; we don't save it.
407 		 * A newline is ignored if this is not a command word.
408 		 */
409 		if (ch == '\n')
410 		{
411 			if (is_control_word)
412 			{
413 				break;
414 			}
415 			ch = my_getchar(f);
416 			continue;
417 		}
418 
419 		/* A semicolon always ends a command word; we do save it.
420 		 * A semicolon never ends a regular word.
421 		 */
422 		if (ch == ';')
423 		{
424 			if (is_control_word)
425 			{
426 				need_unget = TRUE;
427 				break;
428 			}
429 		}
430 
431 		/* In this parser, a space character terminates
432 		 * any word, and if it does not follow a command,
433 		 * then it is a word in itself.
434 		 */
435 		if (ch == ' ')
436 		{
437 			if (!is_control_word)
438 			{
439 				need_unget = TRUE;
440 			}
441 			break;
442 		}
443 
444 		/* Identify a control word's numeric parameter.
445 		 */
446 		if (is_control_word)
447 		{
448 			if (!has_numeric_param && (isdigit(ch) || ch == '-'))
449 			{
450 				has_numeric_param = TRUE;
451 			}
452 			else if (has_numeric_param && !isdigit(ch))
453 			{
454 				if (ch != ' ')
455 				{
456 					need_unget = TRUE;
457 				}
458 				break;
459 			}
460 		}
461 
462 		input_str[ix++] = ch;
463 		if (ix == current_max_length)
464 		{
465 			if (!expand_word_buffer())
466 			{
467 				error_handler("Word too long");
468 			}
469 		}
470 		ch = my_getchar(f);
471 	}
472 
473 	if (need_unget)
474 	{
475 		my_unget_char(ch);
476 	}
477 
478 	input_str[ix] = 0;
479 
480 	if (!memcmp(input_str, "\\bin", 4) && isdigit(input_str[4]))
481 	{
482 		my_skip(f, atoi(input_str + 4));
483 	}
484 
485 	return ix;
486 }
487 
488 
489 
490 /*========================================================================
491  * Name:	word_read
492  * Purpose:	This is the recursive metareader which pieces together the
493  *			structure of Word objects.
494  * Args:	Input file.
495  * Returns:	Tree of Word objects.
496  *=======================================================================*/
497 
498 Word *
word_read(FILE * f)499 word_read(FILE *f)
500 {
501 	Word *prev_word = NULL;
502 	Word *first_word = NULL;
503 	Word *new_word = NULL;  /* temp */
504 
505 	CHECK_PARAM_NOT_NULL(f);
506 
507 	do
508 	{
509 		if (!read_word(f))
510 		{
511 			return first_word;
512 		}
513 
514 		if (input_str[0] == '{')
515 		{
516 			/* Process subwords */
517 
518 			/* Create a dummy word to point to a sublist */
519 			new_word = word_new(NULL);
520 			if (!new_word)
521 			{
522 				error_handler("Cannot allocate word");
523 			}
524 
525 			/* Get the sublist */
526 			new_word->child = word_read(f);
527 
528 		}
529 		else if (input_str[0] == '}')
530 		{
531 			return first_word;
532 		}
533 		else
534 		{
535 			new_word = word_new(input_str);
536 		}
537 
538 		if (prev_word)
539 		{
540 			prev_word->next = new_word;
541 		}
542 
543 		if (!first_word)
544 		{
545 			first_word = new_word;
546 		}
547 
548 		prev_word = new_word;
549 
550 		/* Free up the memory allocated by read_word. */
551 		my_free(input_str);
552 		input_str = NULL;
553 	}
554 	while (1);
555 }
556