1 /*=============================================================================
2 GNU UnRTF, a command-line program to convert RTF documents to other formats.
3 Copyright (C) 2000,2001 Zachary Thayer Smith
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
19 The author is reachable by electronic mail at tuorfa@yahoo.com.
20 =============================================================================*/
21
22
23 /*----------------------------------------------------------------------
24 * Module name: parse
25 * Author name: Zach Smith
26 * Create date: 01 Sep 00
27 * Purpose: Parsing of the RTF file into a structure of Word objects.
28 *----------------------------------------------------------------------
29 * Changes:
30 * 15 Oct 00, tuorfa@yahoo.com: parse.c created with functions taken from word.c
31 * 15 Oct 00, tuorfa@yahoo.com: backslash before newline is now \par
32 * 08 Apr 01, tuorfa@yahoo.com: removed limit on word length
33 * 03 Aug 01, tuorfa@yahoo.com: added input buffering
34 * 19 Sep 01, tuorfa@yahoo.com: cleaned up read_word()
35 * 22 Sep 01, tuorfa@yahoo.com: moved word_dump() to word.c
36 * 22 Sep 01, tuorfa@yahoo.com: added function-level comment blocks
37 * 08 Sep 03, daved@physiol.usyd.edu.au: type fixes; ANSI C fixes
38 * 29 Mar 05, daved@physiol.usyd.edu.au: changes requested by ZT Smith
39 * 16 Dec 07, daved@physiol.usyd.edu.au: updated to GPL v3
40 *--------------------------------------------------------------------*/
41
42 #ifdef HAVE_CONFIG_H
43 #include <config.h>
44 #endif
45
46 #ifdef HAVE_STDIO_H
47 #include <stdio.h>
48 #endif
49
50 #ifdef HAVE_STDLIB_H
51 #include <stdlib.h>
52 #endif
53
54 #ifdef HAVE_CTYPE_H
55 #include <ctype.h>
56 #endif
57
58 #ifdef HAVE_STRING_H
59 #include <string.h>
60 #endif
61
62 #include "defs.h"
63 #include "parse.h"
64 #include "malloc.h"
65 #include "main.h"
66 #include "error.h"
67 #include "word.h"
68 #include "hash.h"
69
70
71
72 /* local to getchar stuff */
73 static int ungot_char = -1;
74 static int ungot_char2 = -1;
75 static int ungot_char3 = -1;
76
77
78 /*========================================================================
79 * Name: my_unget_char
80 * Purpose: My own unget routine, handling up to 3 ungot characters.
81 * Args: Character.
82 * Returns: None.
83 *=======================================================================*/
84
my_unget_char(int ch)85 static void my_unget_char(int ch)
86 {
87 if (ungot_char >= 0 && ungot_char2 >= 0 && ungot_char3 >= 0)
88 {
89 error_handler("More than 3 ungot chars");
90 }
91
92 ungot_char3 = ungot_char2;
93 ungot_char2 = ungot_char;
94 ungot_char = ch;
95 }
96
97
98 static int last_returned_ch = 0;
99
100
101 #define READ_BUF_LEN 2048
102 static int buffer_size = 0;
103 static char *read_buf = NULL;
104 static int read_buf_end = 0;
105 static int read_buf_index = 0;
106
107
108
109
110
111 /*========================================================================
112 * Name: my_getchar
113 * Purpose: Gets a character: either an ungot one, or a buffered one.
114 * Args: Input file.
115 * Returns: Character, or EOF.
116 *=======================================================================*/
117
my_getchar(FILE * f)118 static int my_getchar(FILE *f)
119 {
120 int ch;
121
122 CHECK_PARAM_NOT_NULL(f);
123
124 if (ungot_char >= 0)
125 {
126 ch = ungot_char;
127 ungot_char = ungot_char2;
128 ungot_char2 = ungot_char3;
129 ungot_char3 = -1;
130 last_returned_ch = ch;
131 if (ch > 255)
132 {
133 fprintf(stderr, "returning bad ch = '%c' (0%o)\n",
134 ch, ch);
135 }
136 return ch;
137 }
138 do
139 {
140 if (read_buf_index >= read_buf_end)
141 {
142 if (!read_buf)
143 {
144 buffer_size = READ_BUF_LEN;
145 read_buf = my_malloc(buffer_size);
146 if (!read_buf)
147 {
148 buffer_size /= 4;
149 read_buf = my_malloc(buffer_size);
150 if (!read_buf)
151 {
152 error_handler("Cannot allocate read buffer");
153 }
154 }
155 }
156 read_buf_end = fread(read_buf, 1, buffer_size, f);
157 read_buf_index = 0;
158 if (!read_buf_end)
159 {
160 return EOF;
161 }
162 }
163 ch = read_buf [read_buf_index++];
164
165 if (ch == '\n')
166 {
167 lineno++;
168 /* Convert \(newline) into \par here */
169 if (last_returned_ch == '\\')
170 {
171 my_unget_char(' ');
172 my_unget_char('r');
173 my_unget_char('a');
174 ch = 'p';
175 break;
176 }
177 }
178 }
179 while (ch == '\r' /* || ch=='\n' */);
180
181 if (ch == '\t')
182 {
183 ch = ' ';
184 }
185
186 last_returned_ch = ch;
187 if (ch > 255)
188 {
189 fprintf(stderr, "returning bad ch '%c' (0%o)\n", ch, ch);
190 exit(1);
191 }
192 return ch;
193 }
194
195
196 /*========================================================================
197 * Name: my_skip
198 * Purpose: Skips the given number of characters.
199 * Args: Input file, number of characters.
200 * Returns: None.
201 *=======================================================================*/
202
my_skip(FILE * f,long n)203 static void my_skip(FILE *f, long n)
204 {
205 n += read_buf_index;
206 if (n >= 0 && n < read_buf_end)
207 {
208 read_buf_index = (int)n;
209 return;
210 }
211 read_buf_end = read_buf_index = 0;
212 if (fseek(f, n - read_buf_end, SEEK_CUR))
213 {
214 error_handler("Cannot seek");
215 }
216 return;
217 }
218
219 /* local to read_word */
220 static char *input_str = NULL;
221 static unsigned long current_max_length = 1;
222
223
224
225 /*========================================================================
226 * Name: expand_word_buffer
227 * Purpose: Expands the buffer used to store an incoming word.
228 * This allows us to remove the limit on word length.
229 * Args: None.
230 * Returns: None.
231 *=======================================================================*/
232
233 static int
expand_word_buffer()234 expand_word_buffer()
235 {
236 char *new_ptr;
237 unsigned long old_length;
238 if (!input_str)
239 {
240 error_handler("No input buffer allocated");
241 }
242 old_length = current_max_length;
243 current_max_length *= 2;
244 new_ptr = my_malloc(current_max_length);
245 if (!new_ptr)
246 {
247 error_handler("Out of memory while resizing buffer");
248 }
249
250 memcpy(new_ptr, input_str, old_length);
251 my_free(input_str);
252 input_str = new_ptr;
253 return TRUE;
254 }
255
256
257
258
259 /*========================================================================
260 * Name: read_word
261 * Purpose: The core of the parser, this reads a word.
262 * Args: Input file.
263 * Returns: Number of characters in the word, or zero.
264 * Note: The word buffer is static and local to this file.
265 *=======================================================================*/
266
267 static int
read_word(FILE * f)268 read_word(FILE *f)
269 {
270 int ch, ch2;
271 unsigned long ix = 0;
272 int have_whitespace = FALSE;
273 int is_control_word = FALSE;
274 int has_numeric_param = FALSE; /* if is_control_word==TRUE */
275 int need_unget = FALSE;
276
277 CHECK_PARAM_NOT_NULL(f);
278
279 if (input_str == NULL)
280 {
281 /* Get some storage for a word.
282 */
283 current_max_length = 10; /* XX */
284 input_str = my_malloc(current_max_length);
285 if (!input_str)
286 {
287 error_handler("Cannot allocate word storage");
288 }
289 }
290
291 do
292 {
293 ch = my_getchar(f);
294 }
295 while (ch == '\n');
296
297 if (ch == ' ')
298 {
299 /* Compress multiple space chars down to one.
300 */
301 while (ch == ' ')
302 {
303 ch = my_getchar(f);
304 have_whitespace = TRUE;
305 }
306 if (have_whitespace)
307 {
308 my_unget_char(ch);
309 input_str[0] = ' ';
310 input_str[1] = 0;
311 return 1;
312 }
313 }
314
315 switch (ch)
316 {
317 case EOF:
318 return 0;
319
320 case '\\':
321 ch2 = my_getchar(f);
322
323 /* Look for two-character command words.
324 */
325 switch (ch2)
326 {
327 case '\n':
328 strcpy(input_str, "\\par");
329 return 4;
330 case '~':
331 case '{':
332 case '}':
333 case '\\':
334 case '_':
335 case '-':
336 input_str[0] = '\\';
337 input_str[1] = ch2;
338 input_str[2] = 0;
339 return 2;
340 case '\'':
341 /* Preserve \'## expressions (hex char exprs) for later.
342 */
343 input_str[0] = '\\';
344 input_str[1] = '\'';
345 ix = 2;
346 if (ix == current_max_length)
347 {
348 if (!expand_word_buffer())
349 {
350 error_handler("Word too long");
351 }
352 }
353 ch = my_getchar(f);
354 input_str[ix++] = ch;
355 if (ix == current_max_length)
356 {
357 if (!expand_word_buffer())
358 {
359 error_handler("Word too long");
360 }
361 }
362 ch = my_getchar(f);
363 input_str[ix++] = ch;
364 if (ix == current_max_length)
365 {
366 if (!expand_word_buffer())
367 {
368 error_handler("Word too long");
369 }
370 }
371 input_str[ix] = 0;
372 return ix;
373 }
374
375 is_control_word = TRUE;
376 ix = 1;
377 input_str[0] = ch;
378 ch = ch2;
379 break;
380
381 case '\t':
382 /* In RTF, a tab char is the same as \tab.
383 */
384 strcpy(input_str, "\\tab");
385 return 4;
386
387 case '{':
388 case '}':
389 case ';':
390 input_str[0] = ch;
391 input_str[1] = 0;
392 return 1;
393
394 }
395
396 while (ch != EOF)
397 {
398 /* Several chars always ends a word, and we need to save them.
399 */
400 if (ch == '\t' || ch == '{' || ch == '}' || ch == '\\')
401 {
402 need_unget = TRUE;
403 break;
404 }
405
406 /* A newline always ends a command word; we don't save it.
407 * A newline is ignored if this is not a command word.
408 */
409 if (ch == '\n')
410 {
411 if (is_control_word)
412 {
413 break;
414 }
415 ch = my_getchar(f);
416 continue;
417 }
418
419 /* A semicolon always ends a command word; we do save it.
420 * A semicolon never ends a regular word.
421 */
422 if (ch == ';')
423 {
424 if (is_control_word)
425 {
426 need_unget = TRUE;
427 break;
428 }
429 }
430
431 /* In this parser, a space character terminates
432 * any word, and if it does not follow a command,
433 * then it is a word in itself.
434 */
435 if (ch == ' ')
436 {
437 if (!is_control_word)
438 {
439 need_unget = TRUE;
440 }
441 break;
442 }
443
444 /* Identify a control word's numeric parameter.
445 */
446 if (is_control_word)
447 {
448 if (!has_numeric_param && (isdigit(ch) || ch == '-'))
449 {
450 has_numeric_param = TRUE;
451 }
452 else if (has_numeric_param && !isdigit(ch))
453 {
454 if (ch != ' ')
455 {
456 need_unget = TRUE;
457 }
458 break;
459 }
460 }
461
462 input_str[ix++] = ch;
463 if (ix == current_max_length)
464 {
465 if (!expand_word_buffer())
466 {
467 error_handler("Word too long");
468 }
469 }
470 ch = my_getchar(f);
471 }
472
473 if (need_unget)
474 {
475 my_unget_char(ch);
476 }
477
478 input_str[ix] = 0;
479
480 if (!memcmp(input_str, "\\bin", 4) && isdigit(input_str[4]))
481 {
482 my_skip(f, atoi(input_str + 4));
483 }
484
485 return ix;
486 }
487
488
489
490 /*========================================================================
491 * Name: word_read
492 * Purpose: This is the recursive metareader which pieces together the
493 * structure of Word objects.
494 * Args: Input file.
495 * Returns: Tree of Word objects.
496 *=======================================================================*/
497
498 Word *
word_read(FILE * f)499 word_read(FILE *f)
500 {
501 Word *prev_word = NULL;
502 Word *first_word = NULL;
503 Word *new_word = NULL; /* temp */
504
505 CHECK_PARAM_NOT_NULL(f);
506
507 do
508 {
509 if (!read_word(f))
510 {
511 return first_word;
512 }
513
514 if (input_str[0] == '{')
515 {
516 /* Process subwords */
517
518 /* Create a dummy word to point to a sublist */
519 new_word = word_new(NULL);
520 if (!new_word)
521 {
522 error_handler("Cannot allocate word");
523 }
524
525 /* Get the sublist */
526 new_word->child = word_read(f);
527
528 }
529 else if (input_str[0] == '}')
530 {
531 return first_word;
532 }
533 else
534 {
535 new_word = word_new(input_str);
536 }
537
538 if (prev_word)
539 {
540 prev_word->next = new_word;
541 }
542
543 if (!first_word)
544 {
545 first_word = new_word;
546 }
547
548 prev_word = new_word;
549
550 /* Free up the memory allocated by read_word. */
551 my_free(input_str);
552 input_str = NULL;
553 }
554 while (1);
555 }
556