1 /* Copyright 2010-2020 Free Software Foundation, Inc.
2
3 This program is free software: you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation, either version 3 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program. If not, see <http://www.gnu.org/licenses/>. */
15
16 #define _GNU_SOURCE
17
18 #include <config.h>
19
20 #include <stdlib.h>
21 #include <stdio.h>
22 #include <string.h>
23 #include <iconv.h>
24 #include <errno.h>
25 #include <sys/stat.h>
26
27 #include "errors.h"
28 #include "input.h"
29 #include "text.h"
30 #include "commands.h"
31
32 enum input_type { IN_file, IN_text };
33
34 enum character_encoding {
35 ce_latin1,
36 ce_latin2,
37 ce_latin15,
38 ce_utf8,
39 ce_shiftjis,
40 ce_koi8r,
41 ce_koi8u
42 };
43
44 typedef struct {
45 enum input_type type;
46
47 FILE *file;
48 LINE_NR line_nr;
49
50 char *text; /* Input text to be parsed as Texinfo. */
51 char *ptext; /* How far we are through 'text'. Used to split 'text'
52 into lines. */
53 } INPUT;
54
55 enum character_encoding input_encoding;
56
57 void
set_input_encoding(char * encoding)58 set_input_encoding (char *encoding)
59 {
60 if (!strcasecmp (encoding, "utf-8"))
61 input_encoding = ce_utf8;
62 else if (!strcmp (encoding, "iso-8859-1")
63 || !strcmp (encoding, "us-ascii"))
64 input_encoding = ce_latin1;
65 else if (!strcmp (encoding, "iso-8859-2"))
66 input_encoding = ce_latin2;
67 else if (!strcmp (encoding, "iso-8859-15"))
68 input_encoding = ce_latin15;
69 else if (!strcmp (encoding, "shift_jis"))
70 input_encoding = ce_shiftjis;
71 else if (!strcmp (encoding, "koi8-r"))
72 input_encoding = ce_koi8r;
73 else if (!strcmp (encoding, "koi8-u"))
74 input_encoding = ce_koi8u;
75 else
76 fprintf (stderr, "warning: unhandled encoding %s\n", encoding);
77 }
78
79
80 static INPUT *input_stack = 0;
81 int input_number = 0;
82 int input_space = 0;
83
84 /* Current filename and line number. Used for reporting. */
85 LINE_NR line_nr;
86
87 /* Change the line number of filename of the top input source. Used to
88 record a #line directive. If FILENAME is non-null, it should hbae
89 been returned from save_string. */
90 void
save_line_directive(int line_nr,char * filename)91 save_line_directive (int line_nr, char *filename)
92 {
93 INPUT *top = &input_stack[input_number - 1];
94 if (line_nr)
95 top->line_nr.line_nr = line_nr;
96 if (filename)
97 top->line_nr.file_name = filename;
98 }
99
100 /* Collect text from the input sources until a newline is found. This is used
101 instead of next_text when we need to be sure we get an entire line of
102 Texinfo input (for example as a line argument to a command), which might not
103 be the case if the input is the result of a macro expansion.
104
105 Return value should not be freed by caller, and becomes invalid after
106 a subsequent call. */
107 char *
new_line(void)108 new_line (void)
109 {
110 static TEXT t;
111 char *new = 0;
112
113 t.end = 0;
114
115 while (1)
116 {
117 new = next_text ();
118 if (!new)
119 break;
120 text_append (&t, new);
121 free (new);
122
123 if (t.text[t.end - 1] == '\n')
124 break;
125 }
126
127 if (t.end > 0)
128 return t.text;
129 else
130 return 0;
131 }
132
133
134 static iconv_t iconv_from_latin1;
135 static iconv_t iconv_from_latin2;
136 static iconv_t iconv_from_latin15;
137 static iconv_t iconv_from_shiftjis;
138 static iconv_t iconv_from_koi8u;
139 static iconv_t iconv_from_koi8r;
140 static iconv_t iconv_validate_utf8;
141
142 /* Run iconv using text buffer as output buffer. */
143 size_t
text_buffer_iconv(TEXT * buf,iconv_t iconv_state,ICONV_CONST char ** inbuf,size_t * inbytesleft)144 text_buffer_iconv (TEXT *buf, iconv_t iconv_state,
145 ICONV_CONST char **inbuf, size_t *inbytesleft)
146 {
147 size_t out_bytes_left;
148 char *outptr;
149 size_t iconv_ret;
150
151 outptr = buf->text + buf->end;
152 if (buf->end == buf->space - 1)
153 {
154 errno = E2BIG;
155 return (size_t) -1;
156 }
157 out_bytes_left = buf->space - buf->end - 1;
158 iconv_ret = iconv (iconv_state, inbuf, inbytesleft,
159 &outptr, &out_bytes_left);
160
161 buf->end = outptr - buf->text;
162
163 return iconv_ret;
164 }
165
166
167 /* Return conversion of S according to input_encoding. This function
168 frees S. */
169 static char *
convert_to_utf8(char * s)170 convert_to_utf8 (char *s)
171 {
172 iconv_t our_iconv = (iconv_t) -1;
173 static TEXT t;
174 ICONV_CONST char *inptr; size_t bytes_left;
175 size_t iconv_ret;
176 enum character_encoding enc;
177
178 /* Convert from @documentencoding to UTF-8.
179 It might be possible not to convert to UTF-8 and use an 8-bit encoding
180 throughout, but then we'd have to not set the UTF-8 flag on the Perl
181 strings in api.c. If multiple character encodings were used in a single
182 file, then we'd have to keep track of which strings needed the UTF-8 flag
183 and which didn't. */
184
185 /* Initialize conversions for the first time. */
186 if (iconv_validate_utf8 == (iconv_t) 0)
187 iconv_validate_utf8 = iconv_open ("UTF-8", "UTF-8");
188 if (iconv_from_latin1 == (iconv_t) 0)
189 iconv_from_latin1 = iconv_open ("UTF-8", "ISO-8859-1");
190 if (iconv_from_latin2 == (iconv_t) 0)
191 iconv_from_latin2 = iconv_open ("UTF-8", "ISO-8859-2");
192 if (iconv_from_latin15 == (iconv_t) 0)
193 iconv_from_latin15 = iconv_open ("UTF-8", "ISO-8859-15");
194 if (iconv_from_shiftjis == (iconv_t) 0)
195 iconv_from_shiftjis = iconv_open ("UTF-8", "SHIFT-JIS");
196 if (iconv_from_koi8r == (iconv_t) 0)
197 iconv_from_koi8r = iconv_open ("UTF-8", "KOI8-R");
198 if (iconv_from_koi8u == (iconv_t) 0)
199 iconv_from_koi8u = iconv_open ("UTF-8", "KOI8-U");
200
201 switch (input_encoding)
202 {
203 case ce_utf8:
204 our_iconv = iconv_validate_utf8;
205 break;
206 case ce_latin1:
207 our_iconv = iconv_from_latin1;
208 break;
209 case ce_latin2:
210 our_iconv = iconv_from_latin2;
211 break;
212 case ce_latin15:
213 our_iconv = iconv_from_latin15;
214 break;
215 case ce_shiftjis:
216 our_iconv = iconv_from_shiftjis;
217 break;
218 case ce_koi8r:
219 our_iconv = iconv_from_koi8r;
220 break;
221 case ce_koi8u:
222 our_iconv = iconv_from_koi8u;
223 break;
224 }
225
226 if (our_iconv == (iconv_t) -1)
227 {
228 /* In case the converter couldn't be initialised.
229 Danger: this will cause problems if the input is not in UTF-8 as
230 the Perl strings that are created are flagged as being UTF-8. */
231 return s;
232 }
233
234 t.end = 0;
235 inptr = s;
236 bytes_left = strlen (s);
237 text_alloc (&t, 10);
238
239 while (1)
240 {
241 iconv_ret = text_buffer_iconv (&t, our_iconv,
242 &inptr, &bytes_left);
243
244 /* Make sure libiconv flushes out the last converted character.
245 This is required when the conversion is stateful, in which
246 case libiconv might not output the last character, waiting to
247 see whether it should be combined with the next one. */
248 if (iconv_ret != (size_t) -1
249 && text_buffer_iconv (&t, our_iconv, 0, 0) != (size_t) -1)
250 /* Success: all of input converted. */
251 break;
252
253 if (bytes_left == 0)
254 break;
255
256 switch (errno)
257 {
258 case E2BIG:
259 text_alloc (&t, t.space + 20);
260 break;
261 case EILSEQ:
262 default:
263 fprintf(stderr, "%s:%d: encoding error at byte 0x%2x\n",
264 line_nr.file_name, line_nr.line_nr, *(unsigned char *)inptr);
265 inptr++; bytes_left--;
266 break;
267 }
268 }
269
270 free (s);
271 t.text[t.end] = '\0';
272 return strdup (t.text);
273 }
274
275 int
expanding_macro(char * macro)276 expanding_macro (char *macro)
277 {
278 int i;
279 for (i = 0; i < input_number; i++)
280 {
281 if (input_stack[i].line_nr.macro
282 && !strcmp (input_stack[i].line_nr.macro, macro))
283 {
284 return 1;
285 }
286 }
287 return 0;
288 }
289
290 char *save_string (char *string);
291
292 /* Return value to be freed by caller. Return null if we are out of input. */
293 char *
next_text(void)294 next_text (void)
295 {
296 ssize_t status;
297 char *line = 0;
298 size_t n;
299 FILE *input_file;
300
301 while (input_number > 0)
302 {
303 /* Check for pending input. */
304 INPUT *i = &input_stack[input_number - 1];
305
306 switch (i->type)
307 {
308 char *p, *new;
309 case IN_text:
310 if (!*i->ptext)
311 {
312 /* End of text reached. */
313 free (i->text);
314 break;
315 }
316 /* Split off a line of input. */
317 p = strchrnul (i->ptext, '\n');
318 new = strndup (i->ptext, p - i->ptext + 1);
319 if (*p)
320 i->ptext = p + 1;
321 else
322 i->ptext = p; /* The next time, we will pop the input source. */
323
324 if (!i->line_nr.macro)
325 i->line_nr.line_nr++;
326
327 line_nr = i->line_nr;
328
329 return new;
330
331 break;
332 case IN_file:
333 input_file = input_stack[input_number - 1].file;
334 status = getline (&line, &n, input_file);
335 if (status != -1)
336 {
337 char *comment;
338 if (feof (input_file))
339 {
340 /* Add a newline at the end of the file if one is missing. */
341 char *line2;
342 asprintf (&line2, "%s\n", line);
343 free (line);
344 line = line2;
345 }
346
347 /* Strip off a comment. */
348 comment = strchr (line, '\x7F');
349 if (comment)
350 *comment = '\0';
351
352 i->line_nr.line_nr++;
353 line_nr = i->line_nr;
354
355 return convert_to_utf8 (line);
356 }
357 free (line); line = 0;
358 break;
359 default:
360 fatal ("unknown input source type");
361 }
362
363 /* Top input source failed. Pop it and try the next one. */
364
365 if (input_stack[input_number - 1].type == IN_file)
366 {
367 FILE *file = input_stack[input_number - 1].file;
368
369 if (file != stdin)
370 {
371 if (fclose (input_stack[input_number - 1].file) == EOF)
372 fprintf (stderr, "error on closing %s: %s",
373 input_stack[input_number - 1].line_nr.file_name,
374 strerror (errno));
375 }
376 }
377
378 input_number--;
379 }
380 return 0;
381 }
382
383 void
input_push(char * text,char * macro,char * filename,int line_number)384 input_push (char *text, char *macro, char *filename, int line_number)
385 {
386 if (input_number == input_space)
387 {
388 input_space++; input_space *= 1.5;
389 input_stack = realloc (input_stack, input_space * sizeof (INPUT));
390 if (!input_stack)
391 fatal ("realloc failed");
392 }
393
394 input_stack[input_number].type = IN_text;
395 input_stack[input_number].file = 0;
396 input_stack[input_number].text = text;
397 input_stack[input_number].ptext = text;
398
399 if (!macro)
400 line_number--;
401 input_stack[input_number].line_nr.line_nr = line_number;
402 input_stack[input_number].line_nr.file_name = save_string (filename);
403 input_stack[input_number].line_nr.macro = save_string (macro);
404 input_number++;
405 }
406
407 /* For filenames and macro names, it is possible that they won't be referenced
408 in the line number of any element. It would be too much work to keep track,
409 so just keep them all here, and free them all together at the end. */
410 static char **small_strings;
411 static size_t small_strings_num;
412 static size_t small_strings_space;
413
414 char *
save_string(char * string)415 save_string (char *string)
416 {
417 char *ret = string ? strdup (string) : 0;
418 if (ret)
419 {
420 if (small_strings_num == small_strings_space)
421 {
422 small_strings_space++;
423 small_strings_space += (small_strings_space >> 2);
424 small_strings = realloc (small_strings, small_strings_space
425 * sizeof (char *));
426 if (!small_strings)
427 fatal ("realloc failed");
428 }
429 small_strings[small_strings_num++] = ret;
430 }
431 return ret;
432 }
433
434 /* Called in reset_parser. */
435 void
free_small_strings(void)436 free_small_strings (void)
437 {
438 size_t i;
439 for (i = 0; i < small_strings_num; i++)
440 {
441 free (small_strings[i]);
442 }
443 small_strings_num = 0;
444 }
445
446
447 /* Store TEXT as a source for Texinfo content. TEXT should be a UTF-8
448 string. TEXT will be later free'd and must be allocated on the heap.
449 MACRO is the name of a macro that the text came from. */
450 void
input_push_text(char * text,char * macro)451 input_push_text (char *text, char *macro)
452 {
453 if (text)
454 {
455 char *filename = 0;
456 if (input_number > 0)
457 {
458 filename = input_stack[input_number - 1].line_nr.file_name;
459 }
460 input_push (text, macro, filename, line_nr.line_nr);
461 }
462 }
463
464 /* Used in tests - like input_push_text, but the lines from the text have
465 line numbers. */
466 void
input_push_text_with_line_nos(char * text,int starting)467 input_push_text_with_line_nos (char *text, int starting)
468 {
469 input_push (text, 0, 0, starting);
470 input_stack[input_number - 1].type = IN_text;
471 }
472
473 void
input_reset_input_stack(void)474 input_reset_input_stack (void)
475 {
476 int i;
477 for (i = 0; i < input_number; i++)
478 {
479 switch (input_stack[i].type)
480 {
481 case IN_file:
482 if (input_stack[i].file != stdin)
483 fclose (input_stack[i].file);
484 break;
485 case IN_text:
486 free (input_stack[i].text);
487 break;
488 }
489 }
490 input_number = 0;
491 }
492
493 int
top_file_index(void)494 top_file_index (void)
495 {
496 int i = input_number - 1;
497 while (i >= 0 && input_stack[i].type != IN_file)
498 i--;
499 return i;
500 }
501
502
503 static char **include_dirs;
504 static size_t include_dirs_number;
505 static size_t include_dirs_space;
506
507 void
add_include_directory(char * filename)508 add_include_directory (char *filename)
509 {
510 int len;
511 if (include_dirs_number == include_dirs_space)
512 {
513 include_dirs = realloc (include_dirs,
514 sizeof (char *) * (include_dirs_space += 5));
515 }
516 filename = strdup (filename);
517 include_dirs[include_dirs_number++] = filename;
518 len = strlen (filename);
519 if (len > 0 && filename[len - 1] == '/')
520 filename[len - 1] = '\0';
521 }
522
523 /* Return value to be freed by caller. */
524 char *
locate_include_file(char * filename)525 locate_include_file (char *filename)
526 {
527 char *fullpath;
528 struct stat dummy;
529 int i, status;
530
531 /* Checks if filename is absolute or relative to current directory.
532 TODO: Could use macros in top-level config.h for this. */
533 /* TODO: The Perl code (in Common.pm, 'locate_include_file') handles
534 a volume in a path (like "A:"), possibly more general treatment
535 with File::Spec module. */
536 if (!memcmp (filename, "/", 1)
537 || !memcmp (filename, "../", 3)
538 || !memcmp (filename, "./", 2))
539 {
540 status = stat (filename, &dummy);
541 if (status == 0)
542 return strdup (filename);
543 }
544 else
545 {
546 for (i = 0; i < include_dirs_number; i++)
547 {
548 asprintf (&fullpath, "%s/%s", include_dirs[i], filename);
549 status = stat (fullpath, &dummy);
550 if (status == 0)
551 return fullpath;
552 free (fullpath);
553 }
554 }
555 return 0;
556 }
557
558 /* Try to open a file called FILENAME, looking for it in the list of include
559 directories. */
560 int
input_push_file(char * filename)561 input_push_file (char *filename)
562 {
563 FILE *stream;
564
565 stream = fopen (filename, "r");
566 if (!stream)
567 return errno;
568
569 if (input_number == input_space)
570 {
571 input_stack = realloc (input_stack, (input_space += 5) * sizeof (INPUT));
572 if (!input_stack)
573 fatal ("realloc failed");
574 }
575
576 /* Strip off a leading directory path. */
577 char *p, *q;
578 p = 0;
579 q = strchr (filename, '/');
580 while (q)
581 {
582 p = q;
583 q = strchr (q + 1, '/');
584 }
585 if (p)
586 filename = save_string (p+1);
587 else
588 filename = save_string (filename);
589
590 input_stack[input_number].type = IN_file;
591 input_stack[input_number].file = stream;
592 input_stack[input_number].line_nr.file_name = filename;
593 input_stack[input_number].line_nr.line_nr = 0;
594 input_stack[input_number].line_nr.macro = 0;
595 input_stack[input_number].text = 0;
596 input_stack[input_number].ptext = 0;
597 input_number++;
598
599 return 0;
600 }
601
602