xref: /openbsd/gnu/gcc/libcpp/lex.c (revision dd6081ec)
1 /* CPP Library - lexical analysis.
2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3    Contributed by Per Bothner, 1994-95.
4    Based on CCCP program by Paul Rubin, June 1986
5    Adapted to ANSI C, Richard Stallman, Jan 1987
6    Broken out to separate file, Zack Weinberg, Mar 2000
7 
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
11 later version.
12 
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 GNU General Public License for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
26 
27 enum spell_type
28 {
29   SPELL_OPERATOR = 0,
30   SPELL_IDENT,
31   SPELL_LITERAL,
32   SPELL_NONE
33 };
34 
35 struct token_spelling
36 {
37   enum spell_type category;
38   const unsigned char *name;
39 };
40 
41 static const unsigned char *const digraph_spellings[] =
42 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
43 
44 #define OP(e, s) { SPELL_OPERATOR, U s  },
45 #define TK(e, s) { SPELL_ ## s,    U #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
49 
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52 
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
59 			    unsigned int, enum cpp_ttype);
60 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
61 static int name_p (cpp_reader *, const cpp_string *);
62 static tokenrun *next_tokenrun (tokenrun *);
63 
64 static _cpp_buff *new_buff (size_t);
65 
66 
67 /* Utility routine:
68 
69    Compares, the token TOKEN to the NUL-terminated string STRING.
70    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
71 int
cpp_ideq(const cpp_token * token,const char * string)72 cpp_ideq (const cpp_token *token, const char *string)
73 {
74   if (token->type != CPP_NAME)
75     return 0;
76 
77   return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
78 }
79 
80 /* Record a note TYPE at byte POS into the current cleaned logical
81    line.  */
82 static void
add_line_note(cpp_buffer * buffer,const uchar * pos,unsigned int type)83 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
84 {
85   if (buffer->notes_used == buffer->notes_cap)
86     {
87       buffer->notes_cap = buffer->notes_cap * 2 + 200;
88       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
89                                   buffer->notes_cap);
90     }
91 
92   buffer->notes[buffer->notes_used].pos = pos;
93   buffer->notes[buffer->notes_used].type = type;
94   buffer->notes_used++;
95 }
96 
97 /* Returns with a logical line that contains no escaped newlines or
98    trigraphs.  This is a time-critical inner loop.  */
99 void
_cpp_clean_line(cpp_reader * pfile)100 _cpp_clean_line (cpp_reader *pfile)
101 {
102   cpp_buffer *buffer;
103   const uchar *s;
104   uchar c, *d, *p;
105 
106   buffer = pfile->buffer;
107   buffer->cur_note = buffer->notes_used = 0;
108   buffer->cur = buffer->line_base = buffer->next_line;
109   buffer->need_line = false;
110   s = buffer->next_line - 1;
111 
112   if (!buffer->from_stage3)
113     {
114       /* Short circuit for the common case of an un-escaped line with
115 	 no trigraphs.  The primary win here is by not writing any
116 	 data back to memory until we have to.  */
117       for (;;)
118 	{
119 	  c = *++s;
120 	  if (c == '\n' || c == '\r')
121 	    {
122 	      d = (uchar *) s;
123 
124 	      if (s == buffer->rlimit)
125 		goto done;
126 
127 	      /* DOS line ending? */
128 	      if (c == '\r' && s[1] == '\n')
129 		s++;
130 
131 	      if (s == buffer->rlimit)
132 		goto done;
133 
134 	      /* check for escaped newline */
135 	      p = d;
136 	      while (p != buffer->next_line && is_nvspace (p[-1]))
137 		p--;
138 	      if (p == buffer->next_line || p[-1] != '\\')
139 		goto done;
140 
141 	      /* Have an escaped newline; process it and proceed to
142 		 the slow path.  */
143 	      add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
144 	      d = p - 2;
145 	      buffer->next_line = p - 1;
146 	      break;
147 	    }
148 	  if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
149 	    {
150 	      /* Have a trigraph.  We may or may not have to convert
151 		 it.  Add a line note regardless, for -Wtrigraphs.  */
152 	      add_line_note (buffer, s, s[2]);
153 	      if (CPP_OPTION (pfile, trigraphs))
154 		{
155 		  /* We do, and that means we have to switch to the
156 		     slow path.  */
157 		  d = (uchar *) s;
158 		  *d = _cpp_trigraph_map[s[2]];
159 		  s += 2;
160 		  break;
161 		}
162 	    }
163 	}
164 
165 
166       for (;;)
167 	{
168 	  c = *++s;
169 	  *++d = c;
170 
171 	  if (c == '\n' || c == '\r')
172 	    {
173 		  /* Handle DOS line endings.  */
174 	      if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
175 		s++;
176 	      if (s == buffer->rlimit)
177 		break;
178 
179 	      /* Escaped?  */
180 	      p = d;
181 	      while (p != buffer->next_line && is_nvspace (p[-1]))
182 		p--;
183 	      if (p == buffer->next_line || p[-1] != '\\')
184 		break;
185 
186 	      add_line_note (buffer, p - 1, p != d ? ' ': '\\');
187 	      d = p - 2;
188 	      buffer->next_line = p - 1;
189 	    }
190 	  else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
191 	    {
192 	      /* Add a note regardless, for the benefit of -Wtrigraphs.  */
193 	      add_line_note (buffer, d, s[2]);
194 	      if (CPP_OPTION (pfile, trigraphs))
195 		{
196 		  *d = _cpp_trigraph_map[s[2]];
197 		  s += 2;
198 		}
199 	    }
200 	}
201     }
202   else
203     {
204       do
205 	s++;
206       while (*s != '\n' && *s != '\r');
207       d = (uchar *) s;
208 
209       /* Handle DOS line endings.  */
210       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
211 	s++;
212     }
213 
214  done:
215   *d = '\n';
216   /* A sentinel note that should never be processed.  */
217   add_line_note (buffer, d + 1, '\n');
218   buffer->next_line = s + 1;
219 }
220 
221 /* Return true if the trigraph indicated by NOTE should be warned
222    about in a comment.  */
223 static bool
warn_in_comment(cpp_reader * pfile,_cpp_line_note * note)224 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
225 {
226   const uchar *p;
227 
228   /* Within comments we don't warn about trigraphs, unless the
229      trigraph forms an escaped newline, as that may change
230      behavior.  */
231   if (note->type != '/')
232     return false;
233 
234   /* If -trigraphs, then this was an escaped newline iff the next note
235      is coincident.  */
236   if (CPP_OPTION (pfile, trigraphs))
237     return note[1].pos == note->pos;
238 
239   /* Otherwise, see if this forms an escaped newline.  */
240   p = note->pos + 3;
241   while (is_nvspace (*p))
242     p++;
243 
244   /* There might have been escaped newlines between the trigraph and the
245      newline we found.  Hence the position test.  */
246   return (*p == '\n' && p < note[1].pos);
247 }
248 
249 /* Process the notes created by add_line_note as far as the current
250    location.  */
251 void
_cpp_process_line_notes(cpp_reader * pfile,int in_comment)252 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
253 {
254   cpp_buffer *buffer = pfile->buffer;
255 
256   for (;;)
257     {
258       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
259       unsigned int col;
260 
261       if (note->pos > buffer->cur)
262 	break;
263 
264       buffer->cur_note++;
265       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
266 
267       if (note->type == '\\' || note->type == ' ')
268 	{
269 	  if (note->type == ' ' && !in_comment)
270 	    cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
271 				 "backslash and newline separated by space");
272 
273 	  if (buffer->next_line > buffer->rlimit)
274 	    {
275 	      cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
276 				   "backslash-newline at end of file");
277 	      /* Prevent "no newline at end of file" warning.  */
278 	      buffer->next_line = buffer->rlimit;
279 	    }
280 
281 	  buffer->line_base = note->pos;
282 	  CPP_INCREMENT_LINE (pfile, 0);
283 	}
284       else if (_cpp_trigraph_map[note->type])
285 	{
286 	  if (CPP_OPTION (pfile, warn_trigraphs)
287 	      && (!in_comment || warn_in_comment (pfile, note)))
288 	    {
289 	      if (CPP_OPTION (pfile, trigraphs))
290 		cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
291 				     "trigraph ??%c converted to %c",
292 				     note->type,
293 				     (int) _cpp_trigraph_map[note->type]);
294 	      else
295 		{
296 		  cpp_error_with_line
297 		    (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
298 		     "trigraph ??%c ignored, use -trigraphs to enable",
299 		     note->type);
300 		}
301 	    }
302 	}
303       else
304 	abort ();
305     }
306 }
307 
308 /* Skip a C-style block comment.  We find the end of the comment by
309    seeing if an asterisk is before every '/' we encounter.  Returns
310    nonzero if comment terminated by EOF, zero otherwise.
311 
312    Buffer->cur points to the initial asterisk of the comment.  */
313 bool
_cpp_skip_block_comment(cpp_reader * pfile)314 _cpp_skip_block_comment (cpp_reader *pfile)
315 {
316   cpp_buffer *buffer = pfile->buffer;
317   const uchar *cur = buffer->cur;
318   uchar c;
319 
320   cur++;
321   if (*cur == '/')
322     cur++;
323 
324   for (;;)
325     {
326       /* People like decorating comments with '*', so check for '/'
327 	 instead for efficiency.  */
328       c = *cur++;
329 
330       if (c == '/')
331 	{
332 	  if (cur[-2] == '*')
333 	    break;
334 
335 	  /* Warn about potential nested comments, but not if the '/'
336 	     comes immediately before the true comment delimiter.
337 	     Don't bother to get it right across escaped newlines.  */
338 	  if (CPP_OPTION (pfile, warn_comments)
339 	      && cur[0] == '*' && cur[1] != '/')
340 	    {
341 	      buffer->cur = cur;
342 	      cpp_error_with_line (pfile, CPP_DL_WARNING,
343 				   pfile->line_table->highest_line, CPP_BUF_COL (buffer),
344 				   "\"/*\" within comment");
345 	    }
346 	}
347       else if (c == '\n')
348 	{
349 	  unsigned int cols;
350 	  buffer->cur = cur - 1;
351 	  _cpp_process_line_notes (pfile, true);
352 	  if (buffer->next_line >= buffer->rlimit)
353 	    return true;
354 	  _cpp_clean_line (pfile);
355 
356 	  cols = buffer->next_line - buffer->line_base;
357 	  CPP_INCREMENT_LINE (pfile, cols);
358 
359 	  cur = buffer->cur;
360 	}
361     }
362 
363   buffer->cur = cur;
364   _cpp_process_line_notes (pfile, true);
365   return false;
366 }
367 
368 /* Skip a C++ line comment, leaving buffer->cur pointing to the
369    terminating newline.  Handles escaped newlines.  Returns nonzero
370    if a multiline comment.  */
371 static int
skip_line_comment(cpp_reader * pfile)372 skip_line_comment (cpp_reader *pfile)
373 {
374   cpp_buffer *buffer = pfile->buffer;
375   unsigned int orig_line = pfile->line_table->highest_line;
376 
377   while (*buffer->cur != '\n')
378     buffer->cur++;
379 
380   _cpp_process_line_notes (pfile, true);
381   return orig_line != pfile->line_table->highest_line;
382 }
383 
384 /* Skips whitespace, saving the next non-whitespace character.  */
385 static void
skip_whitespace(cpp_reader * pfile,cppchar_t c)386 skip_whitespace (cpp_reader *pfile, cppchar_t c)
387 {
388   cpp_buffer *buffer = pfile->buffer;
389   bool saw_NUL = false;
390 
391   do
392     {
393       /* Horizontal space always OK.  */
394       if (c == ' ' || c == '\t')
395 	;
396       /* Just \f \v or \0 left.  */
397       else if (c == '\0')
398 	saw_NUL = true;
399       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
400 	cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
401 			     CPP_BUF_COL (buffer),
402 			     "%s in preprocessing directive",
403 			     c == '\f' ? "form feed" : "vertical tab");
404 
405       c = *buffer->cur++;
406     }
407   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
408   while (is_nvspace (c));
409 
410   if (saw_NUL)
411     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
412 
413   buffer->cur--;
414 }
415 
416 /* See if the characters of a number token are valid in a name (no
417    '.', '+' or '-').  */
418 static int
name_p(cpp_reader * pfile,const cpp_string * string)419 name_p (cpp_reader *pfile, const cpp_string *string)
420 {
421   unsigned int i;
422 
423   for (i = 0; i < string->len; i++)
424     if (!is_idchar (string->text[i]))
425       return 0;
426 
427   return 1;
428 }
429 
430 /* After parsing an identifier or other sequence, produce a warning about
431    sequences not in NFC/NFKC.  */
432 static void
warn_about_normalization(cpp_reader * pfile,const cpp_token * token,const struct normalize_state * s)433 warn_about_normalization (cpp_reader *pfile,
434 			  const cpp_token *token,
435 			  const struct normalize_state *s)
436 {
437   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
438       && !pfile->state.skipping)
439     {
440       /* Make sure that the token is printed using UCNs, even
441 	 if we'd otherwise happily print UTF-8.  */
442       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
443       size_t sz;
444 
445       sz = cpp_spell_token (pfile, token, buf, false) - buf;
446       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
447 	cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
448 			     "`%.*s' is not in NFKC", (int) sz, buf);
449       else
450 	cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
451 			     "`%.*s' is not in NFC", (int) sz, buf);
452     }
453 }
454 
455 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
456    an identifier.  FIRST is TRUE if this starts an identifier.  */
457 static bool
forms_identifier_p(cpp_reader * pfile,int first,struct normalize_state * state)458 forms_identifier_p (cpp_reader *pfile, int first,
459 		    struct normalize_state *state)
460 {
461   cpp_buffer *buffer = pfile->buffer;
462 
463   if (*buffer->cur == '$')
464     {
465       if (!CPP_OPTION (pfile, dollars_in_ident))
466 	return false;
467 
468       buffer->cur++;
469       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
470 	{
471 	  CPP_OPTION (pfile, warn_dollars) = 0;
472 	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
473 	}
474 
475       return true;
476     }
477 
478   /* Is this a syntactically valid UCN?  */
479   if (CPP_OPTION (pfile, extended_identifiers)
480       && *buffer->cur == '\\'
481       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
482     {
483       buffer->cur += 2;
484       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
485 			  state))
486 	return true;
487       buffer->cur -= 2;
488     }
489 
490   return false;
491 }
492 
493 /* Lex an identifier starting at BUFFER->CUR - 1.  */
494 static cpp_hashnode *
lex_identifier(cpp_reader * pfile,const uchar * base,bool starts_ucn,struct normalize_state * nst)495 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
496 		struct normalize_state *nst)
497 {
498   cpp_hashnode *result;
499   const uchar *cur;
500   unsigned int len;
501   unsigned int hash = HT_HASHSTEP (0, *base);
502 
503   cur = pfile->buffer->cur;
504   if (! starts_ucn)
505     while (ISIDNUM (*cur))
506       {
507 	hash = HT_HASHSTEP (hash, *cur);
508 	cur++;
509       }
510   pfile->buffer->cur = cur;
511   if (starts_ucn || forms_identifier_p (pfile, false, nst))
512     {
513       /* Slower version for identifiers containing UCNs (or $).  */
514       do {
515 	while (ISIDNUM (*pfile->buffer->cur))
516 	  {
517 	    pfile->buffer->cur++;
518 	    NORMALIZE_STATE_UPDATE_IDNUM (nst);
519 	  }
520       } while (forms_identifier_p (pfile, false, nst));
521       result = _cpp_interpret_identifier (pfile, base,
522 					  pfile->buffer->cur - base);
523     }
524   else
525     {
526       len = cur - base;
527       hash = HT_HASHFINISH (hash, len);
528 
529       result = (cpp_hashnode *)
530 	ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
531     }
532 
533   /* Rarely, identifiers require diagnostics when lexed.  */
534   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
535 			&& !pfile->state.skipping, 0))
536     {
537       /* It is allowed to poison the same identifier twice.  */
538       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
539 	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
540 		   NODE_NAME (result));
541 
542       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
543 	 replacement list of a variadic macro.  */
544       if (result == pfile->spec_nodes.n__VA_ARGS__
545 	  && !pfile->state.va_args_ok)
546 	cpp_error (pfile, CPP_DL_PEDWARN,
547 		   "__VA_ARGS__ can only appear in the expansion"
548 		   " of a C99 variadic macro");
549     }
550 
551   return result;
552 }
553 
554 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
555 static void
lex_number(cpp_reader * pfile,cpp_string * number,struct normalize_state * nst)556 lex_number (cpp_reader *pfile, cpp_string *number,
557 	    struct normalize_state *nst)
558 {
559   const uchar *cur;
560   const uchar *base;
561   uchar *dest;
562 
563   base = pfile->buffer->cur - 1;
564   do
565     {
566       cur = pfile->buffer->cur;
567 
568       /* N.B. ISIDNUM does not include $.  */
569       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
570 	{
571 	  cur++;
572 	  NORMALIZE_STATE_UPDATE_IDNUM (nst);
573 	}
574 
575       pfile->buffer->cur = cur;
576     }
577   while (forms_identifier_p (pfile, false, nst));
578 
579   number->len = cur - base;
580   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
581   memcpy (dest, base, number->len);
582   dest[number->len] = '\0';
583   number->text = dest;
584 }
585 
586 /* Create a token of type TYPE with a literal spelling.  */
587 static void
create_literal(cpp_reader * pfile,cpp_token * token,const uchar * base,unsigned int len,enum cpp_ttype type)588 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
589 		unsigned int len, enum cpp_ttype type)
590 {
591   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
592 
593   memcpy (dest, base, len);
594   dest[len] = '\0';
595   token->type = type;
596   token->val.str.len = len;
597   token->val.str.text = dest;
598 }
599 
600 /* Lexes a string, character constant, or angle-bracketed header file
601    name.  The stored string contains the spelling, including opening
602    quote and leading any leading 'L'.  It returns the type of the
603    literal, or CPP_OTHER if it was not properly terminated.
604 
605    The spelling is NUL-terminated, but it is not guaranteed that this
606    is the first NUL since embedded NULs are preserved.  */
607 static void
lex_string(cpp_reader * pfile,cpp_token * token,const uchar * base)608 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
609 {
610   bool saw_NUL = false;
611   const uchar *cur;
612   cppchar_t terminator;
613   enum cpp_ttype type;
614 
615   cur = base;
616   terminator = *cur++;
617   if (terminator == 'L')
618     terminator = *cur++;
619   if (terminator == '\"')
620     type = *base == 'L' ? CPP_WSTRING: CPP_STRING;
621   else if (terminator == '\'')
622     type = *base == 'L' ? CPP_WCHAR: CPP_CHAR;
623   else
624     terminator = '>', type = CPP_HEADER_NAME;
625 
626   for (;;)
627     {
628       cppchar_t c = *cur++;
629 
630       /* In #include-style directives, terminators are not escapable.  */
631       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
632 	cur++;
633       else if (c == terminator)
634 	break;
635       else if (c == '\n')
636 	{
637 	  cur--;
638 	  type = CPP_OTHER;
639 	  break;
640 	}
641       else if (c == '\0')
642 	saw_NUL = true;
643     }
644 
645   if (saw_NUL && !pfile->state.skipping)
646     cpp_error (pfile, CPP_DL_WARNING,
647 	       "null character(s) preserved in literal");
648 
649   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
650     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
651 	       (int) terminator);
652 
653   pfile->buffer->cur = cur;
654   create_literal (pfile, token, base, cur - base, type);
655 }
656 
657 /* The stored comment includes the comment start and any terminator.  */
658 static void
save_comment(cpp_reader * pfile,cpp_token * token,const unsigned char * from,cppchar_t type)659 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
660 	      cppchar_t type)
661 {
662   unsigned char *buffer;
663   unsigned int len, clen;
664 
665   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
666 
667   /* C++ comments probably (not definitely) have moved past a new
668      line, which we don't want to save in the comment.  */
669   if (is_vspace (pfile->buffer->cur[-1]))
670     len--;
671 
672   /* If we are currently in a directive, then we need to store all
673      C++ comments as C comments internally, and so we need to
674      allocate a little extra space in that case.
675 
676      Note that the only time we encounter a directive here is
677      when we are saving comments in a "#define".  */
678   clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
679 
680   buffer = _cpp_unaligned_alloc (pfile, clen);
681 
682   token->type = CPP_COMMENT;
683   token->val.str.len = clen;
684   token->val.str.text = buffer;
685 
686   buffer[0] = '/';
687   memcpy (buffer + 1, from, len - 1);
688 
689   /* Finish conversion to a C comment, if necessary.  */
690   if (pfile->state.in_directive && type == '/')
691     {
692       buffer[1] = '*';
693       buffer[clen - 2] = '*';
694       buffer[clen - 1] = '/';
695     }
696 }
697 
698 /* Allocate COUNT tokens for RUN.  */
699 void
_cpp_init_tokenrun(tokenrun * run,unsigned int count)700 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
701 {
702   run->base = XNEWVEC (cpp_token, count);
703   run->limit = run->base + count;
704   run->next = NULL;
705 }
706 
707 /* Returns the next tokenrun, or creates one if there is none.  */
708 static tokenrun *
next_tokenrun(tokenrun * run)709 next_tokenrun (tokenrun *run)
710 {
711   if (run->next == NULL)
712     {
713       run->next = XNEW (tokenrun);
714       run->next->prev = run;
715       _cpp_init_tokenrun (run->next, 250);
716     }
717 
718   return run->next;
719 }
720 
721 /* Allocate a single token that is invalidated at the same time as the
722    rest of the tokens on the line.  Has its line and col set to the
723    same as the last lexed token, so that diagnostics appear in the
724    right place.  */
725 cpp_token *
_cpp_temp_token(cpp_reader * pfile)726 _cpp_temp_token (cpp_reader *pfile)
727 {
728   cpp_token *old, *result;
729 
730   old = pfile->cur_token - 1;
731   if (pfile->cur_token == pfile->cur_run->limit)
732     {
733       pfile->cur_run = next_tokenrun (pfile->cur_run);
734       pfile->cur_token = pfile->cur_run->base;
735     }
736 
737   result = pfile->cur_token++;
738   result->src_loc = old->src_loc;
739   return result;
740 }
741 
742 /* Lex a token into RESULT (external interface).  Takes care of issues
743    like directive handling, token lookahead, multiple include
744    optimization and skipping.  */
745 const cpp_token *
_cpp_lex_token(cpp_reader * pfile)746 _cpp_lex_token (cpp_reader *pfile)
747 {
748   cpp_token *result;
749 
750   for (;;)
751     {
752       if (pfile->cur_token == pfile->cur_run->limit)
753 	{
754 	  pfile->cur_run = next_tokenrun (pfile->cur_run);
755 	  pfile->cur_token = pfile->cur_run->base;
756 	}
757 
758       if (pfile->lookaheads)
759 	{
760 	  pfile->lookaheads--;
761 	  result = pfile->cur_token++;
762 	}
763       else
764 	result = _cpp_lex_direct (pfile);
765 
766       if (result->flags & BOL)
767 	{
768 	  /* Is this a directive.  If _cpp_handle_directive returns
769 	     false, it is an assembler #.  */
770 	  if (result->type == CPP_HASH
771 	      /* 6.10.3 p 11: Directives in a list of macro arguments
772 		 gives undefined behavior.  This implementation
773 		 handles the directive as normal.  */
774 	      && pfile->state.parsing_args != 1)
775 	    {
776 	      if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
777 		{
778 		  if (pfile->directive_result.type == CPP_PADDING)
779 		    continue;
780 		  result = &pfile->directive_result;
781 		}
782 	    }
783 	  else if (pfile->state.in_deferred_pragma)
784 	    result = &pfile->directive_result;
785 
786 	  if (pfile->cb.line_change && !pfile->state.skipping)
787 	    pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
788 	}
789 
790       /* We don't skip tokens in directives.  */
791       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
792 	break;
793 
794       /* Outside a directive, invalidate controlling macros.  At file
795 	 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
796 	 get here and MI optimization works.  */
797       pfile->mi_valid = false;
798 
799       if (!pfile->state.skipping || result->type == CPP_EOF)
800 	break;
801     }
802 
803   return result;
804 }
805 
806 /* Returns true if a fresh line has been loaded.  */
807 bool
_cpp_get_fresh_line(cpp_reader * pfile)808 _cpp_get_fresh_line (cpp_reader *pfile)
809 {
810   int return_at_eof;
811 
812   /* We can't get a new line until we leave the current directive.  */
813   if (pfile->state.in_directive)
814     return false;
815 
816   for (;;)
817     {
818       cpp_buffer *buffer = pfile->buffer;
819 
820       if (!buffer->need_line)
821 	return true;
822 
823       if (buffer->next_line < buffer->rlimit)
824 	{
825 	  _cpp_clean_line (pfile);
826 	  return true;
827 	}
828 
829       /* First, get out of parsing arguments state.  */
830       if (pfile->state.parsing_args)
831 	return false;
832 
833       /* End of buffer.  Non-empty files should end in a newline.  */
834       if (buffer->buf != buffer->rlimit
835 	  && buffer->next_line > buffer->rlimit
836 	  && !buffer->from_stage3)
837 	{
838 	  /* Only warn once.  */
839 	  buffer->next_line = buffer->rlimit;
840 	}
841 
842       return_at_eof = buffer->return_at_eof;
843       _cpp_pop_buffer (pfile);
844       if (pfile->buffer == NULL || return_at_eof)
845 	return false;
846     }
847 }
848 
849 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)		\
850   do							\
851     {							\
852       result->type = ELSE_TYPE;				\
853       if (*buffer->cur == CHAR)				\
854 	buffer->cur++, result->type = THEN_TYPE;	\
855     }							\
856   while (0)
857 
858 /* Lex a token into pfile->cur_token, which is also incremented, to
859    get diagnostics pointing to the correct location.
860 
861    Does not handle issues such as token lookahead, multiple-include
862    optimization, directives, skipping etc.  This function is only
863    suitable for use by _cpp_lex_token, and in special cases like
864    lex_expansion_token which doesn't care for any of these issues.
865 
866    When meeting a newline, returns CPP_EOF if parsing a directive,
867    otherwise returns to the start of the token buffer if permissible.
868    Returns the location of the lexed token.  */
869 cpp_token *
_cpp_lex_direct(cpp_reader * pfile)870 _cpp_lex_direct (cpp_reader *pfile)
871 {
872   cppchar_t c;
873   cpp_buffer *buffer;
874   const unsigned char *comment_start;
875   cpp_token *result = pfile->cur_token++;
876 
877  fresh_line:
878   result->flags = 0;
879   buffer = pfile->buffer;
880   if (buffer->need_line)
881     {
882       if (pfile->state.in_deferred_pragma)
883 	{
884 	  result->type = CPP_PRAGMA_EOL;
885 	  pfile->state.in_deferred_pragma = false;
886 	  if (!pfile->state.pragma_allow_expansion)
887 	    pfile->state.prevent_expansion--;
888 	  return result;
889 	}
890       if (!_cpp_get_fresh_line (pfile))
891 	{
892 	  result->type = CPP_EOF;
893 	  if (!pfile->state.in_directive)
894 	    {
895 	      /* Tell the compiler the line number of the EOF token.  */
896 	      result->src_loc = pfile->line_table->highest_line;
897 	      result->flags = BOL;
898 	    }
899 	  return result;
900 	}
901       if (!pfile->keep_tokens)
902 	{
903 	  pfile->cur_run = &pfile->base_run;
904 	  result = pfile->base_run.base;
905 	  pfile->cur_token = result + 1;
906 	}
907       result->flags = BOL;
908       if (pfile->state.parsing_args == 2)
909 	result->flags |= PREV_WHITE;
910     }
911   buffer = pfile->buffer;
912  update_tokens_line:
913   result->src_loc = pfile->line_table->highest_line;
914 
915  skipped_white:
916   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
917       && !pfile->overlaid_buffer)
918     {
919       _cpp_process_line_notes (pfile, false);
920       result->src_loc = pfile->line_table->highest_line;
921     }
922   c = *buffer->cur++;
923 
924   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
925 			       CPP_BUF_COLUMN (buffer, buffer->cur));
926 
927   switch (c)
928     {
929     case ' ': case '\t': case '\f': case '\v': case '\0':
930       result->flags |= PREV_WHITE;
931       skip_whitespace (pfile, c);
932       goto skipped_white;
933 
934     case '\n':
935       if (buffer->cur < buffer->rlimit)
936 	CPP_INCREMENT_LINE (pfile, 0);
937       buffer->need_line = true;
938       goto fresh_line;
939 
940     case '0': case '1': case '2': case '3': case '4':
941     case '5': case '6': case '7': case '8': case '9':
942       {
943 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
944 	result->type = CPP_NUMBER;
945 	lex_number (pfile, &result->val.str, &nst);
946 	warn_about_normalization (pfile, result, &nst);
947 	break;
948       }
949 
950     case 'L':
951       /* 'L' may introduce wide characters or strings.  */
952       if (*buffer->cur == '\'' || *buffer->cur == '"')
953 	{
954 	  lex_string (pfile, result, buffer->cur - 1);
955 	  break;
956 	}
957       /* Fall through.  */
958 
959     case '_':
960     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
961     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
962     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
963     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
964     case 'y': case 'z':
965     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
966     case 'G': case 'H': case 'I': case 'J': case 'K':
967     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
968     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
969     case 'Y': case 'Z':
970       result->type = CPP_NAME;
971       {
972 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
973 	result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
974 					   &nst);
975 	warn_about_normalization (pfile, result, &nst);
976       }
977 
978       /* Convert named operators to their proper types.  */
979       if (result->val.node->flags & NODE_OPERATOR)
980 	{
981 	  result->flags |= NAMED_OP;
982 	  result->type = (enum cpp_ttype) result->val.node->directive_index;
983 	}
984       break;
985 
986     case '\'':
987     case '"':
988       lex_string (pfile, result, buffer->cur - 1);
989       break;
990 
991     case '/':
992       /* A potential block or line comment.  */
993       comment_start = buffer->cur;
994       c = *buffer->cur;
995 
996       if (c == '*')
997 	{
998 	  if (_cpp_skip_block_comment (pfile))
999 	    cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1000 	}
1001       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1002 			    || cpp_in_system_header (pfile)))
1003 	{
1004 	  /* Warn about comments only if pedantically GNUC89, and not
1005 	     in system headers.  */
1006 	  if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1007 	      && ! buffer->warned_cplusplus_comments)
1008 	    {
1009 	      cpp_error (pfile, CPP_DL_PEDWARN,
1010 			 "C++ style comments are not allowed in ISO C90");
1011 	      cpp_error (pfile, CPP_DL_PEDWARN,
1012 			 "(this will be reported only once per input file)");
1013 	      buffer->warned_cplusplus_comments = 1;
1014 	    }
1015 
1016 	  if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1017 	    cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1018 	}
1019       else if (c == '=')
1020 	{
1021 	  buffer->cur++;
1022 	  result->type = CPP_DIV_EQ;
1023 	  break;
1024 	}
1025       else
1026 	{
1027 	  result->type = CPP_DIV;
1028 	  break;
1029 	}
1030 
1031       if (!pfile->state.save_comments)
1032 	{
1033 	  result->flags |= PREV_WHITE;
1034 	  goto update_tokens_line;
1035 	}
1036 
1037       /* Save the comment as a token in its own right.  */
1038       save_comment (pfile, result, comment_start, c);
1039       break;
1040 
1041     case '<':
1042       if (pfile->state.angled_headers)
1043 	{
1044 	  lex_string (pfile, result, buffer->cur - 1);
1045 	  break;
1046 	}
1047 
1048       result->type = CPP_LESS;
1049       if (*buffer->cur == '=')
1050 	buffer->cur++, result->type = CPP_LESS_EQ;
1051       else if (*buffer->cur == '<')
1052 	{
1053 	  buffer->cur++;
1054 	  IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1055 	}
1056       else if (CPP_OPTION (pfile, digraphs))
1057 	{
1058 	  if (*buffer->cur == ':')
1059 	    {
1060 	      buffer->cur++;
1061 	      result->flags |= DIGRAPH;
1062 	      result->type = CPP_OPEN_SQUARE;
1063 	    }
1064 	  else if (*buffer->cur == '%')
1065 	    {
1066 	      buffer->cur++;
1067 	      result->flags |= DIGRAPH;
1068 	      result->type = CPP_OPEN_BRACE;
1069 	    }
1070 	}
1071       break;
1072 
1073     case '>':
1074       result->type = CPP_GREATER;
1075       if (*buffer->cur == '=')
1076 	buffer->cur++, result->type = CPP_GREATER_EQ;
1077       else if (*buffer->cur == '>')
1078 	{
1079 	  buffer->cur++;
1080 	  IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1081 	}
1082       break;
1083 
1084     case '%':
1085       result->type = CPP_MOD;
1086       if (*buffer->cur == '=')
1087 	buffer->cur++, result->type = CPP_MOD_EQ;
1088       else if (CPP_OPTION (pfile, digraphs))
1089 	{
1090 	  if (*buffer->cur == ':')
1091 	    {
1092 	      buffer->cur++;
1093 	      result->flags |= DIGRAPH;
1094 	      result->type = CPP_HASH;
1095 	      if (*buffer->cur == '%' && buffer->cur[1] == ':')
1096 		buffer->cur += 2, result->type = CPP_PASTE;
1097 	    }
1098 	  else if (*buffer->cur == '>')
1099 	    {
1100 	      buffer->cur++;
1101 	      result->flags |= DIGRAPH;
1102 	      result->type = CPP_CLOSE_BRACE;
1103 	    }
1104 	}
1105       break;
1106 
1107     case '.':
1108       result->type = CPP_DOT;
1109       if (ISDIGIT (*buffer->cur))
1110 	{
1111 	  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1112 	  result->type = CPP_NUMBER;
1113 	  lex_number (pfile, &result->val.str, &nst);
1114 	  warn_about_normalization (pfile, result, &nst);
1115 	}
1116       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1117 	buffer->cur += 2, result->type = CPP_ELLIPSIS;
1118       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1119 	buffer->cur++, result->type = CPP_DOT_STAR;
1120       break;
1121 
1122     case '+':
1123       result->type = CPP_PLUS;
1124       if (*buffer->cur == '+')
1125 	buffer->cur++, result->type = CPP_PLUS_PLUS;
1126       else if (*buffer->cur == '=')
1127 	buffer->cur++, result->type = CPP_PLUS_EQ;
1128       break;
1129 
1130     case '-':
1131       result->type = CPP_MINUS;
1132       if (*buffer->cur == '>')
1133 	{
1134 	  buffer->cur++;
1135 	  result->type = CPP_DEREF;
1136 	  if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1137 	    buffer->cur++, result->type = CPP_DEREF_STAR;
1138 	}
1139       else if (*buffer->cur == '-')
1140 	buffer->cur++, result->type = CPP_MINUS_MINUS;
1141       else if (*buffer->cur == '=')
1142 	buffer->cur++, result->type = CPP_MINUS_EQ;
1143       break;
1144 
1145     case '&':
1146       result->type = CPP_AND;
1147       if (*buffer->cur == '&')
1148 	buffer->cur++, result->type = CPP_AND_AND;
1149       else if (*buffer->cur == '=')
1150 	buffer->cur++, result->type = CPP_AND_EQ;
1151       break;
1152 
1153     case '|':
1154       result->type = CPP_OR;
1155       if (*buffer->cur == '|')
1156 	buffer->cur++, result->type = CPP_OR_OR;
1157       else if (*buffer->cur == '=')
1158 	buffer->cur++, result->type = CPP_OR_EQ;
1159       break;
1160 
1161     case ':':
1162       result->type = CPP_COLON;
1163       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1164 	buffer->cur++, result->type = CPP_SCOPE;
1165       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1166 	{
1167 	  buffer->cur++;
1168 	  result->flags |= DIGRAPH;
1169 	  result->type = CPP_CLOSE_SQUARE;
1170 	}
1171       break;
1172 
1173     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1174     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1175     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1176     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1177     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1178 
1179     case '?': result->type = CPP_QUERY; break;
1180     case '~': result->type = CPP_COMPL; break;
1181     case ',': result->type = CPP_COMMA; break;
1182     case '(': result->type = CPP_OPEN_PAREN; break;
1183     case ')': result->type = CPP_CLOSE_PAREN; break;
1184     case '[': result->type = CPP_OPEN_SQUARE; break;
1185     case ']': result->type = CPP_CLOSE_SQUARE; break;
1186     case '{': result->type = CPP_OPEN_BRACE; break;
1187     case '}': result->type = CPP_CLOSE_BRACE; break;
1188     case ';': result->type = CPP_SEMICOLON; break;
1189 
1190       /* @ is a punctuator in Objective-C.  */
1191     case '@': result->type = CPP_ATSIGN; break;
1192 
1193     case '$':
1194     case '\\':
1195       {
1196 	const uchar *base = --buffer->cur;
1197 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1198 
1199 	if (forms_identifier_p (pfile, true, &nst))
1200 	  {
1201 	    result->type = CPP_NAME;
1202 	    result->val.node = lex_identifier (pfile, base, true, &nst);
1203 	    warn_about_normalization (pfile, result, &nst);
1204 	    break;
1205 	  }
1206 	buffer->cur++;
1207       }
1208 
1209     default:
1210       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1211       break;
1212     }
1213 
1214   return result;
1215 }
1216 
1217 /* An upper bound on the number of bytes needed to spell TOKEN.
1218    Does not include preceding whitespace.  */
1219 unsigned int
cpp_token_len(const cpp_token * token)1220 cpp_token_len (const cpp_token *token)
1221 {
1222   unsigned int len;
1223 
1224   switch (TOKEN_SPELL (token))
1225     {
1226     default:		len = 4;				break;
1227     case SPELL_LITERAL:	len = token->val.str.len;		break;
1228     case SPELL_IDENT:	len = NODE_LEN (token->val.node) * 10;	break;
1229     }
1230 
1231   return len;
1232 }
1233 
1234 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1235    Return the number of bytes read out of NAME.  (There are always
1236    10 bytes written to BUFFER.)  */
1237 
1238 static size_t
utf8_to_ucn(unsigned char * buffer,const unsigned char * name)1239 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1240 {
1241   int j;
1242   int ucn_len = 0;
1243   int ucn_len_c;
1244   unsigned t;
1245   unsigned long utf32;
1246 
1247   /* Compute the length of the UTF-8 sequence.  */
1248   for (t = *name; t & 0x80; t <<= 1)
1249     ucn_len++;
1250 
1251   utf32 = *name & (0x7F >> ucn_len);
1252   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1253     {
1254       utf32 = (utf32 << 6) | (*++name & 0x3F);
1255 
1256       /* Ill-formed UTF-8.  */
1257       if ((*name & ~0x3F) != 0x80)
1258 	abort ();
1259     }
1260 
1261   *buffer++ = '\\';
1262   *buffer++ = 'U';
1263   for (j = 7; j >= 0; j--)
1264     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1265   return ucn_len;
1266 }
1267 
1268 
1269 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
1270    already contain the enough space to hold the token's spelling.
1271    Returns a pointer to the character after the last character written.
1272    FORSTRING is true if this is to be the spelling after translation
1273    phase 1 (this is different for UCNs).
1274    FIXME: Would be nice if we didn't need the PFILE argument.  */
1275 unsigned char *
cpp_spell_token(cpp_reader * pfile,const cpp_token * token,unsigned char * buffer,bool forstring)1276 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1277 		 unsigned char *buffer, bool forstring)
1278 {
1279   switch (TOKEN_SPELL (token))
1280     {
1281     case SPELL_OPERATOR:
1282       {
1283 	const unsigned char *spelling;
1284 	unsigned char c;
1285 
1286 	if (token->flags & DIGRAPH)
1287 	  spelling
1288 	    = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1289 	else if (token->flags & NAMED_OP)
1290 	  goto spell_ident;
1291 	else
1292 	  spelling = TOKEN_NAME (token);
1293 
1294 	while ((c = *spelling++) != '\0')
1295 	  *buffer++ = c;
1296       }
1297       break;
1298 
1299     spell_ident:
1300     case SPELL_IDENT:
1301       if (forstring)
1302 	{
1303 	  memcpy (buffer, NODE_NAME (token->val.node),
1304 		  NODE_LEN (token->val.node));
1305 	  buffer += NODE_LEN (token->val.node);
1306 	}
1307       else
1308 	{
1309 	  size_t i;
1310 	  const unsigned char * name = NODE_NAME (token->val.node);
1311 
1312 	  for (i = 0; i < NODE_LEN (token->val.node); i++)
1313 	    if (name[i] & ~0x7F)
1314 	      {
1315 		i += utf8_to_ucn (buffer, name + i) - 1;
1316 		buffer += 10;
1317 	      }
1318 	    else
1319 	      *buffer++ = NODE_NAME (token->val.node)[i];
1320 	}
1321       break;
1322 
1323     case SPELL_LITERAL:
1324       memcpy (buffer, token->val.str.text, token->val.str.len);
1325       buffer += token->val.str.len;
1326       break;
1327 
1328     case SPELL_NONE:
1329       cpp_error (pfile, CPP_DL_ICE,
1330 		 "unspellable token %s", TOKEN_NAME (token));
1331       break;
1332     }
1333 
1334   return buffer;
1335 }
1336 
1337 /* Returns TOKEN spelt as a null-terminated string.  The string is
1338    freed when the reader is destroyed.  Useful for diagnostics.  */
1339 unsigned char *
cpp_token_as_text(cpp_reader * pfile,const cpp_token * token)1340 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1341 {
1342   unsigned int len = cpp_token_len (token) + 1;
1343   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1344 
1345   end = cpp_spell_token (pfile, token, start, false);
1346   end[0] = '\0';
1347 
1348   return start;
1349 }
1350 
1351 /* Used by C front ends, which really should move to using
1352    cpp_token_as_text.  */
1353 const char *
cpp_type2name(enum cpp_ttype type)1354 cpp_type2name (enum cpp_ttype type)
1355 {
1356   return (const char *) token_spellings[type].name;
1357 }
1358 
1359 /* Writes the spelling of token to FP, without any preceding space.
1360    Separated from cpp_spell_token for efficiency - to avoid stdio
1361    double-buffering.  */
1362 void
cpp_output_token(const cpp_token * token,FILE * fp)1363 cpp_output_token (const cpp_token *token, FILE *fp)
1364 {
1365   switch (TOKEN_SPELL (token))
1366     {
1367     case SPELL_OPERATOR:
1368       {
1369 	const unsigned char *spelling;
1370 	int c;
1371 
1372 	if (token->flags & DIGRAPH)
1373 	  spelling
1374 	    = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1375 	else if (token->flags & NAMED_OP)
1376 	  goto spell_ident;
1377 	else
1378 	  spelling = TOKEN_NAME (token);
1379 
1380 	c = *spelling;
1381 	do
1382 	  putc (c, fp);
1383 	while ((c = *++spelling) != '\0');
1384       }
1385       break;
1386 
1387     spell_ident:
1388     case SPELL_IDENT:
1389       {
1390 	size_t i;
1391 	const unsigned char * name = NODE_NAME (token->val.node);
1392 
1393 	for (i = 0; i < NODE_LEN (token->val.node); i++)
1394 	  if (name[i] & ~0x7F)
1395 	    {
1396 	      unsigned char buffer[10];
1397 	      i += utf8_to_ucn (buffer, name + i) - 1;
1398 	      fwrite (buffer, 1, 10, fp);
1399 	    }
1400 	  else
1401 	    fputc (NODE_NAME (token->val.node)[i], fp);
1402       }
1403       break;
1404 
1405     case SPELL_LITERAL:
1406       fwrite (token->val.str.text, 1, token->val.str.len, fp);
1407       break;
1408 
1409     case SPELL_NONE:
1410       /* An error, most probably.  */
1411       break;
1412     }
1413 }
1414 
1415 /* Compare two tokens.  */
1416 int
_cpp_equiv_tokens(const cpp_token * a,const cpp_token * b)1417 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1418 {
1419   if (a->type == b->type && a->flags == b->flags)
1420     switch (TOKEN_SPELL (a))
1421       {
1422       default:			/* Keep compiler happy.  */
1423       case SPELL_OPERATOR:
1424 	return 1;
1425       case SPELL_NONE:
1426 	return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1427       case SPELL_IDENT:
1428 	return a->val.node == b->val.node;
1429       case SPELL_LITERAL:
1430 	return (a->val.str.len == b->val.str.len
1431 		&& !memcmp (a->val.str.text, b->val.str.text,
1432 			    a->val.str.len));
1433       }
1434 
1435   return 0;
1436 }
1437 
1438 /* Returns nonzero if a space should be inserted to avoid an
1439    accidental token paste for output.  For simplicity, it is
1440    conservative, and occasionally advises a space where one is not
1441    needed, e.g. "." and ".2".  */
1442 int
cpp_avoid_paste(cpp_reader * pfile,const cpp_token * token1,const cpp_token * token2)1443 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1444 		 const cpp_token *token2)
1445 {
1446   enum cpp_ttype a = token1->type, b = token2->type;
1447   cppchar_t c;
1448 
1449   if (token1->flags & NAMED_OP)
1450     a = CPP_NAME;
1451   if (token2->flags & NAMED_OP)
1452     b = CPP_NAME;
1453 
1454   c = EOF;
1455   if (token2->flags & DIGRAPH)
1456     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1457   else if (token_spellings[b].category == SPELL_OPERATOR)
1458     c = token_spellings[b].name[0];
1459 
1460   /* Quickly get everything that can paste with an '='.  */
1461   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1462     return 1;
1463 
1464   switch (a)
1465     {
1466     case CPP_GREATER:	return c == '>';
1467     case CPP_LESS:	return c == '<' || c == '%' || c == ':';
1468     case CPP_PLUS:	return c == '+';
1469     case CPP_MINUS:	return c == '-' || c == '>';
1470     case CPP_DIV:	return c == '/' || c == '*'; /* Comments.  */
1471     case CPP_MOD:	return c == ':' || c == '>';
1472     case CPP_AND:	return c == '&';
1473     case CPP_OR:	return c == '|';
1474     case CPP_COLON:	return c == ':' || c == '>';
1475     case CPP_DEREF:	return c == '*';
1476     case CPP_DOT:	return c == '.' || c == '%' || b == CPP_NUMBER;
1477     case CPP_HASH:	return c == '#' || c == '%'; /* Digraph form.  */
1478     case CPP_NAME:	return ((b == CPP_NUMBER
1479 				 && name_p (pfile, &token2->val.str))
1480 				|| b == CPP_NAME
1481 				|| b == CPP_CHAR || b == CPP_STRING); /* L */
1482     case CPP_NUMBER:	return (b == CPP_NUMBER || b == CPP_NAME
1483 				|| c == '.' || c == '+' || c == '-');
1484 				      /* UCNs */
1485     case CPP_OTHER:	return ((token1->val.str.text[0] == '\\'
1486 				 && b == CPP_NAME)
1487 				|| (CPP_OPTION (pfile, objc)
1488 				    && token1->val.str.text[0] == '@'
1489 				    && (b == CPP_NAME || b == CPP_STRING)));
1490     default:		break;
1491     }
1492 
1493   return 0;
1494 }
1495 
1496 /* Output all the remaining tokens on the current line, and a newline
1497    character, to FP.  Leading whitespace is removed.  If there are
1498    macros, special token padding is not performed.  */
1499 void
cpp_output_line(cpp_reader * pfile,FILE * fp)1500 cpp_output_line (cpp_reader *pfile, FILE *fp)
1501 {
1502   const cpp_token *token;
1503 
1504   token = cpp_get_token (pfile);
1505   while (token->type != CPP_EOF)
1506     {
1507       cpp_output_token (token, fp);
1508       token = cpp_get_token (pfile);
1509       if (token->flags & PREV_WHITE)
1510 	putc (' ', fp);
1511     }
1512 
1513   putc ('\n', fp);
1514 }
1515 
1516 /* Memory buffers.  Changing these three constants can have a dramatic
1517    effect on performance.  The values here are reasonable defaults,
1518    but might be tuned.  If you adjust them, be sure to test across a
1519    range of uses of cpplib, including heavy nested function-like macro
1520    expansion.  Also check the change in peak memory usage (NJAMD is a
1521    good tool for this).  */
1522 #define MIN_BUFF_SIZE 8000
1523 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1524 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1525 	(MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1526 
1527 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1528   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1529 #endif
1530 
1531 /* Create a new allocation buffer.  Place the control block at the end
1532    of the buffer, so that buffer overflows will cause immediate chaos.  */
1533 static _cpp_buff *
new_buff(size_t len)1534 new_buff (size_t len)
1535 {
1536   _cpp_buff *result;
1537   unsigned char *base;
1538 
1539   if (len < MIN_BUFF_SIZE)
1540     len = MIN_BUFF_SIZE;
1541   len = CPP_ALIGN (len);
1542 
1543   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
1544   result = (_cpp_buff *) (base + len);
1545   result->base = base;
1546   result->cur = base;
1547   result->limit = base + len;
1548   result->next = NULL;
1549   return result;
1550 }
1551 
1552 /* Place a chain of unwanted allocation buffers on the free list.  */
1553 void
_cpp_release_buff(cpp_reader * pfile,_cpp_buff * buff)1554 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1555 {
1556   _cpp_buff *end = buff;
1557 
1558   while (end->next)
1559     end = end->next;
1560   end->next = pfile->free_buffs;
1561   pfile->free_buffs = buff;
1562 }
1563 
1564 /* Return a free buffer of size at least MIN_SIZE.  */
1565 _cpp_buff *
_cpp_get_buff(cpp_reader * pfile,size_t min_size)1566 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1567 {
1568   _cpp_buff *result, **p;
1569 
1570   for (p = &pfile->free_buffs;; p = &(*p)->next)
1571     {
1572       size_t size;
1573 
1574       if (*p == NULL)
1575 	return new_buff (min_size);
1576       result = *p;
1577       size = result->limit - result->base;
1578       /* Return a buffer that's big enough, but don't waste one that's
1579          way too big.  */
1580       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1581 	break;
1582     }
1583 
1584   *p = result->next;
1585   result->next = NULL;
1586   result->cur = result->base;
1587   return result;
1588 }
1589 
1590 /* Creates a new buffer with enough space to hold the uncommitted
1591    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
1592    the excess bytes to the new buffer.  Chains the new buffer after
1593    BUFF, and returns the new buffer.  */
1594 _cpp_buff *
_cpp_append_extend_buff(cpp_reader * pfile,_cpp_buff * buff,size_t min_extra)1595 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1596 {
1597   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1598   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1599 
1600   buff->next = new_buff;
1601   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1602   return new_buff;
1603 }
1604 
1605 /* Creates a new buffer with enough space to hold the uncommitted
1606    remaining bytes of the buffer pointed to by BUFF, and at least
1607    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
1608    Chains the new buffer before the buffer pointed to by BUFF, and
1609    updates the pointer to point to the new buffer.  */
1610 void
_cpp_extend_buff(cpp_reader * pfile,_cpp_buff ** pbuff,size_t min_extra)1611 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1612 {
1613   _cpp_buff *new_buff, *old_buff = *pbuff;
1614   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1615 
1616   new_buff = _cpp_get_buff (pfile, size);
1617   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1618   new_buff->next = old_buff;
1619   *pbuff = new_buff;
1620 }
1621 
1622 /* Free a chain of buffers starting at BUFF.  */
1623 void
_cpp_free_buff(_cpp_buff * buff)1624 _cpp_free_buff (_cpp_buff *buff)
1625 {
1626   _cpp_buff *next;
1627 
1628   for (; buff; buff = next)
1629     {
1630       next = buff->next;
1631       free (buff->base);
1632     }
1633 }
1634 
1635 /* Allocate permanent, unaligned storage of length LEN.  */
1636 unsigned char *
_cpp_unaligned_alloc(cpp_reader * pfile,size_t len)1637 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
1638 {
1639   _cpp_buff *buff = pfile->u_buff;
1640   unsigned char *result = buff->cur;
1641 
1642   if (len > (size_t) (buff->limit - result))
1643     {
1644       buff = _cpp_get_buff (pfile, len);
1645       buff->next = pfile->u_buff;
1646       pfile->u_buff = buff;
1647       result = buff->cur;
1648     }
1649 
1650   buff->cur = result + len;
1651   return result;
1652 }
1653 
1654 /* Allocate permanent, unaligned storage of length LEN from a_buff.
1655    That buffer is used for growing allocations when saving macro
1656    replacement lists in a #define, and when parsing an answer to an
1657    assertion in #assert, #unassert or #if (and therefore possibly
1658    whilst expanding macros).  It therefore must not be used by any
1659    code that they might call: specifically the lexer and the guts of
1660    the macro expander.
1661 
1662    All existing other uses clearly fit this restriction: storing
1663    registered pragmas during initialization.  */
1664 unsigned char *
_cpp_aligned_alloc(cpp_reader * pfile,size_t len)1665 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
1666 {
1667   _cpp_buff *buff = pfile->a_buff;
1668   unsigned char *result = buff->cur;
1669 
1670   if (len > (size_t) (buff->limit - result))
1671     {
1672       buff = _cpp_get_buff (pfile, len);
1673       buff->next = pfile->a_buff;
1674       pfile->a_buff = buff;
1675       result = buff->cur;
1676     }
1677 
1678   buff->cur = result + len;
1679   return result;
1680 }
1681 
1682 /* Say which field of TOK is in use.  */
1683 
1684 enum cpp_token_fld_kind
cpp_token_val_index(cpp_token * tok)1685 cpp_token_val_index (cpp_token *tok)
1686 {
1687   switch (TOKEN_SPELL (tok))
1688     {
1689     case SPELL_IDENT:
1690       return CPP_TOKEN_FLD_NODE;
1691     case SPELL_LITERAL:
1692       return CPP_TOKEN_FLD_STR;
1693     case SPELL_NONE:
1694       if (tok->type == CPP_MACRO_ARG)
1695 	return CPP_TOKEN_FLD_ARG_NO;
1696       else if (tok->type == CPP_PADDING)
1697 	return CPP_TOKEN_FLD_SOURCE;
1698       else if (tok->type == CPP_PRAGMA)
1699 	return CPP_TOKEN_FLD_PRAGMA;
1700       /* else fall through */
1701     default:
1702       return CPP_TOKEN_FLD_NONE;
1703     }
1704 }
1705