1 /* CPP Library - lexical analysis.
2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3    Contributed by Per Bothner, 1994-95.
4    Based on CCCP program by Paul Rubin, June 1986
5    Adapted to ANSI C, Richard Stallman, Jan 1987
6    Broken out to separate file, Zack Weinberg, Mar 2000
7 
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
11 later version.
12 
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 GNU General Public License for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
26 
27 enum spell_type
28 {
29   SPELL_OPERATOR = 0,
30   SPELL_IDENT,
31   SPELL_LITERAL,
32   SPELL_NONE
33 };
34 
35 struct token_spelling
36 {
37   enum spell_type category;
38   const unsigned char *name;
39 };
40 
41 static const unsigned char *const digraph_spellings[] =
42 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
43 
44 #define OP(e, s) { SPELL_OPERATOR, U s  },
45 #define TK(e, s) { SPELL_ ## s,    U #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
49 
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52 
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
59 			    unsigned int, enum cpp_ttype);
60 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
61 static int name_p (cpp_reader *, const cpp_string *);
62 static tokenrun *next_tokenrun (tokenrun *);
63 
64 static _cpp_buff *new_buff (size_t);
65 
66 
67 /* Utility routine:
68 
69    Compares, the token TOKEN to the NUL-terminated string STRING.
70    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
71 int
cpp_ideq(const cpp_token * token,const char * string)72 cpp_ideq (const cpp_token *token, const char *string)
73 {
74   if (token->type != CPP_NAME)
75     return 0;
76 
77   return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
78 }
79 
80 /* Record a note TYPE at byte POS into the current cleaned logical
81    line.  */
82 static void
add_line_note(cpp_buffer * buffer,const uchar * pos,unsigned int type)83 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
84 {
85   if (buffer->notes_used == buffer->notes_cap)
86     {
87       buffer->notes_cap = buffer->notes_cap * 2 + 200;
88       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
89                                   buffer->notes_cap);
90     }
91 
92   buffer->notes[buffer->notes_used].pos = pos;
93   buffer->notes[buffer->notes_used].type = type;
94   buffer->notes_used++;
95 }
96 
97 /* Returns with a logical line that contains no escaped newlines or
98    trigraphs.  This is a time-critical inner loop.  */
99 void
_cpp_clean_line(cpp_reader * pfile)100 _cpp_clean_line (cpp_reader *pfile)
101 {
102   cpp_buffer *buffer;
103   const uchar *s;
104   uchar c, *d, *p;
105 
106   buffer = pfile->buffer;
107   buffer->cur_note = buffer->notes_used = 0;
108   buffer->cur = buffer->line_base = buffer->next_line;
109   buffer->need_line = false;
110   s = buffer->next_line - 1;
111 
112   if (!buffer->from_stage3)
113     {
114       /* Short circuit for the common case of an un-escaped line with
115 	 no trigraphs.  The primary win here is by not writing any
116 	 data back to memory until we have to.  */
117       for (;;)
118 	{
119 	  c = *++s;
120 	  if (c == '\n' || c == '\r')
121 	    {
122 	      d = (uchar *) s;
123 
124 	      if (s == buffer->rlimit)
125 		goto done;
126 
127 	      /* DOS line ending? */
128 	      if (c == '\r' && s[1] == '\n')
129 		s++;
130 
131 	      if (s == buffer->rlimit)
132 		goto done;
133 
134 	      /* check for escaped newline */
135 	      p = d;
136 	      while (p != buffer->next_line && is_nvspace (p[-1]))
137 		p--;
138 	      if (p == buffer->next_line || p[-1] != '\\')
139 		goto done;
140 
141 	      /* Have an escaped newline; process it and proceed to
142 		 the slow path.  */
143 	      add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
144 	      d = p - 2;
145 	      buffer->next_line = p - 1;
146 	      break;
147 	    }
148 	  if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
149 	    {
150 	      /* Have a trigraph.  We may or may not have to convert
151 		 it.  Add a line note regardless, for -Wtrigraphs.  */
152 	      add_line_note (buffer, s, s[2]);
153 	      if (CPP_OPTION (pfile, trigraphs))
154 		{
155 		  /* We do, and that means we have to switch to the
156 		     slow path.  */
157 		  d = (uchar *) s;
158 		  *d = _cpp_trigraph_map[s[2]];
159 		  s += 2;
160 		  break;
161 		}
162 	    }
163 	}
164 
165 
166       for (;;)
167 	{
168 	  c = *++s;
169 	  *++d = c;
170 
171 	  if (c == '\n' || c == '\r')
172 	    {
173 		  /* Handle DOS line endings.  */
174 	      if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
175 		s++;
176 	      if (s == buffer->rlimit)
177 		break;
178 
179 	      /* Escaped?  */
180 	      p = d;
181 	      while (p != buffer->next_line && is_nvspace (p[-1]))
182 		p--;
183 	      if (p == buffer->next_line || p[-1] != '\\')
184 		break;
185 
186 	      add_line_note (buffer, p - 1, p != d ? ' ': '\\');
187 	      d = p - 2;
188 	      buffer->next_line = p - 1;
189 	    }
190 	  else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
191 	    {
192 	      /* Add a note regardless, for the benefit of -Wtrigraphs.  */
193 	      add_line_note (buffer, d, s[2]);
194 	      if (CPP_OPTION (pfile, trigraphs))
195 		{
196 		  *d = _cpp_trigraph_map[s[2]];
197 		  s += 2;
198 		}
199 	    }
200 	}
201     }
202   else
203     {
204       do
205 	s++;
206       while (*s != '\n' && *s != '\r');
207       d = (uchar *) s;
208 
209       /* Handle DOS line endings.  */
210       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
211 	s++;
212     }
213 
214  done:
215   /* (TIGCC 20050212) Don't convert \r to \n, switch them instead. */
216   if (*d=='\r' && d[1]=='\n')
217     {*d='\n'; d[1]='\r';}
218   else
219     *d = '\n';
220   /* A sentinel note that should never be processed.  */
221   add_line_note (buffer, d + 1, '\n');
222   buffer->next_line = s + 1;
223 }
224 
225 /* Return true if the trigraph indicated by NOTE should be warned
226    about in a comment.  */
227 static bool
warn_in_comment(cpp_reader * pfile,_cpp_line_note * note)228 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
229 {
230   const uchar *p;
231 
232   /* Within comments we don't warn about trigraphs, unless the
233      trigraph forms an escaped newline, as that may change
234      behavior.  */
235   if (note->type != '/')
236     return false;
237 
238   /* If -trigraphs, then this was an escaped newline iff the next note
239      is coincident.  */
240   if (CPP_OPTION (pfile, trigraphs))
241     return note[1].pos == note->pos;
242 
243   /* Otherwise, see if this forms an escaped newline.  */
244   p = note->pos + 3;
245   while (is_nvspace (*p))
246     p++;
247 
248   /* There might have been escaped newlines between the trigraph and the
249      newline we found.  Hence the position test.  */
250   return (*p == '\n' && p < note[1].pos);
251 }
252 
253 /* Process the notes created by add_line_note as far as the current
254    location.  */
255 void
_cpp_process_line_notes(cpp_reader * pfile,int in_comment)256 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
257 {
258   cpp_buffer *buffer = pfile->buffer;
259 
260   for (;;)
261     {
262       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
263       unsigned int col;
264 
265       if (note->pos > buffer->cur)
266 	break;
267 
268       buffer->cur_note++;
269       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
270 
271       if (note->type == '\\' || note->type == ' ')
272 	{
273 	  if (note->type == ' ' && !in_comment)
274 	    cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
275 				 "backslash and newline separated by space");
276 
277 	  if (buffer->next_line > buffer->rlimit)
278 	    {
279 	      cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
280 				   "backslash-newline at end of file");
281 	      /* Prevent "no newline at end of file" warning.  */
282 	      buffer->next_line = buffer->rlimit;
283 	    }
284 
285 	  buffer->line_base = note->pos;
286 	  CPP_INCREMENT_LINE (pfile, 0);
287 	}
288       else if (_cpp_trigraph_map[note->type])
289 	{
290 	  if (CPP_OPTION (pfile, warn_trigraphs)
291 	      && (!in_comment || warn_in_comment (pfile, note)))
292 	    {
293 	      if (CPP_OPTION (pfile, trigraphs))
294 		cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
295 				     "trigraph ??%c converted to %c",
296 				     note->type,
297 				     (int) _cpp_trigraph_map[note->type]);
298 	      else
299 		{
300 		  cpp_error_with_line
301 		    (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
302 		     "trigraph ??%c ignored, use -trigraphs to enable",
303 		     note->type);
304 		}
305 	    }
306 	}
307       else
308 	abort ();
309     }
310 }
311 
312 /* Skip a C-style block comment.  We find the end of the comment by
313    seeing if an asterisk is before every '/' we encounter.  Returns
314    nonzero if comment terminated by EOF, zero otherwise.
315 
316    Buffer->cur points to the initial asterisk of the comment.  */
317 bool
_cpp_skip_block_comment(cpp_reader * pfile)318 _cpp_skip_block_comment (cpp_reader *pfile)
319 {
320   cpp_buffer *buffer = pfile->buffer;
321   const uchar *cur = buffer->cur;
322   uchar c;
323 
324   cur++;
325   if (*cur == '/')
326     cur++;
327 
328   for (;;)
329     {
330       /* People like decorating comments with '*', so check for '/'
331 	 instead for efficiency.  */
332       c = *cur++;
333 
334       if (c == '/')
335 	{
336 	  if (cur[-2] == '*')
337 	    break;
338 
339 	  /* Warn about potential nested comments, but not if the '/'
340 	     comes immediately before the true comment delimiter.
341 	     Don't bother to get it right across escaped newlines.  */
342 	  if (CPP_OPTION (pfile, warn_comments)
343 	      && cur[0] == '*' && cur[1] != '/')
344 	    {
345 	      buffer->cur = cur;
346 	      cpp_error_with_line (pfile, CPP_DL_WARNING,
347 				   pfile->line_table->highest_line, CPP_BUF_COL (buffer),
348 				   "\"/*\" within comment");
349 	    }
350 	}
351       else if (c == '\n')
352 	{
353 	  unsigned int cols;
354 	  buffer->cur = cur - 1;
355 	  _cpp_process_line_notes (pfile, true);
356 	  if (buffer->next_line >= buffer->rlimit)
357 	    return true;
358 	  _cpp_clean_line (pfile);
359 
360 	  cols = buffer->next_line - buffer->line_base;
361 	  CPP_INCREMENT_LINE (pfile, cols);
362 
363 	  cur = buffer->cur;
364 	}
365     }
366 
367   buffer->cur = cur;
368   _cpp_process_line_notes (pfile, true);
369   return false;
370 }
371 
372 /* Skip a C++ line comment, leaving buffer->cur pointing to the
373    terminating newline.  Handles escaped newlines.  Returns nonzero
374    if a multiline comment.  */
375 static int
skip_line_comment(cpp_reader * pfile)376 skip_line_comment (cpp_reader *pfile)
377 {
378   cpp_buffer *buffer = pfile->buffer;
379   unsigned int orig_line = pfile->line_table->highest_line;
380 
381   while (*buffer->cur != '\n')
382     buffer->cur++;
383 
384   _cpp_process_line_notes (pfile, true);
385   return orig_line != pfile->line_table->highest_line;
386 }
387 
388 /* Skips whitespace, saving the next non-whitespace character.  */
389 static void
skip_whitespace(cpp_reader * pfile,cppchar_t c)390 skip_whitespace (cpp_reader *pfile, cppchar_t c)
391 {
392   cpp_buffer *buffer = pfile->buffer;
393   bool saw_NUL = false;
394 
395   do
396     {
397       /* Horizontal space always OK.  */
398       if (c == ' ' || c == '\t')
399 	;
400       /* Just \f \v or \0 left.  */
401       else if (c == '\0')
402 	saw_NUL = true;
403       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
404 	cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
405 			     CPP_BUF_COL (buffer),
406 			     "%s in preprocessing directive",
407 			     c == '\f' ? "form feed" : "vertical tab");
408 
409       c = *buffer->cur++;
410     }
411   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
412   while (is_nvspace (c));
413 
414   if (saw_NUL)
415     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
416 
417   buffer->cur--;
418 }
419 
420 /* See if the characters of a number token are valid in a name (no
421    '.', '+' or '-').  */
422 static int
name_p(cpp_reader * pfile,const cpp_string * string)423 name_p (cpp_reader *pfile, const cpp_string *string)
424 {
425   unsigned int i;
426 
427   for (i = 0; i < string->len; i++)
428     if (!is_idchar (string->text[i]))
429       return 0;
430 
431   return 1;
432 }
433 
434 /* After parsing an identifier or other sequence, produce a warning about
435    sequences not in NFC/NFKC.  */
436 static void
warn_about_normalization(cpp_reader * pfile,const cpp_token * token,const struct normalize_state * s)437 warn_about_normalization (cpp_reader *pfile,
438 			  const cpp_token *token,
439 			  const struct normalize_state *s)
440 {
441   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
442       && !pfile->state.skipping)
443     {
444       /* Make sure that the token is printed using UCNs, even
445 	 if we'd otherwise happily print UTF-8.  */
446       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
447       size_t sz;
448 
449       sz = cpp_spell_token (pfile, token, buf, false) - buf;
450       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
451 	cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
452 			     "`%.*s' is not in NFKC", (int) sz, buf);
453       else
454 	cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
455 			     "`%.*s' is not in NFC", (int) sz, buf);
456     }
457 }
458 
459 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
460    an identifier.  FIRST is TRUE if this starts an identifier.  */
461 static bool
forms_identifier_p(cpp_reader * pfile,int first,struct normalize_state * state)462 forms_identifier_p (cpp_reader *pfile, int first,
463 		    struct normalize_state *state)
464 {
465   cpp_buffer *buffer = pfile->buffer;
466 
467   if (*buffer->cur == '$')
468     {
469       if (!CPP_OPTION (pfile, dollars_in_ident))
470 	return false;
471 
472       buffer->cur++;
473       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
474 	{
475 	  CPP_OPTION (pfile, warn_dollars) = 0;
476 	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
477 	}
478 
479       return true;
480     }
481 
482   /* Is this a syntactically valid UCN?  */
483   if (CPP_OPTION (pfile, extended_identifiers)
484       && *buffer->cur == '\\'
485       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
486     {
487       buffer->cur += 2;
488       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
489 			  state))
490 	return true;
491       buffer->cur -= 2;
492     }
493 
494   return false;
495 }
496 
497 /* Lex an identifier starting at BUFFER->CUR - 1.  */
498 static cpp_hashnode *
lex_identifier(cpp_reader * pfile,const uchar * base,bool starts_ucn,struct normalize_state * nst)499 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
500 		struct normalize_state *nst)
501 {
502   cpp_hashnode *result;
503   const uchar *cur;
504   unsigned int len;
505   unsigned int hash = HT_HASHSTEP (0, *base);
506 
507   cur = pfile->buffer->cur;
508   if (! starts_ucn)
509     while (ISIDNUM (*cur))
510       {
511 	hash = HT_HASHSTEP (hash, *cur);
512 	cur++;
513       }
514   pfile->buffer->cur = cur;
515   if (starts_ucn || forms_identifier_p (pfile, false, nst))
516     {
517       /* Slower version for identifiers containing UCNs (or $).  */
518       do {
519 	while (ISIDNUM (*pfile->buffer->cur))
520 	  {
521 	    pfile->buffer->cur++;
522 	    NORMALIZE_STATE_UPDATE_IDNUM (nst);
523 	  }
524       } while (forms_identifier_p (pfile, false, nst));
525       result = _cpp_interpret_identifier (pfile, base,
526 					  pfile->buffer->cur - base);
527     }
528   else
529     {
530       len = cur - base;
531       hash = HT_HASHFINISH (hash, len);
532 
533       result = (cpp_hashnode *)
534 	ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
535     }
536 
537   /* Rarely, identifiers require diagnostics when lexed.  */
538   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
539 			&& !pfile->state.skipping, 0))
540     {
541       /* It is allowed to poison the same identifier twice.  */
542       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
543 	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
544 		   NODE_NAME (result));
545 
546       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
547 	 replacement list of a variadic macro.  */
548       if (result == pfile->spec_nodes.n__VA_ARGS__
549 	  && !pfile->state.va_args_ok)
550 	cpp_error (pfile, CPP_DL_PEDWARN,
551 		   "__VA_ARGS__ can only appear in the expansion"
552 		   " of a C99 variadic macro");
553     }
554 
555   return result;
556 }
557 
558 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
559 static void
lex_number(cpp_reader * pfile,cpp_string * number,struct normalize_state * nst)560 lex_number (cpp_reader *pfile, cpp_string *number,
561 	    struct normalize_state *nst)
562 {
563   const uchar *cur;
564   const uchar *base;
565   uchar *dest;
566 
567   base = pfile->buffer->cur - 1;
568   do
569     {
570       cur = pfile->buffer->cur;
571 
572       /* N.B. ISIDNUM does not include $.  */
573       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
574 	{
575 	  cur++;
576 	  NORMALIZE_STATE_UPDATE_IDNUM (nst);
577 	}
578 
579       pfile->buffer->cur = cur;
580     }
581   while (forms_identifier_p (pfile, false, nst));
582 
583   number->len = cur - base;
584   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
585   memcpy (dest, base, number->len);
586   dest[number->len] = '\0';
587   number->text = dest;
588 }
589 
590 /* Create a token of type TYPE with a literal spelling.  */
591 static void
create_literal(cpp_reader * pfile,cpp_token * token,const uchar * base,unsigned int len,enum cpp_ttype type)592 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
593 		unsigned int len, enum cpp_ttype type)
594 {
595   char *p;
596   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
597 
598   memcpy (dest, base, len);
599   dest[len] = '\0';
600   /* (TIGCC 20050206) Delete \r characters in multi-line strings. */
601   p = (char *)dest;
602   while (p < (char *)dest + len) {
603     if (*p == '\r') {
604       memmove (p, p + 1, (char *)dest + len - p);
605       len--;
606     } else p++;
607   }
608   token->type = type;
609   token->val.str.len = len;
610   token->val.str.text = dest;
611 }
612 
613 /* Lexes a string, character constant, or angle-bracketed header file
614    name.  The stored string contains the spelling, including opening
615    quote and leading any leading 'L'.  It returns the type of the
616    literal, or CPP_OTHER if it was not properly terminated.
617 
618    The spelling is NUL-terminated, but it is not guaranteed that this
619    is the first NUL since embedded NULs are preserved.
620 
621    Multi-line strings are allowed as a TIGCC extension (removed in the FSF GCC
622    since version 3.3).  */
623 static void
lex_string(cpp_reader * pfile,cpp_token * token,const uchar * base)624 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
625 {
626   bool saw_NUL = false;
627   const uchar *cur;
628   cppchar_t terminator;
629   enum cpp_ttype type;
630   cpp_buffer *buffer;
631   unsigned int startcol;
632 
633   buffer = pfile->buffer;
634   startcol = CPP_BUF_COL (buffer);
635   cur = base;
636   terminator = *cur++;
637   if (terminator == 'L')
638     terminator = *cur++;
639   if (terminator == '\"')
640     type = *base == 'L' ? CPP_WSTRING: CPP_STRING;
641   else if (terminator == '\'')
642     type = *base == 'L' ? CPP_WCHAR: CPP_CHAR;
643   else
644     terminator = '>', type = CPP_HEADER_NAME;
645 
646   for (;;)
647     {
648       cppchar_t c = *cur++;
649 
650       /* In #include-style directives, terminators are not escapable.  */
651       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
652 	cur++;
653       else if (c == terminator)
654 	break;
655       else if (c == '\n')
656 	{
657 	  unsigned int cols;
658 
659 	  /* In assembly language, silently terminate string and
660 	     character literals at end of line.  This is a kludge
661 	     around not knowing where comments are.  */
662 	  if (CPP_OPTION (pfile, lang) == CLK_ASM && terminator != '>')
663 	    {
664 	      cur--;
665 	      break;
666 	    }
667 
668 	  /* Character constants and header names may not extend over
669 	     multiple lines.  In Standard C, neither may strings.
670 	     In TIGCC, we accept multiline strings as an
671 	     extension, except in #include family directives.  */
672 	  if (terminator != '"' || pfile->state.angled_headers)
673 	    {
674 	      cur--;
675 	      type = CPP_OTHER;
676 	      break;
677 	    }
678 
679 	  if (CPP_PEDANTIC (pfile))
680 	    cpp_error(pfile, CPP_DL_PEDWARN, "ISO C forbids newline in string literal");
681 	  buffer->cur = cur - 1;
682 	  _cpp_process_line_notes (pfile, true);
683 	  if (buffer->next_line >= buffer->rlimit)
684 	    {
685 	      cur--;
686 	      type = CPP_OTHER;
687 	      break;
688 	    }
689 	  _cpp_clean_line (pfile);
690 
691 	  cols = buffer->next_line - buffer->line_base;
692 	  CPP_INCREMENT_LINE (pfile, cols);
693 
694 	  cur = buffer->cur;
695 
696 	  if (pfile->mls_line == 0)
697 	    {
698 	      pfile->mls_line = token->src_loc;
699 	      pfile->mls_col = startcol;
700 	    }
701 	}
702       else if (c == '\0')
703 	saw_NUL = true;
704     }
705 
706   if (saw_NUL && !pfile->state.skipping)
707     cpp_error (pfile, CPP_DL_WARNING,
708 	       "null character(s) preserved in literal");
709 
710   pfile->buffer->cur = cur;
711   create_literal (pfile, token, base, cur - base, type);
712 }
713 
714 /* The stored comment includes the comment start and any terminator.  */
715 static void
save_comment(cpp_reader * pfile,cpp_token * token,const unsigned char * from,cppchar_t type)716 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
717 	      cppchar_t type)
718 {
719   unsigned char *buffer;
720   unsigned int len, clen;
721 
722   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
723 
724   /* C++ comments probably (not definitely) have moved past a new
725      line, which we don't want to save in the comment.  */
726   if (is_vspace (pfile->buffer->cur[-1]))
727     len--;
728 
729   /* If we are currently in a directive, then we need to store all
730      C++ comments as C comments internally, and so we need to
731      allocate a little extra space in that case.
732 
733      Note that the only time we encounter a directive here is
734      when we are saving comments in a "#define".  */
735   clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
736 
737   buffer = _cpp_unaligned_alloc (pfile, clen);
738 
739   token->type = CPP_COMMENT;
740   token->val.str.len = clen;
741   token->val.str.text = buffer;
742 
743   buffer[0] = '/';
744   memcpy (buffer + 1, from, len - 1);
745 
746   /* Finish conversion to a C comment, if necessary.  */
747   if (pfile->state.in_directive && type == '/')
748     {
749       buffer[1] = '*';
750       buffer[clen - 2] = '*';
751       buffer[clen - 1] = '/';
752     }
753 }
754 
755 /* Allocate COUNT tokens for RUN.  */
756 void
_cpp_init_tokenrun(tokenrun * run,unsigned int count)757 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
758 {
759   run->base = XNEWVEC (cpp_token, count);
760   run->limit = run->base + count;
761   run->next = NULL;
762 }
763 
764 /* Returns the next tokenrun, or creates one if there is none.  */
765 static tokenrun *
next_tokenrun(tokenrun * run)766 next_tokenrun (tokenrun *run)
767 {
768   if (run->next == NULL)
769     {
770       run->next = XNEW (tokenrun);
771       run->next->prev = run;
772       _cpp_init_tokenrun (run->next, 250);
773     }
774 
775   return run->next;
776 }
777 
778 /* Allocate a single token that is invalidated at the same time as the
779    rest of the tokens on the line.  Has its line and col set to the
780    same as the last lexed token, so that diagnostics appear in the
781    right place.  */
782 cpp_token *
_cpp_temp_token(cpp_reader * pfile)783 _cpp_temp_token (cpp_reader *pfile)
784 {
785   cpp_token *old, *result;
786 
787   old = pfile->cur_token - 1;
788   if (pfile->cur_token == pfile->cur_run->limit)
789     {
790       pfile->cur_run = next_tokenrun (pfile->cur_run);
791       pfile->cur_token = pfile->cur_run->base;
792     }
793 
794   result = pfile->cur_token++;
795   result->src_loc = old->src_loc;
796   return result;
797 }
798 
799 /* Lex a token into RESULT (external interface).  Takes care of issues
800    like directive handling, token lookahead, multiple include
801    optimization and skipping.  */
802 const cpp_token *
_cpp_lex_token(cpp_reader * pfile)803 _cpp_lex_token (cpp_reader *pfile)
804 {
805   cpp_token *result;
806 
807   for (;;)
808     {
809       if (pfile->cur_token == pfile->cur_run->limit)
810 	{
811 	  pfile->cur_run = next_tokenrun (pfile->cur_run);
812 	  pfile->cur_token = pfile->cur_run->base;
813 	}
814 
815       if (pfile->lookaheads)
816 	{
817 	  pfile->lookaheads--;
818 	  result = pfile->cur_token++;
819 	}
820       else
821 	result = _cpp_lex_direct (pfile);
822 
823       if (result->flags & BOL)
824 	{
825 	  /* Is this a directive.  If _cpp_handle_directive returns
826 	     false, it is an assembler #.  */
827 	  if (result->type == CPP_HASH
828 	      /* 6.10.3 p 11: Directives in a list of macro arguments
829 		 gives undefined behavior.  This implementation
830 		 handles the directive as normal.  */
831 	      && pfile->state.parsing_args != 1
832 	      && _cpp_handle_directive (pfile, result->flags & PREV_WHITE))
833 	    {
834 	      if (pfile->directive_result.type == CPP_PADDING)
835 		continue;
836 	      else
837 		{
838 		  result = &pfile->directive_result;
839 		  break;
840 		}
841 	    }
842 
843 	  if (pfile->cb.line_change && !pfile->state.skipping)
844 	    pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
845 	}
846 
847       /* We don't skip tokens in directives.  */
848       if (pfile->state.in_directive)
849 	break;
850 
851       /* Outside a directive, invalidate controlling macros.  At file
852 	 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
853 	 get here and MI optimization works.  */
854       pfile->mi_valid = false;
855 
856       if (!pfile->state.skipping || result->type == CPP_EOF)
857 	break;
858     }
859 
860   return result;
861 }
862 
863 /* Returns true if a fresh line has been loaded.  */
864 bool
_cpp_get_fresh_line(cpp_reader * pfile)865 _cpp_get_fresh_line (cpp_reader *pfile)
866 {
867   int return_at_eof;
868 
869   /* We can't get a new line until we leave the current directive.  */
870   if (pfile->state.in_directive)
871     return false;
872 
873   for (;;)
874     {
875       cpp_buffer *buffer = pfile->buffer;
876 
877       if (!buffer->need_line)
878 	return true;
879 
880       if (buffer->next_line < buffer->rlimit)
881 	{
882 	  _cpp_clean_line (pfile);
883 	  return true;
884 	}
885 
886       /* First, get out of parsing arguments state.  */
887       if (pfile->state.parsing_args)
888 	return false;
889 
890       /* End of buffer.  Non-empty files should end in a newline.  */
891       if (buffer->buf != buffer->rlimit
892 	  && buffer->next_line > buffer->rlimit
893 	  && !buffer->from_stage3)
894 	{
895 	  /* Only warn once.  */
896 	  buffer->next_line = buffer->rlimit;
897 	  cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
898 			       CPP_BUF_COLUMN (buffer, buffer->cur),
899 			       "no newline at end of file");
900 	}
901 
902       return_at_eof = buffer->return_at_eof;
903       _cpp_pop_buffer (pfile);
904       if (pfile->buffer == NULL || return_at_eof)
905 	return false;
906     }
907 }
908 
909 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)		\
910   do							\
911     {							\
912       result->type = ELSE_TYPE;				\
913       if (*buffer->cur == CHAR)				\
914 	buffer->cur++, result->type = THEN_TYPE;	\
915     }							\
916   while (0)
917 
918 /* Lex a token into pfile->cur_token, which is also incremented, to
919    get diagnostics pointing to the correct location.
920 
921    Does not handle issues such as token lookahead, multiple-include
922    optimization, directives, skipping etc.  This function is only
923    suitable for use by _cpp_lex_token, and in special cases like
924    lex_expansion_token which doesn't care for any of these issues.
925 
926    When meeting a newline, returns CPP_EOF if parsing a directive,
927    otherwise returns to the start of the token buffer if permissible.
928    Returns the location of the lexed token.  */
929 cpp_token *
_cpp_lex_direct(cpp_reader * pfile)930 _cpp_lex_direct (cpp_reader *pfile)
931 {
932   cppchar_t c;
933   cpp_buffer *buffer;
934   const unsigned char *comment_start;
935   cpp_token *result = pfile->cur_token++;
936 
937  fresh_line:
938   result->flags = 0;
939   buffer = pfile->buffer;
940   if (buffer->need_line)
941     {
942       if (!_cpp_get_fresh_line (pfile))
943 	{
944 	  result->type = CPP_EOF;
945 	  if (!pfile->state.in_directive)
946 	    {
947 	      /* Tell the compiler the line number of the EOF token.  */
948 	      result->src_loc = pfile->line_table->highest_line;
949 	      result->flags = BOL;
950 	    }
951 	  return result;
952 	}
953       if (!pfile->keep_tokens)
954 	{
955 	  pfile->cur_run = &pfile->base_run;
956 	  result = pfile->base_run.base;
957 	  pfile->cur_token = result + 1;
958 	}
959       result->flags = BOL;
960       if (pfile->state.parsing_args == 2)
961 	result->flags |= PREV_WHITE;
962     }
963   buffer = pfile->buffer;
964  update_tokens_line:
965   result->src_loc = pfile->line_table->highest_line;
966 
967  skipped_white:
968   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
969       && !pfile->overlaid_buffer)
970     {
971       _cpp_process_line_notes (pfile, false);
972       result->src_loc = pfile->line_table->highest_line;
973     }
974   c = *buffer->cur++;
975 
976   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
977 			       CPP_BUF_COLUMN (buffer, buffer->cur));
978 
979   switch (c)
980     {
981     case ' ': case '\t': case '\f': case '\v': case '\0':
982       result->flags |= PREV_WHITE;
983       skip_whitespace (pfile, c);
984       goto skipped_white;
985 
986     case '\n':
987       if (buffer->cur < buffer->rlimit)
988 	CPP_INCREMENT_LINE (pfile, 0);
989       buffer->need_line = true;
990       goto fresh_line;
991 
992     case '0': case '1': case '2': case '3': case '4':
993     case '5': case '6': case '7': case '8': case '9':
994       {
995 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
996 	result->type = CPP_NUMBER;
997 	lex_number (pfile, &result->val.str, &nst);
998 	warn_about_normalization (pfile, result, &nst);
999 	break;
1000       }
1001 
1002     case 'L':
1003       /* 'L' may introduce wide characters or strings.  */
1004       if (*buffer->cur == '\'' || *buffer->cur == '"')
1005 	{
1006 	  lex_string (pfile, result, buffer->cur - 1);
1007 	  break;
1008 	}
1009       /* Fall through.  */
1010 
1011     case '_':
1012     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1013     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1014     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1015     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1016     case 'y': case 'z':
1017     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1018     case 'G': case 'H': case 'I': case 'J': case 'K':
1019     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1020     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1021     case 'Y': case 'Z':
1022       result->type = CPP_NAME;
1023       {
1024 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1025 	result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
1026 					   &nst);
1027 	warn_about_normalization (pfile, result, &nst);
1028       }
1029 
1030       /* Convert named operators to their proper types.  */
1031       if (result->val.node->flags & NODE_OPERATOR)
1032 	{
1033 	  result->flags |= NAMED_OP;
1034 	  result->type = (enum cpp_ttype) result->val.node->directive_index;
1035 	}
1036       break;
1037 
1038     case '\'':
1039     case '"':
1040       lex_string (pfile, result, buffer->cur - 1);
1041       break;
1042 
1043     case '/':
1044       /* A potential block or line comment.  */
1045       comment_start = buffer->cur;
1046       c = *buffer->cur;
1047 
1048       if (c == '*')
1049 	{
1050 	  if (_cpp_skip_block_comment (pfile))
1051 	    cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1052 	}
1053       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1054 			    || cpp_in_system_header (pfile)))
1055 	{
1056 	  /* Warn about comments only if pedantically GNUC89, and not
1057 	     in system headers.  */
1058 	  if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1059 	      && ! buffer->warned_cplusplus_comments)
1060 	    {
1061 	      cpp_error (pfile, CPP_DL_PEDWARN,
1062 			 "C++ style comments are not allowed in ISO C90");
1063 	      cpp_error (pfile, CPP_DL_PEDWARN,
1064 			 "(this will be reported only once per input file)");
1065 	      buffer->warned_cplusplus_comments = 1;
1066 	    }
1067 
1068 	  if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1069 	    cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1070 	}
1071       else if (c == '=')
1072 	{
1073 	  buffer->cur++;
1074 	  result->type = CPP_DIV_EQ;
1075 	  break;
1076 	}
1077       else
1078 	{
1079 	  result->type = CPP_DIV;
1080 	  break;
1081 	}
1082 
1083       if (!pfile->state.save_comments)
1084 	{
1085 	  result->flags |= PREV_WHITE;
1086 	  goto update_tokens_line;
1087 	}
1088 
1089       /* Save the comment as a token in its own right.  */
1090       save_comment (pfile, result, comment_start, c);
1091       break;
1092 
1093     case '<':
1094       if (pfile->state.angled_headers)
1095 	{
1096 	  lex_string (pfile, result, buffer->cur - 1);
1097 	  break;
1098 	}
1099 
1100       result->type = CPP_LESS;
1101       if (*buffer->cur == '=')
1102 	buffer->cur++, result->type = CPP_LESS_EQ;
1103       else if (*buffer->cur == '<')
1104 	{
1105 	  buffer->cur++;
1106 	  IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1107 	}
1108       else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
1109 	{
1110 	  buffer->cur++;
1111 	  IF_NEXT_IS ('=', CPP_MIN_EQ, CPP_MIN);
1112 	}
1113       else if (CPP_OPTION (pfile, digraphs))
1114 	{
1115 	  if (*buffer->cur == ':')
1116 	    {
1117 	      buffer->cur++;
1118 	      result->flags |= DIGRAPH;
1119 	      result->type = CPP_OPEN_SQUARE;
1120 	    }
1121 	  else if (*buffer->cur == '%')
1122 	    {
1123 	      buffer->cur++;
1124 	      result->flags |= DIGRAPH;
1125 	      result->type = CPP_OPEN_BRACE;
1126 	    }
1127 	}
1128       break;
1129 
1130     case '>':
1131       result->type = CPP_GREATER;
1132       if (*buffer->cur == '=')
1133 	buffer->cur++, result->type = CPP_GREATER_EQ;
1134       else if (*buffer->cur == '>')
1135 	{
1136 	  buffer->cur++;
1137 	  IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1138 	}
1139       else if (*buffer->cur == '?' && CPP_OPTION (pfile, cplusplus))
1140 	{
1141 	  buffer->cur++;
1142 	  IF_NEXT_IS ('=', CPP_MAX_EQ, CPP_MAX);
1143 	}
1144       break;
1145 
1146     case '%':
1147       result->type = CPP_MOD;
1148       if (*buffer->cur == '=')
1149 	buffer->cur++, result->type = CPP_MOD_EQ;
1150       else if (CPP_OPTION (pfile, digraphs))
1151 	{
1152 	  if (*buffer->cur == ':')
1153 	    {
1154 	      buffer->cur++;
1155 	      result->flags |= DIGRAPH;
1156 	      result->type = CPP_HASH;
1157 	      if (*buffer->cur == '%' && buffer->cur[1] == ':')
1158 		buffer->cur += 2, result->type = CPP_PASTE;
1159 	    }
1160 	  else if (*buffer->cur == '>')
1161 	    {
1162 	      buffer->cur++;
1163 	      result->flags |= DIGRAPH;
1164 	      result->type = CPP_CLOSE_BRACE;
1165 	    }
1166 	}
1167       break;
1168 
1169     case '.':
1170       result->type = CPP_DOT;
1171       if (ISDIGIT (*buffer->cur))
1172 	{
1173 	  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1174 	  result->type = CPP_NUMBER;
1175 	  lex_number (pfile, &result->val.str, &nst);
1176 	  warn_about_normalization (pfile, result, &nst);
1177 	}
1178       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1179 	buffer->cur += 2, result->type = CPP_ELLIPSIS;
1180       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1181 	buffer->cur++, result->type = CPP_DOT_STAR;
1182       break;
1183 
1184     case '+':
1185       result->type = CPP_PLUS;
1186       if (*buffer->cur == '+')
1187 	buffer->cur++, result->type = CPP_PLUS_PLUS;
1188       else if (*buffer->cur == '=')
1189 	buffer->cur++, result->type = CPP_PLUS_EQ;
1190       break;
1191 
1192     case '-':
1193       result->type = CPP_MINUS;
1194       if (*buffer->cur == '>')
1195 	{
1196 	  buffer->cur++;
1197 	  result->type = CPP_DEREF;
1198 	  if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1199 	    buffer->cur++, result->type = CPP_DEREF_STAR;
1200 	}
1201       else if (*buffer->cur == '-')
1202 	buffer->cur++, result->type = CPP_MINUS_MINUS;
1203       else if (*buffer->cur == '=')
1204 	buffer->cur++, result->type = CPP_MINUS_EQ;
1205       break;
1206 
1207     case '&':
1208       result->type = CPP_AND;
1209       if (*buffer->cur == '&')
1210 	buffer->cur++, result->type = CPP_AND_AND;
1211       else if (*buffer->cur == '=')
1212 	buffer->cur++, result->type = CPP_AND_EQ;
1213       break;
1214 
1215     case '|':
1216       result->type = CPP_OR;
1217       if (*buffer->cur == '|')
1218 	buffer->cur++, result->type = CPP_OR_OR;
1219       else if (*buffer->cur == '=')
1220 	buffer->cur++, result->type = CPP_OR_EQ;
1221       break;
1222 
1223     case ':':
1224       result->type = CPP_COLON;
1225       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1226 	buffer->cur++, result->type = CPP_SCOPE;
1227       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1228 	{
1229 	  buffer->cur++;
1230 	  result->flags |= DIGRAPH;
1231 	  result->type = CPP_CLOSE_SQUARE;
1232 	}
1233       break;
1234 
1235     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1236     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1237     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1238     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1239     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1240 
1241     case '?': result->type = CPP_QUERY; break;
1242     case '~': result->type = CPP_COMPL; break;
1243     case ',': result->type = CPP_COMMA; break;
1244     case '(': result->type = CPP_OPEN_PAREN; break;
1245     case ')': result->type = CPP_CLOSE_PAREN; break;
1246     case '[': result->type = CPP_OPEN_SQUARE; break;
1247     case ']': result->type = CPP_CLOSE_SQUARE; break;
1248     case '{': result->type = CPP_OPEN_BRACE; break;
1249     case '}': result->type = CPP_CLOSE_BRACE; break;
1250     case ';': result->type = CPP_SEMICOLON; break;
1251 
1252       /* @ is a punctuator in Objective-C.  */
1253     case '@': result->type = CPP_ATSIGN; break;
1254 
1255     case '$':
1256     case '\\':
1257       {
1258 	const uchar *base = --buffer->cur;
1259 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1260 
1261 	if (forms_identifier_p (pfile, true, &nst))
1262 	  {
1263 	    result->type = CPP_NAME;
1264 	    result->val.node = lex_identifier (pfile, base, true, &nst);
1265 	    warn_about_normalization (pfile, result, &nst);
1266 	    break;
1267 	  }
1268 	buffer->cur++;
1269       }
1270 
1271     default:
1272       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1273       break;
1274     }
1275 
1276   return result;
1277 }
1278 
1279 /* An upper bound on the number of bytes needed to spell TOKEN.
1280    Does not include preceding whitespace.  */
1281 unsigned int
cpp_token_len(const cpp_token * token)1282 cpp_token_len (const cpp_token *token)
1283 {
1284   unsigned int len;
1285 
1286   switch (TOKEN_SPELL (token))
1287     {
1288     default:		len = 4;				break;
1289     case SPELL_LITERAL:	len = token->val.str.len;		break;
1290     case SPELL_IDENT:	len = NODE_LEN (token->val.node) * 10;	break;
1291     }
1292 
1293   return len;
1294 }
1295 
1296 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1297    Return the number of bytes read out of NAME.  (There are always
1298    10 bytes written to BUFFER.)  */
1299 
1300 static size_t
utf8_to_ucn(unsigned char * buffer,const unsigned char * name)1301 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1302 {
1303   int j;
1304   int ucn_len = 0;
1305   int ucn_len_c;
1306   unsigned t;
1307   unsigned long utf32;
1308 
1309   /* Compute the length of the UTF-8 sequence.  */
1310   for (t = *name; t & 0x80; t <<= 1)
1311     ucn_len++;
1312 
1313   utf32 = *name & (0x7F >> ucn_len);
1314   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1315     {
1316       utf32 = (utf32 << 6) | (*++name & 0x3F);
1317 
1318       /* Ill-formed UTF-8.  */
1319       if ((*name & ~0x3F) != 0x80)
1320 	abort ();
1321     }
1322 
1323   *buffer++ = '\\';
1324   *buffer++ = 'U';
1325   for (j = 7; j >= 0; j--)
1326     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1327   return ucn_len;
1328 }
1329 
1330 
1331 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
1332    already contain the enough space to hold the token's spelling.
1333    Returns a pointer to the character after the last character written.
1334    FORSTRING is true if this is to be the spelling after translation
1335    phase 1 (this is different for UCNs).
1336    FIXME: Would be nice if we didn't need the PFILE argument.  */
1337 unsigned char *
cpp_spell_token(cpp_reader * pfile,const cpp_token * token,unsigned char * buffer,bool forstring)1338 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1339 		 unsigned char *buffer, bool forstring)
1340 {
1341   switch (TOKEN_SPELL (token))
1342     {
1343     case SPELL_OPERATOR:
1344       {
1345 	const unsigned char *spelling;
1346 	unsigned char c;
1347 
1348 	if (token->flags & DIGRAPH)
1349 	  spelling
1350 	    = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1351 	else if (token->flags & NAMED_OP)
1352 	  goto spell_ident;
1353 	else
1354 	  spelling = TOKEN_NAME (token);
1355 
1356 	while ((c = *spelling++) != '\0')
1357 	  *buffer++ = c;
1358       }
1359       break;
1360 
1361     spell_ident:
1362     case SPELL_IDENT:
1363       if (forstring)
1364 	{
1365 	  memcpy (buffer, NODE_NAME (token->val.node),
1366 		  NODE_LEN (token->val.node));
1367 	  buffer += NODE_LEN (token->val.node);
1368 	}
1369       else
1370 	{
1371 	  size_t i;
1372 	  const unsigned char * name = NODE_NAME (token->val.node);
1373 
1374 	  for (i = 0; i < NODE_LEN (token->val.node); i++)
1375 	    if (name[i] & ~0x7F)
1376 	      {
1377 		i += utf8_to_ucn (buffer, name + i) - 1;
1378 		buffer += 10;
1379 	      }
1380 	    else
1381 	      *buffer++ = NODE_NAME (token->val.node)[i];
1382 	}
1383       break;
1384 
1385     case SPELL_LITERAL:
1386       memcpy (buffer, token->val.str.text, token->val.str.len);
1387       buffer += token->val.str.len;
1388       break;
1389 
1390     case SPELL_NONE:
1391       cpp_error (pfile, CPP_DL_ICE,
1392 		 "unspellable token %s", TOKEN_NAME (token));
1393       break;
1394     }
1395 
1396   return buffer;
1397 }
1398 
1399 /* Returns TOKEN spelt as a null-terminated string.  The string is
1400    freed when the reader is destroyed.  Useful for diagnostics.  */
1401 unsigned char *
cpp_token_as_text(cpp_reader * pfile,const cpp_token * token)1402 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1403 {
1404   unsigned int len = cpp_token_len (token) + 1;
1405   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1406 
1407   end = cpp_spell_token (pfile, token, start, false);
1408   end[0] = '\0';
1409 
1410   return start;
1411 }
1412 
1413 /* Used by C front ends, which really should move to using
1414    cpp_token_as_text.  */
1415 const char *
cpp_type2name(enum cpp_ttype type)1416 cpp_type2name (enum cpp_ttype type)
1417 {
1418   return (const char *) token_spellings[type].name;
1419 }
1420 
1421 /* Writes the spelling of token to FP, without any preceding space.
1422    Separated from cpp_spell_token for efficiency - to avoid stdio
1423    double-buffering.  */
1424 void
cpp_output_token(const cpp_token * token,FILE * fp)1425 cpp_output_token (const cpp_token *token, FILE *fp)
1426 {
1427   switch (TOKEN_SPELL (token))
1428     {
1429     case SPELL_OPERATOR:
1430       {
1431 	const unsigned char *spelling;
1432 	int c;
1433 
1434 	if (token->flags & DIGRAPH)
1435 	  spelling
1436 	    = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1437 	else if (token->flags & NAMED_OP)
1438 	  goto spell_ident;
1439 	else
1440 	  spelling = TOKEN_NAME (token);
1441 
1442 	c = *spelling;
1443 	do
1444 	  putc (c, fp);
1445 	while ((c = *++spelling) != '\0');
1446       }
1447       break;
1448 
1449     spell_ident:
1450     case SPELL_IDENT:
1451       {
1452 	size_t i;
1453 	const unsigned char * name = NODE_NAME (token->val.node);
1454 
1455 	for (i = 0; i < NODE_LEN (token->val.node); i++)
1456 	  if (name[i] & ~0x7F)
1457 	    {
1458 	      unsigned char buffer[10];
1459 	      i += utf8_to_ucn (buffer, name + i) - 1;
1460 	      fwrite (buffer, 1, 10, fp);
1461 	    }
1462 	  else
1463 	    fputc (NODE_NAME (token->val.node)[i], fp);
1464       }
1465       break;
1466 
1467     case SPELL_LITERAL:
1468       fwrite (token->val.str.text, 1, token->val.str.len, fp);
1469       break;
1470 
1471     case SPELL_NONE:
1472       /* An error, most probably.  */
1473       break;
1474     }
1475 }
1476 
1477 /* Compare two tokens.  */
1478 int
_cpp_equiv_tokens(const cpp_token * a,const cpp_token * b)1479 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1480 {
1481   if (a->type == b->type && a->flags == b->flags)
1482     switch (TOKEN_SPELL (a))
1483       {
1484       default:			/* Keep compiler happy.  */
1485       case SPELL_OPERATOR:
1486 	return 1;
1487       case SPELL_NONE:
1488 	return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1489       case SPELL_IDENT:
1490 	return a->val.node == b->val.node;
1491       case SPELL_LITERAL:
1492 	return (a->val.str.len == b->val.str.len
1493 		&& !memcmp (a->val.str.text, b->val.str.text,
1494 			    a->val.str.len));
1495       }
1496 
1497   return 0;
1498 }
1499 
1500 /* Returns nonzero if a space should be inserted to avoid an
1501    accidental token paste for output.  For simplicity, it is
1502    conservative, and occasionally advises a space where one is not
1503    needed, e.g. "." and ".2".  */
1504 int
cpp_avoid_paste(cpp_reader * pfile,const cpp_token * token1,const cpp_token * token2)1505 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1506 		 const cpp_token *token2)
1507 {
1508   enum cpp_ttype a = token1->type, b = token2->type;
1509   cppchar_t c;
1510 
1511   if (token1->flags & NAMED_OP)
1512     a = CPP_NAME;
1513   if (token2->flags & NAMED_OP)
1514     b = CPP_NAME;
1515 
1516   c = EOF;
1517   if (token2->flags & DIGRAPH)
1518     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1519   else if (token_spellings[b].category == SPELL_OPERATOR)
1520     c = token_spellings[b].name[0];
1521 
1522   /* Quickly get everything that can paste with an '='.  */
1523   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1524     return 1;
1525 
1526   switch (a)
1527     {
1528     case CPP_GREATER:	return c == '>' || c == '?';
1529     case CPP_LESS:	return c == '<' || c == '?' || c == '%' || c == ':';
1530     case CPP_PLUS:	return c == '+';
1531     case CPP_MINUS:	return c == '-' || c == '>';
1532     case CPP_DIV:	return c == '/' || c == '*'; /* Comments.  */
1533     case CPP_MOD:	return c == ':' || c == '>';
1534     case CPP_AND:	return c == '&';
1535     case CPP_OR:	return c == '|';
1536     case CPP_COLON:	return c == ':' || c == '>';
1537     case CPP_DEREF:	return c == '*';
1538     case CPP_DOT:	return c == '.' || c == '%' || b == CPP_NUMBER;
1539     case CPP_HASH:	return c == '#' || c == '%'; /* Digraph form.  */
1540     case CPP_NAME:	return ((b == CPP_NUMBER
1541 				 && name_p (pfile, &token2->val.str))
1542 				|| b == CPP_NAME
1543 				|| b == CPP_CHAR || b == CPP_STRING); /* L */
1544     case CPP_NUMBER:	return (b == CPP_NUMBER || b == CPP_NAME
1545 				|| c == '.' || c == '+' || c == '-');
1546 				      /* UCNs */
1547     case CPP_OTHER:	return ((token1->val.str.text[0] == '\\'
1548 				 && b == CPP_NAME)
1549 				|| (CPP_OPTION (pfile, objc)
1550 				    && token1->val.str.text[0] == '@'
1551 				    && (b == CPP_NAME || b == CPP_STRING)));
1552     default:		break;
1553     }
1554 
1555   return 0;
1556 }
1557 
1558 /* Output all the remaining tokens on the current line, and a newline
1559    character, to FP.  Leading whitespace is removed.  If there are
1560    macros, special token padding is not performed.  */
1561 void
cpp_output_line(cpp_reader * pfile,FILE * fp)1562 cpp_output_line (cpp_reader *pfile, FILE *fp)
1563 {
1564   const cpp_token *token;
1565 
1566   token = cpp_get_token (pfile);
1567   while (token->type != CPP_EOF)
1568     {
1569       cpp_output_token (token, fp);
1570       token = cpp_get_token (pfile);
1571       if (token->flags & PREV_WHITE)
1572 	putc (' ', fp);
1573     }
1574 
1575   putc ('\n', fp);
1576 }
1577 
1578 /* Memory buffers.  Changing these three constants can have a dramatic
1579    effect on performance.  The values here are reasonable defaults,
1580    but might be tuned.  If you adjust them, be sure to test across a
1581    range of uses of cpplib, including heavy nested function-like macro
1582    expansion.  Also check the change in peak memory usage (NJAMD is a
1583    good tool for this).  */
1584 #define MIN_BUFF_SIZE 8000
1585 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1586 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1587 	(MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1588 
1589 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1590   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1591 #endif
1592 
1593 /* Create a new allocation buffer.  Place the control block at the end
1594    of the buffer, so that buffer overflows will cause immediate chaos.  */
1595 static _cpp_buff *
new_buff(size_t len)1596 new_buff (size_t len)
1597 {
1598   _cpp_buff *result;
1599   unsigned char *base;
1600 
1601   if (len < MIN_BUFF_SIZE)
1602     len = MIN_BUFF_SIZE;
1603   len = CPP_ALIGN (len);
1604 
1605   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
1606   result = (_cpp_buff *) (base + len);
1607   result->base = base;
1608   result->cur = base;
1609   result->limit = base + len;
1610   result->next = NULL;
1611   return result;
1612 }
1613 
1614 /* Place a chain of unwanted allocation buffers on the free list.  */
1615 void
_cpp_release_buff(cpp_reader * pfile,_cpp_buff * buff)1616 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1617 {
1618   _cpp_buff *end = buff;
1619 
1620   while (end->next)
1621     end = end->next;
1622   end->next = pfile->free_buffs;
1623   pfile->free_buffs = buff;
1624 }
1625 
1626 /* Return a free buffer of size at least MIN_SIZE.  */
1627 _cpp_buff *
_cpp_get_buff(cpp_reader * pfile,size_t min_size)1628 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1629 {
1630   _cpp_buff *result, **p;
1631 
1632   for (p = &pfile->free_buffs;; p = &(*p)->next)
1633     {
1634       size_t size;
1635 
1636       if (*p == NULL)
1637 	return new_buff (min_size);
1638       result = *p;
1639       size = result->limit - result->base;
1640       /* Return a buffer that's big enough, but don't waste one that's
1641          way too big.  */
1642       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1643 	break;
1644     }
1645 
1646   *p = result->next;
1647   result->next = NULL;
1648   result->cur = result->base;
1649   return result;
1650 }
1651 
1652 /* Creates a new buffer with enough space to hold the uncommitted
1653    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
1654    the excess bytes to the new buffer.  Chains the new buffer after
1655    BUFF, and returns the new buffer.  */
1656 _cpp_buff *
_cpp_append_extend_buff(cpp_reader * pfile,_cpp_buff * buff,size_t min_extra)1657 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1658 {
1659   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1660   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1661 
1662   buff->next = new_buff;
1663   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1664   return new_buff;
1665 }
1666 
1667 /* Creates a new buffer with enough space to hold the uncommitted
1668    remaining bytes of the buffer pointed to by BUFF, and at least
1669    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
1670    Chains the new buffer before the buffer pointed to by BUFF, and
1671    updates the pointer to point to the new buffer.  */
1672 void
_cpp_extend_buff(cpp_reader * pfile,_cpp_buff ** pbuff,size_t min_extra)1673 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1674 {
1675   _cpp_buff *new_buff, *old_buff = *pbuff;
1676   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1677 
1678   new_buff = _cpp_get_buff (pfile, size);
1679   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1680   new_buff->next = old_buff;
1681   *pbuff = new_buff;
1682 }
1683 
1684 /* Free a chain of buffers starting at BUFF.  */
1685 void
_cpp_free_buff(_cpp_buff * buff)1686 _cpp_free_buff (_cpp_buff *buff)
1687 {
1688   _cpp_buff *next;
1689 
1690   for (; buff; buff = next)
1691     {
1692       next = buff->next;
1693       free (buff->base);
1694     }
1695 }
1696 
1697 /* Allocate permanent, unaligned storage of length LEN.  */
1698 unsigned char *
_cpp_unaligned_alloc(cpp_reader * pfile,size_t len)1699 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
1700 {
1701   _cpp_buff *buff = pfile->u_buff;
1702   unsigned char *result = buff->cur;
1703 
1704   if (len > (size_t) (buff->limit - result))
1705     {
1706       buff = _cpp_get_buff (pfile, len);
1707       buff->next = pfile->u_buff;
1708       pfile->u_buff = buff;
1709       result = buff->cur;
1710     }
1711 
1712   buff->cur = result + len;
1713   return result;
1714 }
1715 
1716 /* Allocate permanent, unaligned storage of length LEN from a_buff.
1717    That buffer is used for growing allocations when saving macro
1718    replacement lists in a #define, and when parsing an answer to an
1719    assertion in #assert, #unassert or #if (and therefore possibly
1720    whilst expanding macros).  It therefore must not be used by any
1721    code that they might call: specifically the lexer and the guts of
1722    the macro expander.
1723 
1724    All existing other uses clearly fit this restriction: storing
1725    registered pragmas during initialization.  */
1726 unsigned char *
_cpp_aligned_alloc(cpp_reader * pfile,size_t len)1727 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
1728 {
1729   _cpp_buff *buff = pfile->a_buff;
1730   unsigned char *result = buff->cur;
1731 
1732   if (len > (size_t) (buff->limit - result))
1733     {
1734       buff = _cpp_get_buff (pfile, len);
1735       buff->next = pfile->a_buff;
1736       pfile->a_buff = buff;
1737       result = buff->cur;
1738     }
1739 
1740   buff->cur = result + len;
1741   return result;
1742 }
1743 
1744 /* Say which field of TOK is in use.  */
1745 
1746 enum cpp_token_fld_kind
cpp_token_val_index(cpp_token * tok)1747 cpp_token_val_index (cpp_token *tok)
1748 {
1749   switch (TOKEN_SPELL (tok))
1750     {
1751     case SPELL_IDENT:
1752       return CPP_TOKEN_FLD_NODE;
1753     case SPELL_LITERAL:
1754       return CPP_TOKEN_FLD_STR;
1755     case SPELL_NONE:
1756       if (tok->type == CPP_MACRO_ARG)
1757 	return CPP_TOKEN_FLD_ARG_NO;
1758       else if (tok->type == CPP_PADDING)
1759 	return CPP_TOKEN_FLD_SOURCE;
1760       else if (tok->type == CPP_PRAGMA)
1761 	return CPP_TOKEN_FLD_STR;
1762       /* else fall through */
1763     default:
1764       return CPP_TOKEN_FLD_NONE;
1765     }
1766 }
1767 
1768 /* Emits error for unterminated strings.  */
1769 void
cpp_unterminated(cpp_reader * pfile,int term)1770 cpp_unterminated (cpp_reader *pfile, int term)
1771 {
1772   cpp_error (pfile, CPP_DL_ERROR, "missing terminating %c character", term);
1773 
1774   if (term == '\"' && pfile->mls_line && pfile->mls_line != pfile->line_table->highest_line)
1775     {
1776       cpp_error_with_line (pfile, CPP_DL_ERROR, pfile->mls_line, pfile->mls_col,
1777 			   "possible start of unterminated string literal");
1778       pfile->mls_line = 0;
1779     }
1780 }
1781 
1782