1 /* CPP Library - lexical analysis.
2    Copyright (C) 2000-2016 Free Software Foundation, Inc.
3    Contributed by Per Bothner, 1994-95.
4    Based on CCCP program by Paul Rubin, June 1986
5    Adapted to ANSI C, Richard Stallman, Jan 1987
6    Broken out to separate file, Zack Weinberg, Mar 2000
7 
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12 
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 GNU General Public License for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
26 
27 enum spell_type
28 {
29   SPELL_OPERATOR = 0,
30   SPELL_IDENT,
31   SPELL_LITERAL,
32   SPELL_NONE
33 };
34 
35 struct token_spelling
36 {
37   enum spell_type category;
38   const unsigned char *name;
39 };
40 
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
43 
44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
45 #define TK(e, s) { SPELL_ ## s,    UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
49 
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52 
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void store_comment (cpp_reader *, cpp_token *);
59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
60 			    unsigned int, enum cpp_ttype);
61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
62 static int name_p (cpp_reader *, const cpp_string *);
63 static tokenrun *next_tokenrun (tokenrun *);
64 
65 static _cpp_buff *new_buff (size_t);
66 
67 
68 /* Utility routine:
69 
70    Compares, the token TOKEN to the NUL-terminated string STRING.
71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
72 int
cpp_ideq(const cpp_token * token,const char * string)73 cpp_ideq (const cpp_token *token, const char *string)
74 {
75   if (token->type != CPP_NAME)
76     return 0;
77 
78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
79 }
80 
81 /* Record a note TYPE at byte POS into the current cleaned logical
82    line.  */
83 static void
add_line_note(cpp_buffer * buffer,const uchar * pos,unsigned int type)84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
85 {
86   if (buffer->notes_used == buffer->notes_cap)
87     {
88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
90                                   buffer->notes_cap);
91     }
92 
93   buffer->notes[buffer->notes_used].pos = pos;
94   buffer->notes[buffer->notes_used].type = type;
95   buffer->notes_used++;
96 }
97 
98 
99 /* Fast path to find line special characters using optimized character
100    scanning algorithms.  Anything complicated falls back to the slow
101    path below.  Since this loop is very hot it's worth doing these kinds
102    of optimizations.
103 
104    One of the paths through the ifdefs should provide
105 
106      const uchar *search_line_fast (const uchar *s, const uchar *end);
107 
108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
109    the found character.
110 
111    Note that the last character of the buffer is *always* a newline,
112    as forced by _cpp_convert_input.  This fact can be used to avoid
113    explicitly looking for the end of the buffer.  */
114 
115 /* Configure gives us an ifdef test.  */
116 #ifndef WORDS_BIGENDIAN
117 #define WORDS_BIGENDIAN 0
118 #endif
119 
120 /* We'd like the largest integer that fits into a register.  There's nothing
121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
123    can get the "real" word size.  */
124 #ifdef __GNUC__
125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
126 #else
127 typedef unsigned long word_type;
128 #endif
129 
130 /* The code below is only expecting sizes 4 or 8.
131    Die at compile-time if this expectation is violated.  */
132 typedef char check_word_type_size
133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
134 
135 /* Return X with the first N bytes forced to values that won't match one
136    of the interesting characters.  Note that NUL is not interesting.  */
137 
138 static inline word_type
acc_char_mask_misalign(word_type val,unsigned int n)139 acc_char_mask_misalign (word_type val, unsigned int n)
140 {
141   word_type mask = -1;
142   if (WORDS_BIGENDIAN)
143     mask >>= n * 8;
144   else
145     mask <<= n * 8;
146   return val & mask;
147 }
148 
149 /* Return X replicated to all byte positions within WORD_TYPE.  */
150 
151 static inline word_type
acc_char_replicate(uchar x)152 acc_char_replicate (uchar x)
153 {
154   word_type ret;
155 
156   ret = (x << 24) | (x << 16) | (x << 8) | x;
157   if (sizeof(word_type) == 8)
158     ret = (ret << 16 << 16) | ret;
159   return ret;
160 }
161 
162 /* Return non-zero if some byte of VAL is (probably) C.  */
163 
164 static inline word_type
acc_char_cmp(word_type val,word_type c)165 acc_char_cmp (word_type val, word_type c)
166 {
167 #if defined(__GNUC__) && defined(__alpha__)
168   /* We can get exact results using a compare-bytes instruction.
169      Get (val == c) via (0 >= (val ^ c)).  */
170   return __builtin_alpha_cmpbge (0, val ^ c);
171 #else
172   word_type magic = 0x7efefefeU;
173   if (sizeof(word_type) == 8)
174     magic = (magic << 16 << 16) | 0xfefefefeU;
175   magic |= 1;
176 
177   val ^= c;
178   return ((val + magic) ^ ~val) & ~magic;
179 #endif
180 }
181 
182 /* Given the result of acc_char_cmp is non-zero, return the index of
183    the found character.  If this was a false positive, return -1.  */
184 
185 static inline int
acc_char_index(word_type cmp ATTRIBUTE_UNUSED,word_type val ATTRIBUTE_UNUSED)186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
187 		word_type val ATTRIBUTE_UNUSED)
188 {
189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
190   /* The cmpbge instruction sets *bits* of the result corresponding to
191      matches in the bytes with no false positives.  */
192   return __builtin_ctzl (cmp);
193 #else
194   unsigned int i;
195 
196   /* ??? It would be nice to force unrolling here,
197      and have all of these constants folded.  */
198   for (i = 0; i < sizeof(word_type); ++i)
199     {
200       uchar c;
201       if (WORDS_BIGENDIAN)
202 	c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
203       else
204 	c = (val >> i * 8) & 0xff;
205 
206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
207 	return i;
208     }
209 
210   return -1;
211 #endif
212 }
213 
214 /* A version of the fast scanner using bit fiddling techniques.
215 
216    For 32-bit words, one would normally perform 16 comparisons and
217    16 branches.  With this algorithm one performs 24 arithmetic
218    operations and one branch.  Whether this is faster with a 32-bit
219    word size is going to be somewhat system dependent.
220 
221    For 64-bit words, we eliminate twice the number of comparisons
222    and branches without increasing the number of arithmetic operations.
223    It's almost certainly going to be a win with 64-bit word size.  */
224 
225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
226   ATTRIBUTE_UNUSED;
227 
228 static const uchar *
search_line_acc_char(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
230 {
231   const word_type repl_nl = acc_char_replicate ('\n');
232   const word_type repl_cr = acc_char_replicate ('\r');
233   const word_type repl_bs = acc_char_replicate ('\\');
234   const word_type repl_qm = acc_char_replicate ('?');
235 
236   unsigned int misalign;
237   const word_type *p;
238   word_type val, t;
239 
240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
242   val = *p;
243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
244   if (misalign)
245     val = acc_char_mask_misalign (val, misalign);
246 
247   /* Main loop.  */
248   while (1)
249     {
250       t  = acc_char_cmp (val, repl_nl);
251       t |= acc_char_cmp (val, repl_cr);
252       t |= acc_char_cmp (val, repl_bs);
253       t |= acc_char_cmp (val, repl_qm);
254 
255       if (__builtin_expect (t != 0, 0))
256 	{
257 	  int i = acc_char_index (t, val);
258 	  if (i >= 0)
259 	    return (const uchar *)p + i;
260 	}
261 
262       val = *++p;
263     }
264 }
265 
266 /* Disable on Solaris 2/x86 until the following problem can be properly
267    autoconfed:
268 
269    The Solaris 10+ assembler tags objects with the instruction set
270    extensions used, so SSE4.2 executables cannot run on machines that
271    don't support that extension.  */
272 
273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
274 
275 /* Replicated character data to be shared between implementations.
276    Recall that outside of a context with vector support we can't
277    define compatible vector types, therefore these are all defined
278    in terms of raw characters.  */
279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
286   { '?', '?', '?', '?', '?', '?', '?', '?',
287     '?', '?', '?', '?', '?', '?', '?', '?' },
288 };
289 
290 /* A version of the fast scanner using MMX vectorized byte compare insns.
291 
292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
293    which was packaged into SSE1; it is also present in the AMD MMX
294    extension.  Mark the function as using "sse" so that we emit a real
295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
296 
297 static const uchar *
298 #ifndef __SSE__
299 __attribute__((__target__("sse")))
300 #endif
search_line_mmx(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
302 {
303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
305 
306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
310 
311   unsigned int misalign, found, mask;
312   const v8qi *p;
313   v8qi data, t, c;
314 
315   /* Align the source pointer.  While MMX doesn't generate unaligned data
316      faults, this allows us to safely scan to the end of the buffer without
317      reading beyond the end of the last page.  */
318   misalign = (uintptr_t)s & 7;
319   p = (const v8qi *)((uintptr_t)s & -8);
320   data = *p;
321 
322   /* Create a mask for the bytes that are valid within the first
323      16-byte block.  The Idea here is that the AND with the mask
324      within the loop is "free", since we need some AND or TEST
325      insn in order to set the flags for the branch anyway.  */
326   mask = -1u << misalign;
327 
328   /* Main loop processing 8 bytes at a time.  */
329   goto start;
330   do
331     {
332       data = *++p;
333       mask = -1;
334 
335     start:
336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
343       found = __builtin_ia32_pmovmskb (t);
344       found &= mask;
345     }
346   while (!found);
347 
348   __builtin_ia32_emms ();
349 
350   /* FOUND contains 1 in bits for which we matched a relevant
351      character.  Conversion to the byte index is trivial.  */
352   found = __builtin_ctz(found);
353   return (const uchar *)p + found;
354 }
355 
356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
357 
358 static const uchar *
359 #ifndef __SSE2__
360 __attribute__((__target__("sse2")))
361 #endif
search_line_sse2(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
363 {
364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
365 
366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
370 
371   unsigned int misalign, found, mask;
372   const v16qi *p;
373   v16qi data, t;
374 
375   /* Align the source pointer.  */
376   misalign = (uintptr_t)s & 15;
377   p = (const v16qi *)((uintptr_t)s & -16);
378   data = *p;
379 
380   /* Create a mask for the bytes that are valid within the first
381      16-byte block.  The Idea here is that the AND with the mask
382      within the loop is "free", since we need some AND or TEST
383      insn in order to set the flags for the branch anyway.  */
384   mask = -1u << misalign;
385 
386   /* Main loop processing 16 bytes at a time.  */
387   goto start;
388   do
389     {
390       data = *++p;
391       mask = -1;
392 
393     start:
394       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
395       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
396       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
397       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
398       found = __builtin_ia32_pmovmskb128 (t);
399       found &= mask;
400     }
401   while (!found);
402 
403   /* FOUND contains 1 in bits for which we matched a relevant
404      character.  Conversion to the byte index is trivial.  */
405   found = __builtin_ctz(found);
406   return (const uchar *)p + found;
407 }
408 
409 #ifdef HAVE_SSE4
410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
411 
412 static const uchar *
413 #ifndef __SSE4_2__
414 __attribute__((__target__("sse4.2")))
415 #endif
search_line_sse42(const uchar * s,const uchar * end)416 search_line_sse42 (const uchar *s, const uchar *end)
417 {
418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
419   static const v16qi search = { '\n', '\r', '?', '\\' };
420 
421   uintptr_t si = (uintptr_t)s;
422   uintptr_t index;
423 
424   /* Check for unaligned input.  */
425   if (si & 15)
426     {
427       v16qi sv;
428 
429       if (__builtin_expect (end - s < 16, 0)
430 	  && __builtin_expect ((si & 0xfff) > 0xff0, 0))
431 	{
432 	  /* There are less than 16 bytes left in the buffer, and less
433 	     than 16 bytes left on the page.  Reading 16 bytes at this
434 	     point might generate a spurious page fault.  Defer to the
435 	     SSE2 implementation, which already handles alignment.  */
436 	  return search_line_sse2 (s, end);
437 	}
438 
439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
440 	 memory need not be aligned.  */
441       sv = __builtin_ia32_loaddqu ((const char *) s);
442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
443 
444       if (__builtin_expect (index < 16, 0))
445 	goto found;
446 
447       /* Advance the pointer to an aligned address.  We will re-scan a
448 	 few bytes, but we no longer need care for reading past the
449 	 end of a page, since we're guaranteed a match.  */
450       s = (const uchar *)((si + 15) & -16);
451     }
452 
453   /* Main loop, processing 16 bytes at a time.  */
454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
455   while (1)
456     {
457       char f;
458 
459       /* By using inline assembly instead of the builtin,
460 	 we can use the result, as well as the flags set.  */
461       __asm ("%vpcmpestri\t$0, %2, %3"
462 	     : "=c"(index), "=@ccc"(f)
463 	     : "m"(*s), "x"(search), "a"(4), "d"(16));
464       if (f)
465 	break;
466 
467       s += 16;
468     }
469 #else
470   s -= 16;
471   /* By doing the whole loop in inline assembly,
472      we can make proper use of the flags set.  */
473   __asm (      ".balign 16\n"
474 	"0:	add $16, %1\n"
475 	"	%vpcmpestri\t$0, (%1), %2\n"
476 	"	jnc 0b"
477 	: "=&c"(index), "+r"(s)
478 	: "x"(search), "a"(4), "d"(16));
479 #endif
480 
481  found:
482   return s + index;
483 }
484 
485 #else
486 /* Work around out-dated assemblers without sse4 support.  */
487 #define search_line_sse42 search_line_sse2
488 #endif
489 
490 /* Check the CPU capabilities.  */
491 
492 #include "../gcc/config/i386/cpuid.h"
493 
494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
495 static search_line_fast_type search_line_fast;
496 
497 #define HAVE_init_vectorized_lexer 1
498 static inline void
init_vectorized_lexer(void)499 init_vectorized_lexer (void)
500 {
501   unsigned dummy, ecx = 0, edx = 0;
502   search_line_fast_type impl = search_line_acc_char;
503   int minimum = 0;
504 
505 #if defined(__SSE4_2__)
506   minimum = 3;
507 #elif defined(__SSE2__)
508   minimum = 2;
509 #elif defined(__SSE__)
510   minimum = 1;
511 #endif
512 
513   if (minimum == 3)
514     impl = search_line_sse42;
515   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
516     {
517       if (minimum == 3 || (ecx & bit_SSE4_2))
518         impl = search_line_sse42;
519       else if (minimum == 2 || (edx & bit_SSE2))
520 	impl = search_line_sse2;
521       else if (minimum == 1 || (edx & bit_SSE))
522 	impl = search_line_mmx;
523     }
524   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
525     {
526       if (minimum == 1
527 	  || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
528 	impl = search_line_mmx;
529     }
530 
531   search_line_fast = impl;
532 }
533 
534 #elif defined(_ARCH_PWR8) && defined(__ALTIVEC__)
535 
536 /* A vection of the fast scanner using AltiVec vectorized byte compares
537    and VSX unaligned loads (when VSX is available).  This is otherwise
538    the same as the pre-GCC 5 version.  */
539 
540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
541 static const uchar *
search_line_fast(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
543 {
544   typedef __attribute__((altivec(vector))) unsigned char vc;
545 
546   const vc repl_nl = {
547     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
549   };
550   const vc repl_cr = {
551     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
553   };
554   const vc repl_bs = {
555     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
557   };
558   const vc repl_qm = {
559     '?', '?', '?', '?', '?', '?', '?', '?',
560     '?', '?', '?', '?', '?', '?', '?', '?',
561   };
562   const vc zero = { 0 };
563 
564   vc data, t;
565 
566   /* Main loop processing 16 bytes at a time.  */
567   do
568     {
569       vc m_nl, m_cr, m_bs, m_qm;
570 
571       data = *((const vc *)s);
572       s += 16;
573 
574       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
575       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
576       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
577       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
578       t = (m_nl | m_cr) | (m_bs | m_qm);
579 
580       /* T now contains 0xff in bytes for which we matched one of the relevant
581 	 characters.  We want to exit the loop if any byte in T is non-zero.
582 	 Below is the expansion of vec_any_ne(t, zero).  */
583     }
584   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
585 
586   /* Restore s to to point to the 16 bytes we just processed.  */
587   s -= 16;
588 
589   {
590 #define N  (sizeof(vc) / sizeof(long))
591 
592     union {
593       vc v;
594       /* Statically assert that N is 2 or 4.  */
595       unsigned long l[(N == 2 || N == 4) ? N : -1];
596     } u;
597     unsigned long l, i = 0;
598 
599     u.v = t;
600 
601     /* Find the first word of T that is non-zero.  */
602     switch (N)
603       {
604       case 4:
605 	l = u.l[i++];
606 	if (l != 0)
607 	  break;
608 	s += sizeof(unsigned long);
609 	l = u.l[i++];
610 	if (l != 0)
611 	  break;
612 	s += sizeof(unsigned long);
613       case 2:
614 	l = u.l[i++];
615 	if (l != 0)
616 	  break;
617 	s += sizeof(unsigned long);
618 	l = u.l[i];
619       }
620 
621     /* L now contains 0xff in bytes for which we matched one of the
622        relevant characters.  We can find the byte index by finding
623        its bit index and dividing by 8.  */
624 #ifdef __BIG_ENDIAN__
625     l = __builtin_clzl(l) >> 3;
626 #else
627     l = __builtin_ctzl(l) >> 3;
628 #endif
629     return s + l;
630 
631 #undef N
632   }
633 }
634 
635 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
636 
637 /* A vection of the fast scanner using AltiVec vectorized byte compares.
638    This cannot be used for little endian because vec_lvsl/lvsr are
639    deprecated for little endian and the code won't work properly.  */
640 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
641    so we can't compile this function without -maltivec on the command line
642    (or implied by some other switch).  */
643 
644 static const uchar *
search_line_fast(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)645 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
646 {
647   typedef __attribute__((altivec(vector))) unsigned char vc;
648 
649   const vc repl_nl = {
650     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
651     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
652   };
653   const vc repl_cr = {
654     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
655     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
656   };
657   const vc repl_bs = {
658     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
659     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
660   };
661   const vc repl_qm = {
662     '?', '?', '?', '?', '?', '?', '?', '?',
663     '?', '?', '?', '?', '?', '?', '?', '?',
664   };
665   const vc ones = {
666     -1, -1, -1, -1, -1, -1, -1, -1,
667     -1, -1, -1, -1, -1, -1, -1, -1,
668   };
669   const vc zero = { 0 };
670 
671   vc data, mask, t;
672 
673   /* Altivec loads automatically mask addresses with -16.  This lets us
674      issue the first load as early as possible.  */
675   data = __builtin_vec_ld(0, (const vc *)s);
676 
677   /* Discard bytes before the beginning of the buffer.  Do this by
678      beginning with all ones and shifting in zeros according to the
679      mis-alignment.  The LVSR instruction pulls the exact shift we
680      want from the address.  */
681   mask = __builtin_vec_lvsr(0, s);
682   mask = __builtin_vec_perm(zero, ones, mask);
683   data &= mask;
684 
685   /* While altivec loads mask addresses, we still need to align S so
686      that the offset we compute at the end is correct.  */
687   s = (const uchar *)((uintptr_t)s & -16);
688 
689   /* Main loop processing 16 bytes at a time.  */
690   goto start;
691   do
692     {
693       vc m_nl, m_cr, m_bs, m_qm;
694 
695       s += 16;
696       data = __builtin_vec_ld(0, (const vc *)s);
697 
698     start:
699       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
700       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
701       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
702       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
703       t = (m_nl | m_cr) | (m_bs | m_qm);
704 
705       /* T now contains 0xff in bytes for which we matched one of the relevant
706 	 characters.  We want to exit the loop if any byte in T is non-zero.
707 	 Below is the expansion of vec_any_ne(t, zero).  */
708     }
709   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
710 
711   {
712 #define N  (sizeof(vc) / sizeof(long))
713 
714     union {
715       vc v;
716       /* Statically assert that N is 2 or 4.  */
717       unsigned long l[(N == 2 || N == 4) ? N : -1];
718     } u;
719     unsigned long l, i = 0;
720 
721     u.v = t;
722 
723     /* Find the first word of T that is non-zero.  */
724     switch (N)
725       {
726       case 4:
727 	l = u.l[i++];
728 	if (l != 0)
729 	  break;
730 	s += sizeof(unsigned long);
731 	l = u.l[i++];
732 	if (l != 0)
733 	  break;
734 	s += sizeof(unsigned long);
735       case 2:
736 	l = u.l[i++];
737 	if (l != 0)
738 	  break;
739 	s += sizeof(unsigned long);
740 	l = u.l[i];
741       }
742 
743     /* L now contains 0xff in bytes for which we matched one of the
744        relevant characters.  We can find the byte index by finding
745        its bit index and dividing by 8.  */
746     l = __builtin_clzl(l) >> 3;
747     return s + l;
748 
749 #undef N
750   }
751 }
752 
753 #elif defined (__ARM_NEON)
754 #include "arm_neon.h"
755 
756 static const uchar *
search_line_fast(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)757 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
758 {
759   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
760   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
761   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
762   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
763   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
764 
765   unsigned int misalign, found, mask;
766   const uint8_t *p;
767   uint8x16_t data;
768 
769   /* Align the source pointer.  */
770   misalign = (uintptr_t)s & 15;
771   p = (const uint8_t *)((uintptr_t)s & -16);
772   data = vld1q_u8 (p);
773 
774   /* Create a mask for the bytes that are valid within the first
775      16-byte block.  The Idea here is that the AND with the mask
776      within the loop is "free", since we need some AND or TEST
777      insn in order to set the flags for the branch anyway.  */
778   mask = (-1u << misalign) & 0xffff;
779 
780   /* Main loop, processing 16 bytes at a time.  */
781   goto start;
782 
783   do
784     {
785       uint8x8_t l;
786       uint16x4_t m;
787       uint32x2_t n;
788       uint8x16_t t, u, v, w;
789 
790       p += 16;
791       data = vld1q_u8 (p);
792       mask = 0xffff;
793 
794     start:
795       t = vceqq_u8 (data, repl_nl);
796       u = vceqq_u8 (data, repl_cr);
797       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
798       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
799       t = vandq_u8 (vorrq_u8 (v, w), xmask);
800       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
801       m = vpaddl_u8 (l);
802       n = vpaddl_u16 (m);
803 
804       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
805 	      vshr_n_u64 ((uint64x1_t) n, 24)), 0);
806       found &= mask;
807     }
808   while (!found);
809 
810   /* FOUND contains 1 in bits for which we matched a relevant
811      character.  Conversion to the byte index is trivial.  */
812   found = __builtin_ctz (found);
813   return (const uchar *)p + found;
814 }
815 
816 #else
817 
818 /* We only have one accellerated alternative.  Use a direct call so that
819    we encourage inlining.  */
820 
821 #define search_line_fast  search_line_acc_char
822 
823 #endif
824 
825 /* Initialize the lexer if needed.  */
826 
827 void
_cpp_init_lexer(void)828 _cpp_init_lexer (void)
829 {
830 #ifdef HAVE_init_vectorized_lexer
831   init_vectorized_lexer ();
832 #endif
833 }
834 
835 /* Returns with a logical line that contains no escaped newlines or
836    trigraphs.  This is a time-critical inner loop.  */
837 void
_cpp_clean_line(cpp_reader * pfile)838 _cpp_clean_line (cpp_reader *pfile)
839 {
840   cpp_buffer *buffer;
841   const uchar *s;
842   uchar c, *d, *p;
843 
844   buffer = pfile->buffer;
845   buffer->cur_note = buffer->notes_used = 0;
846   buffer->cur = buffer->line_base = buffer->next_line;
847   buffer->need_line = false;
848   s = buffer->next_line;
849 
850   if (!buffer->from_stage3)
851     {
852       const uchar *pbackslash = NULL;
853 
854       /* Fast path.  This is the common case of an un-escaped line with
855 	 no trigraphs.  The primary win here is by not writing any
856 	 data back to memory until we have to.  */
857       while (1)
858 	{
859 	  /* Perform an optimized search for \n, \r, \\, ?.  */
860 	  s = search_line_fast (s, buffer->rlimit);
861 
862 	  c = *s;
863 	  if (c == '\\')
864 	    {
865 	      /* Record the location of the backslash and continue.  */
866 	      pbackslash = s++;
867 	    }
868 	  else if (__builtin_expect (c == '?', 0))
869 	    {
870 	      if (__builtin_expect (s[1] == '?', false)
871 		   && _cpp_trigraph_map[s[2]])
872 		{
873 		  /* Have a trigraph.  We may or may not have to convert
874 		     it.  Add a line note regardless, for -Wtrigraphs.  */
875 		  add_line_note (buffer, s, s[2]);
876 		  if (CPP_OPTION (pfile, trigraphs))
877 		    {
878 		      /* We do, and that means we have to switch to the
879 		         slow path.  */
880 		      d = (uchar *) s;
881 		      *d = _cpp_trigraph_map[s[2]];
882 		      s += 2;
883 		      goto slow_path;
884 		    }
885 		}
886 	      /* Not a trigraph.  Continue on fast-path.  */
887 	      s++;
888 	    }
889 	  else
890 	    break;
891 	}
892 
893       /* This must be \r or \n.  We're either done, or we'll be forced
894 	 to write back to the buffer and continue on the slow path.  */
895       d = (uchar *) s;
896 
897       if (__builtin_expect (s == buffer->rlimit, false))
898 	goto done;
899 
900       /* DOS line ending? */
901       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
902 	{
903 	  s++;
904 	  if (s == buffer->rlimit)
905 	    goto done;
906 	}
907 
908       if (__builtin_expect (pbackslash == NULL, true))
909 	goto done;
910 
911       /* Check for escaped newline.  */
912       p = d;
913       while (is_nvspace (p[-1]))
914 	p--;
915       if (p - 1 != pbackslash)
916 	goto done;
917 
918       /* Have an escaped newline; process it and proceed to
919 	 the slow path.  */
920       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
921       d = p - 2;
922       buffer->next_line = p - 1;
923 
924     slow_path:
925       while (1)
926 	{
927 	  c = *++s;
928 	  *++d = c;
929 
930 	  if (c == '\n' || c == '\r')
931 	    {
932 	      /* Handle DOS line endings.  */
933 	      if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
934 		s++;
935 	      if (s == buffer->rlimit)
936 		break;
937 
938 	      /* Escaped?  */
939 	      p = d;
940 	      while (p != buffer->next_line && is_nvspace (p[-1]))
941 		p--;
942 	      if (p == buffer->next_line || p[-1] != '\\')
943 		break;
944 
945 	      add_line_note (buffer, p - 1, p != d ? ' ': '\\');
946 	      d = p - 2;
947 	      buffer->next_line = p - 1;
948 	    }
949 	  else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
950 	    {
951 	      /* Add a note regardless, for the benefit of -Wtrigraphs.  */
952 	      add_line_note (buffer, d, s[2]);
953 	      if (CPP_OPTION (pfile, trigraphs))
954 		{
955 		  *d = _cpp_trigraph_map[s[2]];
956 		  s += 2;
957 		}
958 	    }
959 	}
960     }
961   else
962     {
963       while (*s != '\n' && *s != '\r')
964 	s++;
965       d = (uchar *) s;
966 
967       /* Handle DOS line endings.  */
968       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
969 	s++;
970     }
971 
972  done:
973   *d = '\n';
974   /* A sentinel note that should never be processed.  */
975   add_line_note (buffer, d + 1, '\n');
976   buffer->next_line = s + 1;
977 }
978 
979 /* Return true if the trigraph indicated by NOTE should be warned
980    about in a comment.  */
981 static bool
warn_in_comment(cpp_reader * pfile,_cpp_line_note * note)982 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
983 {
984   const uchar *p;
985 
986   /* Within comments we don't warn about trigraphs, unless the
987      trigraph forms an escaped newline, as that may change
988      behavior.  */
989   if (note->type != '/')
990     return false;
991 
992   /* If -trigraphs, then this was an escaped newline iff the next note
993      is coincident.  */
994   if (CPP_OPTION (pfile, trigraphs))
995     return note[1].pos == note->pos;
996 
997   /* Otherwise, see if this forms an escaped newline.  */
998   p = note->pos + 3;
999   while (is_nvspace (*p))
1000     p++;
1001 
1002   /* There might have been escaped newlines between the trigraph and the
1003      newline we found.  Hence the position test.  */
1004   return (*p == '\n' && p < note[1].pos);
1005 }
1006 
1007 /* Process the notes created by add_line_note as far as the current
1008    location.  */
1009 void
_cpp_process_line_notes(cpp_reader * pfile,int in_comment)1010 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1011 {
1012   cpp_buffer *buffer = pfile->buffer;
1013 
1014   for (;;)
1015     {
1016       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1017       unsigned int col;
1018 
1019       if (note->pos > buffer->cur)
1020 	break;
1021 
1022       buffer->cur_note++;
1023       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1024 
1025       if (note->type == '\\' || note->type == ' ')
1026 	{
1027 	  if (note->type == ' ' && !in_comment)
1028 	    cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1029 				 "backslash and newline separated by space");
1030 
1031 	  if (buffer->next_line > buffer->rlimit)
1032 	    {
1033 	      cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1034 				   "backslash-newline at end of file");
1035 	      /* Prevent "no newline at end of file" warning.  */
1036 	      buffer->next_line = buffer->rlimit;
1037 	    }
1038 
1039 	  buffer->line_base = note->pos;
1040 	  CPP_INCREMENT_LINE (pfile, 0);
1041 	}
1042       else if (_cpp_trigraph_map[note->type])
1043 	{
1044 	  if (CPP_OPTION (pfile, warn_trigraphs)
1045 	      && (!in_comment || warn_in_comment (pfile, note)))
1046 	    {
1047 	      if (CPP_OPTION (pfile, trigraphs))
1048 		cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1049                                        pfile->line_table->highest_line, col,
1050 				       "trigraph ??%c converted to %c",
1051 				       note->type,
1052 				       (int) _cpp_trigraph_map[note->type]);
1053 	      else
1054 		{
1055 		  cpp_warning_with_line
1056 		    (pfile, CPP_W_TRIGRAPHS,
1057                      pfile->line_table->highest_line, col,
1058 		     "trigraph ??%c ignored, use -trigraphs to enable",
1059 		     note->type);
1060 		}
1061 	    }
1062 	}
1063       else if (note->type == 0)
1064 	/* Already processed in lex_raw_string.  */;
1065       else
1066 	abort ();
1067     }
1068 }
1069 
1070 /* Skip a C-style block comment.  We find the end of the comment by
1071    seeing if an asterisk is before every '/' we encounter.  Returns
1072    nonzero if comment terminated by EOF, zero otherwise.
1073 
1074    Buffer->cur points to the initial asterisk of the comment.  */
1075 bool
_cpp_skip_block_comment(cpp_reader * pfile)1076 _cpp_skip_block_comment (cpp_reader *pfile)
1077 {
1078   cpp_buffer *buffer = pfile->buffer;
1079   const uchar *cur = buffer->cur;
1080   uchar c;
1081 
1082   cur++;
1083   if (*cur == '/')
1084     cur++;
1085 
1086   for (;;)
1087     {
1088       /* People like decorating comments with '*', so check for '/'
1089 	 instead for efficiency.  */
1090       c = *cur++;
1091 
1092       if (c == '/')
1093 	{
1094 	  if (cur[-2] == '*')
1095 	    break;
1096 
1097 	  /* Warn about potential nested comments, but not if the '/'
1098 	     comes immediately before the true comment delimiter.
1099 	     Don't bother to get it right across escaped newlines.  */
1100 	  if (CPP_OPTION (pfile, warn_comments)
1101 	      && cur[0] == '*' && cur[1] != '/')
1102 	    {
1103 	      buffer->cur = cur;
1104 	      cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1105 				     pfile->line_table->highest_line,
1106 				     CPP_BUF_COL (buffer),
1107 				     "\"/*\" within comment");
1108 	    }
1109 	}
1110       else if (c == '\n')
1111 	{
1112 	  unsigned int cols;
1113 	  buffer->cur = cur - 1;
1114 	  _cpp_process_line_notes (pfile, true);
1115 	  if (buffer->next_line >= buffer->rlimit)
1116 	    return true;
1117 	  _cpp_clean_line (pfile);
1118 
1119 	  cols = buffer->next_line - buffer->line_base;
1120 	  CPP_INCREMENT_LINE (pfile, cols);
1121 
1122 	  cur = buffer->cur;
1123 	}
1124     }
1125 
1126   buffer->cur = cur;
1127   _cpp_process_line_notes (pfile, true);
1128   return false;
1129 }
1130 
1131 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1132    terminating newline.  Handles escaped newlines.  Returns nonzero
1133    if a multiline comment.  */
1134 static int
skip_line_comment(cpp_reader * pfile)1135 skip_line_comment (cpp_reader *pfile)
1136 {
1137   cpp_buffer *buffer = pfile->buffer;
1138   source_location orig_line = pfile->line_table->highest_line;
1139 
1140   while (*buffer->cur != '\n')
1141     buffer->cur++;
1142 
1143   _cpp_process_line_notes (pfile, true);
1144   return orig_line != pfile->line_table->highest_line;
1145 }
1146 
1147 /* Skips whitespace, saving the next non-whitespace character.  */
1148 static void
skip_whitespace(cpp_reader * pfile,cppchar_t c)1149 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1150 {
1151   cpp_buffer *buffer = pfile->buffer;
1152   bool saw_NUL = false;
1153 
1154   do
1155     {
1156       /* Horizontal space always OK.  */
1157       if (c == ' ' || c == '\t')
1158 	;
1159       /* Just \f \v or \0 left.  */
1160       else if (c == '\0')
1161 	saw_NUL = true;
1162       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1163 	cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1164 			     CPP_BUF_COL (buffer),
1165 			     "%s in preprocessing directive",
1166 			     c == '\f' ? "form feed" : "vertical tab");
1167 
1168       c = *buffer->cur++;
1169     }
1170   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1171   while (is_nvspace (c));
1172 
1173   if (saw_NUL)
1174     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1175 
1176   buffer->cur--;
1177 }
1178 
1179 /* See if the characters of a number token are valid in a name (no
1180    '.', '+' or '-').  */
1181 static int
name_p(cpp_reader * pfile,const cpp_string * string)1182 name_p (cpp_reader *pfile, const cpp_string *string)
1183 {
1184   unsigned int i;
1185 
1186   for (i = 0; i < string->len; i++)
1187     if (!is_idchar (string->text[i]))
1188       return 0;
1189 
1190   return 1;
1191 }
1192 
1193 /* After parsing an identifier or other sequence, produce a warning about
1194    sequences not in NFC/NFKC.  */
1195 static void
warn_about_normalization(cpp_reader * pfile,const cpp_token * token,const struct normalize_state * s)1196 warn_about_normalization (cpp_reader *pfile,
1197 			  const cpp_token *token,
1198 			  const struct normalize_state *s)
1199 {
1200   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1201       && !pfile->state.skipping)
1202     {
1203       /* Make sure that the token is printed using UCNs, even
1204 	 if we'd otherwise happily print UTF-8.  */
1205       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1206       size_t sz;
1207 
1208       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1209       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1210 	cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1211 			       "`%.*s' is not in NFKC", (int) sz, buf);
1212       else
1213 	cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1214 			       "`%.*s' is not in NFC", (int) sz, buf);
1215       free (buf);
1216     }
1217 }
1218 
1219 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1220    an identifier.  FIRST is TRUE if this starts an identifier.  */
1221 static bool
forms_identifier_p(cpp_reader * pfile,int first,struct normalize_state * state)1222 forms_identifier_p (cpp_reader *pfile, int first,
1223 		    struct normalize_state *state)
1224 {
1225   cpp_buffer *buffer = pfile->buffer;
1226 
1227   if (*buffer->cur == '$')
1228     {
1229       if (!CPP_OPTION (pfile, dollars_in_ident))
1230 	return false;
1231 
1232       buffer->cur++;
1233       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1234 	{
1235 	  CPP_OPTION (pfile, warn_dollars) = 0;
1236 	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1237 	}
1238 
1239       return true;
1240     }
1241 
1242   /* Is this a syntactically valid UCN?  */
1243   if (CPP_OPTION (pfile, extended_identifiers)
1244       && *buffer->cur == '\\'
1245       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1246     {
1247       cppchar_t s;
1248       buffer->cur += 2;
1249       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1250 			  state, &s))
1251 	return true;
1252       buffer->cur -= 2;
1253     }
1254 
1255   return false;
1256 }
1257 
1258 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1259 static cpp_hashnode *
lex_identifier_intern(cpp_reader * pfile,const uchar * base)1260 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1261 {
1262   cpp_hashnode *result;
1263   const uchar *cur;
1264   unsigned int len;
1265   unsigned int hash = HT_HASHSTEP (0, *base);
1266 
1267   cur = base + 1;
1268   while (ISIDNUM (*cur))
1269     {
1270       hash = HT_HASHSTEP (hash, *cur);
1271       cur++;
1272     }
1273   len = cur - base;
1274   hash = HT_HASHFINISH (hash, len);
1275   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1276 					      base, len, hash, HT_ALLOC));
1277 
1278   /* Rarely, identifiers require diagnostics when lexed.  */
1279   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1280 			&& !pfile->state.skipping, 0))
1281     {
1282       /* It is allowed to poison the same identifier twice.  */
1283       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1284 	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1285 		   NODE_NAME (result));
1286 
1287       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1288 	 replacement list of a variadic macro.  */
1289       if (result == pfile->spec_nodes.n__VA_ARGS__
1290 	  && !pfile->state.va_args_ok)
1291 	{
1292 	  if (CPP_OPTION (pfile, cplusplus))
1293 	    cpp_error (pfile, CPP_DL_PEDWARN,
1294 		       "__VA_ARGS__ can only appear in the expansion"
1295 		       " of a C++11 variadic macro");
1296 	  else
1297 	    cpp_error (pfile, CPP_DL_PEDWARN,
1298 		       "__VA_ARGS__ can only appear in the expansion"
1299 		       " of a C99 variadic macro");
1300 	}
1301 
1302       /* For -Wc++-compat, warn about use of C++ named operators.  */
1303       if (result->flags & NODE_WARN_OPERATOR)
1304 	cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1305 		     "identifier \"%s\" is a special operator name in C++",
1306 		     NODE_NAME (result));
1307     }
1308 
1309   return result;
1310 }
1311 
1312 /* Get the cpp_hashnode of an identifier specified by NAME in
1313    the current cpp_reader object.  If none is found, NULL is returned.  */
1314 cpp_hashnode *
_cpp_lex_identifier(cpp_reader * pfile,const char * name)1315 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1316 {
1317   cpp_hashnode *result;
1318   result = lex_identifier_intern (pfile, (uchar *) name);
1319   return result;
1320 }
1321 
1322 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1323 static cpp_hashnode *
lex_identifier(cpp_reader * pfile,const uchar * base,bool starts_ucn,struct normalize_state * nst,cpp_hashnode ** spelling)1324 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1325 		struct normalize_state *nst, cpp_hashnode **spelling)
1326 {
1327   cpp_hashnode *result;
1328   const uchar *cur;
1329   unsigned int len;
1330   unsigned int hash = HT_HASHSTEP (0, *base);
1331 
1332   cur = pfile->buffer->cur;
1333   if (! starts_ucn)
1334     {
1335       while (ISIDNUM (*cur))
1336 	{
1337 	  hash = HT_HASHSTEP (hash, *cur);
1338 	  cur++;
1339 	}
1340       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1341     }
1342   pfile->buffer->cur = cur;
1343   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1344     {
1345       /* Slower version for identifiers containing UCNs (or $).  */
1346       do {
1347 	while (ISIDNUM (*pfile->buffer->cur))
1348 	  {
1349 	    NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1350 	    pfile->buffer->cur++;
1351 	  }
1352       } while (forms_identifier_p (pfile, false, nst));
1353       result = _cpp_interpret_identifier (pfile, base,
1354 					  pfile->buffer->cur - base);
1355       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1356     }
1357   else
1358     {
1359       len = cur - base;
1360       hash = HT_HASHFINISH (hash, len);
1361 
1362       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1363 						  base, len, hash, HT_ALLOC));
1364       *spelling = result;
1365     }
1366 
1367   /* Rarely, identifiers require diagnostics when lexed.  */
1368   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1369 			&& !pfile->state.skipping, 0))
1370     {
1371       /* It is allowed to poison the same identifier twice.  */
1372       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1373 	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1374 		   NODE_NAME (result));
1375 
1376       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1377 	 replacement list of a variadic macro.  */
1378       if (result == pfile->spec_nodes.n__VA_ARGS__
1379 	  && !pfile->state.va_args_ok)
1380 	{
1381 	  if (CPP_OPTION (pfile, cplusplus))
1382 	    cpp_error (pfile, CPP_DL_PEDWARN,
1383 		       "__VA_ARGS__ can only appear in the expansion"
1384 		       " of a C++11 variadic macro");
1385 	  else
1386 	    cpp_error (pfile, CPP_DL_PEDWARN,
1387 		       "__VA_ARGS__ can only appear in the expansion"
1388 		       " of a C99 variadic macro");
1389 	}
1390 
1391       /* For -Wc++-compat, warn about use of C++ named operators.  */
1392       if (result->flags & NODE_WARN_OPERATOR)
1393 	cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1394 		     "identifier \"%s\" is a special operator name in C++",
1395 		     NODE_NAME (result));
1396     }
1397 
1398   return result;
1399 }
1400 
1401 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1402 static void
lex_number(cpp_reader * pfile,cpp_string * number,struct normalize_state * nst)1403 lex_number (cpp_reader *pfile, cpp_string *number,
1404 	    struct normalize_state *nst)
1405 {
1406   const uchar *cur;
1407   const uchar *base;
1408   uchar *dest;
1409 
1410   base = pfile->buffer->cur - 1;
1411   do
1412     {
1413       cur = pfile->buffer->cur;
1414 
1415       /* N.B. ISIDNUM does not include $.  */
1416       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1417 	     || VALID_SIGN (*cur, cur[-1]))
1418 	{
1419 	  NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1420 	  cur++;
1421 	}
1422       /* A number can't end with a digit separator.  */
1423       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1424 	--cur;
1425 
1426       pfile->buffer->cur = cur;
1427     }
1428   while (forms_identifier_p (pfile, false, nst));
1429 
1430   number->len = cur - base;
1431   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1432   memcpy (dest, base, number->len);
1433   dest[number->len] = '\0';
1434   number->text = dest;
1435 }
1436 
1437 /* Create a token of type TYPE with a literal spelling.  */
1438 static void
create_literal(cpp_reader * pfile,cpp_token * token,const uchar * base,unsigned int len,enum cpp_ttype type)1439 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1440 		unsigned int len, enum cpp_ttype type)
1441 {
1442   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1443 
1444   memcpy (dest, base, len);
1445   dest[len] = '\0';
1446   token->type = type;
1447   token->val.str.len = len;
1448   token->val.str.text = dest;
1449 }
1450 
1451 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1452    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1453 
1454 static void
bufring_append(cpp_reader * pfile,const uchar * base,size_t len,_cpp_buff ** first_buff_p,_cpp_buff ** last_buff_p)1455 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1456 		_cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1457 {
1458   _cpp_buff *first_buff = *first_buff_p;
1459   _cpp_buff *last_buff = *last_buff_p;
1460 
1461   if (first_buff == NULL)
1462     first_buff = last_buff = _cpp_get_buff (pfile, len);
1463   else if (len > BUFF_ROOM (last_buff))
1464     {
1465       size_t room = BUFF_ROOM (last_buff);
1466       memcpy (BUFF_FRONT (last_buff), base, room);
1467       BUFF_FRONT (last_buff) += room;
1468       base += room;
1469       len -= room;
1470       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1471     }
1472 
1473   memcpy (BUFF_FRONT (last_buff), base, len);
1474   BUFF_FRONT (last_buff) += len;
1475 
1476   *first_buff_p = first_buff;
1477   *last_buff_p = last_buff;
1478 }
1479 
1480 
1481 /* Returns true if a macro has been defined.
1482    This might not work if compile with -save-temps,
1483    or preprocess separately from compilation.  */
1484 
1485 static bool
is_macro(cpp_reader * pfile,const uchar * base)1486 is_macro(cpp_reader *pfile, const uchar *base)
1487 {
1488   const uchar *cur = base;
1489   if (! ISIDST (*cur))
1490     return false;
1491   unsigned int hash = HT_HASHSTEP (0, *cur);
1492   ++cur;
1493   while (ISIDNUM (*cur))
1494     {
1495       hash = HT_HASHSTEP (hash, *cur);
1496       ++cur;
1497     }
1498   hash = HT_HASHFINISH (hash, cur - base);
1499 
1500   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1501 					base, cur - base, hash, HT_NO_INSERT));
1502 
1503   return !result ? false : (result->type == NT_MACRO);
1504 }
1505 
1506 
1507 /* Lexes a raw string.  The stored string contains the spelling, including
1508    double quotes, delimiter string, '(' and ')', any leading
1509    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1510    literal, or CPP_OTHER if it was not properly terminated.
1511 
1512    The spelling is NUL-terminated, but it is not guaranteed that this
1513    is the first NUL since embedded NULs are preserved.  */
1514 
1515 static void
lex_raw_string(cpp_reader * pfile,cpp_token * token,const uchar * base,const uchar * cur)1516 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1517 		const uchar *cur)
1518 {
1519   uchar raw_prefix[17];
1520   uchar temp_buffer[18];
1521   const uchar *orig_base;
1522   unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1523   enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1524   raw_str_phase phase = RAW_STR_PREFIX;
1525   enum cpp_ttype type;
1526   size_t total_len = 0;
1527   /* Index into temp_buffer during phases other than RAW_STR,
1528      during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1529      be appended to temp_buffer.  */
1530   size_t temp_buffer_len = 0;
1531   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1532   size_t raw_prefix_start;
1533   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1534 
1535   type = (*base == 'L' ? CPP_WSTRING :
1536 	  *base == 'U' ? CPP_STRING32 :
1537 	  *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1538 	  : CPP_STRING);
1539 
1540 #define BUF_APPEND(STR,LEN)					\
1541       do {							\
1542 	bufring_append (pfile, (const uchar *)(STR), (LEN),	\
1543 			&first_buff, &last_buff);		\
1544 	total_len += (LEN);					\
1545 	if (__builtin_expect (temp_buffer_len < 17, 0)		\
1546 	    && (const uchar *)(STR) != base			\
1547 	    && (LEN) <= 2)					\
1548 	  {							\
1549 	    memcpy (temp_buffer + temp_buffer_len,		\
1550 		    (const uchar *)(STR), (LEN));		\
1551 	    temp_buffer_len += (LEN);				\
1552 	  }							\
1553       } while (0);
1554 
1555   orig_base = base;
1556   ++cur;
1557   raw_prefix_start = cur - base;
1558   for (;;)
1559     {
1560       cppchar_t c;
1561 
1562       /* If we previously performed any trigraph or line splicing
1563 	 transformations, undo them in between the opening and closing
1564 	 double quote.  */
1565       while (note->pos < cur)
1566 	++note;
1567       for (; note->pos == cur; ++note)
1568 	{
1569 	  switch (note->type)
1570 	    {
1571 	    case '\\':
1572 	    case ' ':
1573 	      /* Restore backslash followed by newline.  */
1574 	      BUF_APPEND (base, cur - base);
1575 	      base = cur;
1576 	      BUF_APPEND ("\\", 1);
1577 	    after_backslash:
1578 	      if (note->type == ' ')
1579 		{
1580 		  /* GNU backslash whitespace newline extension.  FIXME
1581 		     could be any sequence of non-vertical space.  When we
1582 		     can properly restore any such sequence, we should mark
1583 		     this note as handled so _cpp_process_line_notes
1584 		     doesn't warn.  */
1585 		  BUF_APPEND (" ", 1);
1586 		}
1587 
1588 	      BUF_APPEND ("\n", 1);
1589 	      break;
1590 
1591 	    case 0:
1592 	      /* Already handled.  */
1593 	      break;
1594 
1595 	    default:
1596 	      if (_cpp_trigraph_map[note->type])
1597 		{
1598 		  /* Don't warn about this trigraph in
1599 		     _cpp_process_line_notes, since trigraphs show up as
1600 		     trigraphs in raw strings.  */
1601 		  uchar type = note->type;
1602 		  note->type = 0;
1603 
1604 		  if (!CPP_OPTION (pfile, trigraphs))
1605 		    /* If we didn't convert the trigraph in the first
1606 		       place, don't do anything now either.  */
1607 		    break;
1608 
1609 		  BUF_APPEND (base, cur - base);
1610 		  base = cur;
1611 		  BUF_APPEND ("??", 2);
1612 
1613 		  /* ??/ followed by newline gets two line notes, one for
1614 		     the trigraph and one for the backslash/newline.  */
1615 		  if (type == '/' && note[1].pos == cur)
1616 		    {
1617 		      if (note[1].type != '\\'
1618 			  && note[1].type != ' ')
1619 			abort ();
1620 		      BUF_APPEND ("/", 1);
1621 		      ++note;
1622 		      goto after_backslash;
1623 		    }
1624 		  else
1625 		    {
1626 		      /* Skip the replacement character.  */
1627 		      base = ++cur;
1628 		      BUF_APPEND (&type, 1);
1629 		      c = type;
1630 		      goto check_c;
1631 		    }
1632 		}
1633 	      else
1634 		abort ();
1635 	      break;
1636 	    }
1637 	}
1638       c = *cur++;
1639       if (__builtin_expect (temp_buffer_len < 17, 0))
1640 	temp_buffer[temp_buffer_len++] = c;
1641 
1642      check_c:
1643       if (phase == RAW_STR_PREFIX)
1644 	{
1645 	  while (raw_prefix_len < temp_buffer_len)
1646 	    {
1647 	      raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1648 	      switch (raw_prefix[raw_prefix_len])
1649 		{
1650 		case ' ': case '(': case ')': case '\\': case '\t':
1651 		case '\v': case '\f': case '\n': default:
1652 		  break;
1653 		/* Basic source charset except the above chars.  */
1654 		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1655 		case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1656 		case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1657 		case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1658 		case 'y': case 'z':
1659 		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1660 		case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1661 		case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1662 		case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1663 		case 'Y': case 'Z':
1664 		case '0': case '1': case '2': case '3': case '4': case '5':
1665 		case '6': case '7': case '8': case '9':
1666 		case '_': case '{': case '}': case '#': case '[': case ']':
1667 		case '<': case '>': case '%': case ':': case ';': case '.':
1668 		case '?': case '*': case '+': case '-': case '/': case '^':
1669 		case '&': case '|': case '~': case '!': case '=': case ',':
1670 		case '"': case '\'':
1671 		  if (raw_prefix_len < 16)
1672 		    {
1673 		      raw_prefix_len++;
1674 		      continue;
1675 		    }
1676 		  break;
1677 		}
1678 
1679 	      if (raw_prefix[raw_prefix_len] != '(')
1680 		{
1681 		  int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1682 		  if (raw_prefix_len == 16)
1683 		    cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1684 					 col, "raw string delimiter longer "
1685 					      "than 16 characters");
1686 		  else if (raw_prefix[raw_prefix_len] == '\n')
1687 		    cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1688 					 col, "invalid new-line in raw "
1689 					      "string delimiter");
1690 		  else
1691 		    cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1692 					 col, "invalid character '%c' in "
1693 					      "raw string delimiter",
1694 					 (int) raw_prefix[raw_prefix_len]);
1695 		  pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1696 		  create_literal (pfile, token, orig_base,
1697 				  raw_prefix_start - 1, CPP_OTHER);
1698 		  if (first_buff)
1699 		    _cpp_release_buff (pfile, first_buff);
1700 		  return;
1701 		}
1702 	      raw_prefix[raw_prefix_len] = '"';
1703 	      phase = RAW_STR;
1704 	      /* Nothing should be appended to temp_buffer during
1705 		 RAW_STR phase.  */
1706 	      temp_buffer_len = 17;
1707 	      break;
1708 	    }
1709 	  continue;
1710 	}
1711       else if (phase == RAW_STR_SUFFIX)
1712 	{
1713 	  while (raw_suffix_len <= raw_prefix_len
1714 		 && raw_suffix_len < temp_buffer_len
1715 		 && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1716 	    raw_suffix_len++;
1717 	  if (raw_suffix_len > raw_prefix_len)
1718 	    break;
1719 	  if (raw_suffix_len == temp_buffer_len)
1720 	    continue;
1721 	  phase = RAW_STR;
1722 	  /* Nothing should be appended to temp_buffer during
1723 	     RAW_STR phase.  */
1724 	  temp_buffer_len = 17;
1725 	}
1726       if (c == ')')
1727 	{
1728 	  phase = RAW_STR_SUFFIX;
1729 	  raw_suffix_len = 0;
1730 	  temp_buffer_len = 0;
1731 	}
1732       else if (c == '\n')
1733 	{
1734 	  if (pfile->state.in_directive
1735 	      || (pfile->state.parsing_args
1736 		  && pfile->buffer->next_line >= pfile->buffer->rlimit))
1737 	    {
1738 	      cur--;
1739 	      type = CPP_OTHER;
1740 	      cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1741 				   "unterminated raw string");
1742 	      break;
1743 	    }
1744 
1745 	  BUF_APPEND (base, cur - base);
1746 
1747 	  if (pfile->buffer->cur < pfile->buffer->rlimit)
1748 	    CPP_INCREMENT_LINE (pfile, 0);
1749 	  pfile->buffer->need_line = true;
1750 
1751 	  pfile->buffer->cur = cur-1;
1752 	  _cpp_process_line_notes (pfile, false);
1753 	  if (!_cpp_get_fresh_line (pfile))
1754 	    {
1755 	      source_location src_loc = token->src_loc;
1756 	      token->type = CPP_EOF;
1757 	      /* Tell the compiler the line number of the EOF token.  */
1758 	      token->src_loc = pfile->line_table->highest_line;
1759 	      token->flags = BOL;
1760 	      if (first_buff != NULL)
1761 		_cpp_release_buff (pfile, first_buff);
1762 	      cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1763 				   "unterminated raw string");
1764 	      return;
1765 	    }
1766 
1767 	  cur = base = pfile->buffer->cur;
1768 	  note = &pfile->buffer->notes[pfile->buffer->cur_note];
1769 	}
1770     }
1771 
1772   if (CPP_OPTION (pfile, user_literals))
1773     {
1774       /* If a string format macro, say from inttypes.h, is placed touching
1775 	 a string literal it could be parsed as a C++11 user-defined string
1776 	 literal thus breaking the program.
1777 	 Try to identify macros with is_macro. A warning is issued. */
1778       if (is_macro (pfile, cur))
1779 	{
1780 	  /* Raise a warning, but do not consume subsequent tokens.  */
1781 	  if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1782 	    cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1783 				   token->src_loc, 0,
1784 				   "invalid suffix on literal; C++11 requires "
1785 				   "a space between literal and string macro");
1786 	}
1787       /* Grab user defined literal suffix.  */
1788       else if (ISIDST (*cur))
1789 	{
1790 	  type = cpp_userdef_string_add_type (type);
1791 	  ++cur;
1792 
1793 	  while (ISIDNUM (*cur))
1794 	    ++cur;
1795 	}
1796     }
1797 
1798   pfile->buffer->cur = cur;
1799   if (first_buff == NULL)
1800     create_literal (pfile, token, base, cur - base, type);
1801   else
1802     {
1803       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1804 
1805       token->type = type;
1806       token->val.str.len = total_len + (cur - base);
1807       token->val.str.text = dest;
1808       last_buff = first_buff;
1809       while (last_buff != NULL)
1810 	{
1811 	  memcpy (dest, last_buff->base,
1812 		  BUFF_FRONT (last_buff) - last_buff->base);
1813 	  dest += BUFF_FRONT (last_buff) - last_buff->base;
1814 	  last_buff = last_buff->next;
1815 	}
1816       _cpp_release_buff (pfile, first_buff);
1817       memcpy (dest, base, cur - base);
1818       dest[cur - base] = '\0';
1819     }
1820 }
1821 
1822 /* Lexes a string, character constant, or angle-bracketed header file
1823    name.  The stored string contains the spelling, including opening
1824    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1825    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1826    if it was not properly terminated, or CPP_LESS for an unterminated
1827    header name which must be relexed as normal tokens.
1828 
1829    The spelling is NUL-terminated, but it is not guaranteed that this
1830    is the first NUL since embedded NULs are preserved.  */
1831 static void
lex_string(cpp_reader * pfile,cpp_token * token,const uchar * base)1832 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1833 {
1834   bool saw_NUL = false;
1835   const uchar *cur;
1836   cppchar_t terminator;
1837   enum cpp_ttype type;
1838 
1839   cur = base;
1840   terminator = *cur++;
1841   if (terminator == 'L' || terminator == 'U')
1842     terminator = *cur++;
1843   else if (terminator == 'u')
1844     {
1845       terminator = *cur++;
1846       if (terminator == '8')
1847 	terminator = *cur++;
1848     }
1849   if (terminator == 'R')
1850     {
1851       lex_raw_string (pfile, token, base, cur);
1852       return;
1853     }
1854   if (terminator == '"')
1855     type = (*base == 'L' ? CPP_WSTRING :
1856 	    *base == 'U' ? CPP_STRING32 :
1857 	    *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1858 			 : CPP_STRING);
1859   else if (terminator == '\'')
1860     type = (*base == 'L' ? CPP_WCHAR :
1861 	    *base == 'U' ? CPP_CHAR32 :
1862 	    *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
1863 			 : CPP_CHAR);
1864   else
1865     terminator = '>', type = CPP_HEADER_NAME;
1866 
1867   for (;;)
1868     {
1869       cppchar_t c = *cur++;
1870 
1871       /* In #include-style directives, terminators are not escapable.  */
1872       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1873 	cur++;
1874       else if (c == terminator)
1875 	break;
1876       else if (c == '\n')
1877 	{
1878 	  cur--;
1879 	  /* Unmatched quotes always yield undefined behavior, but
1880 	     greedy lexing means that what appears to be an unterminated
1881 	     header name may actually be a legitimate sequence of tokens.  */
1882 	  if (terminator == '>')
1883 	    {
1884 	      token->type = CPP_LESS;
1885 	      return;
1886 	    }
1887 	  type = CPP_OTHER;
1888 	  break;
1889 	}
1890       else if (c == '\0')
1891 	saw_NUL = true;
1892     }
1893 
1894   if (saw_NUL && !pfile->state.skipping)
1895     cpp_error (pfile, CPP_DL_WARNING,
1896 	       "null character(s) preserved in literal");
1897 
1898   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1899     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1900 	       (int) terminator);
1901 
1902   if (CPP_OPTION (pfile, user_literals))
1903     {
1904       /* If a string format macro, say from inttypes.h, is placed touching
1905 	 a string literal it could be parsed as a C++11 user-defined string
1906 	 literal thus breaking the program.
1907 	 Try to identify macros with is_macro. A warning is issued. */
1908       if (is_macro (pfile, cur))
1909 	{
1910 	  /* Raise a warning, but do not consume subsequent tokens.  */
1911 	  if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1912 	    cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1913 				   token->src_loc, 0,
1914 				   "invalid suffix on literal; C++11 requires "
1915 				   "a space between literal and string macro");
1916 	}
1917       /* Grab user defined literal suffix.  */
1918       else if (ISIDST (*cur))
1919 	{
1920 	  type = cpp_userdef_char_add_type (type);
1921 	  type = cpp_userdef_string_add_type (type);
1922           ++cur;
1923 
1924 	  while (ISIDNUM (*cur))
1925 	    ++cur;
1926 	}
1927     }
1928   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
1929 	   && is_macro (pfile, cur)
1930 	   && !pfile->state.skipping)
1931     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
1932 			   token->src_loc, 0, "C++11 requires a space "
1933 			   "between string literal and macro");
1934 
1935   pfile->buffer->cur = cur;
1936   create_literal (pfile, token, base, cur - base, type);
1937 }
1938 
1939 /* Return the comment table. The client may not make any assumption
1940    about the ordering of the table.  */
1941 cpp_comment_table *
cpp_get_comments(cpp_reader * pfile)1942 cpp_get_comments (cpp_reader *pfile)
1943 {
1944   return &pfile->comments;
1945 }
1946 
1947 /* Append a comment to the end of the comment table. */
1948 static void
store_comment(cpp_reader * pfile,cpp_token * token)1949 store_comment (cpp_reader *pfile, cpp_token *token)
1950 {
1951   int len;
1952 
1953   if (pfile->comments.allocated == 0)
1954     {
1955       pfile->comments.allocated = 256;
1956       pfile->comments.entries = (cpp_comment *) xmalloc
1957 	(pfile->comments.allocated * sizeof (cpp_comment));
1958     }
1959 
1960   if (pfile->comments.count == pfile->comments.allocated)
1961     {
1962       pfile->comments.allocated *= 2;
1963       pfile->comments.entries = (cpp_comment *) xrealloc
1964 	(pfile->comments.entries,
1965 	 pfile->comments.allocated * sizeof (cpp_comment));
1966     }
1967 
1968   len = token->val.str.len;
1969 
1970   /* Copy comment. Note, token may not be NULL terminated. */
1971   pfile->comments.entries[pfile->comments.count].comment =
1972     (char *) xmalloc (sizeof (char) * (len + 1));
1973   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1974 	  token->val.str.text, len);
1975   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1976 
1977   /* Set source location. */
1978   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1979 
1980   /* Increment the count of entries in the comment table. */
1981   pfile->comments.count++;
1982 }
1983 
1984 /* The stored comment includes the comment start and any terminator.  */
1985 static void
save_comment(cpp_reader * pfile,cpp_token * token,const unsigned char * from,cppchar_t type)1986 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1987 	      cppchar_t type)
1988 {
1989   unsigned char *buffer;
1990   unsigned int len, clen, i;
1991 
1992   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1993 
1994   /* C++ comments probably (not definitely) have moved past a new
1995      line, which we don't want to save in the comment.  */
1996   if (is_vspace (pfile->buffer->cur[-1]))
1997     len--;
1998 
1999   /* If we are currently in a directive or in argument parsing, then
2000      we need to store all C++ comments as C comments internally, and
2001      so we need to allocate a little extra space in that case.
2002 
2003      Note that the only time we encounter a directive here is
2004      when we are saving comments in a "#define".  */
2005   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2006 	  && type == '/') ? len + 2 : len;
2007 
2008   buffer = _cpp_unaligned_alloc (pfile, clen);
2009 
2010   token->type = CPP_COMMENT;
2011   token->val.str.len = clen;
2012   token->val.str.text = buffer;
2013 
2014   buffer[0] = '/';
2015   memcpy (buffer + 1, from, len - 1);
2016 
2017   /* Finish conversion to a C comment, if necessary.  */
2018   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2019     {
2020       buffer[1] = '*';
2021       buffer[clen - 2] = '*';
2022       buffer[clen - 1] = '/';
2023       /* As there can be in a C++ comments illegal sequences for C comments
2024          we need to filter them out.  */
2025       for (i = 2; i < (clen - 2); i++)
2026         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2027           buffer[i] = '|';
2028     }
2029 
2030   /* Finally store this comment for use by clients of libcpp. */
2031   store_comment (pfile, token);
2032 }
2033 
2034 /* Allocate COUNT tokens for RUN.  */
2035 void
_cpp_init_tokenrun(tokenrun * run,unsigned int count)2036 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2037 {
2038   run->base = XNEWVEC (cpp_token, count);
2039   run->limit = run->base + count;
2040   run->next = NULL;
2041 }
2042 
2043 /* Returns the next tokenrun, or creates one if there is none.  */
2044 static tokenrun *
next_tokenrun(tokenrun * run)2045 next_tokenrun (tokenrun *run)
2046 {
2047   if (run->next == NULL)
2048     {
2049       run->next = XNEW (tokenrun);
2050       run->next->prev = run;
2051       _cpp_init_tokenrun (run->next, 250);
2052     }
2053 
2054   return run->next;
2055 }
2056 
2057 /* Return the number of not yet processed token in a given
2058    context.  */
2059 int
_cpp_remaining_tokens_num_in_context(cpp_context * context)2060 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2061 {
2062   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2063     return (LAST (context).token - FIRST (context).token);
2064   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2065 	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
2066     return (LAST (context).ptoken - FIRST (context).ptoken);
2067   else
2068       abort ();
2069 }
2070 
2071 /* Returns the token present at index INDEX in a given context.  If
2072    INDEX is zero, the next token to be processed is returned.  */
2073 static const cpp_token*
_cpp_token_from_context_at(cpp_context * context,int index)2074 _cpp_token_from_context_at (cpp_context *context, int index)
2075 {
2076   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2077     return &(FIRST (context).token[index]);
2078   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2079 	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
2080     return FIRST (context).ptoken[index];
2081  else
2082    abort ();
2083 }
2084 
2085 /* Look ahead in the input stream.  */
2086 const cpp_token *
cpp_peek_token(cpp_reader * pfile,int index)2087 cpp_peek_token (cpp_reader *pfile, int index)
2088 {
2089   cpp_context *context = pfile->context;
2090   const cpp_token *peektok;
2091   int count;
2092 
2093   /* First, scan through any pending cpp_context objects.  */
2094   while (context->prev)
2095     {
2096       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2097 
2098       if (index < (int) sz)
2099         return _cpp_token_from_context_at (context, index);
2100       index -= (int) sz;
2101       context = context->prev;
2102     }
2103 
2104   /* We will have to read some new tokens after all (and do so
2105      without invalidating preceding tokens).  */
2106   count = index;
2107   pfile->keep_tokens++;
2108 
2109   /* For peeked tokens temporarily disable line_change reporting,
2110      until the tokens are parsed for real.  */
2111   void (*line_change) (cpp_reader *, const cpp_token *, int)
2112     = pfile->cb.line_change;
2113   pfile->cb.line_change = NULL;
2114 
2115   do
2116     {
2117       peektok = _cpp_lex_token (pfile);
2118       if (peektok->type == CPP_EOF)
2119 	{
2120 	  index--;
2121 	  break;
2122 	}
2123     }
2124   while (index--);
2125 
2126   _cpp_backup_tokens_direct (pfile, count - index);
2127   pfile->keep_tokens--;
2128   pfile->cb.line_change = line_change;
2129 
2130   return peektok;
2131 }
2132 
2133 /* Allocate a single token that is invalidated at the same time as the
2134    rest of the tokens on the line.  Has its line and col set to the
2135    same as the last lexed token, so that diagnostics appear in the
2136    right place.  */
2137 cpp_token *
_cpp_temp_token(cpp_reader * pfile)2138 _cpp_temp_token (cpp_reader *pfile)
2139 {
2140   cpp_token *old, *result;
2141   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2142   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2143 
2144   old = pfile->cur_token - 1;
2145   /* Any pre-existing lookaheads must not be clobbered.  */
2146   if (la)
2147     {
2148       if (sz <= la)
2149         {
2150           tokenrun *next = next_tokenrun (pfile->cur_run);
2151 
2152           if (sz < la)
2153             memmove (next->base + 1, next->base,
2154                      (la - sz) * sizeof (cpp_token));
2155 
2156           next->base[0] = pfile->cur_run->limit[-1];
2157         }
2158 
2159       if (sz > 1)
2160         memmove (pfile->cur_token + 1, pfile->cur_token,
2161                  MIN (la, sz - 1) * sizeof (cpp_token));
2162     }
2163 
2164   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2165     {
2166       pfile->cur_run = next_tokenrun (pfile->cur_run);
2167       pfile->cur_token = pfile->cur_run->base;
2168     }
2169 
2170   result = pfile->cur_token++;
2171   result->src_loc = old->src_loc;
2172   return result;
2173 }
2174 
2175 /* Lex a token into RESULT (external interface).  Takes care of issues
2176    like directive handling, token lookahead, multiple include
2177    optimization and skipping.  */
2178 const cpp_token *
_cpp_lex_token(cpp_reader * pfile)2179 _cpp_lex_token (cpp_reader *pfile)
2180 {
2181   cpp_token *result;
2182 
2183   for (;;)
2184     {
2185       if (pfile->cur_token == pfile->cur_run->limit)
2186 	{
2187 	  pfile->cur_run = next_tokenrun (pfile->cur_run);
2188 	  pfile->cur_token = pfile->cur_run->base;
2189 	}
2190       /* We assume that the current token is somewhere in the current
2191 	 run.  */
2192       if (pfile->cur_token < pfile->cur_run->base
2193 	  || pfile->cur_token >= pfile->cur_run->limit)
2194 	abort ();
2195 
2196       if (pfile->lookaheads)
2197 	{
2198 	  pfile->lookaheads--;
2199 	  result = pfile->cur_token++;
2200 	}
2201       else
2202 	result = _cpp_lex_direct (pfile);
2203 
2204       if (result->flags & BOL)
2205 	{
2206 	  /* Is this a directive.  If _cpp_handle_directive returns
2207 	     false, it is an assembler #.  */
2208 	  if (result->type == CPP_HASH
2209 	      /* 6.10.3 p 11: Directives in a list of macro arguments
2210 		 gives undefined behavior.  This implementation
2211 		 handles the directive as normal.  */
2212 	      && pfile->state.parsing_args != 1)
2213 	    {
2214 	      if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2215 		{
2216 		  if (pfile->directive_result.type == CPP_PADDING)
2217 		    continue;
2218 		  result = &pfile->directive_result;
2219 		}
2220 	    }
2221 	  else if (pfile->state.in_deferred_pragma)
2222 	    result = &pfile->directive_result;
2223 
2224 	  if (pfile->cb.line_change && !pfile->state.skipping)
2225 	    pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2226 	}
2227 
2228       /* We don't skip tokens in directives.  */
2229       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2230 	break;
2231 
2232       /* Outside a directive, invalidate controlling macros.  At file
2233 	 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2234 	 get here and MI optimization works.  */
2235       pfile->mi_valid = false;
2236 
2237       if (!pfile->state.skipping || result->type == CPP_EOF)
2238 	break;
2239     }
2240 
2241   return result;
2242 }
2243 
2244 /* Returns true if a fresh line has been loaded.  */
2245 bool
_cpp_get_fresh_line(cpp_reader * pfile)2246 _cpp_get_fresh_line (cpp_reader *pfile)
2247 {
2248   int return_at_eof;
2249 
2250   /* We can't get a new line until we leave the current directive.  */
2251   if (pfile->state.in_directive)
2252     return false;
2253 
2254   for (;;)
2255     {
2256       cpp_buffer *buffer = pfile->buffer;
2257 
2258       if (!buffer->need_line)
2259 	return true;
2260 
2261       if (buffer->next_line < buffer->rlimit)
2262 	{
2263 	  _cpp_clean_line (pfile);
2264 	  return true;
2265 	}
2266 
2267       /* First, get out of parsing arguments state.  */
2268       if (pfile->state.parsing_args)
2269 	return false;
2270 
2271       /* End of buffer.  Non-empty files should end in a newline.  */
2272       if (buffer->buf != buffer->rlimit
2273 	  && buffer->next_line > buffer->rlimit
2274 	  && !buffer->from_stage3)
2275 	{
2276 	  /* Clip to buffer size.  */
2277 	  buffer->next_line = buffer->rlimit;
2278 	}
2279 
2280       return_at_eof = buffer->return_at_eof;
2281       _cpp_pop_buffer (pfile);
2282       if (pfile->buffer == NULL || return_at_eof)
2283 	return false;
2284     }
2285 }
2286 
2287 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)		\
2288   do							\
2289     {							\
2290       result->type = ELSE_TYPE;				\
2291       if (*buffer->cur == CHAR)				\
2292 	buffer->cur++, result->type = THEN_TYPE;	\
2293     }							\
2294   while (0)
2295 
2296 /* Lex a token into pfile->cur_token, which is also incremented, to
2297    get diagnostics pointing to the correct location.
2298 
2299    Does not handle issues such as token lookahead, multiple-include
2300    optimization, directives, skipping etc.  This function is only
2301    suitable for use by _cpp_lex_token, and in special cases like
2302    lex_expansion_token which doesn't care for any of these issues.
2303 
2304    When meeting a newline, returns CPP_EOF if parsing a directive,
2305    otherwise returns to the start of the token buffer if permissible.
2306    Returns the location of the lexed token.  */
2307 cpp_token *
_cpp_lex_direct(cpp_reader * pfile)2308 _cpp_lex_direct (cpp_reader *pfile)
2309 {
2310   cppchar_t c;
2311   cpp_buffer *buffer;
2312   const unsigned char *comment_start;
2313   cpp_token *result = pfile->cur_token++;
2314 
2315  fresh_line:
2316   result->flags = 0;
2317   buffer = pfile->buffer;
2318   if (buffer->need_line)
2319     {
2320       if (pfile->state.in_deferred_pragma)
2321 	{
2322 	  result->type = CPP_PRAGMA_EOL;
2323 	  pfile->state.in_deferred_pragma = false;
2324 	  if (!pfile->state.pragma_allow_expansion)
2325 	    pfile->state.prevent_expansion--;
2326 	  return result;
2327 	}
2328       if (!_cpp_get_fresh_line (pfile))
2329 	{
2330 	  result->type = CPP_EOF;
2331 	  if (!pfile->state.in_directive)
2332 	    {
2333 	      /* Tell the compiler the line number of the EOF token.  */
2334 	      result->src_loc = pfile->line_table->highest_line;
2335 	      result->flags = BOL;
2336 	    }
2337 	  return result;
2338 	}
2339       if (!pfile->keep_tokens)
2340 	{
2341 	  pfile->cur_run = &pfile->base_run;
2342 	  result = pfile->base_run.base;
2343 	  pfile->cur_token = result + 1;
2344 	}
2345       result->flags = BOL;
2346       if (pfile->state.parsing_args == 2)
2347 	result->flags |= PREV_WHITE;
2348     }
2349   buffer = pfile->buffer;
2350  update_tokens_line:
2351   result->src_loc = pfile->line_table->highest_line;
2352 
2353  skipped_white:
2354   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2355       && !pfile->overlaid_buffer)
2356     {
2357       _cpp_process_line_notes (pfile, false);
2358       result->src_loc = pfile->line_table->highest_line;
2359     }
2360   c = *buffer->cur++;
2361 
2362   if (pfile->forced_token_location_p)
2363     result->src_loc = *pfile->forced_token_location_p;
2364   else
2365     result->src_loc = linemap_position_for_column (pfile->line_table,
2366 					  CPP_BUF_COLUMN (buffer, buffer->cur));
2367 
2368   switch (c)
2369     {
2370     case ' ': case '\t': case '\f': case '\v': case '\0':
2371       result->flags |= PREV_WHITE;
2372       skip_whitespace (pfile, c);
2373       goto skipped_white;
2374 
2375     case '\n':
2376       if (buffer->cur < buffer->rlimit)
2377 	CPP_INCREMENT_LINE (pfile, 0);
2378       buffer->need_line = true;
2379       goto fresh_line;
2380 
2381     case '0': case '1': case '2': case '3': case '4':
2382     case '5': case '6': case '7': case '8': case '9':
2383       {
2384 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2385 	result->type = CPP_NUMBER;
2386 	lex_number (pfile, &result->val.str, &nst);
2387 	warn_about_normalization (pfile, result, &nst);
2388 	break;
2389       }
2390 
2391     case 'L':
2392     case 'u':
2393     case 'U':
2394     case 'R':
2395       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2396 	 wide strings or raw strings.  */
2397       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2398 	  || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2399 	{
2400 	  if ((*buffer->cur == '\'' && c != 'R')
2401 	      || *buffer->cur == '"'
2402 	      || (*buffer->cur == 'R'
2403 		  && c != 'R'
2404 		  && buffer->cur[1] == '"'
2405 		  && CPP_OPTION (pfile, rliterals))
2406 	      || (*buffer->cur == '8'
2407 		  && c == 'u'
2408 		  && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
2409 				&& CPP_OPTION (pfile, utf8_char_literals)))
2410 		      || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2411 			  && CPP_OPTION (pfile, rliterals)))))
2412 	    {
2413 	      lex_string (pfile, result, buffer->cur - 1);
2414 	      break;
2415 	    }
2416 	}
2417       /* Fall through.  */
2418 
2419     case '_':
2420     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2421     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2422     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2423     case 's': case 't':           case 'v': case 'w': case 'x':
2424     case 'y': case 'z':
2425     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2426     case 'G': case 'H': case 'I': case 'J': case 'K':
2427     case 'M': case 'N': case 'O': case 'P': case 'Q':
2428     case 'S': case 'T':           case 'V': case 'W': case 'X':
2429     case 'Y': case 'Z':
2430       result->type = CPP_NAME;
2431       {
2432 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2433 	result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2434 						&nst,
2435 						&result->val.node.spelling);
2436 	warn_about_normalization (pfile, result, &nst);
2437       }
2438 
2439       /* Convert named operators to their proper types.  */
2440       if (result->val.node.node->flags & NODE_OPERATOR)
2441 	{
2442 	  result->flags |= NAMED_OP;
2443 	  result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2444 	}
2445       break;
2446 
2447     case '\'':
2448     case '"':
2449       lex_string (pfile, result, buffer->cur - 1);
2450       break;
2451 
2452     case '/':
2453       /* A potential block or line comment.  */
2454       comment_start = buffer->cur;
2455       c = *buffer->cur;
2456 
2457       if (c == '*')
2458 	{
2459 	  if (_cpp_skip_block_comment (pfile))
2460 	    cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2461 	}
2462       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
2463 	{
2464 	  /* Don't warn for system headers.  */
2465 	  if (cpp_in_system_header (pfile))
2466 	    ;
2467 	  /* Warn about comments if pedantically GNUC89, and not
2468 	     in system headers.  */
2469 	  else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
2470 		   && CPP_PEDANTIC (pfile)
2471 		   && ! buffer->warned_cplusplus_comments)
2472 	    {
2473 	      cpp_error (pfile, CPP_DL_PEDWARN,
2474 			 "C++ style comments are not allowed in ISO C90");
2475 	      cpp_error (pfile, CPP_DL_PEDWARN,
2476 			 "(this will be reported only once per input file)");
2477 	      buffer->warned_cplusplus_comments = 1;
2478 	    }
2479 	  /* Or if specifically desired via -Wc90-c99-compat.  */
2480 	  else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
2481 		   && ! CPP_OPTION (pfile, cplusplus)
2482 		   && ! buffer->warned_cplusplus_comments)
2483 	    {
2484 	      cpp_error (pfile, CPP_DL_WARNING,
2485 			 "C++ style comments are incompatible with C90");
2486 	      cpp_error (pfile, CPP_DL_WARNING,
2487 			 "(this will be reported only once per input file)");
2488 	      buffer->warned_cplusplus_comments = 1;
2489 	    }
2490 	  /* In C89/C94, C++ style comments are forbidden.  */
2491 	  else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
2492 		    || CPP_OPTION (pfile, lang) == CLK_STDC94))
2493 	    {
2494 	      /* But don't be confused about valid code such as
2495 	         - // immediately followed by *,
2496 		 - // in a preprocessing directive,
2497 		 - // in an #if 0 block.  */
2498 	      if (buffer->cur[1] == '*'
2499 		  || pfile->state.in_directive
2500 		  || pfile->state.skipping)
2501 		{
2502 		  result->type = CPP_DIV;
2503 		  break;
2504 		}
2505 	      else if (! buffer->warned_cplusplus_comments)
2506 		{
2507 		  cpp_error (pfile, CPP_DL_ERROR,
2508 			     "C++ style comments are not allowed in ISO C90");
2509 		  cpp_error (pfile, CPP_DL_ERROR,
2510 			     "(this will be reported only once per input "
2511 			     "file)");
2512 		  buffer->warned_cplusplus_comments = 1;
2513 		}
2514 	    }
2515 	  if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2516 	    cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2517 	}
2518       else if (c == '=')
2519 	{
2520 	  buffer->cur++;
2521 	  result->type = CPP_DIV_EQ;
2522 	  break;
2523 	}
2524       else
2525 	{
2526 	  result->type = CPP_DIV;
2527 	  break;
2528 	}
2529 
2530       if (!pfile->state.save_comments)
2531 	{
2532 	  result->flags |= PREV_WHITE;
2533 	  goto update_tokens_line;
2534 	}
2535 
2536       /* Save the comment as a token in its own right.  */
2537       save_comment (pfile, result, comment_start, c);
2538       break;
2539 
2540     case '<':
2541       if (pfile->state.angled_headers)
2542 	{
2543 	  lex_string (pfile, result, buffer->cur - 1);
2544 	  if (result->type != CPP_LESS)
2545 	    break;
2546 	}
2547 
2548       result->type = CPP_LESS;
2549       if (*buffer->cur == '=')
2550 	buffer->cur++, result->type = CPP_LESS_EQ;
2551       else if (*buffer->cur == '<')
2552 	{
2553 	  buffer->cur++;
2554 	  IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2555 	}
2556       else if (CPP_OPTION (pfile, digraphs))
2557 	{
2558 	  if (*buffer->cur == ':')
2559 	    {
2560 	      /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2561 		 three characters are <:: and the subsequent character
2562 		 is neither : nor >, the < is treated as a preprocessor
2563 		 token by itself".  */
2564 	      if (CPP_OPTION (pfile, cplusplus)
2565 		  && CPP_OPTION (pfile, lang) != CLK_CXX98
2566 		  && CPP_OPTION (pfile, lang) != CLK_GNUCXX
2567 		  && buffer->cur[1] == ':'
2568 		  && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2569 		break;
2570 
2571 	      buffer->cur++;
2572 	      result->flags |= DIGRAPH;
2573 	      result->type = CPP_OPEN_SQUARE;
2574 	    }
2575 	  else if (*buffer->cur == '%')
2576 	    {
2577 	      buffer->cur++;
2578 	      result->flags |= DIGRAPH;
2579 	      result->type = CPP_OPEN_BRACE;
2580 	    }
2581 	}
2582       break;
2583 
2584     case '>':
2585       result->type = CPP_GREATER;
2586       if (*buffer->cur == '=')
2587 	buffer->cur++, result->type = CPP_GREATER_EQ;
2588       else if (*buffer->cur == '>')
2589 	{
2590 	  buffer->cur++;
2591 	  IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2592 	}
2593       break;
2594 
2595     case '%':
2596       result->type = CPP_MOD;
2597       if (*buffer->cur == '=')
2598 	buffer->cur++, result->type = CPP_MOD_EQ;
2599       else if (CPP_OPTION (pfile, digraphs))
2600 	{
2601 	  if (*buffer->cur == ':')
2602 	    {
2603 	      buffer->cur++;
2604 	      result->flags |= DIGRAPH;
2605 	      result->type = CPP_HASH;
2606 	      if (*buffer->cur == '%' && buffer->cur[1] == ':')
2607 		buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2608 	    }
2609 	  else if (*buffer->cur == '>')
2610 	    {
2611 	      buffer->cur++;
2612 	      result->flags |= DIGRAPH;
2613 	      result->type = CPP_CLOSE_BRACE;
2614 	    }
2615 	}
2616       break;
2617 
2618     case '.':
2619       result->type = CPP_DOT;
2620       if (ISDIGIT (*buffer->cur))
2621 	{
2622 	  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2623 	  result->type = CPP_NUMBER;
2624 	  lex_number (pfile, &result->val.str, &nst);
2625 	  warn_about_normalization (pfile, result, &nst);
2626 	}
2627       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2628 	buffer->cur += 2, result->type = CPP_ELLIPSIS;
2629       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2630 	buffer->cur++, result->type = CPP_DOT_STAR;
2631       break;
2632 
2633     case '+':
2634       result->type = CPP_PLUS;
2635       if (*buffer->cur == '+')
2636 	buffer->cur++, result->type = CPP_PLUS_PLUS;
2637       else if (*buffer->cur == '=')
2638 	buffer->cur++, result->type = CPP_PLUS_EQ;
2639       break;
2640 
2641     case '-':
2642       result->type = CPP_MINUS;
2643       if (*buffer->cur == '>')
2644 	{
2645 	  buffer->cur++;
2646 	  result->type = CPP_DEREF;
2647 	  if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2648 	    buffer->cur++, result->type = CPP_DEREF_STAR;
2649 	}
2650       else if (*buffer->cur == '-')
2651 	buffer->cur++, result->type = CPP_MINUS_MINUS;
2652       else if (*buffer->cur == '=')
2653 	buffer->cur++, result->type = CPP_MINUS_EQ;
2654       break;
2655 
2656     case '&':
2657       result->type = CPP_AND;
2658       if (*buffer->cur == '&')
2659 	buffer->cur++, result->type = CPP_AND_AND;
2660       else if (*buffer->cur == '=')
2661 	buffer->cur++, result->type = CPP_AND_EQ;
2662       break;
2663 
2664     case '|':
2665       result->type = CPP_OR;
2666       if (*buffer->cur == '|')
2667 	buffer->cur++, result->type = CPP_OR_OR;
2668       else if (*buffer->cur == '=')
2669 	buffer->cur++, result->type = CPP_OR_EQ;
2670       break;
2671 
2672     case ':':
2673       result->type = CPP_COLON;
2674       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2675 	buffer->cur++, result->type = CPP_SCOPE;
2676       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2677 	{
2678 	  buffer->cur++;
2679 	  result->flags |= DIGRAPH;
2680 	  result->type = CPP_CLOSE_SQUARE;
2681 	}
2682       break;
2683 
2684     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2685     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2686     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2687     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2688     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2689 
2690     case '?': result->type = CPP_QUERY; break;
2691     case '~': result->type = CPP_COMPL; break;
2692     case ',': result->type = CPP_COMMA; break;
2693     case '(': result->type = CPP_OPEN_PAREN; break;
2694     case ')': result->type = CPP_CLOSE_PAREN; break;
2695     case '[': result->type = CPP_OPEN_SQUARE; break;
2696     case ']': result->type = CPP_CLOSE_SQUARE; break;
2697     case '{': result->type = CPP_OPEN_BRACE; break;
2698     case '}': result->type = CPP_CLOSE_BRACE; break;
2699     case ';': result->type = CPP_SEMICOLON; break;
2700 
2701       /* @ is a punctuator in Objective-C.  */
2702     case '@': result->type = CPP_ATSIGN; break;
2703 
2704     case '$':
2705     case '\\':
2706       {
2707 	const uchar *base = --buffer->cur;
2708 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2709 
2710 	if (forms_identifier_p (pfile, true, &nst))
2711 	  {
2712 	    result->type = CPP_NAME;
2713 	    result->val.node.node = lex_identifier (pfile, base, true, &nst,
2714 						    &result->val.node.spelling);
2715 	    warn_about_normalization (pfile, result, &nst);
2716 	    break;
2717 	  }
2718 	buffer->cur++;
2719       }
2720 
2721     default:
2722       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2723       break;
2724     }
2725 
2726   source_range tok_range;
2727   tok_range.m_start = result->src_loc;
2728   if (result->src_loc >= RESERVED_LOCATION_COUNT)
2729     tok_range.m_finish
2730       = linemap_position_for_column (pfile->line_table,
2731 				     CPP_BUF_COLUMN (buffer, buffer->cur));
2732   else
2733     tok_range.m_finish = tok_range.m_start;
2734 
2735   result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
2736 					   result->src_loc,
2737 					   tok_range, NULL);
2738 
2739   return result;
2740 }
2741 
2742 /* An upper bound on the number of bytes needed to spell TOKEN.
2743    Does not include preceding whitespace.  */
2744 unsigned int
cpp_token_len(const cpp_token * token)2745 cpp_token_len (const cpp_token *token)
2746 {
2747   unsigned int len;
2748 
2749   switch (TOKEN_SPELL (token))
2750     {
2751     default:		len = 6;				break;
2752     case SPELL_LITERAL:	len = token->val.str.len;		break;
2753     case SPELL_IDENT:	len = NODE_LEN (token->val.node.node) * 10;	break;
2754     }
2755 
2756   return len;
2757 }
2758 
2759 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2760    Return the number of bytes read out of NAME.  (There are always
2761    10 bytes written to BUFFER.)  */
2762 
2763 static size_t
utf8_to_ucn(unsigned char * buffer,const unsigned char * name)2764 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2765 {
2766   int j;
2767   int ucn_len = 0;
2768   int ucn_len_c;
2769   unsigned t;
2770   unsigned long utf32;
2771 
2772   /* Compute the length of the UTF-8 sequence.  */
2773   for (t = *name; t & 0x80; t <<= 1)
2774     ucn_len++;
2775 
2776   utf32 = *name & (0x7F >> ucn_len);
2777   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2778     {
2779       utf32 = (utf32 << 6) | (*++name & 0x3F);
2780 
2781       /* Ill-formed UTF-8.  */
2782       if ((*name & ~0x3F) != 0x80)
2783 	abort ();
2784     }
2785 
2786   *buffer++ = '\\';
2787   *buffer++ = 'U';
2788   for (j = 7; j >= 0; j--)
2789     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2790   return ucn_len;
2791 }
2792 
2793 /* Given a token TYPE corresponding to a digraph, return a pointer to
2794    the spelling of the digraph.  */
2795 static const unsigned char *
cpp_digraph2name(enum cpp_ttype type)2796 cpp_digraph2name (enum cpp_ttype type)
2797 {
2798   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2799 }
2800 
2801 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
2802    The buffer must already contain the enough space to hold the
2803    token's spelling.  Returns a pointer to the character after the
2804    last character written.  */
2805 unsigned char *
_cpp_spell_ident_ucns(unsigned char * buffer,cpp_hashnode * ident)2806 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
2807 {
2808   size_t i;
2809   const unsigned char *name = NODE_NAME (ident);
2810 
2811   for (i = 0; i < NODE_LEN (ident); i++)
2812     if (name[i] & ~0x7F)
2813       {
2814 	i += utf8_to_ucn (buffer, name + i) - 1;
2815 	buffer += 10;
2816       }
2817     else
2818       *buffer++ = name[i];
2819 
2820   return buffer;
2821 }
2822 
2823 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2824    already contain the enough space to hold the token's spelling.
2825    Returns a pointer to the character after the last character written.
2826    FORSTRING is true if this is to be the spelling after translation
2827    phase 1 (with the original spelling of extended identifiers), false
2828    if extended identifiers should always be written using UCNs (there is
2829    no option for always writing them in the internal UTF-8 form).
2830    FIXME: Would be nice if we didn't need the PFILE argument.  */
2831 unsigned char *
cpp_spell_token(cpp_reader * pfile,const cpp_token * token,unsigned char * buffer,bool forstring)2832 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2833 		 unsigned char *buffer, bool forstring)
2834 {
2835   switch (TOKEN_SPELL (token))
2836     {
2837     case SPELL_OPERATOR:
2838       {
2839 	const unsigned char *spelling;
2840 	unsigned char c;
2841 
2842 	if (token->flags & DIGRAPH)
2843 	  spelling = cpp_digraph2name (token->type);
2844 	else if (token->flags & NAMED_OP)
2845 	  goto spell_ident;
2846 	else
2847 	  spelling = TOKEN_NAME (token);
2848 
2849 	while ((c = *spelling++) != '\0')
2850 	  *buffer++ = c;
2851       }
2852       break;
2853 
2854     spell_ident:
2855     case SPELL_IDENT:
2856       if (forstring)
2857 	{
2858 	  memcpy (buffer, NODE_NAME (token->val.node.spelling),
2859 		  NODE_LEN (token->val.node.spelling));
2860 	  buffer += NODE_LEN (token->val.node.spelling);
2861 	}
2862       else
2863 	buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
2864       break;
2865 
2866     case SPELL_LITERAL:
2867       memcpy (buffer, token->val.str.text, token->val.str.len);
2868       buffer += token->val.str.len;
2869       break;
2870 
2871     case SPELL_NONE:
2872       cpp_error (pfile, CPP_DL_ICE,
2873 		 "unspellable token %s", TOKEN_NAME (token));
2874       break;
2875     }
2876 
2877   return buffer;
2878 }
2879 
2880 /* Returns TOKEN spelt as a null-terminated string.  The string is
2881    freed when the reader is destroyed.  Useful for diagnostics.  */
2882 unsigned char *
cpp_token_as_text(cpp_reader * pfile,const cpp_token * token)2883 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2884 {
2885   unsigned int len = cpp_token_len (token) + 1;
2886   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2887 
2888   end = cpp_spell_token (pfile, token, start, false);
2889   end[0] = '\0';
2890 
2891   return start;
2892 }
2893 
2894 /* Returns a pointer to a string which spells the token defined by
2895    TYPE and FLAGS.  Used by C front ends, which really should move to
2896    using cpp_token_as_text.  */
2897 const char *
cpp_type2name(enum cpp_ttype type,unsigned char flags)2898 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2899 {
2900   if (flags & DIGRAPH)
2901     return (const char *) cpp_digraph2name (type);
2902   else if (flags & NAMED_OP)
2903     return cpp_named_operator2name (type);
2904 
2905   return (const char *) token_spellings[type].name;
2906 }
2907 
2908 /* Writes the spelling of token to FP, without any preceding space.
2909    Separated from cpp_spell_token for efficiency - to avoid stdio
2910    double-buffering.  */
2911 void
cpp_output_token(const cpp_token * token,FILE * fp)2912 cpp_output_token (const cpp_token *token, FILE *fp)
2913 {
2914   switch (TOKEN_SPELL (token))
2915     {
2916     case SPELL_OPERATOR:
2917       {
2918 	const unsigned char *spelling;
2919 	int c;
2920 
2921 	if (token->flags & DIGRAPH)
2922 	  spelling = cpp_digraph2name (token->type);
2923 	else if (token->flags & NAMED_OP)
2924 	  goto spell_ident;
2925 	else
2926 	  spelling = TOKEN_NAME (token);
2927 
2928 	c = *spelling;
2929 	do
2930 	  putc (c, fp);
2931 	while ((c = *++spelling) != '\0');
2932       }
2933       break;
2934 
2935     spell_ident:
2936     case SPELL_IDENT:
2937       {
2938 	size_t i;
2939 	const unsigned char * name = NODE_NAME (token->val.node.node);
2940 
2941 	for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2942 	  if (name[i] & ~0x7F)
2943 	    {
2944 	      unsigned char buffer[10];
2945 	      i += utf8_to_ucn (buffer, name + i) - 1;
2946 	      fwrite (buffer, 1, 10, fp);
2947 	    }
2948 	  else
2949 	    fputc (NODE_NAME (token->val.node.node)[i], fp);
2950       }
2951       break;
2952 
2953     case SPELL_LITERAL:
2954       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2955       break;
2956 
2957     case SPELL_NONE:
2958       /* An error, most probably.  */
2959       break;
2960     }
2961 }
2962 
2963 /* Compare two tokens.  */
2964 int
_cpp_equiv_tokens(const cpp_token * a,const cpp_token * b)2965 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2966 {
2967   if (a->type == b->type && a->flags == b->flags)
2968     switch (TOKEN_SPELL (a))
2969       {
2970       default:			/* Keep compiler happy.  */
2971       case SPELL_OPERATOR:
2972 	/* token_no is used to track where multiple consecutive ##
2973 	   tokens were originally located.  */
2974 	return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2975       case SPELL_NONE:
2976 	return (a->type != CPP_MACRO_ARG
2977 		|| (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
2978 		    && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
2979       case SPELL_IDENT:
2980 	return (a->val.node.node == b->val.node.node
2981 		&& a->val.node.spelling == b->val.node.spelling);
2982       case SPELL_LITERAL:
2983 	return (a->val.str.len == b->val.str.len
2984 		&& !memcmp (a->val.str.text, b->val.str.text,
2985 			    a->val.str.len));
2986       }
2987 
2988   return 0;
2989 }
2990 
2991 /* Returns nonzero if a space should be inserted to avoid an
2992    accidental token paste for output.  For simplicity, it is
2993    conservative, and occasionally advises a space where one is not
2994    needed, e.g. "." and ".2".  */
2995 int
cpp_avoid_paste(cpp_reader * pfile,const cpp_token * token1,const cpp_token * token2)2996 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2997 		 const cpp_token *token2)
2998 {
2999   enum cpp_ttype a = token1->type, b = token2->type;
3000   cppchar_t c;
3001 
3002   if (token1->flags & NAMED_OP)
3003     a = CPP_NAME;
3004   if (token2->flags & NAMED_OP)
3005     b = CPP_NAME;
3006 
3007   c = EOF;
3008   if (token2->flags & DIGRAPH)
3009     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3010   else if (token_spellings[b].category == SPELL_OPERATOR)
3011     c = token_spellings[b].name[0];
3012 
3013   /* Quickly get everything that can paste with an '='.  */
3014   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3015     return 1;
3016 
3017   switch (a)
3018     {
3019     case CPP_GREATER:	return c == '>';
3020     case CPP_LESS:	return c == '<' || c == '%' || c == ':';
3021     case CPP_PLUS:	return c == '+';
3022     case CPP_MINUS:	return c == '-' || c == '>';
3023     case CPP_DIV:	return c == '/' || c == '*'; /* Comments.  */
3024     case CPP_MOD:	return c == ':' || c == '>';
3025     case CPP_AND:	return c == '&';
3026     case CPP_OR:	return c == '|';
3027     case CPP_COLON:	return c == ':' || c == '>';
3028     case CPP_DEREF:	return c == '*';
3029     case CPP_DOT:	return c == '.' || c == '%' || b == CPP_NUMBER;
3030     case CPP_HASH:	return c == '#' || c == '%'; /* Digraph form.  */
3031     case CPP_NAME:	return ((b == CPP_NUMBER
3032 				 && name_p (pfile, &token2->val.str))
3033 				|| b == CPP_NAME
3034 				|| b == CPP_CHAR || b == CPP_STRING); /* L */
3035     case CPP_NUMBER:	return (b == CPP_NUMBER || b == CPP_NAME
3036 				|| c == '.' || c == '+' || c == '-');
3037 				      /* UCNs */
3038     case CPP_OTHER:	return ((token1->val.str.text[0] == '\\'
3039 				 && b == CPP_NAME)
3040 				|| (CPP_OPTION (pfile, objc)
3041 				    && token1->val.str.text[0] == '@'
3042 				    && (b == CPP_NAME || b == CPP_STRING)));
3043     case CPP_STRING:
3044     case CPP_WSTRING:
3045     case CPP_UTF8STRING:
3046     case CPP_STRING16:
3047     case CPP_STRING32:	return (CPP_OPTION (pfile, user_literals)
3048 				&& (b == CPP_NAME
3049 				    || (TOKEN_SPELL (token2) == SPELL_LITERAL
3050 					&& ISIDST (token2->val.str.text[0]))));
3051 
3052     default:		break;
3053     }
3054 
3055   return 0;
3056 }
3057 
3058 /* Output all the remaining tokens on the current line, and a newline
3059    character, to FP.  Leading whitespace is removed.  If there are
3060    macros, special token padding is not performed.  */
3061 void
cpp_output_line(cpp_reader * pfile,FILE * fp)3062 cpp_output_line (cpp_reader *pfile, FILE *fp)
3063 {
3064   const cpp_token *token;
3065 
3066   token = cpp_get_token (pfile);
3067   while (token->type != CPP_EOF)
3068     {
3069       cpp_output_token (token, fp);
3070       token = cpp_get_token (pfile);
3071       if (token->flags & PREV_WHITE)
3072 	putc (' ', fp);
3073     }
3074 
3075   putc ('\n', fp);
3076 }
3077 
3078 /* Return a string representation of all the remaining tokens on the
3079    current line.  The result is allocated using xmalloc and must be
3080    freed by the caller.  */
3081 unsigned char *
cpp_output_line_to_string(cpp_reader * pfile,const unsigned char * dir_name)3082 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3083 {
3084   const cpp_token *token;
3085   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3086   unsigned int alloced = 120 + out;
3087   unsigned char *result = (unsigned char *) xmalloc (alloced);
3088 
3089   /* If DIR_NAME is empty, there are no initial contents.  */
3090   if (dir_name)
3091     {
3092       sprintf ((char *) result, "#%s ", dir_name);
3093       out += 2;
3094     }
3095 
3096   token = cpp_get_token (pfile);
3097   while (token->type != CPP_EOF)
3098     {
3099       unsigned char *last;
3100       /* Include room for a possible space and the terminating nul.  */
3101       unsigned int len = cpp_token_len (token) + 2;
3102 
3103       if (out + len > alloced)
3104 	{
3105 	  alloced *= 2;
3106 	  if (out + len > alloced)
3107 	    alloced = out + len;
3108 	  result = (unsigned char *) xrealloc (result, alloced);
3109 	}
3110 
3111       last = cpp_spell_token (pfile, token, &result[out], 0);
3112       out = last - result;
3113 
3114       token = cpp_get_token (pfile);
3115       if (token->flags & PREV_WHITE)
3116 	result[out++] = ' ';
3117     }
3118 
3119   result[out] = '\0';
3120   return result;
3121 }
3122 
3123 /* Memory buffers.  Changing these three constants can have a dramatic
3124    effect on performance.  The values here are reasonable defaults,
3125    but might be tuned.  If you adjust them, be sure to test across a
3126    range of uses of cpplib, including heavy nested function-like macro
3127    expansion.  Also check the change in peak memory usage (NJAMD is a
3128    good tool for this).  */
3129 #define MIN_BUFF_SIZE 8000
3130 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3131 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3132 	(MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3133 
3134 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3135   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3136 #endif
3137 
3138 /* Create a new allocation buffer.  Place the control block at the end
3139    of the buffer, so that buffer overflows will cause immediate chaos.  */
3140 static _cpp_buff *
new_buff(size_t len)3141 new_buff (size_t len)
3142 {
3143   _cpp_buff *result;
3144   unsigned char *base;
3145 
3146   if (len < MIN_BUFF_SIZE)
3147     len = MIN_BUFF_SIZE;
3148   len = CPP_ALIGN (len);
3149 
3150 #ifdef ENABLE_VALGRIND_CHECKING
3151   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3152      struct first.  */
3153   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3154   base = XNEWVEC (unsigned char, len + slen);
3155   result = (_cpp_buff *) base;
3156   base += slen;
3157 #else
3158   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3159   result = (_cpp_buff *) (base + len);
3160 #endif
3161   result->base = base;
3162   result->cur = base;
3163   result->limit = base + len;
3164   result->next = NULL;
3165   return result;
3166 }
3167 
3168 /* Place a chain of unwanted allocation buffers on the free list.  */
3169 void
_cpp_release_buff(cpp_reader * pfile,_cpp_buff * buff)3170 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3171 {
3172   _cpp_buff *end = buff;
3173 
3174   while (end->next)
3175     end = end->next;
3176   end->next = pfile->free_buffs;
3177   pfile->free_buffs = buff;
3178 }
3179 
3180 /* Return a free buffer of size at least MIN_SIZE.  */
3181 _cpp_buff *
_cpp_get_buff(cpp_reader * pfile,size_t min_size)3182 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3183 {
3184   _cpp_buff *result, **p;
3185 
3186   for (p = &pfile->free_buffs;; p = &(*p)->next)
3187     {
3188       size_t size;
3189 
3190       if (*p == NULL)
3191 	return new_buff (min_size);
3192       result = *p;
3193       size = result->limit - result->base;
3194       /* Return a buffer that's big enough, but don't waste one that's
3195          way too big.  */
3196       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3197 	break;
3198     }
3199 
3200   *p = result->next;
3201   result->next = NULL;
3202   result->cur = result->base;
3203   return result;
3204 }
3205 
3206 /* Creates a new buffer with enough space to hold the uncommitted
3207    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3208    the excess bytes to the new buffer.  Chains the new buffer after
3209    BUFF, and returns the new buffer.  */
3210 _cpp_buff *
_cpp_append_extend_buff(cpp_reader * pfile,_cpp_buff * buff,size_t min_extra)3211 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3212 {
3213   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3214   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3215 
3216   buff->next = new_buff;
3217   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3218   return new_buff;
3219 }
3220 
3221 /* Creates a new buffer with enough space to hold the uncommitted
3222    remaining bytes of the buffer pointed to by BUFF, and at least
3223    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3224    Chains the new buffer before the buffer pointed to by BUFF, and
3225    updates the pointer to point to the new buffer.  */
3226 void
_cpp_extend_buff(cpp_reader * pfile,_cpp_buff ** pbuff,size_t min_extra)3227 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3228 {
3229   _cpp_buff *new_buff, *old_buff = *pbuff;
3230   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3231 
3232   new_buff = _cpp_get_buff (pfile, size);
3233   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3234   new_buff->next = old_buff;
3235   *pbuff = new_buff;
3236 }
3237 
3238 /* Free a chain of buffers starting at BUFF.  */
3239 void
_cpp_free_buff(_cpp_buff * buff)3240 _cpp_free_buff (_cpp_buff *buff)
3241 {
3242   _cpp_buff *next;
3243 
3244   for (; buff; buff = next)
3245     {
3246       next = buff->next;
3247 #ifdef ENABLE_VALGRIND_CHECKING
3248       free (buff);
3249 #else
3250       free (buff->base);
3251 #endif
3252     }
3253 }
3254 
3255 /* Allocate permanent, unaligned storage of length LEN.  */
3256 unsigned char *
_cpp_unaligned_alloc(cpp_reader * pfile,size_t len)3257 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3258 {
3259   _cpp_buff *buff = pfile->u_buff;
3260   unsigned char *result = buff->cur;
3261 
3262   if (len > (size_t) (buff->limit - result))
3263     {
3264       buff = _cpp_get_buff (pfile, len);
3265       buff->next = pfile->u_buff;
3266       pfile->u_buff = buff;
3267       result = buff->cur;
3268     }
3269 
3270   buff->cur = result + len;
3271   return result;
3272 }
3273 
3274 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3275    That buffer is used for growing allocations when saving macro
3276    replacement lists in a #define, and when parsing an answer to an
3277    assertion in #assert, #unassert or #if (and therefore possibly
3278    whilst expanding macros).  It therefore must not be used by any
3279    code that they might call: specifically the lexer and the guts of
3280    the macro expander.
3281 
3282    All existing other uses clearly fit this restriction: storing
3283    registered pragmas during initialization.  */
3284 unsigned char *
_cpp_aligned_alloc(cpp_reader * pfile,size_t len)3285 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3286 {
3287   _cpp_buff *buff = pfile->a_buff;
3288   unsigned char *result = buff->cur;
3289 
3290   if (len > (size_t) (buff->limit - result))
3291     {
3292       buff = _cpp_get_buff (pfile, len);
3293       buff->next = pfile->a_buff;
3294       pfile->a_buff = buff;
3295       result = buff->cur;
3296     }
3297 
3298   buff->cur = result + len;
3299   return result;
3300 }
3301 
3302 /* Say which field of TOK is in use.  */
3303 
3304 enum cpp_token_fld_kind
cpp_token_val_index(const cpp_token * tok)3305 cpp_token_val_index (const cpp_token *tok)
3306 {
3307   switch (TOKEN_SPELL (tok))
3308     {
3309     case SPELL_IDENT:
3310       return CPP_TOKEN_FLD_NODE;
3311     case SPELL_LITERAL:
3312       return CPP_TOKEN_FLD_STR;
3313     case SPELL_OPERATOR:
3314       if (tok->type == CPP_PASTE)
3315 	return CPP_TOKEN_FLD_TOKEN_NO;
3316       else
3317 	return CPP_TOKEN_FLD_NONE;
3318     case SPELL_NONE:
3319       if (tok->type == CPP_MACRO_ARG)
3320 	return CPP_TOKEN_FLD_ARG_NO;
3321       else if (tok->type == CPP_PADDING)
3322 	return CPP_TOKEN_FLD_SOURCE;
3323       else if (tok->type == CPP_PRAGMA)
3324 	return CPP_TOKEN_FLD_PRAGMA;
3325       /* else fall through */
3326     default:
3327       return CPP_TOKEN_FLD_NONE;
3328     }
3329 }
3330 
3331 /* All tokens lexed in R after calling this function will be forced to have
3332    their source_location the same as the location referenced by P, until
3333    cpp_stop_forcing_token_locations is called for R.  */
3334 
3335 void
cpp_force_token_locations(cpp_reader * r,source_location * p)3336 cpp_force_token_locations (cpp_reader *r, source_location *p)
3337 {
3338   r->forced_token_location_p = p;
3339 }
3340 
3341 /* Go back to assigning locations naturally for lexed tokens.  */
3342 
3343 void
cpp_stop_forcing_token_locations(cpp_reader * r)3344 cpp_stop_forcing_token_locations (cpp_reader *r)
3345 {
3346   r->forced_token_location_p = NULL;
3347 }
3348