1 /* CPP Library - lexical analysis.
2    Copyright (C) 2000-2020 Free Software Foundation, Inc.
3    Contributed by Per Bothner, 1994-95.
4    Based on CCCP program by Paul Rubin, June 1986
5    Adapted to ANSI C, Richard Stallman, Jan 1987
6    Broken out to separate file, Zack Weinberg, Mar 2000
7 
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12 
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 GNU General Public License for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
26 
27 enum spell_type
28 {
29   SPELL_OPERATOR = 0,
30   SPELL_IDENT,
31   SPELL_LITERAL,
32   SPELL_NONE
33 };
34 
35 struct token_spelling
36 {
37   enum spell_type category;
38   const unsigned char *name;
39 };
40 
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
43 
44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
45 #define TK(e, s) { SPELL_ ## s,    UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
49 
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52 
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void store_comment (cpp_reader *, cpp_token *);
59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
60 			    unsigned int, enum cpp_ttype);
61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
62 static int name_p (cpp_reader *, const cpp_string *);
63 static tokenrun *next_tokenrun (tokenrun *);
64 
65 static _cpp_buff *new_buff (size_t);
66 
67 
68 /* Utility routine:
69 
70    Compares, the token TOKEN to the NUL-terminated string STRING.
71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
72 int
cpp_ideq(const cpp_token * token,const char * string)73 cpp_ideq (const cpp_token *token, const char *string)
74 {
75   if (token->type != CPP_NAME)
76     return 0;
77 
78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
79 }
80 
81 /* Record a note TYPE at byte POS into the current cleaned logical
82    line.  */
83 static void
add_line_note(cpp_buffer * buffer,const uchar * pos,unsigned int type)84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
85 {
86   if (buffer->notes_used == buffer->notes_cap)
87     {
88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
90                                   buffer->notes_cap);
91     }
92 
93   buffer->notes[buffer->notes_used].pos = pos;
94   buffer->notes[buffer->notes_used].type = type;
95   buffer->notes_used++;
96 }
97 
98 
99 /* Fast path to find line special characters using optimized character
100    scanning algorithms.  Anything complicated falls back to the slow
101    path below.  Since this loop is very hot it's worth doing these kinds
102    of optimizations.
103 
104    One of the paths through the ifdefs should provide
105 
106      const uchar *search_line_fast (const uchar *s, const uchar *end);
107 
108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
109    the found character.
110 
111    Note that the last character of the buffer is *always* a newline,
112    as forced by _cpp_convert_input.  This fact can be used to avoid
113    explicitly looking for the end of the buffer.  */
114 
115 /* Configure gives us an ifdef test.  */
116 #ifndef WORDS_BIGENDIAN
117 #define WORDS_BIGENDIAN 0
118 #endif
119 
120 /* We'd like the largest integer that fits into a register.  There's nothing
121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
123    can get the "real" word size.  */
124 #ifdef __GNUC__
125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
126 #else
127 typedef unsigned long word_type;
128 #endif
129 
130 /* The code below is only expecting sizes 4 or 8.
131    Die at compile-time if this expectation is violated.  */
132 typedef char check_word_type_size
133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
134 
135 /* Return X with the first N bytes forced to values that won't match one
136    of the interesting characters.  Note that NUL is not interesting.  */
137 
138 static inline word_type
acc_char_mask_misalign(word_type val,unsigned int n)139 acc_char_mask_misalign (word_type val, unsigned int n)
140 {
141   word_type mask = -1;
142   if (WORDS_BIGENDIAN)
143     mask >>= n * 8;
144   else
145     mask <<= n * 8;
146   return val & mask;
147 }
148 
149 /* Return X replicated to all byte positions within WORD_TYPE.  */
150 
151 static inline word_type
acc_char_replicate(uchar x)152 acc_char_replicate (uchar x)
153 {
154   word_type ret;
155 
156   ret = (x << 24) | (x << 16) | (x << 8) | x;
157   if (sizeof(word_type) == 8)
158     ret = (ret << 16 << 16) | ret;
159   return ret;
160 }
161 
162 /* Return non-zero if some byte of VAL is (probably) C.  */
163 
164 static inline word_type
acc_char_cmp(word_type val,word_type c)165 acc_char_cmp (word_type val, word_type c)
166 {
167 #if defined(__GNUC__) && defined(__alpha__)
168   /* We can get exact results using a compare-bytes instruction.
169      Get (val == c) via (0 >= (val ^ c)).  */
170   return __builtin_alpha_cmpbge (0, val ^ c);
171 #else
172   word_type magic = 0x7efefefeU;
173   if (sizeof(word_type) == 8)
174     magic = (magic << 16 << 16) | 0xfefefefeU;
175   magic |= 1;
176 
177   val ^= c;
178   return ((val + magic) ^ ~val) & ~magic;
179 #endif
180 }
181 
182 /* Given the result of acc_char_cmp is non-zero, return the index of
183    the found character.  If this was a false positive, return -1.  */
184 
185 static inline int
acc_char_index(word_type cmp ATTRIBUTE_UNUSED,word_type val ATTRIBUTE_UNUSED)186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
187 		word_type val ATTRIBUTE_UNUSED)
188 {
189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
190   /* The cmpbge instruction sets *bits* of the result corresponding to
191      matches in the bytes with no false positives.  */
192   return __builtin_ctzl (cmp);
193 #else
194   unsigned int i;
195 
196   /* ??? It would be nice to force unrolling here,
197      and have all of these constants folded.  */
198   for (i = 0; i < sizeof(word_type); ++i)
199     {
200       uchar c;
201       if (WORDS_BIGENDIAN)
202 	c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
203       else
204 	c = (val >> i * 8) & 0xff;
205 
206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
207 	return i;
208     }
209 
210   return -1;
211 #endif
212 }
213 
214 /* A version of the fast scanner using bit fiddling techniques.
215 
216    For 32-bit words, one would normally perform 16 comparisons and
217    16 branches.  With this algorithm one performs 24 arithmetic
218    operations and one branch.  Whether this is faster with a 32-bit
219    word size is going to be somewhat system dependent.
220 
221    For 64-bit words, we eliminate twice the number of comparisons
222    and branches without increasing the number of arithmetic operations.
223    It's almost certainly going to be a win with 64-bit word size.  */
224 
225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
226   ATTRIBUTE_UNUSED;
227 
228 static const uchar *
search_line_acc_char(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
230 {
231   const word_type repl_nl = acc_char_replicate ('\n');
232   const word_type repl_cr = acc_char_replicate ('\r');
233   const word_type repl_bs = acc_char_replicate ('\\');
234   const word_type repl_qm = acc_char_replicate ('?');
235 
236   unsigned int misalign;
237   const word_type *p;
238   word_type val, t;
239 
240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
242   val = *p;
243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
244   if (misalign)
245     val = acc_char_mask_misalign (val, misalign);
246 
247   /* Main loop.  */
248   while (1)
249     {
250       t  = acc_char_cmp (val, repl_nl);
251       t |= acc_char_cmp (val, repl_cr);
252       t |= acc_char_cmp (val, repl_bs);
253       t |= acc_char_cmp (val, repl_qm);
254 
255       if (__builtin_expect (t != 0, 0))
256 	{
257 	  int i = acc_char_index (t, val);
258 	  if (i >= 0)
259 	    return (const uchar *)p + i;
260 	}
261 
262       val = *++p;
263     }
264 }
265 
266 /* Disable on Solaris 2/x86 until the following problem can be properly
267    autoconfed:
268 
269    The Solaris 10+ assembler tags objects with the instruction set
270    extensions used, so SSE4.2 executables cannot run on machines that
271    don't support that extension.  */
272 
273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
274 
275 /* Replicated character data to be shared between implementations.
276    Recall that outside of a context with vector support we can't
277    define compatible vector types, therefore these are all defined
278    in terms of raw characters.  */
279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
286   { '?', '?', '?', '?', '?', '?', '?', '?',
287     '?', '?', '?', '?', '?', '?', '?', '?' },
288 };
289 
290 /* A version of the fast scanner using MMX vectorized byte compare insns.
291 
292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
293    which was packaged into SSE1; it is also present in the AMD MMX
294    extension.  Mark the function as using "sse" so that we emit a real
295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
296 
297 static const uchar *
298 #ifndef __SSE__
299 __attribute__((__target__("sse")))
300 #endif
search_line_mmx(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
302 {
303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
305 
306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
310 
311   unsigned int misalign, found, mask;
312   const v8qi *p;
313   v8qi data, t, c;
314 
315   /* Align the source pointer.  While MMX doesn't generate unaligned data
316      faults, this allows us to safely scan to the end of the buffer without
317      reading beyond the end of the last page.  */
318   misalign = (uintptr_t)s & 7;
319   p = (const v8qi *)((uintptr_t)s & -8);
320   data = *p;
321 
322   /* Create a mask for the bytes that are valid within the first
323      16-byte block.  The Idea here is that the AND with the mask
324      within the loop is "free", since we need some AND or TEST
325      insn in order to set the flags for the branch anyway.  */
326   mask = -1u << misalign;
327 
328   /* Main loop processing 8 bytes at a time.  */
329   goto start;
330   do
331     {
332       data = *++p;
333       mask = -1;
334 
335     start:
336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
343       found = __builtin_ia32_pmovmskb (t);
344       found &= mask;
345     }
346   while (!found);
347 
348   __builtin_ia32_emms ();
349 
350   /* FOUND contains 1 in bits for which we matched a relevant
351      character.  Conversion to the byte index is trivial.  */
352   found = __builtin_ctz(found);
353   return (const uchar *)p + found;
354 }
355 
356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
357 
358 static const uchar *
359 #ifndef __SSE2__
360 __attribute__((__target__("sse2")))
361 #endif
search_line_sse2(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
363 {
364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
365 
366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
370 
371   unsigned int misalign, found, mask;
372   const v16qi *p;
373   v16qi data, t;
374 
375   /* Align the source pointer.  */
376   misalign = (uintptr_t)s & 15;
377   p = (const v16qi *)((uintptr_t)s & -16);
378   data = *p;
379 
380   /* Create a mask for the bytes that are valid within the first
381      16-byte block.  The Idea here is that the AND with the mask
382      within the loop is "free", since we need some AND or TEST
383      insn in order to set the flags for the branch anyway.  */
384   mask = -1u << misalign;
385 
386   /* Main loop processing 16 bytes at a time.  */
387   goto start;
388   do
389     {
390       data = *++p;
391       mask = -1;
392 
393     start:
394       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
395       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
396       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
397       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
398       found = __builtin_ia32_pmovmskb128 (t);
399       found &= mask;
400     }
401   while (!found);
402 
403   /* FOUND contains 1 in bits for which we matched a relevant
404      character.  Conversion to the byte index is trivial.  */
405   found = __builtin_ctz(found);
406   return (const uchar *)p + found;
407 }
408 
409 #ifdef HAVE_SSE4
410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
411 
412 static const uchar *
413 #ifndef __SSE4_2__
414 __attribute__((__target__("sse4.2")))
415 #endif
search_line_sse42(const uchar * s,const uchar * end)416 search_line_sse42 (const uchar *s, const uchar *end)
417 {
418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
419   static const v16qi search = { '\n', '\r', '?', '\\' };
420 
421   uintptr_t si = (uintptr_t)s;
422   uintptr_t index;
423 
424   /* Check for unaligned input.  */
425   if (si & 15)
426     {
427       v16qi sv;
428 
429       if (__builtin_expect (end - s < 16, 0)
430 	  && __builtin_expect ((si & 0xfff) > 0xff0, 0))
431 	{
432 	  /* There are less than 16 bytes left in the buffer, and less
433 	     than 16 bytes left on the page.  Reading 16 bytes at this
434 	     point might generate a spurious page fault.  Defer to the
435 	     SSE2 implementation, which already handles alignment.  */
436 	  return search_line_sse2 (s, end);
437 	}
438 
439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
440 	 memory need not be aligned.  */
441       sv = __builtin_ia32_loaddqu ((const char *) s);
442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
443 
444       if (__builtin_expect (index < 16, 0))
445 	goto found;
446 
447       /* Advance the pointer to an aligned address.  We will re-scan a
448 	 few bytes, but we no longer need care for reading past the
449 	 end of a page, since we're guaranteed a match.  */
450       s = (const uchar *)((si + 15) & -16);
451     }
452 
453   /* Main loop, processing 16 bytes at a time.  */
454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
455   while (1)
456     {
457       char f;
458 
459       /* By using inline assembly instead of the builtin,
460 	 we can use the result, as well as the flags set.  */
461       __asm ("%vpcmpestri\t$0, %2, %3"
462 	     : "=c"(index), "=@ccc"(f)
463 	     : "m"(*s), "x"(search), "a"(4), "d"(16));
464       if (f)
465 	break;
466 
467       s += 16;
468     }
469 #else
470   s -= 16;
471   /* By doing the whole loop in inline assembly,
472      we can make proper use of the flags set.  */
473   __asm (      ".balign 16\n"
474 	"0:	add $16, %1\n"
475 	"	%vpcmpestri\t$0, (%1), %2\n"
476 	"	jnc 0b"
477 	: "=&c"(index), "+r"(s)
478 	: "x"(search), "a"(4), "d"(16));
479 #endif
480 
481  found:
482   return s + index;
483 }
484 
485 #else
486 /* Work around out-dated assemblers without sse4 support.  */
487 #define search_line_sse42 search_line_sse2
488 #endif
489 
490 /* Check the CPU capabilities.  */
491 
492 #include "../gcc/config/i386/cpuid.h"
493 
494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
495 static search_line_fast_type search_line_fast;
496 
497 #define HAVE_init_vectorized_lexer 1
498 static inline void
init_vectorized_lexer(void)499 init_vectorized_lexer (void)
500 {
501   unsigned dummy, ecx = 0, edx = 0;
502   search_line_fast_type impl = search_line_acc_char;
503   int minimum = 0;
504 
505 #if defined(__SSE4_2__)
506   minimum = 3;
507 #elif defined(__SSE2__)
508   minimum = 2;
509 #elif defined(__SSE__)
510   minimum = 1;
511 #endif
512 
513   if (minimum == 3)
514     impl = search_line_sse42;
515   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
516     {
517       if (minimum == 3 || (ecx & bit_SSE4_2))
518         impl = search_line_sse42;
519       else if (minimum == 2 || (edx & bit_SSE2))
520 	impl = search_line_sse2;
521       else if (minimum == 1 || (edx & bit_SSE))
522 	impl = search_line_mmx;
523     }
524   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
525     {
526       if (minimum == 1
527 	  || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
528 	impl = search_line_mmx;
529     }
530 
531   search_line_fast = impl;
532 }
533 
534 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
535 
536 /* A vection of the fast scanner using AltiVec vectorized byte compares
537    and VSX unaligned loads (when VSX is available).  This is otherwise
538    the same as the AltiVec version.  */
539 
540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
541 static const uchar *
search_line_fast(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
543 {
544   typedef __attribute__((altivec(vector))) unsigned char vc;
545 
546   const vc repl_nl = {
547     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
549   };
550   const vc repl_cr = {
551     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
553   };
554   const vc repl_bs = {
555     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
557   };
558   const vc repl_qm = {
559     '?', '?', '?', '?', '?', '?', '?', '?',
560     '?', '?', '?', '?', '?', '?', '?', '?',
561   };
562   const vc zero = { 0 };
563 
564   vc data, t;
565 
566   /* Main loop processing 16 bytes at a time.  */
567   do
568     {
569       vc m_nl, m_cr, m_bs, m_qm;
570 
571       data = __builtin_vec_vsx_ld (0, s);
572       s += 16;
573 
574       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
575       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
576       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
577       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
578       t = (m_nl | m_cr) | (m_bs | m_qm);
579 
580       /* T now contains 0xff in bytes for which we matched one of the relevant
581 	 characters.  We want to exit the loop if any byte in T is non-zero.
582 	 Below is the expansion of vec_any_ne(t, zero).  */
583     }
584   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
585 
586   /* Restore s to to point to the 16 bytes we just processed.  */
587   s -= 16;
588 
589   {
590 #define N  (sizeof(vc) / sizeof(long))
591 
592     union {
593       vc v;
594       /* Statically assert that N is 2 or 4.  */
595       unsigned long l[(N == 2 || N == 4) ? N : -1];
596     } u;
597     unsigned long l, i = 0;
598 
599     u.v = t;
600 
601     /* Find the first word of T that is non-zero.  */
602     switch (N)
603       {
604       case 4:
605 	l = u.l[i++];
606 	if (l != 0)
607 	  break;
608 	s += sizeof(unsigned long);
609 	l = u.l[i++];
610 	if (l != 0)
611 	  break;
612 	s += sizeof(unsigned long);
613 	/* FALLTHRU */
614       case 2:
615 	l = u.l[i++];
616 	if (l != 0)
617 	  break;
618 	s += sizeof(unsigned long);
619 	l = u.l[i];
620       }
621 
622     /* L now contains 0xff in bytes for which we matched one of the
623        relevant characters.  We can find the byte index by finding
624        its bit index and dividing by 8.  */
625 #ifdef __BIG_ENDIAN__
626     l = __builtin_clzl(l) >> 3;
627 #else
628     l = __builtin_ctzl(l) >> 3;
629 #endif
630     return s + l;
631 
632 #undef N
633   }
634 }
635 
636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
637 
638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
639    This cannot be used for little endian because vec_lvsl/lvsr are
640    deprecated for little endian and the code won't work properly.  */
641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
642    so we can't compile this function without -maltivec on the command line
643    (or implied by some other switch).  */
644 
645 static const uchar *
search_line_fast(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
647 {
648   typedef __attribute__((altivec(vector))) unsigned char vc;
649 
650   const vc repl_nl = {
651     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
653   };
654   const vc repl_cr = {
655     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
657   };
658   const vc repl_bs = {
659     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
661   };
662   const vc repl_qm = {
663     '?', '?', '?', '?', '?', '?', '?', '?',
664     '?', '?', '?', '?', '?', '?', '?', '?',
665   };
666   const vc ones = {
667     -1, -1, -1, -1, -1, -1, -1, -1,
668     -1, -1, -1, -1, -1, -1, -1, -1,
669   };
670   const vc zero = { 0 };
671 
672   vc data, mask, t;
673 
674   /* Altivec loads automatically mask addresses with -16.  This lets us
675      issue the first load as early as possible.  */
676   data = __builtin_vec_ld(0, (const vc *)s);
677 
678   /* Discard bytes before the beginning of the buffer.  Do this by
679      beginning with all ones and shifting in zeros according to the
680      mis-alignment.  The LVSR instruction pulls the exact shift we
681      want from the address.  */
682   mask = __builtin_vec_lvsr(0, s);
683   mask = __builtin_vec_perm(zero, ones, mask);
684   data &= mask;
685 
686   /* While altivec loads mask addresses, we still need to align S so
687      that the offset we compute at the end is correct.  */
688   s = (const uchar *)((uintptr_t)s & -16);
689 
690   /* Main loop processing 16 bytes at a time.  */
691   goto start;
692   do
693     {
694       vc m_nl, m_cr, m_bs, m_qm;
695 
696       s += 16;
697       data = __builtin_vec_ld(0, (const vc *)s);
698 
699     start:
700       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
701       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
702       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
703       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
704       t = (m_nl | m_cr) | (m_bs | m_qm);
705 
706       /* T now contains 0xff in bytes for which we matched one of the relevant
707 	 characters.  We want to exit the loop if any byte in T is non-zero.
708 	 Below is the expansion of vec_any_ne(t, zero).  */
709     }
710   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
711 
712   {
713 #define N  (sizeof(vc) / sizeof(long))
714 
715     union {
716       vc v;
717       /* Statically assert that N is 2 or 4.  */
718       unsigned long l[(N == 2 || N == 4) ? N : -1];
719     } u;
720     unsigned long l, i = 0;
721 
722     u.v = t;
723 
724     /* Find the first word of T that is non-zero.  */
725     switch (N)
726       {
727       case 4:
728 	l = u.l[i++];
729 	if (l != 0)
730 	  break;
731 	s += sizeof(unsigned long);
732 	l = u.l[i++];
733 	if (l != 0)
734 	  break;
735 	s += sizeof(unsigned long);
736 	/* FALLTHROUGH */
737       case 2:
738 	l = u.l[i++];
739 	if (l != 0)
740 	  break;
741 	s += sizeof(unsigned long);
742 	l = u.l[i];
743       }
744 
745     /* L now contains 0xff in bytes for which we matched one of the
746        relevant characters.  We can find the byte index by finding
747        its bit index and dividing by 8.  */
748     l = __builtin_clzl(l) >> 3;
749     return s + l;
750 
751 #undef N
752   }
753 }
754 
755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
756 #include "arm_neon.h"
757 
758 /* This doesn't have to be the exact page size, but no system may use
759    a size smaller than this.  ARMv8 requires a minimum page size of
760    4k.  The impact of being conservative here is a small number of
761    cases will take the slightly slower entry path into the main
762    loop.  */
763 
764 #define AARCH64_MIN_PAGE_SIZE 4096
765 
766 static const uchar *
search_line_fast(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
768 {
769   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
770   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
771   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
772   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
773   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
774 
775 #ifdef __ARM_BIG_ENDIAN
776   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
777 #else
778   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
779 #endif
780 
781   unsigned int found;
782   const uint8_t *p;
783   uint8x16_t data;
784   uint8x16_t t;
785   uint16x8_t m;
786   uint8x16_t u, v, w;
787 
788   /* Align the source pointer.  */
789   p = (const uint8_t *)((uintptr_t)s & -16);
790 
791   /* Assuming random string start positions, with a 4k page size we'll take
792      the slow path about 0.37% of the time.  */
793   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
794 			 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
795 			< 16, 0))
796     {
797       /* Slow path: the string starts near a possible page boundary.  */
798       uint32_t misalign, mask;
799 
800       misalign = (uintptr_t)s & 15;
801       mask = (-1u << misalign) & 0xffff;
802       data = vld1q_u8 (p);
803       t = vceqq_u8 (data, repl_nl);
804       u = vceqq_u8 (data, repl_cr);
805       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
806       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
807       t = vorrq_u8 (v, w);
808       t = vandq_u8 (t, xmask);
809       m = vpaddlq_u8 (t);
810       m = vshlq_u16 (m, shift);
811       found = vaddvq_u16 (m);
812       found &= mask;
813       if (found)
814 	return (const uchar*)p + __builtin_ctz (found);
815     }
816   else
817     {
818       data = vld1q_u8 ((const uint8_t *) s);
819       t = vceqq_u8 (data, repl_nl);
820       u = vceqq_u8 (data, repl_cr);
821       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
822       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
823       t = vorrq_u8 (v, w);
824       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
825 	goto done;
826     }
827 
828   do
829     {
830       p += 16;
831       data = vld1q_u8 (p);
832       t = vceqq_u8 (data, repl_nl);
833       u = vceqq_u8 (data, repl_cr);
834       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
835       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
836       t = vorrq_u8 (v, w);
837     } while (!vpaddd_u64 ((uint64x2_t)t));
838 
839 done:
840   /* Now that we've found the terminating substring, work out precisely where
841      we need to stop.  */
842   t = vandq_u8 (t, xmask);
843   m = vpaddlq_u8 (t);
844   m = vshlq_u16 (m, shift);
845   found = vaddvq_u16 (m);
846   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
847 	  + __builtin_ctz (found));
848 }
849 
850 #elif defined (__ARM_NEON)
851 #include "arm_neon.h"
852 
853 static const uchar *
search_line_fast(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
855 {
856   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
857   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
858   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
859   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
860   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
861 
862   unsigned int misalign, found, mask;
863   const uint8_t *p;
864   uint8x16_t data;
865 
866   /* Align the source pointer.  */
867   misalign = (uintptr_t)s & 15;
868   p = (const uint8_t *)((uintptr_t)s & -16);
869   data = vld1q_u8 (p);
870 
871   /* Create a mask for the bytes that are valid within the first
872      16-byte block.  The Idea here is that the AND with the mask
873      within the loop is "free", since we need some AND or TEST
874      insn in order to set the flags for the branch anyway.  */
875   mask = (-1u << misalign) & 0xffff;
876 
877   /* Main loop, processing 16 bytes at a time.  */
878   goto start;
879 
880   do
881     {
882       uint8x8_t l;
883       uint16x4_t m;
884       uint32x2_t n;
885       uint8x16_t t, u, v, w;
886 
887       p += 16;
888       data = vld1q_u8 (p);
889       mask = 0xffff;
890 
891     start:
892       t = vceqq_u8 (data, repl_nl);
893       u = vceqq_u8 (data, repl_cr);
894       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
895       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
896       t = vandq_u8 (vorrq_u8 (v, w), xmask);
897       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
898       m = vpaddl_u8 (l);
899       n = vpaddl_u16 (m);
900 
901       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
902 	      vshr_n_u64 ((uint64x1_t) n, 24)), 0);
903       found &= mask;
904     }
905   while (!found);
906 
907   /* FOUND contains 1 in bits for which we matched a relevant
908      character.  Conversion to the byte index is trivial.  */
909   found = __builtin_ctz (found);
910   return (const uchar *)p + found;
911 }
912 
913 #else
914 
915 /* We only have one accelerated alternative.  Use a direct call so that
916    we encourage inlining.  */
917 
918 #define search_line_fast  search_line_acc_char
919 
920 #endif
921 
922 /* Initialize the lexer if needed.  */
923 
924 void
_cpp_init_lexer(void)925 _cpp_init_lexer (void)
926 {
927 #ifdef HAVE_init_vectorized_lexer
928   init_vectorized_lexer ();
929 #endif
930 }
931 
932 /* Returns with a logical line that contains no escaped newlines or
933    trigraphs.  This is a time-critical inner loop.  */
934 void
_cpp_clean_line(cpp_reader * pfile)935 _cpp_clean_line (cpp_reader *pfile)
936 {
937   cpp_buffer *buffer;
938   const uchar *s;
939   uchar c, *d, *p;
940 
941   buffer = pfile->buffer;
942   buffer->cur_note = buffer->notes_used = 0;
943   buffer->cur = buffer->line_base = buffer->next_line;
944   buffer->need_line = false;
945   s = buffer->next_line;
946 
947   if (!buffer->from_stage3)
948     {
949       const uchar *pbackslash = NULL;
950 
951       /* Fast path.  This is the common case of an un-escaped line with
952 	 no trigraphs.  The primary win here is by not writing any
953 	 data back to memory until we have to.  */
954       while (1)
955 	{
956 	  /* Perform an optimized search for \n, \r, \\, ?.  */
957 	  s = search_line_fast (s, buffer->rlimit);
958 
959 	  c = *s;
960 	  if (c == '\\')
961 	    {
962 	      /* Record the location of the backslash and continue.  */
963 	      pbackslash = s++;
964 	    }
965 	  else if (__builtin_expect (c == '?', 0))
966 	    {
967 	      if (__builtin_expect (s[1] == '?', false)
968 		   && _cpp_trigraph_map[s[2]])
969 		{
970 		  /* Have a trigraph.  We may or may not have to convert
971 		     it.  Add a line note regardless, for -Wtrigraphs.  */
972 		  add_line_note (buffer, s, s[2]);
973 		  if (CPP_OPTION (pfile, trigraphs))
974 		    {
975 		      /* We do, and that means we have to switch to the
976 		         slow path.  */
977 		      d = (uchar *) s;
978 		      *d = _cpp_trigraph_map[s[2]];
979 		      s += 2;
980 		      goto slow_path;
981 		    }
982 		}
983 	      /* Not a trigraph.  Continue on fast-path.  */
984 	      s++;
985 	    }
986 	  else
987 	    break;
988 	}
989 
990       /* This must be \r or \n.  We're either done, or we'll be forced
991 	 to write back to the buffer and continue on the slow path.  */
992       d = (uchar *) s;
993 
994       if (__builtin_expect (s == buffer->rlimit, false))
995 	goto done;
996 
997       /* DOS line ending? */
998       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
999 	{
1000 	  s++;
1001 	  if (s == buffer->rlimit)
1002 	    goto done;
1003 	}
1004 
1005       if (__builtin_expect (pbackslash == NULL, true))
1006 	goto done;
1007 
1008       /* Check for escaped newline.  */
1009       p = d;
1010       while (is_nvspace (p[-1]))
1011 	p--;
1012       if (p - 1 != pbackslash)
1013 	goto done;
1014 
1015       /* Have an escaped newline; process it and proceed to
1016 	 the slow path.  */
1017       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018       d = p - 2;
1019       buffer->next_line = p - 1;
1020 
1021     slow_path:
1022       while (1)
1023 	{
1024 	  c = *++s;
1025 	  *++d = c;
1026 
1027 	  if (c == '\n' || c == '\r')
1028 	    {
1029 	      /* Handle DOS line endings.  */
1030 	      if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031 		s++;
1032 	      if (s == buffer->rlimit)
1033 		break;
1034 
1035 	      /* Escaped?  */
1036 	      p = d;
1037 	      while (p != buffer->next_line && is_nvspace (p[-1]))
1038 		p--;
1039 	      if (p == buffer->next_line || p[-1] != '\\')
1040 		break;
1041 
1042 	      add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043 	      d = p - 2;
1044 	      buffer->next_line = p - 1;
1045 	    }
1046 	  else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047 	    {
1048 	      /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1049 	      add_line_note (buffer, d, s[2]);
1050 	      if (CPP_OPTION (pfile, trigraphs))
1051 		{
1052 		  *d = _cpp_trigraph_map[s[2]];
1053 		  s += 2;
1054 		}
1055 	    }
1056 	}
1057     }
1058   else
1059     {
1060       while (*s != '\n' && *s != '\r')
1061 	s++;
1062       d = (uchar *) s;
1063 
1064       /* Handle DOS line endings.  */
1065       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
1066 	s++;
1067     }
1068 
1069  done:
1070   *d = '\n';
1071   /* A sentinel note that should never be processed.  */
1072   add_line_note (buffer, d + 1, '\n');
1073   buffer->next_line = s + 1;
1074 }
1075 
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077    about in a comment.  */
1078 static bool
warn_in_comment(cpp_reader * pfile,_cpp_line_note * note)1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080 {
1081   const uchar *p;
1082 
1083   /* Within comments we don't warn about trigraphs, unless the
1084      trigraph forms an escaped newline, as that may change
1085      behavior.  */
1086   if (note->type != '/')
1087     return false;
1088 
1089   /* If -trigraphs, then this was an escaped newline iff the next note
1090      is coincident.  */
1091   if (CPP_OPTION (pfile, trigraphs))
1092     return note[1].pos == note->pos;
1093 
1094   /* Otherwise, see if this forms an escaped newline.  */
1095   p = note->pos + 3;
1096   while (is_nvspace (*p))
1097     p++;
1098 
1099   /* There might have been escaped newlines between the trigraph and the
1100      newline we found.  Hence the position test.  */
1101   return (*p == '\n' && p < note[1].pos);
1102 }
1103 
1104 /* Process the notes created by add_line_note as far as the current
1105    location.  */
1106 void
_cpp_process_line_notes(cpp_reader * pfile,int in_comment)1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108 {
1109   cpp_buffer *buffer = pfile->buffer;
1110 
1111   for (;;)
1112     {
1113       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114       unsigned int col;
1115 
1116       if (note->pos > buffer->cur)
1117 	break;
1118 
1119       buffer->cur_note++;
1120       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121 
1122       if (note->type == '\\' || note->type == ' ')
1123 	{
1124 	  if (note->type == ' ' && !in_comment)
1125 	    cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126 				 "backslash and newline separated by space");
1127 
1128 	  if (buffer->next_line > buffer->rlimit)
1129 	    {
1130 	      cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131 				   "backslash-newline at end of file");
1132 	      /* Prevent "no newline at end of file" warning.  */
1133 	      buffer->next_line = buffer->rlimit;
1134 	    }
1135 
1136 	  buffer->line_base = note->pos;
1137 	  CPP_INCREMENT_LINE (pfile, 0);
1138 	}
1139       else if (_cpp_trigraph_map[note->type])
1140 	{
1141 	  if (CPP_OPTION (pfile, warn_trigraphs)
1142 	      && (!in_comment || warn_in_comment (pfile, note)))
1143 	    {
1144 	      if (CPP_OPTION (pfile, trigraphs))
1145 		cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146                                        pfile->line_table->highest_line, col,
1147 				       "trigraph ??%c converted to %c",
1148 				       note->type,
1149 				       (int) _cpp_trigraph_map[note->type]);
1150 	      else
1151 		{
1152 		  cpp_warning_with_line
1153 		    (pfile, CPP_W_TRIGRAPHS,
1154                      pfile->line_table->highest_line, col,
1155 		     "trigraph ??%c ignored, use -trigraphs to enable",
1156 		     note->type);
1157 		}
1158 	    }
1159 	}
1160       else if (note->type == 0)
1161 	/* Already processed in lex_raw_string.  */;
1162       else
1163 	abort ();
1164     }
1165 }
1166 
1167 /* Skip a C-style block comment.  We find the end of the comment by
1168    seeing if an asterisk is before every '/' we encounter.  Returns
1169    nonzero if comment terminated by EOF, zero otherwise.
1170 
1171    Buffer->cur points to the initial asterisk of the comment.  */
1172 bool
_cpp_skip_block_comment(cpp_reader * pfile)1173 _cpp_skip_block_comment (cpp_reader *pfile)
1174 {
1175   cpp_buffer *buffer = pfile->buffer;
1176   const uchar *cur = buffer->cur;
1177   uchar c;
1178 
1179   cur++;
1180   if (*cur == '/')
1181     cur++;
1182 
1183   for (;;)
1184     {
1185       /* People like decorating comments with '*', so check for '/'
1186 	 instead for efficiency.  */
1187       c = *cur++;
1188 
1189       if (c == '/')
1190 	{
1191 	  if (cur[-2] == '*')
1192 	    break;
1193 
1194 	  /* Warn about potential nested comments, but not if the '/'
1195 	     comes immediately before the true comment delimiter.
1196 	     Don't bother to get it right across escaped newlines.  */
1197 	  if (CPP_OPTION (pfile, warn_comments)
1198 	      && cur[0] == '*' && cur[1] != '/')
1199 	    {
1200 	      buffer->cur = cur;
1201 	      cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202 				     pfile->line_table->highest_line,
1203 				     CPP_BUF_COL (buffer),
1204 				     "\"/*\" within comment");
1205 	    }
1206 	}
1207       else if (c == '\n')
1208 	{
1209 	  unsigned int cols;
1210 	  buffer->cur = cur - 1;
1211 	  _cpp_process_line_notes (pfile, true);
1212 	  if (buffer->next_line >= buffer->rlimit)
1213 	    return true;
1214 	  _cpp_clean_line (pfile);
1215 
1216 	  cols = buffer->next_line - buffer->line_base;
1217 	  CPP_INCREMENT_LINE (pfile, cols);
1218 
1219 	  cur = buffer->cur;
1220 	}
1221     }
1222 
1223   buffer->cur = cur;
1224   _cpp_process_line_notes (pfile, true);
1225   return false;
1226 }
1227 
1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229    terminating newline.  Handles escaped newlines.  Returns nonzero
1230    if a multiline comment.  */
1231 static int
skip_line_comment(cpp_reader * pfile)1232 skip_line_comment (cpp_reader *pfile)
1233 {
1234   cpp_buffer *buffer = pfile->buffer;
1235   location_t orig_line = pfile->line_table->highest_line;
1236 
1237   while (*buffer->cur != '\n')
1238     buffer->cur++;
1239 
1240   _cpp_process_line_notes (pfile, true);
1241   return orig_line != pfile->line_table->highest_line;
1242 }
1243 
1244 /* Skips whitespace, saving the next non-whitespace character.  */
1245 static void
skip_whitespace(cpp_reader * pfile,cppchar_t c)1246 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1247 {
1248   cpp_buffer *buffer = pfile->buffer;
1249   bool saw_NUL = false;
1250 
1251   do
1252     {
1253       /* Horizontal space always OK.  */
1254       if (c == ' ' || c == '\t')
1255 	;
1256       /* Just \f \v or \0 left.  */
1257       else if (c == '\0')
1258 	saw_NUL = true;
1259       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260 	cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261 			     CPP_BUF_COL (buffer),
1262 			     "%s in preprocessing directive",
1263 			     c == '\f' ? "form feed" : "vertical tab");
1264 
1265       c = *buffer->cur++;
1266     }
1267   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1268   while (is_nvspace (c));
1269 
1270   if (saw_NUL)
1271     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1272 
1273   buffer->cur--;
1274 }
1275 
1276 /* See if the characters of a number token are valid in a name (no
1277    '.', '+' or '-').  */
1278 static int
name_p(cpp_reader * pfile,const cpp_string * string)1279 name_p (cpp_reader *pfile, const cpp_string *string)
1280 {
1281   unsigned int i;
1282 
1283   for (i = 0; i < string->len; i++)
1284     if (!is_idchar (string->text[i]))
1285       return 0;
1286 
1287   return 1;
1288 }
1289 
1290 /* After parsing an identifier or other sequence, produce a warning about
1291    sequences not in NFC/NFKC.  */
1292 static void
warn_about_normalization(cpp_reader * pfile,const cpp_token * token,const struct normalize_state * s)1293 warn_about_normalization (cpp_reader *pfile,
1294 			  const cpp_token *token,
1295 			  const struct normalize_state *s)
1296 {
1297   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1298       && !pfile->state.skipping)
1299     {
1300       /* Make sure that the token is printed using UCNs, even
1301 	 if we'd otherwise happily print UTF-8.  */
1302       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1303       size_t sz;
1304 
1305       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1306       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1307 	cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1308 			       "`%.*s' is not in NFKC", (int) sz, buf);
1309       else
1310 	cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1311 			       "`%.*s' is not in NFC", (int) sz, buf);
1312       free (buf);
1313     }
1314 }
1315 
1316 static const cppchar_t utf8_signifier = 0xC0;
1317 
1318 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1319    an identifier.  FIRST is TRUE if this starts an identifier.  */
1320 static bool
forms_identifier_p(cpp_reader * pfile,int first,struct normalize_state * state)1321 forms_identifier_p (cpp_reader *pfile, int first,
1322 		    struct normalize_state *state)
1323 {
1324   cpp_buffer *buffer = pfile->buffer;
1325 
1326   if (*buffer->cur == '$')
1327     {
1328       if (!CPP_OPTION (pfile, dollars_in_ident))
1329 	return false;
1330 
1331       buffer->cur++;
1332       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1333 	{
1334 	  CPP_OPTION (pfile, warn_dollars) = 0;
1335 	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1336 	}
1337 
1338       return true;
1339     }
1340 
1341   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
1342   if (CPP_OPTION (pfile, extended_identifiers))
1343     {
1344       cppchar_t s;
1345       if (*buffer->cur >= utf8_signifier)
1346 	{
1347 	  if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1348 			       state, &s))
1349 	    return true;
1350 	}
1351       else if (*buffer->cur == '\\'
1352 	       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1353 	{
1354 	  buffer->cur += 2;
1355 	  if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1356 			      state, &s, NULL, NULL))
1357 	    return true;
1358 	  buffer->cur -= 2;
1359 	}
1360     }
1361 
1362   return false;
1363 }
1364 
1365 /* Helper function to issue error about improper __VA_OPT__ use.  */
1366 static void
maybe_va_opt_error(cpp_reader * pfile)1367 maybe_va_opt_error (cpp_reader *pfile)
1368 {
1369   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1370     {
1371       /* __VA_OPT__ should not be accepted at all, but allow it in
1372 	 system headers.  */
1373       if (!cpp_in_system_header (pfile))
1374 	cpp_error (pfile, CPP_DL_PEDWARN,
1375 		   "__VA_OPT__ is not available until C++2a");
1376     }
1377   else if (!pfile->state.va_args_ok)
1378     {
1379       /* __VA_OPT__ should only appear in the replacement list of a
1380 	 variadic macro.  */
1381       cpp_error (pfile, CPP_DL_PEDWARN,
1382 		 "__VA_OPT__ can only appear in the expansion"
1383 		 " of a C++2a variadic macro");
1384     }
1385 }
1386 
1387 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1388 static cpp_hashnode *
lex_identifier_intern(cpp_reader * pfile,const uchar * base)1389 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1390 {
1391   cpp_hashnode *result;
1392   const uchar *cur;
1393   unsigned int len;
1394   unsigned int hash = HT_HASHSTEP (0, *base);
1395 
1396   cur = base + 1;
1397   while (ISIDNUM (*cur))
1398     {
1399       hash = HT_HASHSTEP (hash, *cur);
1400       cur++;
1401     }
1402   len = cur - base;
1403   hash = HT_HASHFINISH (hash, len);
1404   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1405 					      base, len, hash, HT_ALLOC));
1406 
1407   /* Rarely, identifiers require diagnostics when lexed.  */
1408   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1409 			&& !pfile->state.skipping, 0))
1410     {
1411       /* It is allowed to poison the same identifier twice.  */
1412       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1413 	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1414 		   NODE_NAME (result));
1415 
1416       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1417 	 replacement list of a variadic macro.  */
1418       if (result == pfile->spec_nodes.n__VA_ARGS__
1419 	  && !pfile->state.va_args_ok)
1420 	{
1421 	  if (CPP_OPTION (pfile, cplusplus))
1422 	    cpp_error (pfile, CPP_DL_PEDWARN,
1423 		       "__VA_ARGS__ can only appear in the expansion"
1424 		       " of a C++11 variadic macro");
1425 	  else
1426 	    cpp_error (pfile, CPP_DL_PEDWARN,
1427 		       "__VA_ARGS__ can only appear in the expansion"
1428 		       " of a C99 variadic macro");
1429 	}
1430 
1431       if (result == pfile->spec_nodes.n__VA_OPT__)
1432 	maybe_va_opt_error (pfile);
1433 
1434       /* For -Wc++-compat, warn about use of C++ named operators.  */
1435       if (result->flags & NODE_WARN_OPERATOR)
1436 	cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1437 		     "identifier \"%s\" is a special operator name in C++",
1438 		     NODE_NAME (result));
1439     }
1440 
1441   return result;
1442 }
1443 
1444 /* Get the cpp_hashnode of an identifier specified by NAME in
1445    the current cpp_reader object.  If none is found, NULL is returned.  */
1446 cpp_hashnode *
_cpp_lex_identifier(cpp_reader * pfile,const char * name)1447 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1448 {
1449   cpp_hashnode *result;
1450   result = lex_identifier_intern (pfile, (uchar *) name);
1451   return result;
1452 }
1453 
1454 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1455 static cpp_hashnode *
lex_identifier(cpp_reader * pfile,const uchar * base,bool starts_ucn,struct normalize_state * nst,cpp_hashnode ** spelling)1456 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1457 		struct normalize_state *nst, cpp_hashnode **spelling)
1458 {
1459   cpp_hashnode *result;
1460   const uchar *cur;
1461   unsigned int len;
1462   unsigned int hash = HT_HASHSTEP (0, *base);
1463 
1464   cur = pfile->buffer->cur;
1465   if (! starts_ucn)
1466     {
1467       while (ISIDNUM (*cur))
1468 	{
1469 	  hash = HT_HASHSTEP (hash, *cur);
1470 	  cur++;
1471 	}
1472       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1473     }
1474   pfile->buffer->cur = cur;
1475   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1476     {
1477       /* Slower version for identifiers containing UCNs
1478 	 or extended chars (including $).  */
1479       do {
1480 	while (ISIDNUM (*pfile->buffer->cur))
1481 	  {
1482 	    NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1483 	    pfile->buffer->cur++;
1484 	  }
1485       } while (forms_identifier_p (pfile, false, nst));
1486       result = _cpp_interpret_identifier (pfile, base,
1487 					  pfile->buffer->cur - base);
1488       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1489     }
1490   else
1491     {
1492       len = cur - base;
1493       hash = HT_HASHFINISH (hash, len);
1494 
1495       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1496 						  base, len, hash, HT_ALLOC));
1497       *spelling = result;
1498     }
1499 
1500   /* Rarely, identifiers require diagnostics when lexed.  */
1501   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1502 			&& !pfile->state.skipping, 0))
1503     {
1504       /* It is allowed to poison the same identifier twice.  */
1505       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1506 	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1507 		   NODE_NAME (result));
1508 
1509       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1510 	 replacement list of a variadic macro.  */
1511       if (result == pfile->spec_nodes.n__VA_ARGS__
1512 	  && !pfile->state.va_args_ok)
1513 	{
1514 	  if (CPP_OPTION (pfile, cplusplus))
1515 	    cpp_error (pfile, CPP_DL_PEDWARN,
1516 		       "__VA_ARGS__ can only appear in the expansion"
1517 		       " of a C++11 variadic macro");
1518 	  else
1519 	    cpp_error (pfile, CPP_DL_PEDWARN,
1520 		       "__VA_ARGS__ can only appear in the expansion"
1521 		       " of a C99 variadic macro");
1522 	}
1523 
1524       /* __VA_OPT__ should only appear in the replacement list of a
1525 	 variadic macro.  */
1526       if (result == pfile->spec_nodes.n__VA_OPT__)
1527 	maybe_va_opt_error (pfile);
1528 
1529       /* For -Wc++-compat, warn about use of C++ named operators.  */
1530       if (result->flags & NODE_WARN_OPERATOR)
1531 	cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1532 		     "identifier \"%s\" is a special operator name in C++",
1533 		     NODE_NAME (result));
1534     }
1535 
1536   return result;
1537 }
1538 
1539 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1540 static void
lex_number(cpp_reader * pfile,cpp_string * number,struct normalize_state * nst)1541 lex_number (cpp_reader *pfile, cpp_string *number,
1542 	    struct normalize_state *nst)
1543 {
1544   const uchar *cur;
1545   const uchar *base;
1546   uchar *dest;
1547 
1548   base = pfile->buffer->cur - 1;
1549   do
1550     {
1551       cur = pfile->buffer->cur;
1552 
1553       /* N.B. ISIDNUM does not include $.  */
1554       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1555 	     || VALID_SIGN (*cur, cur[-1]))
1556 	{
1557 	  NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1558 	  cur++;
1559 	}
1560       /* A number can't end with a digit separator.  */
1561       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1562 	--cur;
1563 
1564       pfile->buffer->cur = cur;
1565     }
1566   while (forms_identifier_p (pfile, false, nst));
1567 
1568   number->len = cur - base;
1569   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1570   memcpy (dest, base, number->len);
1571   dest[number->len] = '\0';
1572   number->text = dest;
1573 }
1574 
1575 /* Create a token of type TYPE with a literal spelling.  */
1576 static void
create_literal(cpp_reader * pfile,cpp_token * token,const uchar * base,unsigned int len,enum cpp_ttype type)1577 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1578 		unsigned int len, enum cpp_ttype type)
1579 {
1580   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1581 
1582   memcpy (dest, base, len);
1583   dest[len] = '\0';
1584   token->type = type;
1585   token->val.str.len = len;
1586   token->val.str.text = dest;
1587 }
1588 
1589 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1590    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1591 
1592 static void
bufring_append(cpp_reader * pfile,const uchar * base,size_t len,_cpp_buff ** first_buff_p,_cpp_buff ** last_buff_p)1593 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1594 		_cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1595 {
1596   _cpp_buff *first_buff = *first_buff_p;
1597   _cpp_buff *last_buff = *last_buff_p;
1598 
1599   if (first_buff == NULL)
1600     first_buff = last_buff = _cpp_get_buff (pfile, len);
1601   else if (len > BUFF_ROOM (last_buff))
1602     {
1603       size_t room = BUFF_ROOM (last_buff);
1604       memcpy (BUFF_FRONT (last_buff), base, room);
1605       BUFF_FRONT (last_buff) += room;
1606       base += room;
1607       len -= room;
1608       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1609     }
1610 
1611   memcpy (BUFF_FRONT (last_buff), base, len);
1612   BUFF_FRONT (last_buff) += len;
1613 
1614   *first_buff_p = first_buff;
1615   *last_buff_p = last_buff;
1616 }
1617 
1618 
1619 /* Returns true if a macro has been defined.
1620    This might not work if compile with -save-temps,
1621    or preprocess separately from compilation.  */
1622 
1623 static bool
is_macro(cpp_reader * pfile,const uchar * base)1624 is_macro(cpp_reader *pfile, const uchar *base)
1625 {
1626   const uchar *cur = base;
1627   if (! ISIDST (*cur))
1628     return false;
1629   unsigned int hash = HT_HASHSTEP (0, *cur);
1630   ++cur;
1631   while (ISIDNUM (*cur))
1632     {
1633       hash = HT_HASHSTEP (hash, *cur);
1634       ++cur;
1635     }
1636   hash = HT_HASHFINISH (hash, cur - base);
1637 
1638   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1639 					base, cur - base, hash, HT_NO_INSERT));
1640 
1641   return result && cpp_macro_p (result);
1642 }
1643 
1644 /* Returns true if a literal suffix does not have the expected form
1645    and is defined as a macro.  */
1646 
1647 static bool
is_macro_not_literal_suffix(cpp_reader * pfile,const uchar * base)1648 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
1649 {
1650   /* User-defined literals outside of namespace std must start with a single
1651      underscore, so assume anything of that form really is a UDL suffix.
1652      We don't need to worry about UDLs defined inside namespace std because
1653      their names are reserved, so cannot be used as macro names in valid
1654      programs.  */
1655   if (base[0] == '_' && base[1] != '_')
1656     return false;
1657   return is_macro (pfile, base);
1658 }
1659 
1660 /* Lexes a raw string.  The stored string contains the spelling, including
1661    double quotes, delimiter string, '(' and ')', any leading
1662    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1663    literal, or CPP_OTHER if it was not properly terminated.
1664 
1665    The spelling is NUL-terminated, but it is not guaranteed that this
1666    is the first NUL since embedded NULs are preserved.  */
1667 
1668 static void
lex_raw_string(cpp_reader * pfile,cpp_token * token,const uchar * base,const uchar * cur)1669 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1670 		const uchar *cur)
1671 {
1672   uchar raw_prefix[17];
1673   uchar temp_buffer[18];
1674   const uchar *orig_base;
1675   unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1676   enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1677   raw_str_phase phase = RAW_STR_PREFIX;
1678   enum cpp_ttype type;
1679   size_t total_len = 0;
1680   /* Index into temp_buffer during phases other than RAW_STR,
1681      during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1682      be appended to temp_buffer.  */
1683   size_t temp_buffer_len = 0;
1684   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1685   size_t raw_prefix_start;
1686   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1687 
1688   type = (*base == 'L' ? CPP_WSTRING :
1689 	  *base == 'U' ? CPP_STRING32 :
1690 	  *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1691 	  : CPP_STRING);
1692 
1693 #define BUF_APPEND(STR,LEN)					\
1694       do {							\
1695 	bufring_append (pfile, (const uchar *)(STR), (LEN),	\
1696 			&first_buff, &last_buff);		\
1697 	total_len += (LEN);					\
1698 	if (__builtin_expect (temp_buffer_len < 17, 0)		\
1699 	    && (const uchar *)(STR) != base			\
1700 	    && (LEN) <= 2)					\
1701 	  {							\
1702 	    memcpy (temp_buffer + temp_buffer_len,		\
1703 		    (const uchar *)(STR), (LEN));		\
1704 	    temp_buffer_len += (LEN);				\
1705 	  }							\
1706       } while (0)
1707 
1708   orig_base = base;
1709   ++cur;
1710   raw_prefix_start = cur - base;
1711   for (;;)
1712     {
1713       cppchar_t c;
1714 
1715       /* If we previously performed any trigraph or line splicing
1716 	 transformations, undo them in between the opening and closing
1717 	 double quote.  */
1718       while (note->pos < cur)
1719 	++note;
1720       for (; note->pos == cur; ++note)
1721 	{
1722 	  switch (note->type)
1723 	    {
1724 	    case '\\':
1725 	    case ' ':
1726 	      /* Restore backslash followed by newline.  */
1727 	      BUF_APPEND (base, cur - base);
1728 	      base = cur;
1729 	      BUF_APPEND ("\\", 1);
1730 	    after_backslash:
1731 	      if (note->type == ' ')
1732 		{
1733 		  /* GNU backslash whitespace newline extension.  FIXME
1734 		     could be any sequence of non-vertical space.  When we
1735 		     can properly restore any such sequence, we should mark
1736 		     this note as handled so _cpp_process_line_notes
1737 		     doesn't warn.  */
1738 		  BUF_APPEND (" ", 1);
1739 		}
1740 
1741 	      BUF_APPEND ("\n", 1);
1742 	      break;
1743 
1744 	    case 0:
1745 	      /* Already handled.  */
1746 	      break;
1747 
1748 	    default:
1749 	      if (_cpp_trigraph_map[note->type])
1750 		{
1751 		  /* Don't warn about this trigraph in
1752 		     _cpp_process_line_notes, since trigraphs show up as
1753 		     trigraphs in raw strings.  */
1754 		  uchar type = note->type;
1755 		  note->type = 0;
1756 
1757 		  if (!CPP_OPTION (pfile, trigraphs))
1758 		    /* If we didn't convert the trigraph in the first
1759 		       place, don't do anything now either.  */
1760 		    break;
1761 
1762 		  BUF_APPEND (base, cur - base);
1763 		  base = cur;
1764 		  BUF_APPEND ("??", 2);
1765 
1766 		  /* ??/ followed by newline gets two line notes, one for
1767 		     the trigraph and one for the backslash/newline.  */
1768 		  if (type == '/' && note[1].pos == cur)
1769 		    {
1770 		      if (note[1].type != '\\'
1771 			  && note[1].type != ' ')
1772 			abort ();
1773 		      BUF_APPEND ("/", 1);
1774 		      ++note;
1775 		      goto after_backslash;
1776 		    }
1777 		  else
1778 		    {
1779 		      /* Skip the replacement character.  */
1780 		      base = ++cur;
1781 		      BUF_APPEND (&type, 1);
1782 		      c = type;
1783 		      goto check_c;
1784 		    }
1785 		}
1786 	      else
1787 		abort ();
1788 	      break;
1789 	    }
1790 	}
1791       c = *cur++;
1792       if (__builtin_expect (temp_buffer_len < 17, 0))
1793 	temp_buffer[temp_buffer_len++] = c;
1794 
1795      check_c:
1796       if (phase == RAW_STR_PREFIX)
1797 	{
1798 	  while (raw_prefix_len < temp_buffer_len)
1799 	    {
1800 	      raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1801 	      switch (raw_prefix[raw_prefix_len])
1802 		{
1803 		case ' ': case '(': case ')': case '\\': case '\t':
1804 		case '\v': case '\f': case '\n': default:
1805 		  break;
1806 		/* Basic source charset except the above chars.  */
1807 		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1808 		case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1809 		case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1810 		case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1811 		case 'y': case 'z':
1812 		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1813 		case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1814 		case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1815 		case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1816 		case 'Y': case 'Z':
1817 		case '0': case '1': case '2': case '3': case '4': case '5':
1818 		case '6': case '7': case '8': case '9':
1819 		case '_': case '{': case '}': case '#': case '[': case ']':
1820 		case '<': case '>': case '%': case ':': case ';': case '.':
1821 		case '?': case '*': case '+': case '-': case '/': case '^':
1822 		case '&': case '|': case '~': case '!': case '=': case ',':
1823 		case '"': case '\'':
1824 		  if (raw_prefix_len < 16)
1825 		    {
1826 		      raw_prefix_len++;
1827 		      continue;
1828 		    }
1829 		  break;
1830 		}
1831 
1832 	      if (raw_prefix[raw_prefix_len] != '(')
1833 		{
1834 		  int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1835 		  if (raw_prefix_len == 16)
1836 		    cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1837 					 col, "raw string delimiter longer "
1838 					      "than 16 characters");
1839 		  else if (raw_prefix[raw_prefix_len] == '\n')
1840 		    cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1841 					 col, "invalid new-line in raw "
1842 					      "string delimiter");
1843 		  else
1844 		    cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1845 					 col, "invalid character '%c' in "
1846 					      "raw string delimiter",
1847 					 (int) raw_prefix[raw_prefix_len]);
1848 		  pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1849 		  create_literal (pfile, token, orig_base,
1850 				  raw_prefix_start - 1, CPP_OTHER);
1851 		  if (first_buff)
1852 		    _cpp_release_buff (pfile, first_buff);
1853 		  return;
1854 		}
1855 	      raw_prefix[raw_prefix_len] = '"';
1856 	      phase = RAW_STR;
1857 	      /* Nothing should be appended to temp_buffer during
1858 		 RAW_STR phase.  */
1859 	      temp_buffer_len = 17;
1860 	      break;
1861 	    }
1862 	  continue;
1863 	}
1864       else if (phase == RAW_STR_SUFFIX)
1865 	{
1866 	  while (raw_suffix_len <= raw_prefix_len
1867 		 && raw_suffix_len < temp_buffer_len
1868 		 && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1869 	    raw_suffix_len++;
1870 	  if (raw_suffix_len > raw_prefix_len)
1871 	    break;
1872 	  if (raw_suffix_len == temp_buffer_len)
1873 	    continue;
1874 	  phase = RAW_STR;
1875 	  /* Nothing should be appended to temp_buffer during
1876 	     RAW_STR phase.  */
1877 	  temp_buffer_len = 17;
1878 	}
1879       if (c == ')')
1880 	{
1881 	  phase = RAW_STR_SUFFIX;
1882 	  raw_suffix_len = 0;
1883 	  temp_buffer_len = 0;
1884 	}
1885       else if (c == '\n')
1886 	{
1887 	  if (pfile->state.in_directive
1888 	      || (pfile->state.parsing_args
1889 		  && pfile->buffer->next_line >= pfile->buffer->rlimit))
1890 	    {
1891 	      cur--;
1892 	      type = CPP_OTHER;
1893 	      cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1894 				   "unterminated raw string");
1895 	      break;
1896 	    }
1897 
1898 	  BUF_APPEND (base, cur - base);
1899 
1900 	  if (pfile->buffer->cur < pfile->buffer->rlimit)
1901 	    CPP_INCREMENT_LINE (pfile, 0);
1902 	  pfile->buffer->need_line = true;
1903 
1904 	  pfile->buffer->cur = cur-1;
1905 	  _cpp_process_line_notes (pfile, false);
1906 	  if (!_cpp_get_fresh_line (pfile))
1907 	    {
1908 	      location_t src_loc = token->src_loc;
1909 	      token->type = CPP_EOF;
1910 	      /* Tell the compiler the line number of the EOF token.  */
1911 	      token->src_loc = pfile->line_table->highest_line;
1912 	      token->flags = BOL;
1913 	      if (first_buff != NULL)
1914 		_cpp_release_buff (pfile, first_buff);
1915 	      cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1916 				   "unterminated raw string");
1917 	      return;
1918 	    }
1919 
1920 	  cur = base = pfile->buffer->cur;
1921 	  note = &pfile->buffer->notes[pfile->buffer->cur_note];
1922 	}
1923     }
1924 
1925   if (CPP_OPTION (pfile, user_literals))
1926     {
1927       /* If a string format macro, say from inttypes.h, is placed touching
1928 	 a string literal it could be parsed as a C++11 user-defined string
1929 	 literal thus breaking the program.  */
1930       if (is_macro_not_literal_suffix (pfile, cur))
1931 	{
1932 	  /* Raise a warning, but do not consume subsequent tokens.  */
1933 	  if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1934 	    cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1935 				   token->src_loc, 0,
1936 				   "invalid suffix on literal; C++11 requires "
1937 				   "a space between literal and string macro");
1938 	}
1939       /* Grab user defined literal suffix.  */
1940       else if (ISIDST (*cur))
1941 	{
1942 	  type = cpp_userdef_string_add_type (type);
1943 	  ++cur;
1944 
1945 	  while (ISIDNUM (*cur))
1946 	    ++cur;
1947 	}
1948     }
1949 
1950   pfile->buffer->cur = cur;
1951   if (first_buff == NULL)
1952     create_literal (pfile, token, base, cur - base, type);
1953   else
1954     {
1955       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1956 
1957       token->type = type;
1958       token->val.str.len = total_len + (cur - base);
1959       token->val.str.text = dest;
1960       last_buff = first_buff;
1961       while (last_buff != NULL)
1962 	{
1963 	  memcpy (dest, last_buff->base,
1964 		  BUFF_FRONT (last_buff) - last_buff->base);
1965 	  dest += BUFF_FRONT (last_buff) - last_buff->base;
1966 	  last_buff = last_buff->next;
1967 	}
1968       _cpp_release_buff (pfile, first_buff);
1969       memcpy (dest, base, cur - base);
1970       dest[cur - base] = '\0';
1971     }
1972 }
1973 
1974 /* Lexes a string, character constant, or angle-bracketed header file
1975    name.  The stored string contains the spelling, including opening
1976    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1977    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1978    if it was not properly terminated, or CPP_LESS for an unterminated
1979    header name which must be relexed as normal tokens.
1980 
1981    The spelling is NUL-terminated, but it is not guaranteed that this
1982    is the first NUL since embedded NULs are preserved.  */
1983 static void
lex_string(cpp_reader * pfile,cpp_token * token,const uchar * base)1984 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1985 {
1986   bool saw_NUL = false;
1987   const uchar *cur;
1988   cppchar_t terminator;
1989   enum cpp_ttype type;
1990 
1991   cur = base;
1992   terminator = *cur++;
1993   if (terminator == 'L' || terminator == 'U')
1994     terminator = *cur++;
1995   else if (terminator == 'u')
1996     {
1997       terminator = *cur++;
1998       if (terminator == '8')
1999 	terminator = *cur++;
2000     }
2001   if (terminator == 'R')
2002     {
2003       lex_raw_string (pfile, token, base, cur);
2004       return;
2005     }
2006   if (terminator == '"')
2007     type = (*base == 'L' ? CPP_WSTRING :
2008 	    *base == 'U' ? CPP_STRING32 :
2009 	    *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2010 			 : CPP_STRING);
2011   else if (terminator == '\'')
2012     type = (*base == 'L' ? CPP_WCHAR :
2013 	    *base == 'U' ? CPP_CHAR32 :
2014 	    *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2015 			 : CPP_CHAR);
2016   else
2017     terminator = '>', type = CPP_HEADER_NAME;
2018 
2019   for (;;)
2020     {
2021       cppchar_t c = *cur++;
2022 
2023       /* In #include-style directives, terminators are not escapable.  */
2024       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2025 	cur++;
2026       else if (c == terminator)
2027 	break;
2028       else if (c == '\n')
2029 	{
2030 	  cur--;
2031 	  /* Unmatched quotes always yield undefined behavior, but
2032 	     greedy lexing means that what appears to be an unterminated
2033 	     header name may actually be a legitimate sequence of tokens.  */
2034 	  if (terminator == '>')
2035 	    {
2036 	      token->type = CPP_LESS;
2037 	      return;
2038 	    }
2039 	  type = CPP_OTHER;
2040 	  break;
2041 	}
2042       else if (c == '\0')
2043 	saw_NUL = true;
2044     }
2045 
2046   if (saw_NUL && !pfile->state.skipping)
2047     cpp_error (pfile, CPP_DL_WARNING,
2048 	       "null character(s) preserved in literal");
2049 
2050   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2051     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2052 	       (int) terminator);
2053 
2054   if (CPP_OPTION (pfile, user_literals))
2055     {
2056       /* If a string format macro, say from inttypes.h, is placed touching
2057 	 a string literal it could be parsed as a C++11 user-defined string
2058 	 literal thus breaking the program.  */
2059       if (is_macro_not_literal_suffix (pfile, cur))
2060 	{
2061 	  /* Raise a warning, but do not consume subsequent tokens.  */
2062 	  if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2063 	    cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2064 				   token->src_loc, 0,
2065 				   "invalid suffix on literal; C++11 requires "
2066 				   "a space between literal and string macro");
2067 	}
2068       /* Grab user defined literal suffix.  */
2069       else if (ISIDST (*cur))
2070 	{
2071 	  type = cpp_userdef_char_add_type (type);
2072 	  type = cpp_userdef_string_add_type (type);
2073           ++cur;
2074 
2075 	  while (ISIDNUM (*cur))
2076 	    ++cur;
2077 	}
2078     }
2079   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2080 	   && is_macro (pfile, cur)
2081 	   && !pfile->state.skipping)
2082     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2083 			   token->src_loc, 0, "C++11 requires a space "
2084 			   "between string literal and macro");
2085 
2086   pfile->buffer->cur = cur;
2087   create_literal (pfile, token, base, cur - base, type);
2088 }
2089 
2090 /* Return the comment table. The client may not make any assumption
2091    about the ordering of the table.  */
2092 cpp_comment_table *
cpp_get_comments(cpp_reader * pfile)2093 cpp_get_comments (cpp_reader *pfile)
2094 {
2095   return &pfile->comments;
2096 }
2097 
2098 /* Append a comment to the end of the comment table. */
2099 static void
store_comment(cpp_reader * pfile,cpp_token * token)2100 store_comment (cpp_reader *pfile, cpp_token *token)
2101 {
2102   int len;
2103 
2104   if (pfile->comments.allocated == 0)
2105     {
2106       pfile->comments.allocated = 256;
2107       pfile->comments.entries = (cpp_comment *) xmalloc
2108 	(pfile->comments.allocated * sizeof (cpp_comment));
2109     }
2110 
2111   if (pfile->comments.count == pfile->comments.allocated)
2112     {
2113       pfile->comments.allocated *= 2;
2114       pfile->comments.entries = (cpp_comment *) xrealloc
2115 	(pfile->comments.entries,
2116 	 pfile->comments.allocated * sizeof (cpp_comment));
2117     }
2118 
2119   len = token->val.str.len;
2120 
2121   /* Copy comment. Note, token may not be NULL terminated. */
2122   pfile->comments.entries[pfile->comments.count].comment =
2123     (char *) xmalloc (sizeof (char) * (len + 1));
2124   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2125 	  token->val.str.text, len);
2126   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2127 
2128   /* Set source location. */
2129   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2130 
2131   /* Increment the count of entries in the comment table. */
2132   pfile->comments.count++;
2133 }
2134 
2135 /* The stored comment includes the comment start and any terminator.  */
2136 static void
save_comment(cpp_reader * pfile,cpp_token * token,const unsigned char * from,cppchar_t type)2137 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2138 	      cppchar_t type)
2139 {
2140   unsigned char *buffer;
2141   unsigned int len, clen, i;
2142 
2143   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2144 
2145   /* C++ comments probably (not definitely) have moved past a new
2146      line, which we don't want to save in the comment.  */
2147   if (is_vspace (pfile->buffer->cur[-1]))
2148     len--;
2149 
2150   /* If we are currently in a directive or in argument parsing, then
2151      we need to store all C++ comments as C comments internally, and
2152      so we need to allocate a little extra space in that case.
2153 
2154      Note that the only time we encounter a directive here is
2155      when we are saving comments in a "#define".  */
2156   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2157 	  && type == '/') ? len + 2 : len;
2158 
2159   buffer = _cpp_unaligned_alloc (pfile, clen);
2160 
2161   token->type = CPP_COMMENT;
2162   token->val.str.len = clen;
2163   token->val.str.text = buffer;
2164 
2165   buffer[0] = '/';
2166   memcpy (buffer + 1, from, len - 1);
2167 
2168   /* Finish conversion to a C comment, if necessary.  */
2169   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2170     {
2171       buffer[1] = '*';
2172       buffer[clen - 2] = '*';
2173       buffer[clen - 1] = '/';
2174       /* As there can be in a C++ comments illegal sequences for C comments
2175          we need to filter them out.  */
2176       for (i = 2; i < (clen - 2); i++)
2177         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2178           buffer[i] = '|';
2179     }
2180 
2181   /* Finally store this comment for use by clients of libcpp. */
2182   store_comment (pfile, token);
2183 }
2184 
2185 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2186    comment.  */
2187 
2188 static bool
fallthrough_comment_p(cpp_reader * pfile,const unsigned char * comment_start)2189 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2190 {
2191   const unsigned char *from = comment_start + 1;
2192 
2193   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2194     {
2195       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2196 	 don't recognize any comments.  The latter only checks attributes,
2197 	 the former doesn't warn.  */
2198     case 0:
2199     default:
2200       return false;
2201       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2202 	 content it has.  */
2203     case 1:
2204       return true;
2205     case 2:
2206       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2207 	 .*falls?[ \t-]*thr(u|ough).* regex.  */
2208       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2209 	   from++)
2210 	{
2211 	  /* Is there anything like strpbrk with upper boundary, or
2212 	     memchr looking for 2 characters rather than just one?  */
2213 	  if (from[0] != 'f' && from[0] != 'F')
2214 	    continue;
2215 	  if (from[1] != 'a' && from[1] != 'A')
2216 	    continue;
2217 	  if (from[2] != 'l' && from[2] != 'L')
2218 	    continue;
2219 	  if (from[3] != 'l' && from[3] != 'L')
2220 	    continue;
2221 	  from += sizeof "fall" - 1;
2222 	  if (from[0] == 's' || from[0] == 'S')
2223 	    from++;
2224 	  while (*from == ' ' || *from == '\t' || *from == '-')
2225 	    from++;
2226 	  if (from[0] != 't' && from[0] != 'T')
2227 	    continue;
2228 	  if (from[1] != 'h' && from[1] != 'H')
2229 	    continue;
2230 	  if (from[2] != 'r' && from[2] != 'R')
2231 	    continue;
2232 	  if (from[3] == 'u' || from[3] == 'U')
2233 	    return true;
2234 	  if (from[3] != 'o' && from[3] != 'O')
2235 	    continue;
2236 	  if (from[4] != 'u' && from[4] != 'U')
2237 	    continue;
2238 	  if (from[5] != 'g' && from[5] != 'G')
2239 	    continue;
2240 	  if (from[6] != 'h' && from[6] != 'H')
2241 	    continue;
2242 	  return true;
2243 	}
2244       return false;
2245     case 3:
2246     case 4:
2247       break;
2248     }
2249 
2250   /* Whole comment contents:
2251      -fallthrough
2252      @fallthrough@
2253    */
2254   if (*from == '-' || *from == '@')
2255     {
2256       size_t len = sizeof "fallthrough" - 1;
2257       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2258 	return false;
2259       if (memcmp (from + 1, "fallthrough", len))
2260 	return false;
2261       if (*from == '@')
2262 	{
2263 	  if (from[len + 1] != '@')
2264 	    return false;
2265 	  len++;
2266 	}
2267       from += 1 + len;
2268     }
2269   /* Whole comment contents (regex):
2270      lint -fallthrough[ \t]*
2271    */
2272   else if (*from == 'l')
2273     {
2274       size_t len = sizeof "int -fallthrough" - 1;
2275       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2276 	return false;
2277       if (memcmp (from + 1, "int -fallthrough", len))
2278 	return false;
2279       from += 1 + len;
2280       while (*from == ' ' || *from == '\t')
2281 	from++;
2282     }
2283   /* Whole comment contents (regex):
2284      [ \t]*FALLTHR(U|OUGH)[ \t]*
2285    */
2286   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2287     {
2288       while (*from == ' ' || *from == '\t')
2289 	from++;
2290       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
2291 	return false;
2292       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2293 	return false;
2294       from += sizeof "FALLTHR" - 1;
2295       if (*from == 'U')
2296 	from++;
2297       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
2298 	return false;
2299       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2300 	return false;
2301       else
2302 	from += sizeof "OUGH" - 1;
2303       while (*from == ' ' || *from == '\t')
2304 	from++;
2305     }
2306   /* Whole comment contents (regex):
2307      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2308      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2309      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2310    */
2311   else
2312     {
2313       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2314 	from++;
2315       unsigned char f = *from;
2316       bool all_upper = false;
2317       if (f == 'E' || f == 'e')
2318 	{
2319 	  if ((size_t) (pfile->buffer->cur - from)
2320 	      < sizeof "else fallthru" - 1)
2321 	    return false;
2322 	  if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2323 	    all_upper = true;
2324 	  else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2325 	    return false;
2326 	  from += sizeof "else" - 1;
2327 	  if (*from == ',')
2328 	    from++;
2329 	  if (*from != ' ')
2330 	    return false;
2331 	  from++;
2332 	  if (all_upper && *from == 'f')
2333 	    return false;
2334 	  if (f == 'e' && *from == 'F')
2335 	    return false;
2336 	  f = *from;
2337 	}
2338       else if (f == 'I' || f == 'i')
2339 	{
2340 	  if ((size_t) (pfile->buffer->cur - from)
2341 	      < sizeof "intentional fallthru" - 1)
2342 	    return false;
2343 	  if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2344 				  sizeof "NTENTIONAL" - 1) == 0)
2345 	    all_upper = true;
2346 	  else if (memcmp (from + 1, "ntentional",
2347 			   sizeof "ntentional" - 1))
2348 	    return false;
2349 	  from += sizeof "intentional" - 1;
2350 	  if (*from == ' ')
2351 	    {
2352 	      from++;
2353 	      if (all_upper && *from == 'f')
2354 		return false;
2355 	    }
2356 	  else if (all_upper)
2357 	    {
2358 	      if (memcmp (from, "LY F", sizeof "LY F" - 1))
2359 		return false;
2360 	      from += sizeof "LY " - 1;
2361 	    }
2362 	  else
2363 	    {
2364 	      if (memcmp (from, "ly ", sizeof "ly " - 1))
2365 		return false;
2366 	      from += sizeof "ly " - 1;
2367 	    }
2368 	  if (f == 'i' && *from == 'F')
2369 	    return false;
2370 	  f = *from;
2371 	}
2372       if (f != 'F' && f != 'f')
2373 	return false;
2374       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2375 	return false;
2376       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2377 	all_upper = true;
2378       else if (all_upper)
2379 	return false;
2380       else if (memcmp (from + 1, "all", sizeof "all" - 1))
2381 	return false;
2382       from += sizeof "fall" - 1;
2383       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2384 	from += 2;
2385       else if (*from == ' ' || *from == '-')
2386 	from++;
2387       else if (*from != (all_upper ? 'T' : 't'))
2388 	return false;
2389       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2390 	return false;
2391       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2392 	return false;
2393       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2394 	{
2395 	  if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2396 	    return false;
2397 	  if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2398 		      sizeof "hrough" - 1))
2399 	    return false;
2400 	  from += sizeof "through" - 1;
2401 	}
2402       else
2403 	from += sizeof "thru" - 1;
2404       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2405 	from++;
2406       if (*from == '-')
2407 	{
2408 	  from++;
2409 	  if (*comment_start == '*')
2410 	    {
2411 	      do
2412 		{
2413 		  while (*from && *from != '*'
2414 			 && *from != '\n' && *from != '\r')
2415 		    from++;
2416 		  if (*from != '*' || from[1] == '/')
2417 		    break;
2418 		  from++;
2419 		}
2420 	      while (1);
2421 	    }
2422 	  else
2423 	    while (*from && *from != '\n' && *from != '\r')
2424 	      from++;
2425 	}
2426     }
2427   /* C block comment.  */
2428   if (*comment_start == '*')
2429     {
2430       if (*from != '*' || from[1] != '/')
2431 	return false;
2432     }
2433   /* C++ line comment.  */
2434   else if (*from != '\n')
2435     return false;
2436 
2437   return true;
2438 }
2439 
2440 /* Allocate COUNT tokens for RUN.  */
2441 void
_cpp_init_tokenrun(tokenrun * run,unsigned int count)2442 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2443 {
2444   run->base = XNEWVEC (cpp_token, count);
2445   run->limit = run->base + count;
2446   run->next = NULL;
2447 }
2448 
2449 /* Returns the next tokenrun, or creates one if there is none.  */
2450 static tokenrun *
next_tokenrun(tokenrun * run)2451 next_tokenrun (tokenrun *run)
2452 {
2453   if (run->next == NULL)
2454     {
2455       run->next = XNEW (tokenrun);
2456       run->next->prev = run;
2457       _cpp_init_tokenrun (run->next, 250);
2458     }
2459 
2460   return run->next;
2461 }
2462 
2463 /* Return the number of not yet processed token in a given
2464    context.  */
2465 int
_cpp_remaining_tokens_num_in_context(cpp_context * context)2466 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2467 {
2468   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2469     return (LAST (context).token - FIRST (context).token);
2470   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2471 	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
2472     return (LAST (context).ptoken - FIRST (context).ptoken);
2473   else
2474       abort ();
2475 }
2476 
2477 /* Returns the token present at index INDEX in a given context.  If
2478    INDEX is zero, the next token to be processed is returned.  */
2479 static const cpp_token*
_cpp_token_from_context_at(cpp_context * context,int index)2480 _cpp_token_from_context_at (cpp_context *context, int index)
2481 {
2482   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2483     return &(FIRST (context).token[index]);
2484   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2485 	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
2486     return FIRST (context).ptoken[index];
2487  else
2488    abort ();
2489 }
2490 
2491 /* Look ahead in the input stream.  */
2492 const cpp_token *
cpp_peek_token(cpp_reader * pfile,int index)2493 cpp_peek_token (cpp_reader *pfile, int index)
2494 {
2495   cpp_context *context = pfile->context;
2496   const cpp_token *peektok;
2497   int count;
2498 
2499   /* First, scan through any pending cpp_context objects.  */
2500   while (context->prev)
2501     {
2502       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2503 
2504       if (index < (int) sz)
2505         return _cpp_token_from_context_at (context, index);
2506       index -= (int) sz;
2507       context = context->prev;
2508     }
2509 
2510   /* We will have to read some new tokens after all (and do so
2511      without invalidating preceding tokens).  */
2512   count = index;
2513   pfile->keep_tokens++;
2514 
2515   /* For peeked tokens temporarily disable line_change reporting,
2516      until the tokens are parsed for real.  */
2517   void (*line_change) (cpp_reader *, const cpp_token *, int)
2518     = pfile->cb.line_change;
2519   pfile->cb.line_change = NULL;
2520 
2521   do
2522     {
2523       peektok = _cpp_lex_token (pfile);
2524       if (peektok->type == CPP_EOF)
2525 	{
2526 	  index--;
2527 	  break;
2528 	}
2529     }
2530   while (index--);
2531 
2532   _cpp_backup_tokens_direct (pfile, count - index);
2533   pfile->keep_tokens--;
2534   pfile->cb.line_change = line_change;
2535 
2536   return peektok;
2537 }
2538 
2539 /* Allocate a single token that is invalidated at the same time as the
2540    rest of the tokens on the line.  Has its line and col set to the
2541    same as the last lexed token, so that diagnostics appear in the
2542    right place.  */
2543 cpp_token *
_cpp_temp_token(cpp_reader * pfile)2544 _cpp_temp_token (cpp_reader *pfile)
2545 {
2546   cpp_token *old, *result;
2547   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2548   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2549 
2550   old = pfile->cur_token - 1;
2551   /* Any pre-existing lookaheads must not be clobbered.  */
2552   if (la)
2553     {
2554       if (sz <= la)
2555         {
2556           tokenrun *next = next_tokenrun (pfile->cur_run);
2557 
2558           if (sz < la)
2559             memmove (next->base + 1, next->base,
2560                      (la - sz) * sizeof (cpp_token));
2561 
2562           next->base[0] = pfile->cur_run->limit[-1];
2563         }
2564 
2565       if (sz > 1)
2566         memmove (pfile->cur_token + 1, pfile->cur_token,
2567                  MIN (la, sz - 1) * sizeof (cpp_token));
2568     }
2569 
2570   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2571     {
2572       pfile->cur_run = next_tokenrun (pfile->cur_run);
2573       pfile->cur_token = pfile->cur_run->base;
2574     }
2575 
2576   result = pfile->cur_token++;
2577   result->src_loc = old->src_loc;
2578   return result;
2579 }
2580 
2581 /* Lex a token into RESULT (external interface).  Takes care of issues
2582    like directive handling, token lookahead, multiple include
2583    optimization and skipping.  */
2584 const cpp_token *
_cpp_lex_token(cpp_reader * pfile)2585 _cpp_lex_token (cpp_reader *pfile)
2586 {
2587   cpp_token *result;
2588 
2589   for (;;)
2590     {
2591       if (pfile->cur_token == pfile->cur_run->limit)
2592 	{
2593 	  pfile->cur_run = next_tokenrun (pfile->cur_run);
2594 	  pfile->cur_token = pfile->cur_run->base;
2595 	}
2596       /* We assume that the current token is somewhere in the current
2597 	 run.  */
2598       if (pfile->cur_token < pfile->cur_run->base
2599 	  || pfile->cur_token >= pfile->cur_run->limit)
2600 	abort ();
2601 
2602       if (pfile->lookaheads)
2603 	{
2604 	  pfile->lookaheads--;
2605 	  result = pfile->cur_token++;
2606 	}
2607       else
2608 	result = _cpp_lex_direct (pfile);
2609 
2610       if (result->flags & BOL)
2611 	{
2612 	  /* Is this a directive.  If _cpp_handle_directive returns
2613 	     false, it is an assembler #.  */
2614 	  if (result->type == CPP_HASH
2615 	      /* 6.10.3 p 11: Directives in a list of macro arguments
2616 		 gives undefined behavior.  This implementation
2617 		 handles the directive as normal.  */
2618 	      && pfile->state.parsing_args != 1)
2619 	    {
2620 	      if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2621 		{
2622 		  if (pfile->directive_result.type == CPP_PADDING)
2623 		    continue;
2624 		  result = &pfile->directive_result;
2625 		}
2626 	    }
2627 	  else if (pfile->state.in_deferred_pragma)
2628 	    result = &pfile->directive_result;
2629 
2630 	  if (pfile->cb.line_change && !pfile->state.skipping)
2631 	    pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2632 	}
2633 
2634       /* We don't skip tokens in directives.  */
2635       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2636 	break;
2637 
2638       /* Outside a directive, invalidate controlling macros.  At file
2639 	 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2640 	 get here and MI optimization works.  */
2641       pfile->mi_valid = false;
2642 
2643       if (!pfile->state.skipping || result->type == CPP_EOF)
2644 	break;
2645     }
2646 
2647   return result;
2648 }
2649 
2650 /* Returns true if a fresh line has been loaded.  */
2651 bool
_cpp_get_fresh_line(cpp_reader * pfile)2652 _cpp_get_fresh_line (cpp_reader *pfile)
2653 {
2654   int return_at_eof;
2655 
2656   /* We can't get a new line until we leave the current directive.  */
2657   if (pfile->state.in_directive)
2658     return false;
2659 
2660   for (;;)
2661     {
2662       cpp_buffer *buffer = pfile->buffer;
2663 
2664       if (!buffer->need_line)
2665 	return true;
2666 
2667       if (buffer->next_line < buffer->rlimit)
2668 	{
2669 	  _cpp_clean_line (pfile);
2670 	  return true;
2671 	}
2672 
2673       /* First, get out of parsing arguments state.  */
2674       if (pfile->state.parsing_args)
2675 	return false;
2676 
2677       /* End of buffer.  Non-empty files should end in a newline.  */
2678       if (buffer->buf != buffer->rlimit
2679 	  && buffer->next_line > buffer->rlimit
2680 	  && !buffer->from_stage3)
2681 	{
2682 	  /* Clip to buffer size.  */
2683 	  buffer->next_line = buffer->rlimit;
2684 	}
2685 
2686       return_at_eof = buffer->return_at_eof;
2687       _cpp_pop_buffer (pfile);
2688       if (pfile->buffer == NULL || return_at_eof)
2689 	return false;
2690     }
2691 }
2692 
2693 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)		\
2694   do							\
2695     {							\
2696       result->type = ELSE_TYPE;				\
2697       if (*buffer->cur == CHAR)				\
2698 	buffer->cur++, result->type = THEN_TYPE;	\
2699     }							\
2700   while (0)
2701 
2702 /* Lex a token into pfile->cur_token, which is also incremented, to
2703    get diagnostics pointing to the correct location.
2704 
2705    Does not handle issues such as token lookahead, multiple-include
2706    optimization, directives, skipping etc.  This function is only
2707    suitable for use by _cpp_lex_token, and in special cases like
2708    lex_expansion_token which doesn't care for any of these issues.
2709 
2710    When meeting a newline, returns CPP_EOF if parsing a directive,
2711    otherwise returns to the start of the token buffer if permissible.
2712    Returns the location of the lexed token.  */
2713 cpp_token *
_cpp_lex_direct(cpp_reader * pfile)2714 _cpp_lex_direct (cpp_reader *pfile)
2715 {
2716   cppchar_t c;
2717   cpp_buffer *buffer;
2718   const unsigned char *comment_start;
2719   bool fallthrough_comment = false;
2720   cpp_token *result = pfile->cur_token++;
2721 
2722  fresh_line:
2723   result->flags = 0;
2724   buffer = pfile->buffer;
2725   if (buffer->need_line)
2726     {
2727       if (pfile->state.in_deferred_pragma)
2728 	{
2729 	  result->type = CPP_PRAGMA_EOL;
2730 	  pfile->state.in_deferred_pragma = false;
2731 	  if (!pfile->state.pragma_allow_expansion)
2732 	    pfile->state.prevent_expansion--;
2733 	  return result;
2734 	}
2735       if (!_cpp_get_fresh_line (pfile))
2736 	{
2737 	  result->type = CPP_EOF;
2738 	  if (!pfile->state.in_directive)
2739 	    {
2740 	      /* Tell the compiler the line number of the EOF token.  */
2741 	      result->src_loc = pfile->line_table->highest_line;
2742 	      result->flags = BOL;
2743 	    }
2744 	  return result;
2745 	}
2746       if (buffer != pfile->buffer)
2747 	fallthrough_comment = false;
2748       if (!pfile->keep_tokens)
2749 	{
2750 	  pfile->cur_run = &pfile->base_run;
2751 	  result = pfile->base_run.base;
2752 	  pfile->cur_token = result + 1;
2753 	}
2754       result->flags = BOL;
2755       if (pfile->state.parsing_args == 2)
2756 	result->flags |= PREV_WHITE;
2757     }
2758   buffer = pfile->buffer;
2759  update_tokens_line:
2760   result->src_loc = pfile->line_table->highest_line;
2761 
2762  skipped_white:
2763   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2764       && !pfile->overlaid_buffer)
2765     {
2766       _cpp_process_line_notes (pfile, false);
2767       result->src_loc = pfile->line_table->highest_line;
2768     }
2769   c = *buffer->cur++;
2770 
2771   if (pfile->forced_token_location)
2772     result->src_loc = pfile->forced_token_location;
2773   else
2774     result->src_loc = linemap_position_for_column (pfile->line_table,
2775 					  CPP_BUF_COLUMN (buffer, buffer->cur));
2776 
2777   switch (c)
2778     {
2779     case ' ': case '\t': case '\f': case '\v': case '\0':
2780       result->flags |= PREV_WHITE;
2781       skip_whitespace (pfile, c);
2782       goto skipped_white;
2783 
2784     case '\n':
2785       /* Increment the line, unless this is the last line ...  */
2786       if (buffer->cur < buffer->rlimit
2787 	  /* ... or this is a #include, (where _cpp_stack_file needs to
2788 	     unwind by one line) ...  */
2789 	  || (pfile->state.in_directive > 1
2790 	      /* ... except traditional-cpp increments this elsewhere.  */
2791 	      && !CPP_OPTION (pfile, traditional)))
2792 	CPP_INCREMENT_LINE (pfile, 0);
2793       buffer->need_line = true;
2794       goto fresh_line;
2795 
2796     case '0': case '1': case '2': case '3': case '4':
2797     case '5': case '6': case '7': case '8': case '9':
2798       {
2799 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2800 	result->type = CPP_NUMBER;
2801 	lex_number (pfile, &result->val.str, &nst);
2802 	warn_about_normalization (pfile, result, &nst);
2803 	break;
2804       }
2805 
2806     case 'L':
2807     case 'u':
2808     case 'U':
2809     case 'R':
2810       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2811 	 wide strings or raw strings.  */
2812       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2813 	  || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2814 	{
2815 	  if ((*buffer->cur == '\'' && c != 'R')
2816 	      || *buffer->cur == '"'
2817 	      || (*buffer->cur == 'R'
2818 		  && c != 'R'
2819 		  && buffer->cur[1] == '"'
2820 		  && CPP_OPTION (pfile, rliterals))
2821 	      || (*buffer->cur == '8'
2822 		  && c == 'u'
2823 		  && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
2824 				&& CPP_OPTION (pfile, utf8_char_literals)))
2825 		      || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2826 			  && CPP_OPTION (pfile, rliterals)))))
2827 	    {
2828 	      lex_string (pfile, result, buffer->cur - 1);
2829 	      break;
2830 	    }
2831 	}
2832       /* Fall through.  */
2833 
2834     case '_':
2835     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2836     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2837     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2838     case 's': case 't':           case 'v': case 'w': case 'x':
2839     case 'y': case 'z':
2840     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2841     case 'G': case 'H': case 'I': case 'J': case 'K':
2842     case 'M': case 'N': case 'O': case 'P': case 'Q':
2843     case 'S': case 'T':           case 'V': case 'W': case 'X':
2844     case 'Y': case 'Z':
2845       result->type = CPP_NAME;
2846       {
2847 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2848 	result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2849 						&nst,
2850 						&result->val.node.spelling);
2851 	warn_about_normalization (pfile, result, &nst);
2852       }
2853 
2854       /* Convert named operators to their proper types.  */
2855       if (result->val.node.node->flags & NODE_OPERATOR)
2856 	{
2857 	  result->flags |= NAMED_OP;
2858 	  result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2859 	}
2860 
2861       /* Signal FALLTHROUGH comment followed by another token.  */
2862       if (fallthrough_comment)
2863 	result->flags |= PREV_FALLTHROUGH;
2864       break;
2865 
2866     case '\'':
2867     case '"':
2868       lex_string (pfile, result, buffer->cur - 1);
2869       break;
2870 
2871     case '/':
2872       /* A potential block or line comment.  */
2873       comment_start = buffer->cur;
2874       c = *buffer->cur;
2875 
2876       if (c == '*')
2877 	{
2878 	  if (_cpp_skip_block_comment (pfile))
2879 	    cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2880 	}
2881       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
2882 	{
2883 	  /* Don't warn for system headers.  */
2884 	  if (cpp_in_system_header (pfile))
2885 	    ;
2886 	  /* Warn about comments if pedantically GNUC89, and not
2887 	     in system headers.  */
2888 	  else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
2889 		   && CPP_PEDANTIC (pfile)
2890 		   && ! buffer->warned_cplusplus_comments)
2891 	    {
2892 	      if (cpp_error (pfile, CPP_DL_PEDWARN,
2893 			     "C++ style comments are not allowed in ISO C90"))
2894 		cpp_error (pfile, CPP_DL_NOTE,
2895 			   "(this will be reported only once per input file)");
2896 	      buffer->warned_cplusplus_comments = 1;
2897 	    }
2898 	  /* Or if specifically desired via -Wc90-c99-compat.  */
2899 	  else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
2900 		   && ! CPP_OPTION (pfile, cplusplus)
2901 		   && ! buffer->warned_cplusplus_comments)
2902 	    {
2903 	      if (cpp_error (pfile, CPP_DL_WARNING,
2904 			     "C++ style comments are incompatible with C90"))
2905 		cpp_error (pfile, CPP_DL_NOTE,
2906 			   "(this will be reported only once per input file)");
2907 	      buffer->warned_cplusplus_comments = 1;
2908 	    }
2909 	  /* In C89/C94, C++ style comments are forbidden.  */
2910 	  else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
2911 		    || CPP_OPTION (pfile, lang) == CLK_STDC94))
2912 	    {
2913 	      /* But don't be confused about valid code such as
2914 	         - // immediately followed by *,
2915 		 - // in a preprocessing directive,
2916 		 - // in an #if 0 block.  */
2917 	      if (buffer->cur[1] == '*'
2918 		  || pfile->state.in_directive
2919 		  || pfile->state.skipping)
2920 		{
2921 		  result->type = CPP_DIV;
2922 		  break;
2923 		}
2924 	      else if (! buffer->warned_cplusplus_comments)
2925 		{
2926 		  if (cpp_error (pfile, CPP_DL_ERROR,
2927 				 "C++ style comments are not allowed in "
2928 				 "ISO C90"))
2929 		    cpp_error (pfile, CPP_DL_NOTE,
2930 			       "(this will be reported only once per input "
2931 			       "file)");
2932 		  buffer->warned_cplusplus_comments = 1;
2933 		}
2934 	    }
2935 	  if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2936 	    cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2937 	}
2938       else if (c == '=')
2939 	{
2940 	  buffer->cur++;
2941 	  result->type = CPP_DIV_EQ;
2942 	  break;
2943 	}
2944       else
2945 	{
2946 	  result->type = CPP_DIV;
2947 	  break;
2948 	}
2949 
2950       if (fallthrough_comment_p (pfile, comment_start))
2951 	fallthrough_comment = true;
2952 
2953       if (pfile->cb.comment)
2954 	{
2955 	  size_t len = pfile->buffer->cur - comment_start;
2956 	  pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
2957 			     len + 1);
2958 	}
2959 
2960       if (!pfile->state.save_comments)
2961 	{
2962 	  result->flags |= PREV_WHITE;
2963 	  goto update_tokens_line;
2964 	}
2965 
2966       if (fallthrough_comment)
2967 	result->flags |= PREV_FALLTHROUGH;
2968 
2969       /* Save the comment as a token in its own right.  */
2970       save_comment (pfile, result, comment_start, c);
2971       break;
2972 
2973     case '<':
2974       if (pfile->state.angled_headers)
2975 	{
2976 	  lex_string (pfile, result, buffer->cur - 1);
2977 	  if (result->type != CPP_LESS)
2978 	    break;
2979 	}
2980 
2981       result->type = CPP_LESS;
2982       if (*buffer->cur == '=')
2983 	{
2984 	  buffer->cur++, result->type = CPP_LESS_EQ;
2985 	  if (*buffer->cur == '>'
2986 	      && CPP_OPTION (pfile, cplusplus)
2987 	      && CPP_OPTION (pfile, lang) >= CLK_GNUCXX2A)
2988 	    buffer->cur++, result->type = CPP_SPACESHIP;
2989 	}
2990       else if (*buffer->cur == '<')
2991 	{
2992 	  buffer->cur++;
2993 	  IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2994 	}
2995       else if (CPP_OPTION (pfile, digraphs))
2996 	{
2997 	  if (*buffer->cur == ':')
2998 	    {
2999 	      /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3000 		 three characters are <:: and the subsequent character
3001 		 is neither : nor >, the < is treated as a preprocessor
3002 		 token by itself".  */
3003 	      if (CPP_OPTION (pfile, cplusplus)
3004 		  && CPP_OPTION (pfile, lang) != CLK_CXX98
3005 		  && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3006 		  && buffer->cur[1] == ':'
3007 		  && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3008 		break;
3009 
3010 	      buffer->cur++;
3011 	      result->flags |= DIGRAPH;
3012 	      result->type = CPP_OPEN_SQUARE;
3013 	    }
3014 	  else if (*buffer->cur == '%')
3015 	    {
3016 	      buffer->cur++;
3017 	      result->flags |= DIGRAPH;
3018 	      result->type = CPP_OPEN_BRACE;
3019 	    }
3020 	}
3021       break;
3022 
3023     case '>':
3024       result->type = CPP_GREATER;
3025       if (*buffer->cur == '=')
3026 	buffer->cur++, result->type = CPP_GREATER_EQ;
3027       else if (*buffer->cur == '>')
3028 	{
3029 	  buffer->cur++;
3030 	  IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3031 	}
3032       break;
3033 
3034     case '%':
3035       result->type = CPP_MOD;
3036       if (*buffer->cur == '=')
3037 	buffer->cur++, result->type = CPP_MOD_EQ;
3038       else if (CPP_OPTION (pfile, digraphs))
3039 	{
3040 	  if (*buffer->cur == ':')
3041 	    {
3042 	      buffer->cur++;
3043 	      result->flags |= DIGRAPH;
3044 	      result->type = CPP_HASH;
3045 	      if (*buffer->cur == '%' && buffer->cur[1] == ':')
3046 		buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3047 	    }
3048 	  else if (*buffer->cur == '>')
3049 	    {
3050 	      buffer->cur++;
3051 	      result->flags |= DIGRAPH;
3052 	      result->type = CPP_CLOSE_BRACE;
3053 	    }
3054 	}
3055       break;
3056 
3057     case '.':
3058       result->type = CPP_DOT;
3059       if (ISDIGIT (*buffer->cur))
3060 	{
3061 	  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3062 	  result->type = CPP_NUMBER;
3063 	  lex_number (pfile, &result->val.str, &nst);
3064 	  warn_about_normalization (pfile, result, &nst);
3065 	}
3066       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3067 	buffer->cur += 2, result->type = CPP_ELLIPSIS;
3068       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3069 	buffer->cur++, result->type = CPP_DOT_STAR;
3070       break;
3071 
3072     case '+':
3073       result->type = CPP_PLUS;
3074       if (*buffer->cur == '+')
3075 	buffer->cur++, result->type = CPP_PLUS_PLUS;
3076       else if (*buffer->cur == '=')
3077 	buffer->cur++, result->type = CPP_PLUS_EQ;
3078       break;
3079 
3080     case '-':
3081       result->type = CPP_MINUS;
3082       if (*buffer->cur == '>')
3083 	{
3084 	  buffer->cur++;
3085 	  result->type = CPP_DEREF;
3086 	  if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3087 	    buffer->cur++, result->type = CPP_DEREF_STAR;
3088 	}
3089       else if (*buffer->cur == '-')
3090 	buffer->cur++, result->type = CPP_MINUS_MINUS;
3091       else if (*buffer->cur == '=')
3092 	buffer->cur++, result->type = CPP_MINUS_EQ;
3093       break;
3094 
3095     case '&':
3096       result->type = CPP_AND;
3097       if (*buffer->cur == '&')
3098 	buffer->cur++, result->type = CPP_AND_AND;
3099       else if (*buffer->cur == '=')
3100 	buffer->cur++, result->type = CPP_AND_EQ;
3101       break;
3102 
3103     case '|':
3104       result->type = CPP_OR;
3105       if (*buffer->cur == '|')
3106 	buffer->cur++, result->type = CPP_OR_OR;
3107       else if (*buffer->cur == '=')
3108 	buffer->cur++, result->type = CPP_OR_EQ;
3109       break;
3110 
3111     case ':':
3112       result->type = CPP_COLON;
3113       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
3114 	buffer->cur++, result->type = CPP_SCOPE;
3115       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3116 	{
3117 	  buffer->cur++;
3118 	  result->flags |= DIGRAPH;
3119 	  result->type = CPP_CLOSE_SQUARE;
3120 	}
3121       break;
3122 
3123     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3124     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3125     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3126     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3127     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3128 
3129     case '?': result->type = CPP_QUERY; break;
3130     case '~': result->type = CPP_COMPL; break;
3131     case ',': result->type = CPP_COMMA; break;
3132     case '(': result->type = CPP_OPEN_PAREN; break;
3133     case ')': result->type = CPP_CLOSE_PAREN; break;
3134     case '[': result->type = CPP_OPEN_SQUARE; break;
3135     case ']': result->type = CPP_CLOSE_SQUARE; break;
3136     case '{': result->type = CPP_OPEN_BRACE; break;
3137     case '}': result->type = CPP_CLOSE_BRACE; break;
3138     case ';': result->type = CPP_SEMICOLON; break;
3139 
3140       /* @ is a punctuator in Objective-C.  */
3141     case '@': result->type = CPP_ATSIGN; break;
3142 
3143     default:
3144       {
3145 	const uchar *base = --buffer->cur;
3146 
3147 	/* Check for an extended identifier ($ or UCN or UTF-8).  */
3148 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3149 	if (forms_identifier_p (pfile, true, &nst))
3150 	  {
3151 	    result->type = CPP_NAME;
3152 	    result->val.node.node = lex_identifier (pfile, base, true, &nst,
3153 						    &result->val.node.spelling);
3154 	    warn_about_normalization (pfile, result, &nst);
3155 	    break;
3156 	  }
3157 
3158 	/* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
3159 	   single token.  */
3160 	buffer->cur++;
3161 	if (c >= utf8_signifier)
3162 	  {
3163 	    const uchar *pstr = base;
3164 	    cppchar_t s;
3165 	    if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
3166 	      buffer->cur = pstr;
3167 	  }
3168 	create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
3169 	break;
3170       }
3171 
3172     }
3173 
3174   /* Potentially convert the location of the token to a range.  */
3175   if (result->src_loc >= RESERVED_LOCATION_COUNT
3176       && result->type != CPP_EOF)
3177     {
3178       /* Ensure that any line notes are processed, so that we have the
3179 	 correct physical line/column for the end-point of the token even
3180 	 when a logical line is split via one or more backslashes.  */
3181       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3182 	  && !pfile->overlaid_buffer)
3183 	_cpp_process_line_notes (pfile, false);
3184 
3185       source_range tok_range;
3186       tok_range.m_start = result->src_loc;
3187       tok_range.m_finish
3188 	= linemap_position_for_column (pfile->line_table,
3189 				       CPP_BUF_COLUMN (buffer, buffer->cur));
3190 
3191       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3192 					       result->src_loc,
3193 					       tok_range, NULL);
3194     }
3195 
3196   return result;
3197 }
3198 
3199 /* An upper bound on the number of bytes needed to spell TOKEN.
3200    Does not include preceding whitespace.  */
3201 unsigned int
cpp_token_len(const cpp_token * token)3202 cpp_token_len (const cpp_token *token)
3203 {
3204   unsigned int len;
3205 
3206   switch (TOKEN_SPELL (token))
3207     {
3208     default:		len = 6;				break;
3209     case SPELL_LITERAL:	len = token->val.str.len;		break;
3210     case SPELL_IDENT:	len = NODE_LEN (token->val.node.node) * 10;	break;
3211     }
3212 
3213   return len;
3214 }
3215 
3216 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3217    Return the number of bytes read out of NAME.  (There are always
3218    10 bytes written to BUFFER.)  */
3219 
3220 static size_t
utf8_to_ucn(unsigned char * buffer,const unsigned char * name)3221 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3222 {
3223   int j;
3224   int ucn_len = 0;
3225   int ucn_len_c;
3226   unsigned t;
3227   unsigned long utf32;
3228 
3229   /* Compute the length of the UTF-8 sequence.  */
3230   for (t = *name; t & 0x80; t <<= 1)
3231     ucn_len++;
3232 
3233   utf32 = *name & (0x7F >> ucn_len);
3234   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3235     {
3236       utf32 = (utf32 << 6) | (*++name & 0x3F);
3237 
3238       /* Ill-formed UTF-8.  */
3239       if ((*name & ~0x3F) != 0x80)
3240 	abort ();
3241     }
3242 
3243   *buffer++ = '\\';
3244   *buffer++ = 'U';
3245   for (j = 7; j >= 0; j--)
3246     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3247   return ucn_len;
3248 }
3249 
3250 /* Given a token TYPE corresponding to a digraph, return a pointer to
3251    the spelling of the digraph.  */
3252 static const unsigned char *
cpp_digraph2name(enum cpp_ttype type)3253 cpp_digraph2name (enum cpp_ttype type)
3254 {
3255   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3256 }
3257 
3258 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3259    The buffer must already contain the enough space to hold the
3260    token's spelling.  Returns a pointer to the character after the
3261    last character written.  */
3262 unsigned char *
_cpp_spell_ident_ucns(unsigned char * buffer,cpp_hashnode * ident)3263 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3264 {
3265   size_t i;
3266   const unsigned char *name = NODE_NAME (ident);
3267 
3268   for (i = 0; i < NODE_LEN (ident); i++)
3269     if (name[i] & ~0x7F)
3270       {
3271 	i += utf8_to_ucn (buffer, name + i) - 1;
3272 	buffer += 10;
3273       }
3274     else
3275       *buffer++ = name[i];
3276 
3277   return buffer;
3278 }
3279 
3280 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
3281    already contain the enough space to hold the token's spelling.
3282    Returns a pointer to the character after the last character written.
3283    FORSTRING is true if this is to be the spelling after translation
3284    phase 1 (with the original spelling of extended identifiers), false
3285    if extended identifiers should always be written using UCNs (there is
3286    no option for always writing them in the internal UTF-8 form).
3287    FIXME: Would be nice if we didn't need the PFILE argument.  */
3288 unsigned char *
cpp_spell_token(cpp_reader * pfile,const cpp_token * token,unsigned char * buffer,bool forstring)3289 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3290 		 unsigned char *buffer, bool forstring)
3291 {
3292   switch (TOKEN_SPELL (token))
3293     {
3294     case SPELL_OPERATOR:
3295       {
3296 	const unsigned char *spelling;
3297 	unsigned char c;
3298 
3299 	if (token->flags & DIGRAPH)
3300 	  spelling = cpp_digraph2name (token->type);
3301 	else if (token->flags & NAMED_OP)
3302 	  goto spell_ident;
3303 	else
3304 	  spelling = TOKEN_NAME (token);
3305 
3306 	while ((c = *spelling++) != '\0')
3307 	  *buffer++ = c;
3308       }
3309       break;
3310 
3311     spell_ident:
3312     case SPELL_IDENT:
3313       if (forstring)
3314 	{
3315 	  memcpy (buffer, NODE_NAME (token->val.node.spelling),
3316 		  NODE_LEN (token->val.node.spelling));
3317 	  buffer += NODE_LEN (token->val.node.spelling);
3318 	}
3319       else
3320 	buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3321       break;
3322 
3323     case SPELL_LITERAL:
3324       memcpy (buffer, token->val.str.text, token->val.str.len);
3325       buffer += token->val.str.len;
3326       break;
3327 
3328     case SPELL_NONE:
3329       cpp_error (pfile, CPP_DL_ICE,
3330 		 "unspellable token %s", TOKEN_NAME (token));
3331       break;
3332     }
3333 
3334   return buffer;
3335 }
3336 
3337 /* Returns TOKEN spelt as a null-terminated string.  The string is
3338    freed when the reader is destroyed.  Useful for diagnostics.  */
3339 unsigned char *
cpp_token_as_text(cpp_reader * pfile,const cpp_token * token)3340 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3341 {
3342   unsigned int len = cpp_token_len (token) + 1;
3343   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3344 
3345   end = cpp_spell_token (pfile, token, start, false);
3346   end[0] = '\0';
3347 
3348   return start;
3349 }
3350 
3351 /* Returns a pointer to a string which spells the token defined by
3352    TYPE and FLAGS.  Used by C front ends, which really should move to
3353    using cpp_token_as_text.  */
3354 const char *
cpp_type2name(enum cpp_ttype type,unsigned char flags)3355 cpp_type2name (enum cpp_ttype type, unsigned char flags)
3356 {
3357   if (flags & DIGRAPH)
3358     return (const char *) cpp_digraph2name (type);
3359   else if (flags & NAMED_OP)
3360     return cpp_named_operator2name (type);
3361 
3362   return (const char *) token_spellings[type].name;
3363 }
3364 
3365 /* Writes the spelling of token to FP, without any preceding space.
3366    Separated from cpp_spell_token for efficiency - to avoid stdio
3367    double-buffering.  */
3368 void
cpp_output_token(const cpp_token * token,FILE * fp)3369 cpp_output_token (const cpp_token *token, FILE *fp)
3370 {
3371   switch (TOKEN_SPELL (token))
3372     {
3373     case SPELL_OPERATOR:
3374       {
3375 	const unsigned char *spelling;
3376 	int c;
3377 
3378 	if (token->flags & DIGRAPH)
3379 	  spelling = cpp_digraph2name (token->type);
3380 	else if (token->flags & NAMED_OP)
3381 	  goto spell_ident;
3382 	else
3383 	  spelling = TOKEN_NAME (token);
3384 
3385 	c = *spelling;
3386 	do
3387 	  putc (c, fp);
3388 	while ((c = *++spelling) != '\0');
3389       }
3390       break;
3391 
3392     spell_ident:
3393     case SPELL_IDENT:
3394       {
3395 	size_t i;
3396 	const unsigned char * name = NODE_NAME (token->val.node.node);
3397 
3398 	for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3399 	  if (name[i] & ~0x7F)
3400 	    {
3401 	      unsigned char buffer[10];
3402 	      i += utf8_to_ucn (buffer, name + i) - 1;
3403 	      fwrite (buffer, 1, 10, fp);
3404 	    }
3405 	  else
3406 	    fputc (NODE_NAME (token->val.node.node)[i], fp);
3407       }
3408       break;
3409 
3410     case SPELL_LITERAL:
3411       fwrite (token->val.str.text, 1, token->val.str.len, fp);
3412       break;
3413 
3414     case SPELL_NONE:
3415       /* An error, most probably.  */
3416       break;
3417     }
3418 }
3419 
3420 /* Compare two tokens.  */
3421 int
_cpp_equiv_tokens(const cpp_token * a,const cpp_token * b)3422 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3423 {
3424   if (a->type == b->type && a->flags == b->flags)
3425     switch (TOKEN_SPELL (a))
3426       {
3427       default:			/* Keep compiler happy.  */
3428       case SPELL_OPERATOR:
3429 	/* token_no is used to track where multiple consecutive ##
3430 	   tokens were originally located.  */
3431 	return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3432       case SPELL_NONE:
3433 	return (a->type != CPP_MACRO_ARG
3434 		|| (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3435 		    && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3436       case SPELL_IDENT:
3437 	return (a->val.node.node == b->val.node.node
3438 		&& a->val.node.spelling == b->val.node.spelling);
3439       case SPELL_LITERAL:
3440 	return (a->val.str.len == b->val.str.len
3441 		&& !memcmp (a->val.str.text, b->val.str.text,
3442 			    a->val.str.len));
3443       }
3444 
3445   return 0;
3446 }
3447 
3448 /* Returns nonzero if a space should be inserted to avoid an
3449    accidental token paste for output.  For simplicity, it is
3450    conservative, and occasionally advises a space where one is not
3451    needed, e.g. "." and ".2".  */
3452 int
cpp_avoid_paste(cpp_reader * pfile,const cpp_token * token1,const cpp_token * token2)3453 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3454 		 const cpp_token *token2)
3455 {
3456   enum cpp_ttype a = token1->type, b = token2->type;
3457   cppchar_t c;
3458 
3459   if (token1->flags & NAMED_OP)
3460     a = CPP_NAME;
3461   if (token2->flags & NAMED_OP)
3462     b = CPP_NAME;
3463 
3464   c = EOF;
3465   if (token2->flags & DIGRAPH)
3466     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3467   else if (token_spellings[b].category == SPELL_OPERATOR)
3468     c = token_spellings[b].name[0];
3469 
3470   /* Quickly get everything that can paste with an '='.  */
3471   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3472     return 1;
3473 
3474   switch (a)
3475     {
3476     case CPP_GREATER:	return c == '>';
3477     case CPP_LESS:	return c == '<' || c == '%' || c == ':';
3478     case CPP_PLUS:	return c == '+';
3479     case CPP_MINUS:	return c == '-' || c == '>';
3480     case CPP_DIV:	return c == '/' || c == '*'; /* Comments.  */
3481     case CPP_MOD:	return c == ':' || c == '>';
3482     case CPP_AND:	return c == '&';
3483     case CPP_OR:	return c == '|';
3484     case CPP_COLON:	return c == ':' || c == '>';
3485     case CPP_DEREF:	return c == '*';
3486     case CPP_DOT:	return c == '.' || c == '%' || b == CPP_NUMBER;
3487     case CPP_HASH:	return c == '#' || c == '%'; /* Digraph form.  */
3488     case CPP_NAME:	return ((b == CPP_NUMBER
3489 				 && name_p (pfile, &token2->val.str))
3490 				|| b == CPP_NAME
3491 				|| b == CPP_CHAR || b == CPP_STRING); /* L */
3492     case CPP_NUMBER:	return (b == CPP_NUMBER || b == CPP_NAME
3493 				|| c == '.' || c == '+' || c == '-');
3494 				      /* UCNs */
3495     case CPP_OTHER:	return ((token1->val.str.text[0] == '\\'
3496 				 && b == CPP_NAME)
3497 				|| (CPP_OPTION (pfile, objc)
3498 				    && token1->val.str.text[0] == '@'
3499 				    && (b == CPP_NAME || b == CPP_STRING)));
3500     case CPP_LESS_EQ:	return c == '>';
3501     case CPP_STRING:
3502     case CPP_WSTRING:
3503     case CPP_UTF8STRING:
3504     case CPP_STRING16:
3505     case CPP_STRING32:	return (CPP_OPTION (pfile, user_literals)
3506 				&& (b == CPP_NAME
3507 				    || (TOKEN_SPELL (token2) == SPELL_LITERAL
3508 					&& ISIDST (token2->val.str.text[0]))));
3509 
3510     default:		break;
3511     }
3512 
3513   return 0;
3514 }
3515 
3516 /* Output all the remaining tokens on the current line, and a newline
3517    character, to FP.  Leading whitespace is removed.  If there are
3518    macros, special token padding is not performed.  */
3519 void
cpp_output_line(cpp_reader * pfile,FILE * fp)3520 cpp_output_line (cpp_reader *pfile, FILE *fp)
3521 {
3522   const cpp_token *token;
3523 
3524   token = cpp_get_token (pfile);
3525   while (token->type != CPP_EOF)
3526     {
3527       cpp_output_token (token, fp);
3528       token = cpp_get_token (pfile);
3529       if (token->flags & PREV_WHITE)
3530 	putc (' ', fp);
3531     }
3532 
3533   putc ('\n', fp);
3534 }
3535 
3536 /* Return a string representation of all the remaining tokens on the
3537    current line.  The result is allocated using xmalloc and must be
3538    freed by the caller.  */
3539 unsigned char *
cpp_output_line_to_string(cpp_reader * pfile,const unsigned char * dir_name)3540 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3541 {
3542   const cpp_token *token;
3543   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3544   unsigned int alloced = 120 + out;
3545   unsigned char *result = (unsigned char *) xmalloc (alloced);
3546 
3547   /* If DIR_NAME is empty, there are no initial contents.  */
3548   if (dir_name)
3549     {
3550       sprintf ((char *) result, "#%s ", dir_name);
3551       out += 2;
3552     }
3553 
3554   token = cpp_get_token (pfile);
3555   while (token->type != CPP_EOF)
3556     {
3557       unsigned char *last;
3558       /* Include room for a possible space and the terminating nul.  */
3559       unsigned int len = cpp_token_len (token) + 2;
3560 
3561       if (out + len > alloced)
3562 	{
3563 	  alloced *= 2;
3564 	  if (out + len > alloced)
3565 	    alloced = out + len;
3566 	  result = (unsigned char *) xrealloc (result, alloced);
3567 	}
3568 
3569       last = cpp_spell_token (pfile, token, &result[out], 0);
3570       out = last - result;
3571 
3572       token = cpp_get_token (pfile);
3573       if (token->flags & PREV_WHITE)
3574 	result[out++] = ' ';
3575     }
3576 
3577   result[out] = '\0';
3578   return result;
3579 }
3580 
3581 /* Memory buffers.  Changing these three constants can have a dramatic
3582    effect on performance.  The values here are reasonable defaults,
3583    but might be tuned.  If you adjust them, be sure to test across a
3584    range of uses of cpplib, including heavy nested function-like macro
3585    expansion.  Also check the change in peak memory usage (NJAMD is a
3586    good tool for this).  */
3587 #define MIN_BUFF_SIZE 8000
3588 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3589 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3590 	(MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3591 
3592 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3593   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3594 #endif
3595 
3596 /* Create a new allocation buffer.  Place the control block at the end
3597    of the buffer, so that buffer overflows will cause immediate chaos.  */
3598 static _cpp_buff *
new_buff(size_t len)3599 new_buff (size_t len)
3600 {
3601   _cpp_buff *result;
3602   unsigned char *base;
3603 
3604   if (len < MIN_BUFF_SIZE)
3605     len = MIN_BUFF_SIZE;
3606   len = CPP_ALIGN (len);
3607 
3608 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3609   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3610      struct first.  */
3611   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3612   base = XNEWVEC (unsigned char, len + slen);
3613   result = (_cpp_buff *) base;
3614   base += slen;
3615 #else
3616   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3617   result = (_cpp_buff *) (base + len);
3618 #endif
3619   result->base = base;
3620   result->cur = base;
3621   result->limit = base + len;
3622   result->next = NULL;
3623   return result;
3624 }
3625 
3626 /* Place a chain of unwanted allocation buffers on the free list.  */
3627 void
_cpp_release_buff(cpp_reader * pfile,_cpp_buff * buff)3628 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3629 {
3630   _cpp_buff *end = buff;
3631 
3632   while (end->next)
3633     end = end->next;
3634   end->next = pfile->free_buffs;
3635   pfile->free_buffs = buff;
3636 }
3637 
3638 /* Return a free buffer of size at least MIN_SIZE.  */
3639 _cpp_buff *
_cpp_get_buff(cpp_reader * pfile,size_t min_size)3640 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3641 {
3642   _cpp_buff *result, **p;
3643 
3644   for (p = &pfile->free_buffs;; p = &(*p)->next)
3645     {
3646       size_t size;
3647 
3648       if (*p == NULL)
3649 	return new_buff (min_size);
3650       result = *p;
3651       size = result->limit - result->base;
3652       /* Return a buffer that's big enough, but don't waste one that's
3653          way too big.  */
3654       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3655 	break;
3656     }
3657 
3658   *p = result->next;
3659   result->next = NULL;
3660   result->cur = result->base;
3661   return result;
3662 }
3663 
3664 /* Creates a new buffer with enough space to hold the uncommitted
3665    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3666    the excess bytes to the new buffer.  Chains the new buffer after
3667    BUFF, and returns the new buffer.  */
3668 _cpp_buff *
_cpp_append_extend_buff(cpp_reader * pfile,_cpp_buff * buff,size_t min_extra)3669 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3670 {
3671   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3672   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3673 
3674   buff->next = new_buff;
3675   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3676   return new_buff;
3677 }
3678 
3679 /* Creates a new buffer with enough space to hold the uncommitted
3680    remaining bytes of the buffer pointed to by BUFF, and at least
3681    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3682    Chains the new buffer before the buffer pointed to by BUFF, and
3683    updates the pointer to point to the new buffer.  */
3684 void
_cpp_extend_buff(cpp_reader * pfile,_cpp_buff ** pbuff,size_t min_extra)3685 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3686 {
3687   _cpp_buff *new_buff, *old_buff = *pbuff;
3688   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3689 
3690   new_buff = _cpp_get_buff (pfile, size);
3691   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3692   new_buff->next = old_buff;
3693   *pbuff = new_buff;
3694 }
3695 
3696 /* Free a chain of buffers starting at BUFF.  */
3697 void
_cpp_free_buff(_cpp_buff * buff)3698 _cpp_free_buff (_cpp_buff *buff)
3699 {
3700   _cpp_buff *next;
3701 
3702   for (; buff; buff = next)
3703     {
3704       next = buff->next;
3705 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3706       free (buff);
3707 #else
3708       free (buff->base);
3709 #endif
3710     }
3711 }
3712 
3713 /* Allocate permanent, unaligned storage of length LEN.  */
3714 unsigned char *
_cpp_unaligned_alloc(cpp_reader * pfile,size_t len)3715 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3716 {
3717   _cpp_buff *buff = pfile->u_buff;
3718   unsigned char *result = buff->cur;
3719 
3720   if (len > (size_t) (buff->limit - result))
3721     {
3722       buff = _cpp_get_buff (pfile, len);
3723       buff->next = pfile->u_buff;
3724       pfile->u_buff = buff;
3725       result = buff->cur;
3726     }
3727 
3728   buff->cur = result + len;
3729   return result;
3730 }
3731 
3732 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3733    That buffer is used for growing allocations when saving macro
3734    replacement lists in a #define, and when parsing an answer to an
3735    assertion in #assert, #unassert or #if (and therefore possibly
3736    whilst expanding macros).  It therefore must not be used by any
3737    code that they might call: specifically the lexer and the guts of
3738    the macro expander.
3739 
3740    All existing other uses clearly fit this restriction: storing
3741    registered pragmas during initialization.  */
3742 unsigned char *
_cpp_aligned_alloc(cpp_reader * pfile,size_t len)3743 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3744 {
3745   _cpp_buff *buff = pfile->a_buff;
3746   unsigned char *result = buff->cur;
3747 
3748   if (len > (size_t) (buff->limit - result))
3749     {
3750       buff = _cpp_get_buff (pfile, len);
3751       buff->next = pfile->a_buff;
3752       pfile->a_buff = buff;
3753       result = buff->cur;
3754     }
3755 
3756   buff->cur = result + len;
3757   return result;
3758 }
3759 
3760 /* Commit or allocate storage from a buffer.  */
3761 
3762 void *
_cpp_commit_buff(cpp_reader * pfile,size_t size)3763 _cpp_commit_buff (cpp_reader *pfile, size_t size)
3764 {
3765   void *ptr = BUFF_FRONT (pfile->a_buff);
3766 
3767   if (pfile->hash_table->alloc_subobject)
3768     {
3769       void *copy = pfile->hash_table->alloc_subobject (size);
3770       memcpy (copy, ptr, size);
3771       ptr = copy;
3772     }
3773   else
3774     BUFF_FRONT (pfile->a_buff) += size;
3775 
3776   return ptr;
3777 }
3778 
3779 /* Say which field of TOK is in use.  */
3780 
3781 enum cpp_token_fld_kind
cpp_token_val_index(const cpp_token * tok)3782 cpp_token_val_index (const cpp_token *tok)
3783 {
3784   switch (TOKEN_SPELL (tok))
3785     {
3786     case SPELL_IDENT:
3787       return CPP_TOKEN_FLD_NODE;
3788     case SPELL_LITERAL:
3789       return CPP_TOKEN_FLD_STR;
3790     case SPELL_OPERATOR:
3791       /* Operands which were originally spelled as ident keep around
3792          the node for the exact spelling.  */
3793       if (tok->flags & NAMED_OP)
3794 	return CPP_TOKEN_FLD_NODE;
3795       else if (tok->type == CPP_PASTE)
3796 	return CPP_TOKEN_FLD_TOKEN_NO;
3797       else
3798 	return CPP_TOKEN_FLD_NONE;
3799     case SPELL_NONE:
3800       if (tok->type == CPP_MACRO_ARG)
3801 	return CPP_TOKEN_FLD_ARG_NO;
3802       else if (tok->type == CPP_PADDING)
3803 	return CPP_TOKEN_FLD_SOURCE;
3804       else if (tok->type == CPP_PRAGMA)
3805 	return CPP_TOKEN_FLD_PRAGMA;
3806       /* fall through */
3807     default:
3808       return CPP_TOKEN_FLD_NONE;
3809     }
3810 }
3811 
3812 /* All tokens lexed in R after calling this function will be forced to
3813    have their location_t to be P, until
3814    cpp_stop_forcing_token_locations is called for R.  */
3815 
3816 void
cpp_force_token_locations(cpp_reader * r,location_t loc)3817 cpp_force_token_locations (cpp_reader *r, location_t loc)
3818 {
3819   r->forced_token_location = loc;
3820 }
3821 
3822 /* Go back to assigning locations naturally for lexed tokens.  */
3823 
3824 void
cpp_stop_forcing_token_locations(cpp_reader * r)3825 cpp_stop_forcing_token_locations (cpp_reader *r)
3826 {
3827   r->forced_token_location = 0;
3828 }
3829