xref: /dragonfly/contrib/gcc-8.0/libcpp/lex.c (revision 38fd1498)
1*38fd1498Szrj /* CPP Library - lexical analysis.
2*38fd1498Szrj    Copyright (C) 2000-2018 Free Software Foundation, Inc.
3*38fd1498Szrj    Contributed by Per Bothner, 1994-95.
4*38fd1498Szrj    Based on CCCP program by Paul Rubin, June 1986
5*38fd1498Szrj    Adapted to ANSI C, Richard Stallman, Jan 1987
6*38fd1498Szrj    Broken out to separate file, Zack Weinberg, Mar 2000
7*38fd1498Szrj 
8*38fd1498Szrj This program is free software; you can redistribute it and/or modify it
9*38fd1498Szrj under the terms of the GNU General Public License as published by the
10*38fd1498Szrj Free Software Foundation; either version 3, or (at your option) any
11*38fd1498Szrj later version.
12*38fd1498Szrj 
13*38fd1498Szrj This program is distributed in the hope that it will be useful,
14*38fd1498Szrj but WITHOUT ANY WARRANTY; without even the implied warranty of
15*38fd1498Szrj MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16*38fd1498Szrj GNU General Public License for more details.
17*38fd1498Szrj 
18*38fd1498Szrj You should have received a copy of the GNU General Public License
19*38fd1498Szrj along with this program; see the file COPYING3.  If not see
20*38fd1498Szrj <http://www.gnu.org/licenses/>.  */
21*38fd1498Szrj 
22*38fd1498Szrj #include "config.h"
23*38fd1498Szrj #include "system.h"
24*38fd1498Szrj #include "cpplib.h"
25*38fd1498Szrj #include "internal.h"
26*38fd1498Szrj 
27*38fd1498Szrj enum spell_type
28*38fd1498Szrj {
29*38fd1498Szrj   SPELL_OPERATOR = 0,
30*38fd1498Szrj   SPELL_IDENT,
31*38fd1498Szrj   SPELL_LITERAL,
32*38fd1498Szrj   SPELL_NONE
33*38fd1498Szrj };
34*38fd1498Szrj 
35*38fd1498Szrj struct token_spelling
36*38fd1498Szrj {
37*38fd1498Szrj   enum spell_type category;
38*38fd1498Szrj   const unsigned char *name;
39*38fd1498Szrj };
40*38fd1498Szrj 
41*38fd1498Szrj static const unsigned char *const digraph_spellings[] =
42*38fd1498Szrj { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
43*38fd1498Szrj 
44*38fd1498Szrj #define OP(e, s) { SPELL_OPERATOR, UC s  },
45*38fd1498Szrj #define TK(e, s) { SPELL_ ## s,    UC #e },
46*38fd1498Szrj static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47*38fd1498Szrj #undef OP
48*38fd1498Szrj #undef TK
49*38fd1498Szrj 
50*38fd1498Szrj #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51*38fd1498Szrj #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52*38fd1498Szrj 
53*38fd1498Szrj static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54*38fd1498Szrj static int skip_line_comment (cpp_reader *);
55*38fd1498Szrj static void skip_whitespace (cpp_reader *, cppchar_t);
56*38fd1498Szrj static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57*38fd1498Szrj static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58*38fd1498Szrj static void store_comment (cpp_reader *, cpp_token *);
59*38fd1498Szrj static void create_literal (cpp_reader *, cpp_token *, const uchar *,
60*38fd1498Szrj 			    unsigned int, enum cpp_ttype);
61*38fd1498Szrj static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
62*38fd1498Szrj static int name_p (cpp_reader *, const cpp_string *);
63*38fd1498Szrj static tokenrun *next_tokenrun (tokenrun *);
64*38fd1498Szrj 
65*38fd1498Szrj static _cpp_buff *new_buff (size_t);
66*38fd1498Szrj 
67*38fd1498Szrj 
68*38fd1498Szrj /* Utility routine:
69*38fd1498Szrj 
70*38fd1498Szrj    Compares, the token TOKEN to the NUL-terminated string STRING.
71*38fd1498Szrj    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
72*38fd1498Szrj int
cpp_ideq(const cpp_token * token,const char * string)73*38fd1498Szrj cpp_ideq (const cpp_token *token, const char *string)
74*38fd1498Szrj {
75*38fd1498Szrj   if (token->type != CPP_NAME)
76*38fd1498Szrj     return 0;
77*38fd1498Szrj 
78*38fd1498Szrj   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
79*38fd1498Szrj }
80*38fd1498Szrj 
81*38fd1498Szrj /* Record a note TYPE at byte POS into the current cleaned logical
82*38fd1498Szrj    line.  */
83*38fd1498Szrj static void
add_line_note(cpp_buffer * buffer,const uchar * pos,unsigned int type)84*38fd1498Szrj add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
85*38fd1498Szrj {
86*38fd1498Szrj   if (buffer->notes_used == buffer->notes_cap)
87*38fd1498Szrj     {
88*38fd1498Szrj       buffer->notes_cap = buffer->notes_cap * 2 + 200;
89*38fd1498Szrj       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
90*38fd1498Szrj                                   buffer->notes_cap);
91*38fd1498Szrj     }
92*38fd1498Szrj 
93*38fd1498Szrj   buffer->notes[buffer->notes_used].pos = pos;
94*38fd1498Szrj   buffer->notes[buffer->notes_used].type = type;
95*38fd1498Szrj   buffer->notes_used++;
96*38fd1498Szrj }
97*38fd1498Szrj 
98*38fd1498Szrj 
99*38fd1498Szrj /* Fast path to find line special characters using optimized character
100*38fd1498Szrj    scanning algorithms.  Anything complicated falls back to the slow
101*38fd1498Szrj    path below.  Since this loop is very hot it's worth doing these kinds
102*38fd1498Szrj    of optimizations.
103*38fd1498Szrj 
104*38fd1498Szrj    One of the paths through the ifdefs should provide
105*38fd1498Szrj 
106*38fd1498Szrj      const uchar *search_line_fast (const uchar *s, const uchar *end);
107*38fd1498Szrj 
108*38fd1498Szrj    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
109*38fd1498Szrj    the found character.
110*38fd1498Szrj 
111*38fd1498Szrj    Note that the last character of the buffer is *always* a newline,
112*38fd1498Szrj    as forced by _cpp_convert_input.  This fact can be used to avoid
113*38fd1498Szrj    explicitly looking for the end of the buffer.  */
114*38fd1498Szrj 
115*38fd1498Szrj /* Configure gives us an ifdef test.  */
116*38fd1498Szrj #ifndef WORDS_BIGENDIAN
117*38fd1498Szrj #define WORDS_BIGENDIAN 0
118*38fd1498Szrj #endif
119*38fd1498Szrj 
120*38fd1498Szrj /* We'd like the largest integer that fits into a register.  There's nothing
121*38fd1498Szrj    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
122*38fd1498Szrj    but MS decided on an LLP64 model.  Thankfully when building with GCC we
123*38fd1498Szrj    can get the "real" word size.  */
124*38fd1498Szrj #ifdef __GNUC__
125*38fd1498Szrj typedef unsigned int word_type __attribute__((__mode__(__word__)));
126*38fd1498Szrj #else
127*38fd1498Szrj typedef unsigned long word_type;
128*38fd1498Szrj #endif
129*38fd1498Szrj 
130*38fd1498Szrj /* The code below is only expecting sizes 4 or 8.
131*38fd1498Szrj    Die at compile-time if this expectation is violated.  */
132*38fd1498Szrj typedef char check_word_type_size
133*38fd1498Szrj   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
134*38fd1498Szrj 
135*38fd1498Szrj /* Return X with the first N bytes forced to values that won't match one
136*38fd1498Szrj    of the interesting characters.  Note that NUL is not interesting.  */
137*38fd1498Szrj 
138*38fd1498Szrj static inline word_type
acc_char_mask_misalign(word_type val,unsigned int n)139*38fd1498Szrj acc_char_mask_misalign (word_type val, unsigned int n)
140*38fd1498Szrj {
141*38fd1498Szrj   word_type mask = -1;
142*38fd1498Szrj   if (WORDS_BIGENDIAN)
143*38fd1498Szrj     mask >>= n * 8;
144*38fd1498Szrj   else
145*38fd1498Szrj     mask <<= n * 8;
146*38fd1498Szrj   return val & mask;
147*38fd1498Szrj }
148*38fd1498Szrj 
149*38fd1498Szrj /* Return X replicated to all byte positions within WORD_TYPE.  */
150*38fd1498Szrj 
151*38fd1498Szrj static inline word_type
acc_char_replicate(uchar x)152*38fd1498Szrj acc_char_replicate (uchar x)
153*38fd1498Szrj {
154*38fd1498Szrj   word_type ret;
155*38fd1498Szrj 
156*38fd1498Szrj   ret = (x << 24) | (x << 16) | (x << 8) | x;
157*38fd1498Szrj   if (sizeof(word_type) == 8)
158*38fd1498Szrj     ret = (ret << 16 << 16) | ret;
159*38fd1498Szrj   return ret;
160*38fd1498Szrj }
161*38fd1498Szrj 
162*38fd1498Szrj /* Return non-zero if some byte of VAL is (probably) C.  */
163*38fd1498Szrj 
164*38fd1498Szrj static inline word_type
acc_char_cmp(word_type val,word_type c)165*38fd1498Szrj acc_char_cmp (word_type val, word_type c)
166*38fd1498Szrj {
167*38fd1498Szrj #if defined(__GNUC__) && defined(__alpha__)
168*38fd1498Szrj   /* We can get exact results using a compare-bytes instruction.
169*38fd1498Szrj      Get (val == c) via (0 >= (val ^ c)).  */
170*38fd1498Szrj   return __builtin_alpha_cmpbge (0, val ^ c);
171*38fd1498Szrj #else
172*38fd1498Szrj   word_type magic = 0x7efefefeU;
173*38fd1498Szrj   if (sizeof(word_type) == 8)
174*38fd1498Szrj     magic = (magic << 16 << 16) | 0xfefefefeU;
175*38fd1498Szrj   magic |= 1;
176*38fd1498Szrj 
177*38fd1498Szrj   val ^= c;
178*38fd1498Szrj   return ((val + magic) ^ ~val) & ~magic;
179*38fd1498Szrj #endif
180*38fd1498Szrj }
181*38fd1498Szrj 
182*38fd1498Szrj /* Given the result of acc_char_cmp is non-zero, return the index of
183*38fd1498Szrj    the found character.  If this was a false positive, return -1.  */
184*38fd1498Szrj 
185*38fd1498Szrj static inline int
acc_char_index(word_type cmp ATTRIBUTE_UNUSED,word_type val ATTRIBUTE_UNUSED)186*38fd1498Szrj acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
187*38fd1498Szrj 		word_type val ATTRIBUTE_UNUSED)
188*38fd1498Szrj {
189*38fd1498Szrj #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
190*38fd1498Szrj   /* The cmpbge instruction sets *bits* of the result corresponding to
191*38fd1498Szrj      matches in the bytes with no false positives.  */
192*38fd1498Szrj   return __builtin_ctzl (cmp);
193*38fd1498Szrj #else
194*38fd1498Szrj   unsigned int i;
195*38fd1498Szrj 
196*38fd1498Szrj   /* ??? It would be nice to force unrolling here,
197*38fd1498Szrj      and have all of these constants folded.  */
198*38fd1498Szrj   for (i = 0; i < sizeof(word_type); ++i)
199*38fd1498Szrj     {
200*38fd1498Szrj       uchar c;
201*38fd1498Szrj       if (WORDS_BIGENDIAN)
202*38fd1498Szrj 	c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
203*38fd1498Szrj       else
204*38fd1498Szrj 	c = (val >> i * 8) & 0xff;
205*38fd1498Szrj 
206*38fd1498Szrj       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
207*38fd1498Szrj 	return i;
208*38fd1498Szrj     }
209*38fd1498Szrj 
210*38fd1498Szrj   return -1;
211*38fd1498Szrj #endif
212*38fd1498Szrj }
213*38fd1498Szrj 
214*38fd1498Szrj /* A version of the fast scanner using bit fiddling techniques.
215*38fd1498Szrj 
216*38fd1498Szrj    For 32-bit words, one would normally perform 16 comparisons and
217*38fd1498Szrj    16 branches.  With this algorithm one performs 24 arithmetic
218*38fd1498Szrj    operations and one branch.  Whether this is faster with a 32-bit
219*38fd1498Szrj    word size is going to be somewhat system dependent.
220*38fd1498Szrj 
221*38fd1498Szrj    For 64-bit words, we eliminate twice the number of comparisons
222*38fd1498Szrj    and branches without increasing the number of arithmetic operations.
223*38fd1498Szrj    It's almost certainly going to be a win with 64-bit word size.  */
224*38fd1498Szrj 
225*38fd1498Szrj static const uchar * search_line_acc_char (const uchar *, const uchar *)
226*38fd1498Szrj   ATTRIBUTE_UNUSED;
227*38fd1498Szrj 
228*38fd1498Szrj static const uchar *
search_line_acc_char(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)229*38fd1498Szrj search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
230*38fd1498Szrj {
231*38fd1498Szrj   const word_type repl_nl = acc_char_replicate ('\n');
232*38fd1498Szrj   const word_type repl_cr = acc_char_replicate ('\r');
233*38fd1498Szrj   const word_type repl_bs = acc_char_replicate ('\\');
234*38fd1498Szrj   const word_type repl_qm = acc_char_replicate ('?');
235*38fd1498Szrj 
236*38fd1498Szrj   unsigned int misalign;
237*38fd1498Szrj   const word_type *p;
238*38fd1498Szrj   word_type val, t;
239*38fd1498Szrj 
240*38fd1498Szrj   /* Align the buffer.  Mask out any bytes from before the beginning.  */
241*38fd1498Szrj   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
242*38fd1498Szrj   val = *p;
243*38fd1498Szrj   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
244*38fd1498Szrj   if (misalign)
245*38fd1498Szrj     val = acc_char_mask_misalign (val, misalign);
246*38fd1498Szrj 
247*38fd1498Szrj   /* Main loop.  */
248*38fd1498Szrj   while (1)
249*38fd1498Szrj     {
250*38fd1498Szrj       t  = acc_char_cmp (val, repl_nl);
251*38fd1498Szrj       t |= acc_char_cmp (val, repl_cr);
252*38fd1498Szrj       t |= acc_char_cmp (val, repl_bs);
253*38fd1498Szrj       t |= acc_char_cmp (val, repl_qm);
254*38fd1498Szrj 
255*38fd1498Szrj       if (__builtin_expect (t != 0, 0))
256*38fd1498Szrj 	{
257*38fd1498Szrj 	  int i = acc_char_index (t, val);
258*38fd1498Szrj 	  if (i >= 0)
259*38fd1498Szrj 	    return (const uchar *)p + i;
260*38fd1498Szrj 	}
261*38fd1498Szrj 
262*38fd1498Szrj       val = *++p;
263*38fd1498Szrj     }
264*38fd1498Szrj }
265*38fd1498Szrj 
266*38fd1498Szrj /* Disable on Solaris 2/x86 until the following problem can be properly
267*38fd1498Szrj    autoconfed:
268*38fd1498Szrj 
269*38fd1498Szrj    The Solaris 10+ assembler tags objects with the instruction set
270*38fd1498Szrj    extensions used, so SSE4.2 executables cannot run on machines that
271*38fd1498Szrj    don't support that extension.  */
272*38fd1498Szrj 
273*38fd1498Szrj #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
274*38fd1498Szrj 
275*38fd1498Szrj /* Replicated character data to be shared between implementations.
276*38fd1498Szrj    Recall that outside of a context with vector support we can't
277*38fd1498Szrj    define compatible vector types, therefore these are all defined
278*38fd1498Szrj    in terms of raw characters.  */
279*38fd1498Szrj static const char repl_chars[4][16] __attribute__((aligned(16))) = {
280*38fd1498Szrj   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
281*38fd1498Szrj     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
282*38fd1498Szrj   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
283*38fd1498Szrj     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
284*38fd1498Szrj   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
285*38fd1498Szrj     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
286*38fd1498Szrj   { '?', '?', '?', '?', '?', '?', '?', '?',
287*38fd1498Szrj     '?', '?', '?', '?', '?', '?', '?', '?' },
288*38fd1498Szrj };
289*38fd1498Szrj 
290*38fd1498Szrj /* A version of the fast scanner using MMX vectorized byte compare insns.
291*38fd1498Szrj 
292*38fd1498Szrj    This uses the PMOVMSKB instruction which was introduced with "MMX2",
293*38fd1498Szrj    which was packaged into SSE1; it is also present in the AMD MMX
294*38fd1498Szrj    extension.  Mark the function as using "sse" so that we emit a real
295*38fd1498Szrj    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
296*38fd1498Szrj 
297*38fd1498Szrj static const uchar *
298*38fd1498Szrj #ifndef __SSE__
299*38fd1498Szrj __attribute__((__target__("sse")))
300*38fd1498Szrj #endif
search_line_mmx(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)301*38fd1498Szrj search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
302*38fd1498Szrj {
303*38fd1498Szrj   typedef char v8qi __attribute__ ((__vector_size__ (8)));
304*38fd1498Szrj   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
305*38fd1498Szrj 
306*38fd1498Szrj   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
307*38fd1498Szrj   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
308*38fd1498Szrj   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
309*38fd1498Szrj   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
310*38fd1498Szrj 
311*38fd1498Szrj   unsigned int misalign, found, mask;
312*38fd1498Szrj   const v8qi *p;
313*38fd1498Szrj   v8qi data, t, c;
314*38fd1498Szrj 
315*38fd1498Szrj   /* Align the source pointer.  While MMX doesn't generate unaligned data
316*38fd1498Szrj      faults, this allows us to safely scan to the end of the buffer without
317*38fd1498Szrj      reading beyond the end of the last page.  */
318*38fd1498Szrj   misalign = (uintptr_t)s & 7;
319*38fd1498Szrj   p = (const v8qi *)((uintptr_t)s & -8);
320*38fd1498Szrj   data = *p;
321*38fd1498Szrj 
322*38fd1498Szrj   /* Create a mask for the bytes that are valid within the first
323*38fd1498Szrj      16-byte block.  The Idea here is that the AND with the mask
324*38fd1498Szrj      within the loop is "free", since we need some AND or TEST
325*38fd1498Szrj      insn in order to set the flags for the branch anyway.  */
326*38fd1498Szrj   mask = -1u << misalign;
327*38fd1498Szrj 
328*38fd1498Szrj   /* Main loop processing 8 bytes at a time.  */
329*38fd1498Szrj   goto start;
330*38fd1498Szrj   do
331*38fd1498Szrj     {
332*38fd1498Szrj       data = *++p;
333*38fd1498Szrj       mask = -1;
334*38fd1498Szrj 
335*38fd1498Szrj     start:
336*38fd1498Szrj       t = __builtin_ia32_pcmpeqb(data, repl_nl);
337*38fd1498Szrj       c = __builtin_ia32_pcmpeqb(data, repl_cr);
338*38fd1498Szrj       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
339*38fd1498Szrj       c = __builtin_ia32_pcmpeqb(data, repl_bs);
340*38fd1498Szrj       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
341*38fd1498Szrj       c = __builtin_ia32_pcmpeqb(data, repl_qm);
342*38fd1498Szrj       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
343*38fd1498Szrj       found = __builtin_ia32_pmovmskb (t);
344*38fd1498Szrj       found &= mask;
345*38fd1498Szrj     }
346*38fd1498Szrj   while (!found);
347*38fd1498Szrj 
348*38fd1498Szrj   __builtin_ia32_emms ();
349*38fd1498Szrj 
350*38fd1498Szrj   /* FOUND contains 1 in bits for which we matched a relevant
351*38fd1498Szrj      character.  Conversion to the byte index is trivial.  */
352*38fd1498Szrj   found = __builtin_ctz(found);
353*38fd1498Szrj   return (const uchar *)p + found;
354*38fd1498Szrj }
355*38fd1498Szrj 
356*38fd1498Szrj /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
357*38fd1498Szrj 
358*38fd1498Szrj static const uchar *
359*38fd1498Szrj #ifndef __SSE2__
360*38fd1498Szrj __attribute__((__target__("sse2")))
361*38fd1498Szrj #endif
search_line_sse2(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)362*38fd1498Szrj search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
363*38fd1498Szrj {
364*38fd1498Szrj   typedef char v16qi __attribute__ ((__vector_size__ (16)));
365*38fd1498Szrj 
366*38fd1498Szrj   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
367*38fd1498Szrj   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
368*38fd1498Szrj   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
369*38fd1498Szrj   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
370*38fd1498Szrj 
371*38fd1498Szrj   unsigned int misalign, found, mask;
372*38fd1498Szrj   const v16qi *p;
373*38fd1498Szrj   v16qi data, t;
374*38fd1498Szrj 
375*38fd1498Szrj   /* Align the source pointer.  */
376*38fd1498Szrj   misalign = (uintptr_t)s & 15;
377*38fd1498Szrj   p = (const v16qi *)((uintptr_t)s & -16);
378*38fd1498Szrj   data = *p;
379*38fd1498Szrj 
380*38fd1498Szrj   /* Create a mask for the bytes that are valid within the first
381*38fd1498Szrj      16-byte block.  The Idea here is that the AND with the mask
382*38fd1498Szrj      within the loop is "free", since we need some AND or TEST
383*38fd1498Szrj      insn in order to set the flags for the branch anyway.  */
384*38fd1498Szrj   mask = -1u << misalign;
385*38fd1498Szrj 
386*38fd1498Szrj   /* Main loop processing 16 bytes at a time.  */
387*38fd1498Szrj   goto start;
388*38fd1498Szrj   do
389*38fd1498Szrj     {
390*38fd1498Szrj       data = *++p;
391*38fd1498Szrj       mask = -1;
392*38fd1498Szrj 
393*38fd1498Szrj     start:
394*38fd1498Szrj       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
395*38fd1498Szrj       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
396*38fd1498Szrj       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
397*38fd1498Szrj       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
398*38fd1498Szrj       found = __builtin_ia32_pmovmskb128 (t);
399*38fd1498Szrj       found &= mask;
400*38fd1498Szrj     }
401*38fd1498Szrj   while (!found);
402*38fd1498Szrj 
403*38fd1498Szrj   /* FOUND contains 1 in bits for which we matched a relevant
404*38fd1498Szrj      character.  Conversion to the byte index is trivial.  */
405*38fd1498Szrj   found = __builtin_ctz(found);
406*38fd1498Szrj   return (const uchar *)p + found;
407*38fd1498Szrj }
408*38fd1498Szrj 
409*38fd1498Szrj #ifdef HAVE_SSE4
410*38fd1498Szrj /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
411*38fd1498Szrj 
412*38fd1498Szrj static const uchar *
413*38fd1498Szrj #ifndef __SSE4_2__
414*38fd1498Szrj __attribute__((__target__("sse4.2")))
415*38fd1498Szrj #endif
search_line_sse42(const uchar * s,const uchar * end)416*38fd1498Szrj search_line_sse42 (const uchar *s, const uchar *end)
417*38fd1498Szrj {
418*38fd1498Szrj   typedef char v16qi __attribute__ ((__vector_size__ (16)));
419*38fd1498Szrj   static const v16qi search = { '\n', '\r', '?', '\\' };
420*38fd1498Szrj 
421*38fd1498Szrj   uintptr_t si = (uintptr_t)s;
422*38fd1498Szrj   uintptr_t index;
423*38fd1498Szrj 
424*38fd1498Szrj   /* Check for unaligned input.  */
425*38fd1498Szrj   if (si & 15)
426*38fd1498Szrj     {
427*38fd1498Szrj       v16qi sv;
428*38fd1498Szrj 
429*38fd1498Szrj       if (__builtin_expect (end - s < 16, 0)
430*38fd1498Szrj 	  && __builtin_expect ((si & 0xfff) > 0xff0, 0))
431*38fd1498Szrj 	{
432*38fd1498Szrj 	  /* There are less than 16 bytes left in the buffer, and less
433*38fd1498Szrj 	     than 16 bytes left on the page.  Reading 16 bytes at this
434*38fd1498Szrj 	     point might generate a spurious page fault.  Defer to the
435*38fd1498Szrj 	     SSE2 implementation, which already handles alignment.  */
436*38fd1498Szrj 	  return search_line_sse2 (s, end);
437*38fd1498Szrj 	}
438*38fd1498Szrj 
439*38fd1498Szrj       /* ??? The builtin doesn't understand that the PCMPESTRI read from
440*38fd1498Szrj 	 memory need not be aligned.  */
441*38fd1498Szrj       sv = __builtin_ia32_loaddqu ((const char *) s);
442*38fd1498Szrj       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
443*38fd1498Szrj 
444*38fd1498Szrj       if (__builtin_expect (index < 16, 0))
445*38fd1498Szrj 	goto found;
446*38fd1498Szrj 
447*38fd1498Szrj       /* Advance the pointer to an aligned address.  We will re-scan a
448*38fd1498Szrj 	 few bytes, but we no longer need care for reading past the
449*38fd1498Szrj 	 end of a page, since we're guaranteed a match.  */
450*38fd1498Szrj       s = (const uchar *)((si + 15) & -16);
451*38fd1498Szrj     }
452*38fd1498Szrj 
453*38fd1498Szrj   /* Main loop, processing 16 bytes at a time.  */
454*38fd1498Szrj #ifdef __GCC_ASM_FLAG_OUTPUTS__
455*38fd1498Szrj   while (1)
456*38fd1498Szrj     {
457*38fd1498Szrj       char f;
458*38fd1498Szrj 
459*38fd1498Szrj       /* By using inline assembly instead of the builtin,
460*38fd1498Szrj 	 we can use the result, as well as the flags set.  */
461*38fd1498Szrj       __asm ("%vpcmpestri\t$0, %2, %3"
462*38fd1498Szrj 	     : "=c"(index), "=@ccc"(f)
463*38fd1498Szrj 	     : "m"(*s), "x"(search), "a"(4), "d"(16));
464*38fd1498Szrj       if (f)
465*38fd1498Szrj 	break;
466*38fd1498Szrj 
467*38fd1498Szrj       s += 16;
468*38fd1498Szrj     }
469*38fd1498Szrj #else
470*38fd1498Szrj   s -= 16;
471*38fd1498Szrj   /* By doing the whole loop in inline assembly,
472*38fd1498Szrj      we can make proper use of the flags set.  */
473*38fd1498Szrj   __asm (      ".balign 16\n"
474*38fd1498Szrj 	"0:	add $16, %1\n"
475*38fd1498Szrj 	"	%vpcmpestri\t$0, (%1), %2\n"
476*38fd1498Szrj 	"	jnc 0b"
477*38fd1498Szrj 	: "=&c"(index), "+r"(s)
478*38fd1498Szrj 	: "x"(search), "a"(4), "d"(16));
479*38fd1498Szrj #endif
480*38fd1498Szrj 
481*38fd1498Szrj  found:
482*38fd1498Szrj   return s + index;
483*38fd1498Szrj }
484*38fd1498Szrj 
485*38fd1498Szrj #else
486*38fd1498Szrj /* Work around out-dated assemblers without sse4 support.  */
487*38fd1498Szrj #define search_line_sse42 search_line_sse2
488*38fd1498Szrj #endif
489*38fd1498Szrj 
490*38fd1498Szrj /* Check the CPU capabilities.  */
491*38fd1498Szrj 
492*38fd1498Szrj #include "../gcc/config/i386/cpuid.h"
493*38fd1498Szrj 
494*38fd1498Szrj typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
495*38fd1498Szrj static search_line_fast_type search_line_fast;
496*38fd1498Szrj 
497*38fd1498Szrj #define HAVE_init_vectorized_lexer 1
498*38fd1498Szrj static inline void
init_vectorized_lexer(void)499*38fd1498Szrj init_vectorized_lexer (void)
500*38fd1498Szrj {
501*38fd1498Szrj   unsigned dummy, ecx = 0, edx = 0;
502*38fd1498Szrj   search_line_fast_type impl = search_line_acc_char;
503*38fd1498Szrj   int minimum = 0;
504*38fd1498Szrj 
505*38fd1498Szrj #if defined(__SSE4_2__)
506*38fd1498Szrj   minimum = 3;
507*38fd1498Szrj #elif defined(__SSE2__)
508*38fd1498Szrj   minimum = 2;
509*38fd1498Szrj #elif defined(__SSE__)
510*38fd1498Szrj   minimum = 1;
511*38fd1498Szrj #endif
512*38fd1498Szrj 
513*38fd1498Szrj   if (minimum == 3)
514*38fd1498Szrj     impl = search_line_sse42;
515*38fd1498Szrj   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
516*38fd1498Szrj     {
517*38fd1498Szrj       if (minimum == 3 || (ecx & bit_SSE4_2))
518*38fd1498Szrj         impl = search_line_sse42;
519*38fd1498Szrj       else if (minimum == 2 || (edx & bit_SSE2))
520*38fd1498Szrj 	impl = search_line_sse2;
521*38fd1498Szrj       else if (minimum == 1 || (edx & bit_SSE))
522*38fd1498Szrj 	impl = search_line_mmx;
523*38fd1498Szrj     }
524*38fd1498Szrj   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
525*38fd1498Szrj     {
526*38fd1498Szrj       if (minimum == 1
527*38fd1498Szrj 	  || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
528*38fd1498Szrj 	impl = search_line_mmx;
529*38fd1498Szrj     }
530*38fd1498Szrj 
531*38fd1498Szrj   search_line_fast = impl;
532*38fd1498Szrj }
533*38fd1498Szrj 
534*38fd1498Szrj #elif defined(_ARCH_PWR8) && defined(__ALTIVEC__)
535*38fd1498Szrj 
536*38fd1498Szrj /* A vection of the fast scanner using AltiVec vectorized byte compares
537*38fd1498Szrj    and VSX unaligned loads (when VSX is available).  This is otherwise
538*38fd1498Szrj    the same as the pre-GCC 5 version.  */
539*38fd1498Szrj 
540*38fd1498Szrj ATTRIBUTE_NO_SANITIZE_UNDEFINED
541*38fd1498Szrj static const uchar *
search_line_fast(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)542*38fd1498Szrj search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
543*38fd1498Szrj {
544*38fd1498Szrj   typedef __attribute__((altivec(vector))) unsigned char vc;
545*38fd1498Szrj 
546*38fd1498Szrj   const vc repl_nl = {
547*38fd1498Szrj     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
548*38fd1498Szrj     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
549*38fd1498Szrj   };
550*38fd1498Szrj   const vc repl_cr = {
551*38fd1498Szrj     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
552*38fd1498Szrj     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
553*38fd1498Szrj   };
554*38fd1498Szrj   const vc repl_bs = {
555*38fd1498Szrj     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
556*38fd1498Szrj     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
557*38fd1498Szrj   };
558*38fd1498Szrj   const vc repl_qm = {
559*38fd1498Szrj     '?', '?', '?', '?', '?', '?', '?', '?',
560*38fd1498Szrj     '?', '?', '?', '?', '?', '?', '?', '?',
561*38fd1498Szrj   };
562*38fd1498Szrj   const vc zero = { 0 };
563*38fd1498Szrj 
564*38fd1498Szrj   vc data, t;
565*38fd1498Szrj 
566*38fd1498Szrj   /* Main loop processing 16 bytes at a time.  */
567*38fd1498Szrj   do
568*38fd1498Szrj     {
569*38fd1498Szrj       vc m_nl, m_cr, m_bs, m_qm;
570*38fd1498Szrj 
571*38fd1498Szrj       data = __builtin_vec_vsx_ld (0, s);
572*38fd1498Szrj       s += 16;
573*38fd1498Szrj 
574*38fd1498Szrj       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
575*38fd1498Szrj       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
576*38fd1498Szrj       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
577*38fd1498Szrj       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
578*38fd1498Szrj       t = (m_nl | m_cr) | (m_bs | m_qm);
579*38fd1498Szrj 
580*38fd1498Szrj       /* T now contains 0xff in bytes for which we matched one of the relevant
581*38fd1498Szrj 	 characters.  We want to exit the loop if any byte in T is non-zero.
582*38fd1498Szrj 	 Below is the expansion of vec_any_ne(t, zero).  */
583*38fd1498Szrj     }
584*38fd1498Szrj   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
585*38fd1498Szrj 
586*38fd1498Szrj   /* Restore s to to point to the 16 bytes we just processed.  */
587*38fd1498Szrj   s -= 16;
588*38fd1498Szrj 
589*38fd1498Szrj   {
590*38fd1498Szrj #define N  (sizeof(vc) / sizeof(long))
591*38fd1498Szrj 
592*38fd1498Szrj     union {
593*38fd1498Szrj       vc v;
594*38fd1498Szrj       /* Statically assert that N is 2 or 4.  */
595*38fd1498Szrj       unsigned long l[(N == 2 || N == 4) ? N : -1];
596*38fd1498Szrj     } u;
597*38fd1498Szrj     unsigned long l, i = 0;
598*38fd1498Szrj 
599*38fd1498Szrj     u.v = t;
600*38fd1498Szrj 
601*38fd1498Szrj     /* Find the first word of T that is non-zero.  */
602*38fd1498Szrj     switch (N)
603*38fd1498Szrj       {
604*38fd1498Szrj       case 4:
605*38fd1498Szrj 	l = u.l[i++];
606*38fd1498Szrj 	if (l != 0)
607*38fd1498Szrj 	  break;
608*38fd1498Szrj 	s += sizeof(unsigned long);
609*38fd1498Szrj 	l = u.l[i++];
610*38fd1498Szrj 	if (l != 0)
611*38fd1498Szrj 	  break;
612*38fd1498Szrj 	s += sizeof(unsigned long);
613*38fd1498Szrj 	/* FALLTHRU */
614*38fd1498Szrj       case 2:
615*38fd1498Szrj 	l = u.l[i++];
616*38fd1498Szrj 	if (l != 0)
617*38fd1498Szrj 	  break;
618*38fd1498Szrj 	s += sizeof(unsigned long);
619*38fd1498Szrj 	l = u.l[i];
620*38fd1498Szrj       }
621*38fd1498Szrj 
622*38fd1498Szrj     /* L now contains 0xff in bytes for which we matched one of the
623*38fd1498Szrj        relevant characters.  We can find the byte index by finding
624*38fd1498Szrj        its bit index and dividing by 8.  */
625*38fd1498Szrj #ifdef __BIG_ENDIAN__
626*38fd1498Szrj     l = __builtin_clzl(l) >> 3;
627*38fd1498Szrj #else
628*38fd1498Szrj     l = __builtin_ctzl(l) >> 3;
629*38fd1498Szrj #endif
630*38fd1498Szrj     return s + l;
631*38fd1498Szrj 
632*38fd1498Szrj #undef N
633*38fd1498Szrj   }
634*38fd1498Szrj }
635*38fd1498Szrj 
636*38fd1498Szrj #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
637*38fd1498Szrj 
638*38fd1498Szrj /* A vection of the fast scanner using AltiVec vectorized byte compares.
639*38fd1498Szrj    This cannot be used for little endian because vec_lvsl/lvsr are
640*38fd1498Szrj    deprecated for little endian and the code won't work properly.  */
641*38fd1498Szrj /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
642*38fd1498Szrj    so we can't compile this function without -maltivec on the command line
643*38fd1498Szrj    (or implied by some other switch).  */
644*38fd1498Szrj 
645*38fd1498Szrj static const uchar *
search_line_fast(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)646*38fd1498Szrj search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
647*38fd1498Szrj {
648*38fd1498Szrj   typedef __attribute__((altivec(vector))) unsigned char vc;
649*38fd1498Szrj 
650*38fd1498Szrj   const vc repl_nl = {
651*38fd1498Szrj     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
652*38fd1498Szrj     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
653*38fd1498Szrj   };
654*38fd1498Szrj   const vc repl_cr = {
655*38fd1498Szrj     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
656*38fd1498Szrj     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
657*38fd1498Szrj   };
658*38fd1498Szrj   const vc repl_bs = {
659*38fd1498Szrj     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
660*38fd1498Szrj     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
661*38fd1498Szrj   };
662*38fd1498Szrj   const vc repl_qm = {
663*38fd1498Szrj     '?', '?', '?', '?', '?', '?', '?', '?',
664*38fd1498Szrj     '?', '?', '?', '?', '?', '?', '?', '?',
665*38fd1498Szrj   };
666*38fd1498Szrj   const vc ones = {
667*38fd1498Szrj     -1, -1, -1, -1, -1, -1, -1, -1,
668*38fd1498Szrj     -1, -1, -1, -1, -1, -1, -1, -1,
669*38fd1498Szrj   };
670*38fd1498Szrj   const vc zero = { 0 };
671*38fd1498Szrj 
672*38fd1498Szrj   vc data, mask, t;
673*38fd1498Szrj 
674*38fd1498Szrj   /* Altivec loads automatically mask addresses with -16.  This lets us
675*38fd1498Szrj      issue the first load as early as possible.  */
676*38fd1498Szrj   data = __builtin_vec_ld(0, (const vc *)s);
677*38fd1498Szrj 
678*38fd1498Szrj   /* Discard bytes before the beginning of the buffer.  Do this by
679*38fd1498Szrj      beginning with all ones and shifting in zeros according to the
680*38fd1498Szrj      mis-alignment.  The LVSR instruction pulls the exact shift we
681*38fd1498Szrj      want from the address.  */
682*38fd1498Szrj   mask = __builtin_vec_lvsr(0, s);
683*38fd1498Szrj   mask = __builtin_vec_perm(zero, ones, mask);
684*38fd1498Szrj   data &= mask;
685*38fd1498Szrj 
686*38fd1498Szrj   /* While altivec loads mask addresses, we still need to align S so
687*38fd1498Szrj      that the offset we compute at the end is correct.  */
688*38fd1498Szrj   s = (const uchar *)((uintptr_t)s & -16);
689*38fd1498Szrj 
690*38fd1498Szrj   /* Main loop processing 16 bytes at a time.  */
691*38fd1498Szrj   goto start;
692*38fd1498Szrj   do
693*38fd1498Szrj     {
694*38fd1498Szrj       vc m_nl, m_cr, m_bs, m_qm;
695*38fd1498Szrj 
696*38fd1498Szrj       s += 16;
697*38fd1498Szrj       data = __builtin_vec_ld(0, (const vc *)s);
698*38fd1498Szrj 
699*38fd1498Szrj     start:
700*38fd1498Szrj       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
701*38fd1498Szrj       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
702*38fd1498Szrj       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
703*38fd1498Szrj       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
704*38fd1498Szrj       t = (m_nl | m_cr) | (m_bs | m_qm);
705*38fd1498Szrj 
706*38fd1498Szrj       /* T now contains 0xff in bytes for which we matched one of the relevant
707*38fd1498Szrj 	 characters.  We want to exit the loop if any byte in T is non-zero.
708*38fd1498Szrj 	 Below is the expansion of vec_any_ne(t, zero).  */
709*38fd1498Szrj     }
710*38fd1498Szrj   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
711*38fd1498Szrj 
712*38fd1498Szrj   {
713*38fd1498Szrj #define N  (sizeof(vc) / sizeof(long))
714*38fd1498Szrj 
715*38fd1498Szrj     union {
716*38fd1498Szrj       vc v;
717*38fd1498Szrj       /* Statically assert that N is 2 or 4.  */
718*38fd1498Szrj       unsigned long l[(N == 2 || N == 4) ? N : -1];
719*38fd1498Szrj     } u;
720*38fd1498Szrj     unsigned long l, i = 0;
721*38fd1498Szrj 
722*38fd1498Szrj     u.v = t;
723*38fd1498Szrj 
724*38fd1498Szrj     /* Find the first word of T that is non-zero.  */
725*38fd1498Szrj     switch (N)
726*38fd1498Szrj       {
727*38fd1498Szrj       case 4:
728*38fd1498Szrj 	l = u.l[i++];
729*38fd1498Szrj 	if (l != 0)
730*38fd1498Szrj 	  break;
731*38fd1498Szrj 	s += sizeof(unsigned long);
732*38fd1498Szrj 	l = u.l[i++];
733*38fd1498Szrj 	if (l != 0)
734*38fd1498Szrj 	  break;
735*38fd1498Szrj 	s += sizeof(unsigned long);
736*38fd1498Szrj 	/* FALLTHROUGH */
737*38fd1498Szrj       case 2:
738*38fd1498Szrj 	l = u.l[i++];
739*38fd1498Szrj 	if (l != 0)
740*38fd1498Szrj 	  break;
741*38fd1498Szrj 	s += sizeof(unsigned long);
742*38fd1498Szrj 	l = u.l[i];
743*38fd1498Szrj       }
744*38fd1498Szrj 
745*38fd1498Szrj     /* L now contains 0xff in bytes for which we matched one of the
746*38fd1498Szrj        relevant characters.  We can find the byte index by finding
747*38fd1498Szrj        its bit index and dividing by 8.  */
748*38fd1498Szrj     l = __builtin_clzl(l) >> 3;
749*38fd1498Szrj     return s + l;
750*38fd1498Szrj 
751*38fd1498Szrj #undef N
752*38fd1498Szrj   }
753*38fd1498Szrj }
754*38fd1498Szrj 
755*38fd1498Szrj #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
756*38fd1498Szrj #include "arm_neon.h"
757*38fd1498Szrj 
758*38fd1498Szrj /* This doesn't have to be the exact page size, but no system may use
759*38fd1498Szrj    a size smaller than this.  ARMv8 requires a minimum page size of
760*38fd1498Szrj    4k.  The impact of being conservative here is a small number of
761*38fd1498Szrj    cases will take the slightly slower entry path into the main
762*38fd1498Szrj    loop.  */
763*38fd1498Szrj 
764*38fd1498Szrj #define AARCH64_MIN_PAGE_SIZE 4096
765*38fd1498Szrj 
766*38fd1498Szrj static const uchar *
search_line_fast(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)767*38fd1498Szrj search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
768*38fd1498Szrj {
769*38fd1498Szrj   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
770*38fd1498Szrj   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
771*38fd1498Szrj   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
772*38fd1498Szrj   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
773*38fd1498Szrj   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
774*38fd1498Szrj 
775*38fd1498Szrj #ifdef __ARM_BIG_ENDIAN
776*38fd1498Szrj   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
777*38fd1498Szrj #else
778*38fd1498Szrj   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
779*38fd1498Szrj #endif
780*38fd1498Szrj 
781*38fd1498Szrj   unsigned int found;
782*38fd1498Szrj   const uint8_t *p;
783*38fd1498Szrj   uint8x16_t data;
784*38fd1498Szrj   uint8x16_t t;
785*38fd1498Szrj   uint16x8_t m;
786*38fd1498Szrj   uint8x16_t u, v, w;
787*38fd1498Szrj 
788*38fd1498Szrj   /* Align the source pointer.  */
789*38fd1498Szrj   p = (const uint8_t *)((uintptr_t)s & -16);
790*38fd1498Szrj 
791*38fd1498Szrj   /* Assuming random string start positions, with a 4k page size we'll take
792*38fd1498Szrj      the slow path about 0.37% of the time.  */
793*38fd1498Szrj   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
794*38fd1498Szrj 			 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
795*38fd1498Szrj 			< 16, 0))
796*38fd1498Szrj     {
797*38fd1498Szrj       /* Slow path: the string starts near a possible page boundary.  */
798*38fd1498Szrj       uint32_t misalign, mask;
799*38fd1498Szrj 
800*38fd1498Szrj       misalign = (uintptr_t)s & 15;
801*38fd1498Szrj       mask = (-1u << misalign) & 0xffff;
802*38fd1498Szrj       data = vld1q_u8 (p);
803*38fd1498Szrj       t = vceqq_u8 (data, repl_nl);
804*38fd1498Szrj       u = vceqq_u8 (data, repl_cr);
805*38fd1498Szrj       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
806*38fd1498Szrj       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
807*38fd1498Szrj       t = vorrq_u8 (v, w);
808*38fd1498Szrj       t = vandq_u8 (t, xmask);
809*38fd1498Szrj       m = vpaddlq_u8 (t);
810*38fd1498Szrj       m = vshlq_u16 (m, shift);
811*38fd1498Szrj       found = vaddvq_u16 (m);
812*38fd1498Szrj       found &= mask;
813*38fd1498Szrj       if (found)
814*38fd1498Szrj 	return (const uchar*)p + __builtin_ctz (found);
815*38fd1498Szrj     }
816*38fd1498Szrj   else
817*38fd1498Szrj     {
818*38fd1498Szrj       data = vld1q_u8 ((const uint8_t *) s);
819*38fd1498Szrj       t = vceqq_u8 (data, repl_nl);
820*38fd1498Szrj       u = vceqq_u8 (data, repl_cr);
821*38fd1498Szrj       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
822*38fd1498Szrj       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
823*38fd1498Szrj       t = vorrq_u8 (v, w);
824*38fd1498Szrj       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
825*38fd1498Szrj 	goto done;
826*38fd1498Szrj     }
827*38fd1498Szrj 
828*38fd1498Szrj   do
829*38fd1498Szrj     {
830*38fd1498Szrj       p += 16;
831*38fd1498Szrj       data = vld1q_u8 (p);
832*38fd1498Szrj       t = vceqq_u8 (data, repl_nl);
833*38fd1498Szrj       u = vceqq_u8 (data, repl_cr);
834*38fd1498Szrj       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
835*38fd1498Szrj       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
836*38fd1498Szrj       t = vorrq_u8 (v, w);
837*38fd1498Szrj     } while (!vpaddd_u64 ((uint64x2_t)t));
838*38fd1498Szrj 
839*38fd1498Szrj done:
840*38fd1498Szrj   /* Now that we've found the terminating substring, work out precisely where
841*38fd1498Szrj      we need to stop.  */
842*38fd1498Szrj   t = vandq_u8 (t, xmask);
843*38fd1498Szrj   m = vpaddlq_u8 (t);
844*38fd1498Szrj   m = vshlq_u16 (m, shift);
845*38fd1498Szrj   found = vaddvq_u16 (m);
846*38fd1498Szrj   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
847*38fd1498Szrj 	  + __builtin_ctz (found));
848*38fd1498Szrj }
849*38fd1498Szrj 
850*38fd1498Szrj #elif defined (__ARM_NEON)
851*38fd1498Szrj #include "arm_neon.h"
852*38fd1498Szrj 
853*38fd1498Szrj static const uchar *
search_line_fast(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)854*38fd1498Szrj search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
855*38fd1498Szrj {
856*38fd1498Szrj   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
857*38fd1498Szrj   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
858*38fd1498Szrj   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
859*38fd1498Szrj   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
860*38fd1498Szrj   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
861*38fd1498Szrj 
862*38fd1498Szrj   unsigned int misalign, found, mask;
863*38fd1498Szrj   const uint8_t *p;
864*38fd1498Szrj   uint8x16_t data;
865*38fd1498Szrj 
866*38fd1498Szrj   /* Align the source pointer.  */
867*38fd1498Szrj   misalign = (uintptr_t)s & 15;
868*38fd1498Szrj   p = (const uint8_t *)((uintptr_t)s & -16);
869*38fd1498Szrj   data = vld1q_u8 (p);
870*38fd1498Szrj 
871*38fd1498Szrj   /* Create a mask for the bytes that are valid within the first
872*38fd1498Szrj      16-byte block.  The Idea here is that the AND with the mask
873*38fd1498Szrj      within the loop is "free", since we need some AND or TEST
874*38fd1498Szrj      insn in order to set the flags for the branch anyway.  */
875*38fd1498Szrj   mask = (-1u << misalign) & 0xffff;
876*38fd1498Szrj 
877*38fd1498Szrj   /* Main loop, processing 16 bytes at a time.  */
878*38fd1498Szrj   goto start;
879*38fd1498Szrj 
880*38fd1498Szrj   do
881*38fd1498Szrj     {
882*38fd1498Szrj       uint8x8_t l;
883*38fd1498Szrj       uint16x4_t m;
884*38fd1498Szrj       uint32x2_t n;
885*38fd1498Szrj       uint8x16_t t, u, v, w;
886*38fd1498Szrj 
887*38fd1498Szrj       p += 16;
888*38fd1498Szrj       data = vld1q_u8 (p);
889*38fd1498Szrj       mask = 0xffff;
890*38fd1498Szrj 
891*38fd1498Szrj     start:
892*38fd1498Szrj       t = vceqq_u8 (data, repl_nl);
893*38fd1498Szrj       u = vceqq_u8 (data, repl_cr);
894*38fd1498Szrj       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
895*38fd1498Szrj       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
896*38fd1498Szrj       t = vandq_u8 (vorrq_u8 (v, w), xmask);
897*38fd1498Szrj       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
898*38fd1498Szrj       m = vpaddl_u8 (l);
899*38fd1498Szrj       n = vpaddl_u16 (m);
900*38fd1498Szrj 
901*38fd1498Szrj       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
902*38fd1498Szrj 	      vshr_n_u64 ((uint64x1_t) n, 24)), 0);
903*38fd1498Szrj       found &= mask;
904*38fd1498Szrj     }
905*38fd1498Szrj   while (!found);
906*38fd1498Szrj 
907*38fd1498Szrj   /* FOUND contains 1 in bits for which we matched a relevant
908*38fd1498Szrj      character.  Conversion to the byte index is trivial.  */
909*38fd1498Szrj   found = __builtin_ctz (found);
910*38fd1498Szrj   return (const uchar *)p + found;
911*38fd1498Szrj }
912*38fd1498Szrj 
913*38fd1498Szrj #else
914*38fd1498Szrj 
915*38fd1498Szrj /* We only have one accelerated alternative.  Use a direct call so that
916*38fd1498Szrj    we encourage inlining.  */
917*38fd1498Szrj 
918*38fd1498Szrj #define search_line_fast  search_line_acc_char
919*38fd1498Szrj 
920*38fd1498Szrj #endif
921*38fd1498Szrj 
922*38fd1498Szrj /* Initialize the lexer if needed.  */
923*38fd1498Szrj 
924*38fd1498Szrj void
_cpp_init_lexer(void)925*38fd1498Szrj _cpp_init_lexer (void)
926*38fd1498Szrj {
927*38fd1498Szrj #ifdef HAVE_init_vectorized_lexer
928*38fd1498Szrj   init_vectorized_lexer ();
929*38fd1498Szrj #endif
930*38fd1498Szrj }
931*38fd1498Szrj 
932*38fd1498Szrj /* Returns with a logical line that contains no escaped newlines or
933*38fd1498Szrj    trigraphs.  This is a time-critical inner loop.  */
934*38fd1498Szrj void
_cpp_clean_line(cpp_reader * pfile)935*38fd1498Szrj _cpp_clean_line (cpp_reader *pfile)
936*38fd1498Szrj {
937*38fd1498Szrj   cpp_buffer *buffer;
938*38fd1498Szrj   const uchar *s;
939*38fd1498Szrj   uchar c, *d, *p;
940*38fd1498Szrj 
941*38fd1498Szrj   buffer = pfile->buffer;
942*38fd1498Szrj   buffer->cur_note = buffer->notes_used = 0;
943*38fd1498Szrj   buffer->cur = buffer->line_base = buffer->next_line;
944*38fd1498Szrj   buffer->need_line = false;
945*38fd1498Szrj   s = buffer->next_line;
946*38fd1498Szrj 
947*38fd1498Szrj   if (!buffer->from_stage3)
948*38fd1498Szrj     {
949*38fd1498Szrj       const uchar *pbackslash = NULL;
950*38fd1498Szrj 
951*38fd1498Szrj       /* Fast path.  This is the common case of an un-escaped line with
952*38fd1498Szrj 	 no trigraphs.  The primary win here is by not writing any
953*38fd1498Szrj 	 data back to memory until we have to.  */
954*38fd1498Szrj       while (1)
955*38fd1498Szrj 	{
956*38fd1498Szrj 	  /* Perform an optimized search for \n, \r, \\, ?.  */
957*38fd1498Szrj 	  s = search_line_fast (s, buffer->rlimit);
958*38fd1498Szrj 
959*38fd1498Szrj 	  c = *s;
960*38fd1498Szrj 	  if (c == '\\')
961*38fd1498Szrj 	    {
962*38fd1498Szrj 	      /* Record the location of the backslash and continue.  */
963*38fd1498Szrj 	      pbackslash = s++;
964*38fd1498Szrj 	    }
965*38fd1498Szrj 	  else if (__builtin_expect (c == '?', 0))
966*38fd1498Szrj 	    {
967*38fd1498Szrj 	      if (__builtin_expect (s[1] == '?', false)
968*38fd1498Szrj 		   && _cpp_trigraph_map[s[2]])
969*38fd1498Szrj 		{
970*38fd1498Szrj 		  /* Have a trigraph.  We may or may not have to convert
971*38fd1498Szrj 		     it.  Add a line note regardless, for -Wtrigraphs.  */
972*38fd1498Szrj 		  add_line_note (buffer, s, s[2]);
973*38fd1498Szrj 		  if (CPP_OPTION (pfile, trigraphs))
974*38fd1498Szrj 		    {
975*38fd1498Szrj 		      /* We do, and that means we have to switch to the
976*38fd1498Szrj 		         slow path.  */
977*38fd1498Szrj 		      d = (uchar *) s;
978*38fd1498Szrj 		      *d = _cpp_trigraph_map[s[2]];
979*38fd1498Szrj 		      s += 2;
980*38fd1498Szrj 		      goto slow_path;
981*38fd1498Szrj 		    }
982*38fd1498Szrj 		}
983*38fd1498Szrj 	      /* Not a trigraph.  Continue on fast-path.  */
984*38fd1498Szrj 	      s++;
985*38fd1498Szrj 	    }
986*38fd1498Szrj 	  else
987*38fd1498Szrj 	    break;
988*38fd1498Szrj 	}
989*38fd1498Szrj 
990*38fd1498Szrj       /* This must be \r or \n.  We're either done, or we'll be forced
991*38fd1498Szrj 	 to write back to the buffer and continue on the slow path.  */
992*38fd1498Szrj       d = (uchar *) s;
993*38fd1498Szrj 
994*38fd1498Szrj       if (__builtin_expect (s == buffer->rlimit, false))
995*38fd1498Szrj 	goto done;
996*38fd1498Szrj 
997*38fd1498Szrj       /* DOS line ending? */
998*38fd1498Szrj       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
999*38fd1498Szrj 	{
1000*38fd1498Szrj 	  s++;
1001*38fd1498Szrj 	  if (s == buffer->rlimit)
1002*38fd1498Szrj 	    goto done;
1003*38fd1498Szrj 	}
1004*38fd1498Szrj 
1005*38fd1498Szrj       if (__builtin_expect (pbackslash == NULL, true))
1006*38fd1498Szrj 	goto done;
1007*38fd1498Szrj 
1008*38fd1498Szrj       /* Check for escaped newline.  */
1009*38fd1498Szrj       p = d;
1010*38fd1498Szrj       while (is_nvspace (p[-1]))
1011*38fd1498Szrj 	p--;
1012*38fd1498Szrj       if (p - 1 != pbackslash)
1013*38fd1498Szrj 	goto done;
1014*38fd1498Szrj 
1015*38fd1498Szrj       /* Have an escaped newline; process it and proceed to
1016*38fd1498Szrj 	 the slow path.  */
1017*38fd1498Szrj       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018*38fd1498Szrj       d = p - 2;
1019*38fd1498Szrj       buffer->next_line = p - 1;
1020*38fd1498Szrj 
1021*38fd1498Szrj     slow_path:
1022*38fd1498Szrj       while (1)
1023*38fd1498Szrj 	{
1024*38fd1498Szrj 	  c = *++s;
1025*38fd1498Szrj 	  *++d = c;
1026*38fd1498Szrj 
1027*38fd1498Szrj 	  if (c == '\n' || c == '\r')
1028*38fd1498Szrj 	    {
1029*38fd1498Szrj 	      /* Handle DOS line endings.  */
1030*38fd1498Szrj 	      if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031*38fd1498Szrj 		s++;
1032*38fd1498Szrj 	      if (s == buffer->rlimit)
1033*38fd1498Szrj 		break;
1034*38fd1498Szrj 
1035*38fd1498Szrj 	      /* Escaped?  */
1036*38fd1498Szrj 	      p = d;
1037*38fd1498Szrj 	      while (p != buffer->next_line && is_nvspace (p[-1]))
1038*38fd1498Szrj 		p--;
1039*38fd1498Szrj 	      if (p == buffer->next_line || p[-1] != '\\')
1040*38fd1498Szrj 		break;
1041*38fd1498Szrj 
1042*38fd1498Szrj 	      add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043*38fd1498Szrj 	      d = p - 2;
1044*38fd1498Szrj 	      buffer->next_line = p - 1;
1045*38fd1498Szrj 	    }
1046*38fd1498Szrj 	  else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047*38fd1498Szrj 	    {
1048*38fd1498Szrj 	      /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1049*38fd1498Szrj 	      add_line_note (buffer, d, s[2]);
1050*38fd1498Szrj 	      if (CPP_OPTION (pfile, trigraphs))
1051*38fd1498Szrj 		{
1052*38fd1498Szrj 		  *d = _cpp_trigraph_map[s[2]];
1053*38fd1498Szrj 		  s += 2;
1054*38fd1498Szrj 		}
1055*38fd1498Szrj 	    }
1056*38fd1498Szrj 	}
1057*38fd1498Szrj     }
1058*38fd1498Szrj   else
1059*38fd1498Szrj     {
1060*38fd1498Szrj       while (*s != '\n' && *s != '\r')
1061*38fd1498Szrj 	s++;
1062*38fd1498Szrj       d = (uchar *) s;
1063*38fd1498Szrj 
1064*38fd1498Szrj       /* Handle DOS line endings.  */
1065*38fd1498Szrj       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
1066*38fd1498Szrj 	s++;
1067*38fd1498Szrj     }
1068*38fd1498Szrj 
1069*38fd1498Szrj  done:
1070*38fd1498Szrj   *d = '\n';
1071*38fd1498Szrj   /* A sentinel note that should never be processed.  */
1072*38fd1498Szrj   add_line_note (buffer, d + 1, '\n');
1073*38fd1498Szrj   buffer->next_line = s + 1;
1074*38fd1498Szrj }
1075*38fd1498Szrj 
1076*38fd1498Szrj /* Return true if the trigraph indicated by NOTE should be warned
1077*38fd1498Szrj    about in a comment.  */
1078*38fd1498Szrj static bool
warn_in_comment(cpp_reader * pfile,_cpp_line_note * note)1079*38fd1498Szrj warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080*38fd1498Szrj {
1081*38fd1498Szrj   const uchar *p;
1082*38fd1498Szrj 
1083*38fd1498Szrj   /* Within comments we don't warn about trigraphs, unless the
1084*38fd1498Szrj      trigraph forms an escaped newline, as that may change
1085*38fd1498Szrj      behavior.  */
1086*38fd1498Szrj   if (note->type != '/')
1087*38fd1498Szrj     return false;
1088*38fd1498Szrj 
1089*38fd1498Szrj   /* If -trigraphs, then this was an escaped newline iff the next note
1090*38fd1498Szrj      is coincident.  */
1091*38fd1498Szrj   if (CPP_OPTION (pfile, trigraphs))
1092*38fd1498Szrj     return note[1].pos == note->pos;
1093*38fd1498Szrj 
1094*38fd1498Szrj   /* Otherwise, see if this forms an escaped newline.  */
1095*38fd1498Szrj   p = note->pos + 3;
1096*38fd1498Szrj   while (is_nvspace (*p))
1097*38fd1498Szrj     p++;
1098*38fd1498Szrj 
1099*38fd1498Szrj   /* There might have been escaped newlines between the trigraph and the
1100*38fd1498Szrj      newline we found.  Hence the position test.  */
1101*38fd1498Szrj   return (*p == '\n' && p < note[1].pos);
1102*38fd1498Szrj }
1103*38fd1498Szrj 
1104*38fd1498Szrj /* Process the notes created by add_line_note as far as the current
1105*38fd1498Szrj    location.  */
1106*38fd1498Szrj void
_cpp_process_line_notes(cpp_reader * pfile,int in_comment)1107*38fd1498Szrj _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108*38fd1498Szrj {
1109*38fd1498Szrj   cpp_buffer *buffer = pfile->buffer;
1110*38fd1498Szrj 
1111*38fd1498Szrj   for (;;)
1112*38fd1498Szrj     {
1113*38fd1498Szrj       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114*38fd1498Szrj       unsigned int col;
1115*38fd1498Szrj 
1116*38fd1498Szrj       if (note->pos > buffer->cur)
1117*38fd1498Szrj 	break;
1118*38fd1498Szrj 
1119*38fd1498Szrj       buffer->cur_note++;
1120*38fd1498Szrj       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121*38fd1498Szrj 
1122*38fd1498Szrj       if (note->type == '\\' || note->type == ' ')
1123*38fd1498Szrj 	{
1124*38fd1498Szrj 	  if (note->type == ' ' && !in_comment)
1125*38fd1498Szrj 	    cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126*38fd1498Szrj 				 "backslash and newline separated by space");
1127*38fd1498Szrj 
1128*38fd1498Szrj 	  if (buffer->next_line > buffer->rlimit)
1129*38fd1498Szrj 	    {
1130*38fd1498Szrj 	      cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131*38fd1498Szrj 				   "backslash-newline at end of file");
1132*38fd1498Szrj 	      /* Prevent "no newline at end of file" warning.  */
1133*38fd1498Szrj 	      buffer->next_line = buffer->rlimit;
1134*38fd1498Szrj 	    }
1135*38fd1498Szrj 
1136*38fd1498Szrj 	  buffer->line_base = note->pos;
1137*38fd1498Szrj 	  CPP_INCREMENT_LINE (pfile, 0);
1138*38fd1498Szrj 	}
1139*38fd1498Szrj       else if (_cpp_trigraph_map[note->type])
1140*38fd1498Szrj 	{
1141*38fd1498Szrj 	  if (CPP_OPTION (pfile, warn_trigraphs)
1142*38fd1498Szrj 	      && (!in_comment || warn_in_comment (pfile, note)))
1143*38fd1498Szrj 	    {
1144*38fd1498Szrj 	      if (CPP_OPTION (pfile, trigraphs))
1145*38fd1498Szrj 		cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146*38fd1498Szrj                                        pfile->line_table->highest_line, col,
1147*38fd1498Szrj 				       "trigraph ??%c converted to %c",
1148*38fd1498Szrj 				       note->type,
1149*38fd1498Szrj 				       (int) _cpp_trigraph_map[note->type]);
1150*38fd1498Szrj 	      else
1151*38fd1498Szrj 		{
1152*38fd1498Szrj 		  cpp_warning_with_line
1153*38fd1498Szrj 		    (pfile, CPP_W_TRIGRAPHS,
1154*38fd1498Szrj                      pfile->line_table->highest_line, col,
1155*38fd1498Szrj 		     "trigraph ??%c ignored, use -trigraphs to enable",
1156*38fd1498Szrj 		     note->type);
1157*38fd1498Szrj 		}
1158*38fd1498Szrj 	    }
1159*38fd1498Szrj 	}
1160*38fd1498Szrj       else if (note->type == 0)
1161*38fd1498Szrj 	/* Already processed in lex_raw_string.  */;
1162*38fd1498Szrj       else
1163*38fd1498Szrj 	abort ();
1164*38fd1498Szrj     }
1165*38fd1498Szrj }
1166*38fd1498Szrj 
1167*38fd1498Szrj /* Skip a C-style block comment.  We find the end of the comment by
1168*38fd1498Szrj    seeing if an asterisk is before every '/' we encounter.  Returns
1169*38fd1498Szrj    nonzero if comment terminated by EOF, zero otherwise.
1170*38fd1498Szrj 
1171*38fd1498Szrj    Buffer->cur points to the initial asterisk of the comment.  */
1172*38fd1498Szrj bool
_cpp_skip_block_comment(cpp_reader * pfile)1173*38fd1498Szrj _cpp_skip_block_comment (cpp_reader *pfile)
1174*38fd1498Szrj {
1175*38fd1498Szrj   cpp_buffer *buffer = pfile->buffer;
1176*38fd1498Szrj   const uchar *cur = buffer->cur;
1177*38fd1498Szrj   uchar c;
1178*38fd1498Szrj 
1179*38fd1498Szrj   cur++;
1180*38fd1498Szrj   if (*cur == '/')
1181*38fd1498Szrj     cur++;
1182*38fd1498Szrj 
1183*38fd1498Szrj   for (;;)
1184*38fd1498Szrj     {
1185*38fd1498Szrj       /* People like decorating comments with '*', so check for '/'
1186*38fd1498Szrj 	 instead for efficiency.  */
1187*38fd1498Szrj       c = *cur++;
1188*38fd1498Szrj 
1189*38fd1498Szrj       if (c == '/')
1190*38fd1498Szrj 	{
1191*38fd1498Szrj 	  if (cur[-2] == '*')
1192*38fd1498Szrj 	    break;
1193*38fd1498Szrj 
1194*38fd1498Szrj 	  /* Warn about potential nested comments, but not if the '/'
1195*38fd1498Szrj 	     comes immediately before the true comment delimiter.
1196*38fd1498Szrj 	     Don't bother to get it right across escaped newlines.  */
1197*38fd1498Szrj 	  if (CPP_OPTION (pfile, warn_comments)
1198*38fd1498Szrj 	      && cur[0] == '*' && cur[1] != '/')
1199*38fd1498Szrj 	    {
1200*38fd1498Szrj 	      buffer->cur = cur;
1201*38fd1498Szrj 	      cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202*38fd1498Szrj 				     pfile->line_table->highest_line,
1203*38fd1498Szrj 				     CPP_BUF_COL (buffer),
1204*38fd1498Szrj 				     "\"/*\" within comment");
1205*38fd1498Szrj 	    }
1206*38fd1498Szrj 	}
1207*38fd1498Szrj       else if (c == '\n')
1208*38fd1498Szrj 	{
1209*38fd1498Szrj 	  unsigned int cols;
1210*38fd1498Szrj 	  buffer->cur = cur - 1;
1211*38fd1498Szrj 	  _cpp_process_line_notes (pfile, true);
1212*38fd1498Szrj 	  if (buffer->next_line >= buffer->rlimit)
1213*38fd1498Szrj 	    return true;
1214*38fd1498Szrj 	  _cpp_clean_line (pfile);
1215*38fd1498Szrj 
1216*38fd1498Szrj 	  cols = buffer->next_line - buffer->line_base;
1217*38fd1498Szrj 	  CPP_INCREMENT_LINE (pfile, cols);
1218*38fd1498Szrj 
1219*38fd1498Szrj 	  cur = buffer->cur;
1220*38fd1498Szrj 	}
1221*38fd1498Szrj     }
1222*38fd1498Szrj 
1223*38fd1498Szrj   buffer->cur = cur;
1224*38fd1498Szrj   _cpp_process_line_notes (pfile, true);
1225*38fd1498Szrj   return false;
1226*38fd1498Szrj }
1227*38fd1498Szrj 
1228*38fd1498Szrj /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229*38fd1498Szrj    terminating newline.  Handles escaped newlines.  Returns nonzero
1230*38fd1498Szrj    if a multiline comment.  */
1231*38fd1498Szrj static int
skip_line_comment(cpp_reader * pfile)1232*38fd1498Szrj skip_line_comment (cpp_reader *pfile)
1233*38fd1498Szrj {
1234*38fd1498Szrj   cpp_buffer *buffer = pfile->buffer;
1235*38fd1498Szrj   source_location orig_line = pfile->line_table->highest_line;
1236*38fd1498Szrj 
1237*38fd1498Szrj   while (*buffer->cur != '\n')
1238*38fd1498Szrj     buffer->cur++;
1239*38fd1498Szrj 
1240*38fd1498Szrj   _cpp_process_line_notes (pfile, true);
1241*38fd1498Szrj   return orig_line != pfile->line_table->highest_line;
1242*38fd1498Szrj }
1243*38fd1498Szrj 
1244*38fd1498Szrj /* Skips whitespace, saving the next non-whitespace character.  */
1245*38fd1498Szrj static void
skip_whitespace(cpp_reader * pfile,cppchar_t c)1246*38fd1498Szrj skip_whitespace (cpp_reader *pfile, cppchar_t c)
1247*38fd1498Szrj {
1248*38fd1498Szrj   cpp_buffer *buffer = pfile->buffer;
1249*38fd1498Szrj   bool saw_NUL = false;
1250*38fd1498Szrj 
1251*38fd1498Szrj   do
1252*38fd1498Szrj     {
1253*38fd1498Szrj       /* Horizontal space always OK.  */
1254*38fd1498Szrj       if (c == ' ' || c == '\t')
1255*38fd1498Szrj 	;
1256*38fd1498Szrj       /* Just \f \v or \0 left.  */
1257*38fd1498Szrj       else if (c == '\0')
1258*38fd1498Szrj 	saw_NUL = true;
1259*38fd1498Szrj       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260*38fd1498Szrj 	cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261*38fd1498Szrj 			     CPP_BUF_COL (buffer),
1262*38fd1498Szrj 			     "%s in preprocessing directive",
1263*38fd1498Szrj 			     c == '\f' ? "form feed" : "vertical tab");
1264*38fd1498Szrj 
1265*38fd1498Szrj       c = *buffer->cur++;
1266*38fd1498Szrj     }
1267*38fd1498Szrj   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1268*38fd1498Szrj   while (is_nvspace (c));
1269*38fd1498Szrj 
1270*38fd1498Szrj   if (saw_NUL)
1271*38fd1498Szrj     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1272*38fd1498Szrj 
1273*38fd1498Szrj   buffer->cur--;
1274*38fd1498Szrj }
1275*38fd1498Szrj 
1276*38fd1498Szrj /* See if the characters of a number token are valid in a name (no
1277*38fd1498Szrj    '.', '+' or '-').  */
1278*38fd1498Szrj static int
name_p(cpp_reader * pfile,const cpp_string * string)1279*38fd1498Szrj name_p (cpp_reader *pfile, const cpp_string *string)
1280*38fd1498Szrj {
1281*38fd1498Szrj   unsigned int i;
1282*38fd1498Szrj 
1283*38fd1498Szrj   for (i = 0; i < string->len; i++)
1284*38fd1498Szrj     if (!is_idchar (string->text[i]))
1285*38fd1498Szrj       return 0;
1286*38fd1498Szrj 
1287*38fd1498Szrj   return 1;
1288*38fd1498Szrj }
1289*38fd1498Szrj 
1290*38fd1498Szrj /* After parsing an identifier or other sequence, produce a warning about
1291*38fd1498Szrj    sequences not in NFC/NFKC.  */
1292*38fd1498Szrj static void
warn_about_normalization(cpp_reader * pfile,const cpp_token * token,const struct normalize_state * s)1293*38fd1498Szrj warn_about_normalization (cpp_reader *pfile,
1294*38fd1498Szrj 			  const cpp_token *token,
1295*38fd1498Szrj 			  const struct normalize_state *s)
1296*38fd1498Szrj {
1297*38fd1498Szrj   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1298*38fd1498Szrj       && !pfile->state.skipping)
1299*38fd1498Szrj     {
1300*38fd1498Szrj       /* Make sure that the token is printed using UCNs, even
1301*38fd1498Szrj 	 if we'd otherwise happily print UTF-8.  */
1302*38fd1498Szrj       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1303*38fd1498Szrj       size_t sz;
1304*38fd1498Szrj 
1305*38fd1498Szrj       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1306*38fd1498Szrj       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1307*38fd1498Szrj 	cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1308*38fd1498Szrj 			       "`%.*s' is not in NFKC", (int) sz, buf);
1309*38fd1498Szrj       else
1310*38fd1498Szrj 	cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1311*38fd1498Szrj 			       "`%.*s' is not in NFC", (int) sz, buf);
1312*38fd1498Szrj       free (buf);
1313*38fd1498Szrj     }
1314*38fd1498Szrj }
1315*38fd1498Szrj 
1316*38fd1498Szrj /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1317*38fd1498Szrj    an identifier.  FIRST is TRUE if this starts an identifier.  */
1318*38fd1498Szrj static bool
forms_identifier_p(cpp_reader * pfile,int first,struct normalize_state * state)1319*38fd1498Szrj forms_identifier_p (cpp_reader *pfile, int first,
1320*38fd1498Szrj 		    struct normalize_state *state)
1321*38fd1498Szrj {
1322*38fd1498Szrj   cpp_buffer *buffer = pfile->buffer;
1323*38fd1498Szrj 
1324*38fd1498Szrj   if (*buffer->cur == '$')
1325*38fd1498Szrj     {
1326*38fd1498Szrj       if (!CPP_OPTION (pfile, dollars_in_ident))
1327*38fd1498Szrj 	return false;
1328*38fd1498Szrj 
1329*38fd1498Szrj       buffer->cur++;
1330*38fd1498Szrj       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1331*38fd1498Szrj 	{
1332*38fd1498Szrj 	  CPP_OPTION (pfile, warn_dollars) = 0;
1333*38fd1498Szrj 	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1334*38fd1498Szrj 	}
1335*38fd1498Szrj 
1336*38fd1498Szrj       return true;
1337*38fd1498Szrj     }
1338*38fd1498Szrj 
1339*38fd1498Szrj   /* Is this a syntactically valid UCN?  */
1340*38fd1498Szrj   if (CPP_OPTION (pfile, extended_identifiers)
1341*38fd1498Szrj       && *buffer->cur == '\\'
1342*38fd1498Szrj       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1343*38fd1498Szrj     {
1344*38fd1498Szrj       cppchar_t s;
1345*38fd1498Szrj       buffer->cur += 2;
1346*38fd1498Szrj       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1347*38fd1498Szrj 			  state, &s, NULL, NULL))
1348*38fd1498Szrj 	return true;
1349*38fd1498Szrj       buffer->cur -= 2;
1350*38fd1498Szrj     }
1351*38fd1498Szrj 
1352*38fd1498Szrj   return false;
1353*38fd1498Szrj }
1354*38fd1498Szrj 
1355*38fd1498Szrj /* Helper function to issue error about improper __VA_OPT__ use.  */
1356*38fd1498Szrj static void
maybe_va_opt_error(cpp_reader * pfile)1357*38fd1498Szrj maybe_va_opt_error (cpp_reader *pfile)
1358*38fd1498Szrj {
1359*38fd1498Szrj   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1360*38fd1498Szrj     {
1361*38fd1498Szrj       /* __VA_OPT__ should not be accepted at all, but allow it in
1362*38fd1498Szrj 	 system headers.  */
1363*38fd1498Szrj       if (!cpp_in_system_header (pfile))
1364*38fd1498Szrj 	cpp_error (pfile, CPP_DL_PEDWARN,
1365*38fd1498Szrj 		   "__VA_OPT__ is not available until C++2a");
1366*38fd1498Szrj     }
1367*38fd1498Szrj   else if (!pfile->state.va_args_ok)
1368*38fd1498Szrj     {
1369*38fd1498Szrj       /* __VA_OPT__ should only appear in the replacement list of a
1370*38fd1498Szrj 	 variadic macro.  */
1371*38fd1498Szrj       cpp_error (pfile, CPP_DL_PEDWARN,
1372*38fd1498Szrj 		 "__VA_OPT__ can only appear in the expansion"
1373*38fd1498Szrj 		 " of a C++2a variadic macro");
1374*38fd1498Szrj     }
1375*38fd1498Szrj }
1376*38fd1498Szrj 
1377*38fd1498Szrj /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1378*38fd1498Szrj static cpp_hashnode *
lex_identifier_intern(cpp_reader * pfile,const uchar * base)1379*38fd1498Szrj lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1380*38fd1498Szrj {
1381*38fd1498Szrj   cpp_hashnode *result;
1382*38fd1498Szrj   const uchar *cur;
1383*38fd1498Szrj   unsigned int len;
1384*38fd1498Szrj   unsigned int hash = HT_HASHSTEP (0, *base);
1385*38fd1498Szrj 
1386*38fd1498Szrj   cur = base + 1;
1387*38fd1498Szrj   while (ISIDNUM (*cur))
1388*38fd1498Szrj     {
1389*38fd1498Szrj       hash = HT_HASHSTEP (hash, *cur);
1390*38fd1498Szrj       cur++;
1391*38fd1498Szrj     }
1392*38fd1498Szrj   len = cur - base;
1393*38fd1498Szrj   hash = HT_HASHFINISH (hash, len);
1394*38fd1498Szrj   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1395*38fd1498Szrj 					      base, len, hash, HT_ALLOC));
1396*38fd1498Szrj 
1397*38fd1498Szrj   /* Rarely, identifiers require diagnostics when lexed.  */
1398*38fd1498Szrj   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1399*38fd1498Szrj 			&& !pfile->state.skipping, 0))
1400*38fd1498Szrj     {
1401*38fd1498Szrj       /* It is allowed to poison the same identifier twice.  */
1402*38fd1498Szrj       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1403*38fd1498Szrj 	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1404*38fd1498Szrj 		   NODE_NAME (result));
1405*38fd1498Szrj 
1406*38fd1498Szrj       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1407*38fd1498Szrj 	 replacement list of a variadic macro.  */
1408*38fd1498Szrj       if (result == pfile->spec_nodes.n__VA_ARGS__
1409*38fd1498Szrj 	  && !pfile->state.va_args_ok)
1410*38fd1498Szrj 	{
1411*38fd1498Szrj 	  if (CPP_OPTION (pfile, cplusplus))
1412*38fd1498Szrj 	    cpp_error (pfile, CPP_DL_PEDWARN,
1413*38fd1498Szrj 		       "__VA_ARGS__ can only appear in the expansion"
1414*38fd1498Szrj 		       " of a C++11 variadic macro");
1415*38fd1498Szrj 	  else
1416*38fd1498Szrj 	    cpp_error (pfile, CPP_DL_PEDWARN,
1417*38fd1498Szrj 		       "__VA_ARGS__ can only appear in the expansion"
1418*38fd1498Szrj 		       " of a C99 variadic macro");
1419*38fd1498Szrj 	}
1420*38fd1498Szrj 
1421*38fd1498Szrj       if (result == pfile->spec_nodes.n__VA_OPT__)
1422*38fd1498Szrj 	maybe_va_opt_error (pfile);
1423*38fd1498Szrj 
1424*38fd1498Szrj       /* For -Wc++-compat, warn about use of C++ named operators.  */
1425*38fd1498Szrj       if (result->flags & NODE_WARN_OPERATOR)
1426*38fd1498Szrj 	cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1427*38fd1498Szrj 		     "identifier \"%s\" is a special operator name in C++",
1428*38fd1498Szrj 		     NODE_NAME (result));
1429*38fd1498Szrj     }
1430*38fd1498Szrj 
1431*38fd1498Szrj   return result;
1432*38fd1498Szrj }
1433*38fd1498Szrj 
1434*38fd1498Szrj /* Get the cpp_hashnode of an identifier specified by NAME in
1435*38fd1498Szrj    the current cpp_reader object.  If none is found, NULL is returned.  */
1436*38fd1498Szrj cpp_hashnode *
_cpp_lex_identifier(cpp_reader * pfile,const char * name)1437*38fd1498Szrj _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1438*38fd1498Szrj {
1439*38fd1498Szrj   cpp_hashnode *result;
1440*38fd1498Szrj   result = lex_identifier_intern (pfile, (uchar *) name);
1441*38fd1498Szrj   return result;
1442*38fd1498Szrj }
1443*38fd1498Szrj 
1444*38fd1498Szrj /* Lex an identifier starting at BUFFER->CUR - 1.  */
1445*38fd1498Szrj static cpp_hashnode *
lex_identifier(cpp_reader * pfile,const uchar * base,bool starts_ucn,struct normalize_state * nst,cpp_hashnode ** spelling)1446*38fd1498Szrj lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1447*38fd1498Szrj 		struct normalize_state *nst, cpp_hashnode **spelling)
1448*38fd1498Szrj {
1449*38fd1498Szrj   cpp_hashnode *result;
1450*38fd1498Szrj   const uchar *cur;
1451*38fd1498Szrj   unsigned int len;
1452*38fd1498Szrj   unsigned int hash = HT_HASHSTEP (0, *base);
1453*38fd1498Szrj 
1454*38fd1498Szrj   cur = pfile->buffer->cur;
1455*38fd1498Szrj   if (! starts_ucn)
1456*38fd1498Szrj     {
1457*38fd1498Szrj       while (ISIDNUM (*cur))
1458*38fd1498Szrj 	{
1459*38fd1498Szrj 	  hash = HT_HASHSTEP (hash, *cur);
1460*38fd1498Szrj 	  cur++;
1461*38fd1498Szrj 	}
1462*38fd1498Szrj       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1463*38fd1498Szrj     }
1464*38fd1498Szrj   pfile->buffer->cur = cur;
1465*38fd1498Szrj   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1466*38fd1498Szrj     {
1467*38fd1498Szrj       /* Slower version for identifiers containing UCNs (or $).  */
1468*38fd1498Szrj       do {
1469*38fd1498Szrj 	while (ISIDNUM (*pfile->buffer->cur))
1470*38fd1498Szrj 	  {
1471*38fd1498Szrj 	    NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1472*38fd1498Szrj 	    pfile->buffer->cur++;
1473*38fd1498Szrj 	  }
1474*38fd1498Szrj       } while (forms_identifier_p (pfile, false, nst));
1475*38fd1498Szrj       result = _cpp_interpret_identifier (pfile, base,
1476*38fd1498Szrj 					  pfile->buffer->cur - base);
1477*38fd1498Szrj       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1478*38fd1498Szrj     }
1479*38fd1498Szrj   else
1480*38fd1498Szrj     {
1481*38fd1498Szrj       len = cur - base;
1482*38fd1498Szrj       hash = HT_HASHFINISH (hash, len);
1483*38fd1498Szrj 
1484*38fd1498Szrj       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1485*38fd1498Szrj 						  base, len, hash, HT_ALLOC));
1486*38fd1498Szrj       *spelling = result;
1487*38fd1498Szrj     }
1488*38fd1498Szrj 
1489*38fd1498Szrj   /* Rarely, identifiers require diagnostics when lexed.  */
1490*38fd1498Szrj   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1491*38fd1498Szrj 			&& !pfile->state.skipping, 0))
1492*38fd1498Szrj     {
1493*38fd1498Szrj       /* It is allowed to poison the same identifier twice.  */
1494*38fd1498Szrj       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1495*38fd1498Szrj 	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1496*38fd1498Szrj 		   NODE_NAME (result));
1497*38fd1498Szrj 
1498*38fd1498Szrj       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1499*38fd1498Szrj 	 replacement list of a variadic macro.  */
1500*38fd1498Szrj       if (result == pfile->spec_nodes.n__VA_ARGS__
1501*38fd1498Szrj 	  && !pfile->state.va_args_ok)
1502*38fd1498Szrj 	{
1503*38fd1498Szrj 	  if (CPP_OPTION (pfile, cplusplus))
1504*38fd1498Szrj 	    cpp_error (pfile, CPP_DL_PEDWARN,
1505*38fd1498Szrj 		       "__VA_ARGS__ can only appear in the expansion"
1506*38fd1498Szrj 		       " of a C++11 variadic macro");
1507*38fd1498Szrj 	  else
1508*38fd1498Szrj 	    cpp_error (pfile, CPP_DL_PEDWARN,
1509*38fd1498Szrj 		       "__VA_ARGS__ can only appear in the expansion"
1510*38fd1498Szrj 		       " of a C99 variadic macro");
1511*38fd1498Szrj 	}
1512*38fd1498Szrj 
1513*38fd1498Szrj       /* __VA_OPT__ should only appear in the replacement list of a
1514*38fd1498Szrj 	 variadic macro.  */
1515*38fd1498Szrj       if (result == pfile->spec_nodes.n__VA_OPT__)
1516*38fd1498Szrj 	maybe_va_opt_error (pfile);
1517*38fd1498Szrj 
1518*38fd1498Szrj       /* For -Wc++-compat, warn about use of C++ named operators.  */
1519*38fd1498Szrj       if (result->flags & NODE_WARN_OPERATOR)
1520*38fd1498Szrj 	cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1521*38fd1498Szrj 		     "identifier \"%s\" is a special operator name in C++",
1522*38fd1498Szrj 		     NODE_NAME (result));
1523*38fd1498Szrj     }
1524*38fd1498Szrj 
1525*38fd1498Szrj   return result;
1526*38fd1498Szrj }
1527*38fd1498Szrj 
1528*38fd1498Szrj /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1529*38fd1498Szrj static void
lex_number(cpp_reader * pfile,cpp_string * number,struct normalize_state * nst)1530*38fd1498Szrj lex_number (cpp_reader *pfile, cpp_string *number,
1531*38fd1498Szrj 	    struct normalize_state *nst)
1532*38fd1498Szrj {
1533*38fd1498Szrj   const uchar *cur;
1534*38fd1498Szrj   const uchar *base;
1535*38fd1498Szrj   uchar *dest;
1536*38fd1498Szrj 
1537*38fd1498Szrj   base = pfile->buffer->cur - 1;
1538*38fd1498Szrj   do
1539*38fd1498Szrj     {
1540*38fd1498Szrj       cur = pfile->buffer->cur;
1541*38fd1498Szrj 
1542*38fd1498Szrj       /* N.B. ISIDNUM does not include $.  */
1543*38fd1498Szrj       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1544*38fd1498Szrj 	     || VALID_SIGN (*cur, cur[-1]))
1545*38fd1498Szrj 	{
1546*38fd1498Szrj 	  NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1547*38fd1498Szrj 	  cur++;
1548*38fd1498Szrj 	}
1549*38fd1498Szrj       /* A number can't end with a digit separator.  */
1550*38fd1498Szrj       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1551*38fd1498Szrj 	--cur;
1552*38fd1498Szrj 
1553*38fd1498Szrj       pfile->buffer->cur = cur;
1554*38fd1498Szrj     }
1555*38fd1498Szrj   while (forms_identifier_p (pfile, false, nst));
1556*38fd1498Szrj 
1557*38fd1498Szrj   number->len = cur - base;
1558*38fd1498Szrj   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1559*38fd1498Szrj   memcpy (dest, base, number->len);
1560*38fd1498Szrj   dest[number->len] = '\0';
1561*38fd1498Szrj   number->text = dest;
1562*38fd1498Szrj }
1563*38fd1498Szrj 
1564*38fd1498Szrj /* Create a token of type TYPE with a literal spelling.  */
1565*38fd1498Szrj static void
create_literal(cpp_reader * pfile,cpp_token * token,const uchar * base,unsigned int len,enum cpp_ttype type)1566*38fd1498Szrj create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1567*38fd1498Szrj 		unsigned int len, enum cpp_ttype type)
1568*38fd1498Szrj {
1569*38fd1498Szrj   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1570*38fd1498Szrj 
1571*38fd1498Szrj   memcpy (dest, base, len);
1572*38fd1498Szrj   dest[len] = '\0';
1573*38fd1498Szrj   token->type = type;
1574*38fd1498Szrj   token->val.str.len = len;
1575*38fd1498Szrj   token->val.str.text = dest;
1576*38fd1498Szrj }
1577*38fd1498Szrj 
1578*38fd1498Szrj /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1579*38fd1498Szrj    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1580*38fd1498Szrj 
1581*38fd1498Szrj static void
bufring_append(cpp_reader * pfile,const uchar * base,size_t len,_cpp_buff ** first_buff_p,_cpp_buff ** last_buff_p)1582*38fd1498Szrj bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1583*38fd1498Szrj 		_cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1584*38fd1498Szrj {
1585*38fd1498Szrj   _cpp_buff *first_buff = *first_buff_p;
1586*38fd1498Szrj   _cpp_buff *last_buff = *last_buff_p;
1587*38fd1498Szrj 
1588*38fd1498Szrj   if (first_buff == NULL)
1589*38fd1498Szrj     first_buff = last_buff = _cpp_get_buff (pfile, len);
1590*38fd1498Szrj   else if (len > BUFF_ROOM (last_buff))
1591*38fd1498Szrj     {
1592*38fd1498Szrj       size_t room = BUFF_ROOM (last_buff);
1593*38fd1498Szrj       memcpy (BUFF_FRONT (last_buff), base, room);
1594*38fd1498Szrj       BUFF_FRONT (last_buff) += room;
1595*38fd1498Szrj       base += room;
1596*38fd1498Szrj       len -= room;
1597*38fd1498Szrj       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1598*38fd1498Szrj     }
1599*38fd1498Szrj 
1600*38fd1498Szrj   memcpy (BUFF_FRONT (last_buff), base, len);
1601*38fd1498Szrj   BUFF_FRONT (last_buff) += len;
1602*38fd1498Szrj 
1603*38fd1498Szrj   *first_buff_p = first_buff;
1604*38fd1498Szrj   *last_buff_p = last_buff;
1605*38fd1498Szrj }
1606*38fd1498Szrj 
1607*38fd1498Szrj 
1608*38fd1498Szrj /* Returns true if a macro has been defined.
1609*38fd1498Szrj    This might not work if compile with -save-temps,
1610*38fd1498Szrj    or preprocess separately from compilation.  */
1611*38fd1498Szrj 
1612*38fd1498Szrj static bool
is_macro(cpp_reader * pfile,const uchar * base)1613*38fd1498Szrj is_macro(cpp_reader *pfile, const uchar *base)
1614*38fd1498Szrj {
1615*38fd1498Szrj   const uchar *cur = base;
1616*38fd1498Szrj   if (! ISIDST (*cur))
1617*38fd1498Szrj     return false;
1618*38fd1498Szrj   unsigned int hash = HT_HASHSTEP (0, *cur);
1619*38fd1498Szrj   ++cur;
1620*38fd1498Szrj   while (ISIDNUM (*cur))
1621*38fd1498Szrj     {
1622*38fd1498Szrj       hash = HT_HASHSTEP (hash, *cur);
1623*38fd1498Szrj       ++cur;
1624*38fd1498Szrj     }
1625*38fd1498Szrj   hash = HT_HASHFINISH (hash, cur - base);
1626*38fd1498Szrj 
1627*38fd1498Szrj   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1628*38fd1498Szrj 					base, cur - base, hash, HT_NO_INSERT));
1629*38fd1498Szrj 
1630*38fd1498Szrj   return !result ? false : (result->type == NT_MACRO);
1631*38fd1498Szrj }
1632*38fd1498Szrj 
1633*38fd1498Szrj /* Returns true if a literal suffix does not have the expected form
1634*38fd1498Szrj    and is defined as a macro.  */
1635*38fd1498Szrj 
1636*38fd1498Szrj static bool
is_macro_not_literal_suffix(cpp_reader * pfile,const uchar * base)1637*38fd1498Szrj is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
1638*38fd1498Szrj {
1639*38fd1498Szrj   /* User-defined literals outside of namespace std must start with a single
1640*38fd1498Szrj      underscore, so assume anything of that form really is a UDL suffix.
1641*38fd1498Szrj      We don't need to worry about UDLs defined inside namespace std because
1642*38fd1498Szrj      their names are reserved, so cannot be used as macro names in valid
1643*38fd1498Szrj      programs.  */
1644*38fd1498Szrj   if (base[0] == '_' && base[1] != '_')
1645*38fd1498Szrj     return false;
1646*38fd1498Szrj   return is_macro (pfile, base);
1647*38fd1498Szrj }
1648*38fd1498Szrj 
1649*38fd1498Szrj /* Lexes a raw string.  The stored string contains the spelling, including
1650*38fd1498Szrj    double quotes, delimiter string, '(' and ')', any leading
1651*38fd1498Szrj    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1652*38fd1498Szrj    literal, or CPP_OTHER if it was not properly terminated.
1653*38fd1498Szrj 
1654*38fd1498Szrj    The spelling is NUL-terminated, but it is not guaranteed that this
1655*38fd1498Szrj    is the first NUL since embedded NULs are preserved.  */
1656*38fd1498Szrj 
1657*38fd1498Szrj static void
lex_raw_string(cpp_reader * pfile,cpp_token * token,const uchar * base,const uchar * cur)1658*38fd1498Szrj lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1659*38fd1498Szrj 		const uchar *cur)
1660*38fd1498Szrj {
1661*38fd1498Szrj   uchar raw_prefix[17];
1662*38fd1498Szrj   uchar temp_buffer[18];
1663*38fd1498Szrj   const uchar *orig_base;
1664*38fd1498Szrj   unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1665*38fd1498Szrj   enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1666*38fd1498Szrj   raw_str_phase phase = RAW_STR_PREFIX;
1667*38fd1498Szrj   enum cpp_ttype type;
1668*38fd1498Szrj   size_t total_len = 0;
1669*38fd1498Szrj   /* Index into temp_buffer during phases other than RAW_STR,
1670*38fd1498Szrj      during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1671*38fd1498Szrj      be appended to temp_buffer.  */
1672*38fd1498Szrj   size_t temp_buffer_len = 0;
1673*38fd1498Szrj   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1674*38fd1498Szrj   size_t raw_prefix_start;
1675*38fd1498Szrj   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1676*38fd1498Szrj 
1677*38fd1498Szrj   type = (*base == 'L' ? CPP_WSTRING :
1678*38fd1498Szrj 	  *base == 'U' ? CPP_STRING32 :
1679*38fd1498Szrj 	  *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1680*38fd1498Szrj 	  : CPP_STRING);
1681*38fd1498Szrj 
1682*38fd1498Szrj #define BUF_APPEND(STR,LEN)					\
1683*38fd1498Szrj       do {							\
1684*38fd1498Szrj 	bufring_append (pfile, (const uchar *)(STR), (LEN),	\
1685*38fd1498Szrj 			&first_buff, &last_buff);		\
1686*38fd1498Szrj 	total_len += (LEN);					\
1687*38fd1498Szrj 	if (__builtin_expect (temp_buffer_len < 17, 0)		\
1688*38fd1498Szrj 	    && (const uchar *)(STR) != base			\
1689*38fd1498Szrj 	    && (LEN) <= 2)					\
1690*38fd1498Szrj 	  {							\
1691*38fd1498Szrj 	    memcpy (temp_buffer + temp_buffer_len,		\
1692*38fd1498Szrj 		    (const uchar *)(STR), (LEN));		\
1693*38fd1498Szrj 	    temp_buffer_len += (LEN);				\
1694*38fd1498Szrj 	  }							\
1695*38fd1498Szrj       } while (0)
1696*38fd1498Szrj 
1697*38fd1498Szrj   orig_base = base;
1698*38fd1498Szrj   ++cur;
1699*38fd1498Szrj   raw_prefix_start = cur - base;
1700*38fd1498Szrj   for (;;)
1701*38fd1498Szrj     {
1702*38fd1498Szrj       cppchar_t c;
1703*38fd1498Szrj 
1704*38fd1498Szrj       /* If we previously performed any trigraph or line splicing
1705*38fd1498Szrj 	 transformations, undo them in between the opening and closing
1706*38fd1498Szrj 	 double quote.  */
1707*38fd1498Szrj       while (note->pos < cur)
1708*38fd1498Szrj 	++note;
1709*38fd1498Szrj       for (; note->pos == cur; ++note)
1710*38fd1498Szrj 	{
1711*38fd1498Szrj 	  switch (note->type)
1712*38fd1498Szrj 	    {
1713*38fd1498Szrj 	    case '\\':
1714*38fd1498Szrj 	    case ' ':
1715*38fd1498Szrj 	      /* Restore backslash followed by newline.  */
1716*38fd1498Szrj 	      BUF_APPEND (base, cur - base);
1717*38fd1498Szrj 	      base = cur;
1718*38fd1498Szrj 	      BUF_APPEND ("\\", 1);
1719*38fd1498Szrj 	    after_backslash:
1720*38fd1498Szrj 	      if (note->type == ' ')
1721*38fd1498Szrj 		{
1722*38fd1498Szrj 		  /* GNU backslash whitespace newline extension.  FIXME
1723*38fd1498Szrj 		     could be any sequence of non-vertical space.  When we
1724*38fd1498Szrj 		     can properly restore any such sequence, we should mark
1725*38fd1498Szrj 		     this note as handled so _cpp_process_line_notes
1726*38fd1498Szrj 		     doesn't warn.  */
1727*38fd1498Szrj 		  BUF_APPEND (" ", 1);
1728*38fd1498Szrj 		}
1729*38fd1498Szrj 
1730*38fd1498Szrj 	      BUF_APPEND ("\n", 1);
1731*38fd1498Szrj 	      break;
1732*38fd1498Szrj 
1733*38fd1498Szrj 	    case 0:
1734*38fd1498Szrj 	      /* Already handled.  */
1735*38fd1498Szrj 	      break;
1736*38fd1498Szrj 
1737*38fd1498Szrj 	    default:
1738*38fd1498Szrj 	      if (_cpp_trigraph_map[note->type])
1739*38fd1498Szrj 		{
1740*38fd1498Szrj 		  /* Don't warn about this trigraph in
1741*38fd1498Szrj 		     _cpp_process_line_notes, since trigraphs show up as
1742*38fd1498Szrj 		     trigraphs in raw strings.  */
1743*38fd1498Szrj 		  uchar type = note->type;
1744*38fd1498Szrj 		  note->type = 0;
1745*38fd1498Szrj 
1746*38fd1498Szrj 		  if (!CPP_OPTION (pfile, trigraphs))
1747*38fd1498Szrj 		    /* If we didn't convert the trigraph in the first
1748*38fd1498Szrj 		       place, don't do anything now either.  */
1749*38fd1498Szrj 		    break;
1750*38fd1498Szrj 
1751*38fd1498Szrj 		  BUF_APPEND (base, cur - base);
1752*38fd1498Szrj 		  base = cur;
1753*38fd1498Szrj 		  BUF_APPEND ("??", 2);
1754*38fd1498Szrj 
1755*38fd1498Szrj 		  /* ??/ followed by newline gets two line notes, one for
1756*38fd1498Szrj 		     the trigraph and one for the backslash/newline.  */
1757*38fd1498Szrj 		  if (type == '/' && note[1].pos == cur)
1758*38fd1498Szrj 		    {
1759*38fd1498Szrj 		      if (note[1].type != '\\'
1760*38fd1498Szrj 			  && note[1].type != ' ')
1761*38fd1498Szrj 			abort ();
1762*38fd1498Szrj 		      BUF_APPEND ("/", 1);
1763*38fd1498Szrj 		      ++note;
1764*38fd1498Szrj 		      goto after_backslash;
1765*38fd1498Szrj 		    }
1766*38fd1498Szrj 		  else
1767*38fd1498Szrj 		    {
1768*38fd1498Szrj 		      /* Skip the replacement character.  */
1769*38fd1498Szrj 		      base = ++cur;
1770*38fd1498Szrj 		      BUF_APPEND (&type, 1);
1771*38fd1498Szrj 		      c = type;
1772*38fd1498Szrj 		      goto check_c;
1773*38fd1498Szrj 		    }
1774*38fd1498Szrj 		}
1775*38fd1498Szrj 	      else
1776*38fd1498Szrj 		abort ();
1777*38fd1498Szrj 	      break;
1778*38fd1498Szrj 	    }
1779*38fd1498Szrj 	}
1780*38fd1498Szrj       c = *cur++;
1781*38fd1498Szrj       if (__builtin_expect (temp_buffer_len < 17, 0))
1782*38fd1498Szrj 	temp_buffer[temp_buffer_len++] = c;
1783*38fd1498Szrj 
1784*38fd1498Szrj      check_c:
1785*38fd1498Szrj       if (phase == RAW_STR_PREFIX)
1786*38fd1498Szrj 	{
1787*38fd1498Szrj 	  while (raw_prefix_len < temp_buffer_len)
1788*38fd1498Szrj 	    {
1789*38fd1498Szrj 	      raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1790*38fd1498Szrj 	      switch (raw_prefix[raw_prefix_len])
1791*38fd1498Szrj 		{
1792*38fd1498Szrj 		case ' ': case '(': case ')': case '\\': case '\t':
1793*38fd1498Szrj 		case '\v': case '\f': case '\n': default:
1794*38fd1498Szrj 		  break;
1795*38fd1498Szrj 		/* Basic source charset except the above chars.  */
1796*38fd1498Szrj 		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1797*38fd1498Szrj 		case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1798*38fd1498Szrj 		case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1799*38fd1498Szrj 		case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1800*38fd1498Szrj 		case 'y': case 'z':
1801*38fd1498Szrj 		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1802*38fd1498Szrj 		case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1803*38fd1498Szrj 		case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1804*38fd1498Szrj 		case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1805*38fd1498Szrj 		case 'Y': case 'Z':
1806*38fd1498Szrj 		case '0': case '1': case '2': case '3': case '4': case '5':
1807*38fd1498Szrj 		case '6': case '7': case '8': case '9':
1808*38fd1498Szrj 		case '_': case '{': case '}': case '#': case '[': case ']':
1809*38fd1498Szrj 		case '<': case '>': case '%': case ':': case ';': case '.':
1810*38fd1498Szrj 		case '?': case '*': case '+': case '-': case '/': case '^':
1811*38fd1498Szrj 		case '&': case '|': case '~': case '!': case '=': case ',':
1812*38fd1498Szrj 		case '"': case '\'':
1813*38fd1498Szrj 		  if (raw_prefix_len < 16)
1814*38fd1498Szrj 		    {
1815*38fd1498Szrj 		      raw_prefix_len++;
1816*38fd1498Szrj 		      continue;
1817*38fd1498Szrj 		    }
1818*38fd1498Szrj 		  break;
1819*38fd1498Szrj 		}
1820*38fd1498Szrj 
1821*38fd1498Szrj 	      if (raw_prefix[raw_prefix_len] != '(')
1822*38fd1498Szrj 		{
1823*38fd1498Szrj 		  int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1824*38fd1498Szrj 		  if (raw_prefix_len == 16)
1825*38fd1498Szrj 		    cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1826*38fd1498Szrj 					 col, "raw string delimiter longer "
1827*38fd1498Szrj 					      "than 16 characters");
1828*38fd1498Szrj 		  else if (raw_prefix[raw_prefix_len] == '\n')
1829*38fd1498Szrj 		    cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1830*38fd1498Szrj 					 col, "invalid new-line in raw "
1831*38fd1498Szrj 					      "string delimiter");
1832*38fd1498Szrj 		  else
1833*38fd1498Szrj 		    cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1834*38fd1498Szrj 					 col, "invalid character '%c' in "
1835*38fd1498Szrj 					      "raw string delimiter",
1836*38fd1498Szrj 					 (int) raw_prefix[raw_prefix_len]);
1837*38fd1498Szrj 		  pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1838*38fd1498Szrj 		  create_literal (pfile, token, orig_base,
1839*38fd1498Szrj 				  raw_prefix_start - 1, CPP_OTHER);
1840*38fd1498Szrj 		  if (first_buff)
1841*38fd1498Szrj 		    _cpp_release_buff (pfile, first_buff);
1842*38fd1498Szrj 		  return;
1843*38fd1498Szrj 		}
1844*38fd1498Szrj 	      raw_prefix[raw_prefix_len] = '"';
1845*38fd1498Szrj 	      phase = RAW_STR;
1846*38fd1498Szrj 	      /* Nothing should be appended to temp_buffer during
1847*38fd1498Szrj 		 RAW_STR phase.  */
1848*38fd1498Szrj 	      temp_buffer_len = 17;
1849*38fd1498Szrj 	      break;
1850*38fd1498Szrj 	    }
1851*38fd1498Szrj 	  continue;
1852*38fd1498Szrj 	}
1853*38fd1498Szrj       else if (phase == RAW_STR_SUFFIX)
1854*38fd1498Szrj 	{
1855*38fd1498Szrj 	  while (raw_suffix_len <= raw_prefix_len
1856*38fd1498Szrj 		 && raw_suffix_len < temp_buffer_len
1857*38fd1498Szrj 		 && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1858*38fd1498Szrj 	    raw_suffix_len++;
1859*38fd1498Szrj 	  if (raw_suffix_len > raw_prefix_len)
1860*38fd1498Szrj 	    break;
1861*38fd1498Szrj 	  if (raw_suffix_len == temp_buffer_len)
1862*38fd1498Szrj 	    continue;
1863*38fd1498Szrj 	  phase = RAW_STR;
1864*38fd1498Szrj 	  /* Nothing should be appended to temp_buffer during
1865*38fd1498Szrj 	     RAW_STR phase.  */
1866*38fd1498Szrj 	  temp_buffer_len = 17;
1867*38fd1498Szrj 	}
1868*38fd1498Szrj       if (c == ')')
1869*38fd1498Szrj 	{
1870*38fd1498Szrj 	  phase = RAW_STR_SUFFIX;
1871*38fd1498Szrj 	  raw_suffix_len = 0;
1872*38fd1498Szrj 	  temp_buffer_len = 0;
1873*38fd1498Szrj 	}
1874*38fd1498Szrj       else if (c == '\n')
1875*38fd1498Szrj 	{
1876*38fd1498Szrj 	  if (pfile->state.in_directive
1877*38fd1498Szrj 	      || (pfile->state.parsing_args
1878*38fd1498Szrj 		  && pfile->buffer->next_line >= pfile->buffer->rlimit))
1879*38fd1498Szrj 	    {
1880*38fd1498Szrj 	      cur--;
1881*38fd1498Szrj 	      type = CPP_OTHER;
1882*38fd1498Szrj 	      cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1883*38fd1498Szrj 				   "unterminated raw string");
1884*38fd1498Szrj 	      break;
1885*38fd1498Szrj 	    }
1886*38fd1498Szrj 
1887*38fd1498Szrj 	  BUF_APPEND (base, cur - base);
1888*38fd1498Szrj 
1889*38fd1498Szrj 	  if (pfile->buffer->cur < pfile->buffer->rlimit)
1890*38fd1498Szrj 	    CPP_INCREMENT_LINE (pfile, 0);
1891*38fd1498Szrj 	  pfile->buffer->need_line = true;
1892*38fd1498Szrj 
1893*38fd1498Szrj 	  pfile->buffer->cur = cur-1;
1894*38fd1498Szrj 	  _cpp_process_line_notes (pfile, false);
1895*38fd1498Szrj 	  if (!_cpp_get_fresh_line (pfile))
1896*38fd1498Szrj 	    {
1897*38fd1498Szrj 	      source_location src_loc = token->src_loc;
1898*38fd1498Szrj 	      token->type = CPP_EOF;
1899*38fd1498Szrj 	      /* Tell the compiler the line number of the EOF token.  */
1900*38fd1498Szrj 	      token->src_loc = pfile->line_table->highest_line;
1901*38fd1498Szrj 	      token->flags = BOL;
1902*38fd1498Szrj 	      if (first_buff != NULL)
1903*38fd1498Szrj 		_cpp_release_buff (pfile, first_buff);
1904*38fd1498Szrj 	      cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1905*38fd1498Szrj 				   "unterminated raw string");
1906*38fd1498Szrj 	      return;
1907*38fd1498Szrj 	    }
1908*38fd1498Szrj 
1909*38fd1498Szrj 	  cur = base = pfile->buffer->cur;
1910*38fd1498Szrj 	  note = &pfile->buffer->notes[pfile->buffer->cur_note];
1911*38fd1498Szrj 	}
1912*38fd1498Szrj     }
1913*38fd1498Szrj 
1914*38fd1498Szrj   if (CPP_OPTION (pfile, user_literals))
1915*38fd1498Szrj     {
1916*38fd1498Szrj       /* If a string format macro, say from inttypes.h, is placed touching
1917*38fd1498Szrj 	 a string literal it could be parsed as a C++11 user-defined string
1918*38fd1498Szrj 	 literal thus breaking the program.  */
1919*38fd1498Szrj       if (is_macro_not_literal_suffix (pfile, cur))
1920*38fd1498Szrj 	{
1921*38fd1498Szrj 	  /* Raise a warning, but do not consume subsequent tokens.  */
1922*38fd1498Szrj 	  if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1923*38fd1498Szrj 	    cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1924*38fd1498Szrj 				   token->src_loc, 0,
1925*38fd1498Szrj 				   "invalid suffix on literal; C++11 requires "
1926*38fd1498Szrj 				   "a space between literal and string macro");
1927*38fd1498Szrj 	}
1928*38fd1498Szrj       /* Grab user defined literal suffix.  */
1929*38fd1498Szrj       else if (ISIDST (*cur))
1930*38fd1498Szrj 	{
1931*38fd1498Szrj 	  type = cpp_userdef_string_add_type (type);
1932*38fd1498Szrj 	  ++cur;
1933*38fd1498Szrj 
1934*38fd1498Szrj 	  while (ISIDNUM (*cur))
1935*38fd1498Szrj 	    ++cur;
1936*38fd1498Szrj 	}
1937*38fd1498Szrj     }
1938*38fd1498Szrj 
1939*38fd1498Szrj   pfile->buffer->cur = cur;
1940*38fd1498Szrj   if (first_buff == NULL)
1941*38fd1498Szrj     create_literal (pfile, token, base, cur - base, type);
1942*38fd1498Szrj   else
1943*38fd1498Szrj     {
1944*38fd1498Szrj       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1945*38fd1498Szrj 
1946*38fd1498Szrj       token->type = type;
1947*38fd1498Szrj       token->val.str.len = total_len + (cur - base);
1948*38fd1498Szrj       token->val.str.text = dest;
1949*38fd1498Szrj       last_buff = first_buff;
1950*38fd1498Szrj       while (last_buff != NULL)
1951*38fd1498Szrj 	{
1952*38fd1498Szrj 	  memcpy (dest, last_buff->base,
1953*38fd1498Szrj 		  BUFF_FRONT (last_buff) - last_buff->base);
1954*38fd1498Szrj 	  dest += BUFF_FRONT (last_buff) - last_buff->base;
1955*38fd1498Szrj 	  last_buff = last_buff->next;
1956*38fd1498Szrj 	}
1957*38fd1498Szrj       _cpp_release_buff (pfile, first_buff);
1958*38fd1498Szrj       memcpy (dest, base, cur - base);
1959*38fd1498Szrj       dest[cur - base] = '\0';
1960*38fd1498Szrj     }
1961*38fd1498Szrj }
1962*38fd1498Szrj 
1963*38fd1498Szrj /* Lexes a string, character constant, or angle-bracketed header file
1964*38fd1498Szrj    name.  The stored string contains the spelling, including opening
1965*38fd1498Szrj    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1966*38fd1498Szrj    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1967*38fd1498Szrj    if it was not properly terminated, or CPP_LESS for an unterminated
1968*38fd1498Szrj    header name which must be relexed as normal tokens.
1969*38fd1498Szrj 
1970*38fd1498Szrj    The spelling is NUL-terminated, but it is not guaranteed that this
1971*38fd1498Szrj    is the first NUL since embedded NULs are preserved.  */
1972*38fd1498Szrj static void
lex_string(cpp_reader * pfile,cpp_token * token,const uchar * base)1973*38fd1498Szrj lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1974*38fd1498Szrj {
1975*38fd1498Szrj   bool saw_NUL = false;
1976*38fd1498Szrj   const uchar *cur;
1977*38fd1498Szrj   cppchar_t terminator;
1978*38fd1498Szrj   enum cpp_ttype type;
1979*38fd1498Szrj 
1980*38fd1498Szrj   cur = base;
1981*38fd1498Szrj   terminator = *cur++;
1982*38fd1498Szrj   if (terminator == 'L' || terminator == 'U')
1983*38fd1498Szrj     terminator = *cur++;
1984*38fd1498Szrj   else if (terminator == 'u')
1985*38fd1498Szrj     {
1986*38fd1498Szrj       terminator = *cur++;
1987*38fd1498Szrj       if (terminator == '8')
1988*38fd1498Szrj 	terminator = *cur++;
1989*38fd1498Szrj     }
1990*38fd1498Szrj   if (terminator == 'R')
1991*38fd1498Szrj     {
1992*38fd1498Szrj       lex_raw_string (pfile, token, base, cur);
1993*38fd1498Szrj       return;
1994*38fd1498Szrj     }
1995*38fd1498Szrj   if (terminator == '"')
1996*38fd1498Szrj     type = (*base == 'L' ? CPP_WSTRING :
1997*38fd1498Szrj 	    *base == 'U' ? CPP_STRING32 :
1998*38fd1498Szrj 	    *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1999*38fd1498Szrj 			 : CPP_STRING);
2000*38fd1498Szrj   else if (terminator == '\'')
2001*38fd1498Szrj     type = (*base == 'L' ? CPP_WCHAR :
2002*38fd1498Szrj 	    *base == 'U' ? CPP_CHAR32 :
2003*38fd1498Szrj 	    *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2004*38fd1498Szrj 			 : CPP_CHAR);
2005*38fd1498Szrj   else
2006*38fd1498Szrj     terminator = '>', type = CPP_HEADER_NAME;
2007*38fd1498Szrj 
2008*38fd1498Szrj   for (;;)
2009*38fd1498Szrj     {
2010*38fd1498Szrj       cppchar_t c = *cur++;
2011*38fd1498Szrj 
2012*38fd1498Szrj       /* In #include-style directives, terminators are not escapable.  */
2013*38fd1498Szrj       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2014*38fd1498Szrj 	cur++;
2015*38fd1498Szrj       else if (c == terminator)
2016*38fd1498Szrj 	break;
2017*38fd1498Szrj       else if (c == '\n')
2018*38fd1498Szrj 	{
2019*38fd1498Szrj 	  cur--;
2020*38fd1498Szrj 	  /* Unmatched quotes always yield undefined behavior, but
2021*38fd1498Szrj 	     greedy lexing means that what appears to be an unterminated
2022*38fd1498Szrj 	     header name may actually be a legitimate sequence of tokens.  */
2023*38fd1498Szrj 	  if (terminator == '>')
2024*38fd1498Szrj 	    {
2025*38fd1498Szrj 	      token->type = CPP_LESS;
2026*38fd1498Szrj 	      return;
2027*38fd1498Szrj 	    }
2028*38fd1498Szrj 	  type = CPP_OTHER;
2029*38fd1498Szrj 	  break;
2030*38fd1498Szrj 	}
2031*38fd1498Szrj       else if (c == '\0')
2032*38fd1498Szrj 	saw_NUL = true;
2033*38fd1498Szrj     }
2034*38fd1498Szrj 
2035*38fd1498Szrj   if (saw_NUL && !pfile->state.skipping)
2036*38fd1498Szrj     cpp_error (pfile, CPP_DL_WARNING,
2037*38fd1498Szrj 	       "null character(s) preserved in literal");
2038*38fd1498Szrj 
2039*38fd1498Szrj   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2040*38fd1498Szrj     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2041*38fd1498Szrj 	       (int) terminator);
2042*38fd1498Szrj 
2043*38fd1498Szrj   if (CPP_OPTION (pfile, user_literals))
2044*38fd1498Szrj     {
2045*38fd1498Szrj       /* If a string format macro, say from inttypes.h, is placed touching
2046*38fd1498Szrj 	 a string literal it could be parsed as a C++11 user-defined string
2047*38fd1498Szrj 	 literal thus breaking the program.  */
2048*38fd1498Szrj       if (is_macro_not_literal_suffix (pfile, cur))
2049*38fd1498Szrj 	{
2050*38fd1498Szrj 	  /* Raise a warning, but do not consume subsequent tokens.  */
2051*38fd1498Szrj 	  if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2052*38fd1498Szrj 	    cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2053*38fd1498Szrj 				   token->src_loc, 0,
2054*38fd1498Szrj 				   "invalid suffix on literal; C++11 requires "
2055*38fd1498Szrj 				   "a space between literal and string macro");
2056*38fd1498Szrj 	}
2057*38fd1498Szrj       /* Grab user defined literal suffix.  */
2058*38fd1498Szrj       else if (ISIDST (*cur))
2059*38fd1498Szrj 	{
2060*38fd1498Szrj 	  type = cpp_userdef_char_add_type (type);
2061*38fd1498Szrj 	  type = cpp_userdef_string_add_type (type);
2062*38fd1498Szrj           ++cur;
2063*38fd1498Szrj 
2064*38fd1498Szrj 	  while (ISIDNUM (*cur))
2065*38fd1498Szrj 	    ++cur;
2066*38fd1498Szrj 	}
2067*38fd1498Szrj     }
2068*38fd1498Szrj   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2069*38fd1498Szrj 	   && is_macro (pfile, cur)
2070*38fd1498Szrj 	   && !pfile->state.skipping)
2071*38fd1498Szrj     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2072*38fd1498Szrj 			   token->src_loc, 0, "C++11 requires a space "
2073*38fd1498Szrj 			   "between string literal and macro");
2074*38fd1498Szrj 
2075*38fd1498Szrj   pfile->buffer->cur = cur;
2076*38fd1498Szrj   create_literal (pfile, token, base, cur - base, type);
2077*38fd1498Szrj }
2078*38fd1498Szrj 
2079*38fd1498Szrj /* Return the comment table. The client may not make any assumption
2080*38fd1498Szrj    about the ordering of the table.  */
2081*38fd1498Szrj cpp_comment_table *
cpp_get_comments(cpp_reader * pfile)2082*38fd1498Szrj cpp_get_comments (cpp_reader *pfile)
2083*38fd1498Szrj {
2084*38fd1498Szrj   return &pfile->comments;
2085*38fd1498Szrj }
2086*38fd1498Szrj 
2087*38fd1498Szrj /* Append a comment to the end of the comment table. */
2088*38fd1498Szrj static void
store_comment(cpp_reader * pfile,cpp_token * token)2089*38fd1498Szrj store_comment (cpp_reader *pfile, cpp_token *token)
2090*38fd1498Szrj {
2091*38fd1498Szrj   int len;
2092*38fd1498Szrj 
2093*38fd1498Szrj   if (pfile->comments.allocated == 0)
2094*38fd1498Szrj     {
2095*38fd1498Szrj       pfile->comments.allocated = 256;
2096*38fd1498Szrj       pfile->comments.entries = (cpp_comment *) xmalloc
2097*38fd1498Szrj 	(pfile->comments.allocated * sizeof (cpp_comment));
2098*38fd1498Szrj     }
2099*38fd1498Szrj 
2100*38fd1498Szrj   if (pfile->comments.count == pfile->comments.allocated)
2101*38fd1498Szrj     {
2102*38fd1498Szrj       pfile->comments.allocated *= 2;
2103*38fd1498Szrj       pfile->comments.entries = (cpp_comment *) xrealloc
2104*38fd1498Szrj 	(pfile->comments.entries,
2105*38fd1498Szrj 	 pfile->comments.allocated * sizeof (cpp_comment));
2106*38fd1498Szrj     }
2107*38fd1498Szrj 
2108*38fd1498Szrj   len = token->val.str.len;
2109*38fd1498Szrj 
2110*38fd1498Szrj   /* Copy comment. Note, token may not be NULL terminated. */
2111*38fd1498Szrj   pfile->comments.entries[pfile->comments.count].comment =
2112*38fd1498Szrj     (char *) xmalloc (sizeof (char) * (len + 1));
2113*38fd1498Szrj   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2114*38fd1498Szrj 	  token->val.str.text, len);
2115*38fd1498Szrj   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2116*38fd1498Szrj 
2117*38fd1498Szrj   /* Set source location. */
2118*38fd1498Szrj   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2119*38fd1498Szrj 
2120*38fd1498Szrj   /* Increment the count of entries in the comment table. */
2121*38fd1498Szrj   pfile->comments.count++;
2122*38fd1498Szrj }
2123*38fd1498Szrj 
2124*38fd1498Szrj /* The stored comment includes the comment start and any terminator.  */
2125*38fd1498Szrj static void
save_comment(cpp_reader * pfile,cpp_token * token,const unsigned char * from,cppchar_t type)2126*38fd1498Szrj save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2127*38fd1498Szrj 	      cppchar_t type)
2128*38fd1498Szrj {
2129*38fd1498Szrj   unsigned char *buffer;
2130*38fd1498Szrj   unsigned int len, clen, i;
2131*38fd1498Szrj 
2132*38fd1498Szrj   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2133*38fd1498Szrj 
2134*38fd1498Szrj   /* C++ comments probably (not definitely) have moved past a new
2135*38fd1498Szrj      line, which we don't want to save in the comment.  */
2136*38fd1498Szrj   if (is_vspace (pfile->buffer->cur[-1]))
2137*38fd1498Szrj     len--;
2138*38fd1498Szrj 
2139*38fd1498Szrj   /* If we are currently in a directive or in argument parsing, then
2140*38fd1498Szrj      we need to store all C++ comments as C comments internally, and
2141*38fd1498Szrj      so we need to allocate a little extra space in that case.
2142*38fd1498Szrj 
2143*38fd1498Szrj      Note that the only time we encounter a directive here is
2144*38fd1498Szrj      when we are saving comments in a "#define".  */
2145*38fd1498Szrj   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2146*38fd1498Szrj 	  && type == '/') ? len + 2 : len;
2147*38fd1498Szrj 
2148*38fd1498Szrj   buffer = _cpp_unaligned_alloc (pfile, clen);
2149*38fd1498Szrj 
2150*38fd1498Szrj   token->type = CPP_COMMENT;
2151*38fd1498Szrj   token->val.str.len = clen;
2152*38fd1498Szrj   token->val.str.text = buffer;
2153*38fd1498Szrj 
2154*38fd1498Szrj   buffer[0] = '/';
2155*38fd1498Szrj   memcpy (buffer + 1, from, len - 1);
2156*38fd1498Szrj 
2157*38fd1498Szrj   /* Finish conversion to a C comment, if necessary.  */
2158*38fd1498Szrj   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2159*38fd1498Szrj     {
2160*38fd1498Szrj       buffer[1] = '*';
2161*38fd1498Szrj       buffer[clen - 2] = '*';
2162*38fd1498Szrj       buffer[clen - 1] = '/';
2163*38fd1498Szrj       /* As there can be in a C++ comments illegal sequences for C comments
2164*38fd1498Szrj          we need to filter them out.  */
2165*38fd1498Szrj       for (i = 2; i < (clen - 2); i++)
2166*38fd1498Szrj         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2167*38fd1498Szrj           buffer[i] = '|';
2168*38fd1498Szrj     }
2169*38fd1498Szrj 
2170*38fd1498Szrj   /* Finally store this comment for use by clients of libcpp. */
2171*38fd1498Szrj   store_comment (pfile, token);
2172*38fd1498Szrj }
2173*38fd1498Szrj 
2174*38fd1498Szrj /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2175*38fd1498Szrj    comment.  */
2176*38fd1498Szrj 
2177*38fd1498Szrj static bool
fallthrough_comment_p(cpp_reader * pfile,const unsigned char * comment_start)2178*38fd1498Szrj fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2179*38fd1498Szrj {
2180*38fd1498Szrj   const unsigned char *from = comment_start + 1;
2181*38fd1498Szrj 
2182*38fd1498Szrj   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2183*38fd1498Szrj     {
2184*38fd1498Szrj       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2185*38fd1498Szrj 	 don't recognize any comments.  The latter only checks attributes,
2186*38fd1498Szrj 	 the former doesn't warn.  */
2187*38fd1498Szrj     case 0:
2188*38fd1498Szrj     default:
2189*38fd1498Szrj       return false;
2190*38fd1498Szrj       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2191*38fd1498Szrj 	 content it has.  */
2192*38fd1498Szrj     case 1:
2193*38fd1498Szrj       return true;
2194*38fd1498Szrj     case 2:
2195*38fd1498Szrj       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2196*38fd1498Szrj 	 .*falls?[ \t-]*thr(u|ough).* regex.  */
2197*38fd1498Szrj       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2198*38fd1498Szrj 	   from++)
2199*38fd1498Szrj 	{
2200*38fd1498Szrj 	  /* Is there anything like strpbrk with upper boundary, or
2201*38fd1498Szrj 	     memchr looking for 2 characters rather than just one?  */
2202*38fd1498Szrj 	  if (from[0] != 'f' && from[0] != 'F')
2203*38fd1498Szrj 	    continue;
2204*38fd1498Szrj 	  if (from[1] != 'a' && from[1] != 'A')
2205*38fd1498Szrj 	    continue;
2206*38fd1498Szrj 	  if (from[2] != 'l' && from[2] != 'L')
2207*38fd1498Szrj 	    continue;
2208*38fd1498Szrj 	  if (from[3] != 'l' && from[3] != 'L')
2209*38fd1498Szrj 	    continue;
2210*38fd1498Szrj 	  from += sizeof "fall" - 1;
2211*38fd1498Szrj 	  if (from[0] == 's' || from[0] == 'S')
2212*38fd1498Szrj 	    from++;
2213*38fd1498Szrj 	  while (*from == ' ' || *from == '\t' || *from == '-')
2214*38fd1498Szrj 	    from++;
2215*38fd1498Szrj 	  if (from[0] != 't' && from[0] != 'T')
2216*38fd1498Szrj 	    continue;
2217*38fd1498Szrj 	  if (from[1] != 'h' && from[1] != 'H')
2218*38fd1498Szrj 	    continue;
2219*38fd1498Szrj 	  if (from[2] != 'r' && from[2] != 'R')
2220*38fd1498Szrj 	    continue;
2221*38fd1498Szrj 	  if (from[3] == 'u' || from[3] == 'U')
2222*38fd1498Szrj 	    return true;
2223*38fd1498Szrj 	  if (from[3] != 'o' && from[3] != 'O')
2224*38fd1498Szrj 	    continue;
2225*38fd1498Szrj 	  if (from[4] != 'u' && from[4] != 'U')
2226*38fd1498Szrj 	    continue;
2227*38fd1498Szrj 	  if (from[5] != 'g' && from[5] != 'G')
2228*38fd1498Szrj 	    continue;
2229*38fd1498Szrj 	  if (from[6] != 'h' && from[6] != 'H')
2230*38fd1498Szrj 	    continue;
2231*38fd1498Szrj 	  return true;
2232*38fd1498Szrj 	}
2233*38fd1498Szrj       return false;
2234*38fd1498Szrj     case 3:
2235*38fd1498Szrj     case 4:
2236*38fd1498Szrj       break;
2237*38fd1498Szrj     }
2238*38fd1498Szrj 
2239*38fd1498Szrj   /* Whole comment contents:
2240*38fd1498Szrj      -fallthrough
2241*38fd1498Szrj      @fallthrough@
2242*38fd1498Szrj    */
2243*38fd1498Szrj   if (*from == '-' || *from == '@')
2244*38fd1498Szrj     {
2245*38fd1498Szrj       size_t len = sizeof "fallthrough" - 1;
2246*38fd1498Szrj       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2247*38fd1498Szrj 	return false;
2248*38fd1498Szrj       if (memcmp (from + 1, "fallthrough", len))
2249*38fd1498Szrj 	return false;
2250*38fd1498Szrj       if (*from == '@')
2251*38fd1498Szrj 	{
2252*38fd1498Szrj 	  if (from[len + 1] != '@')
2253*38fd1498Szrj 	    return false;
2254*38fd1498Szrj 	  len++;
2255*38fd1498Szrj 	}
2256*38fd1498Szrj       from += 1 + len;
2257*38fd1498Szrj     }
2258*38fd1498Szrj   /* Whole comment contents (regex):
2259*38fd1498Szrj      lint -fallthrough[ \t]*
2260*38fd1498Szrj    */
2261*38fd1498Szrj   else if (*from == 'l')
2262*38fd1498Szrj     {
2263*38fd1498Szrj       size_t len = sizeof "int -fallthrough" - 1;
2264*38fd1498Szrj       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2265*38fd1498Szrj 	return false;
2266*38fd1498Szrj       if (memcmp (from + 1, "int -fallthrough", len))
2267*38fd1498Szrj 	return false;
2268*38fd1498Szrj       from += 1 + len;
2269*38fd1498Szrj       while (*from == ' ' || *from == '\t')
2270*38fd1498Szrj 	from++;
2271*38fd1498Szrj     }
2272*38fd1498Szrj   /* Whole comment contents (regex):
2273*38fd1498Szrj      [ \t]*FALLTHR(U|OUGH)[ \t]*
2274*38fd1498Szrj    */
2275*38fd1498Szrj   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2276*38fd1498Szrj     {
2277*38fd1498Szrj       while (*from == ' ' || *from == '\t')
2278*38fd1498Szrj 	from++;
2279*38fd1498Szrj       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
2280*38fd1498Szrj 	return false;
2281*38fd1498Szrj       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2282*38fd1498Szrj 	return false;
2283*38fd1498Szrj       from += sizeof "FALLTHR" - 1;
2284*38fd1498Szrj       if (*from == 'U')
2285*38fd1498Szrj 	from++;
2286*38fd1498Szrj       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
2287*38fd1498Szrj 	return false;
2288*38fd1498Szrj       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2289*38fd1498Szrj 	return false;
2290*38fd1498Szrj       else
2291*38fd1498Szrj 	from += sizeof "OUGH" - 1;
2292*38fd1498Szrj       while (*from == ' ' || *from == '\t')
2293*38fd1498Szrj 	from++;
2294*38fd1498Szrj     }
2295*38fd1498Szrj   /* Whole comment contents (regex):
2296*38fd1498Szrj      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2297*38fd1498Szrj      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2298*38fd1498Szrj      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2299*38fd1498Szrj    */
2300*38fd1498Szrj   else
2301*38fd1498Szrj     {
2302*38fd1498Szrj       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2303*38fd1498Szrj 	from++;
2304*38fd1498Szrj       unsigned char f = *from;
2305*38fd1498Szrj       bool all_upper = false;
2306*38fd1498Szrj       if (f == 'E' || f == 'e')
2307*38fd1498Szrj 	{
2308*38fd1498Szrj 	  if ((size_t) (pfile->buffer->cur - from)
2309*38fd1498Szrj 	      < sizeof "else fallthru" - 1)
2310*38fd1498Szrj 	    return false;
2311*38fd1498Szrj 	  if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2312*38fd1498Szrj 	    all_upper = true;
2313*38fd1498Szrj 	  else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2314*38fd1498Szrj 	    return false;
2315*38fd1498Szrj 	  from += sizeof "else" - 1;
2316*38fd1498Szrj 	  if (*from == ',')
2317*38fd1498Szrj 	    from++;
2318*38fd1498Szrj 	  if (*from != ' ')
2319*38fd1498Szrj 	    return false;
2320*38fd1498Szrj 	  from++;
2321*38fd1498Szrj 	  if (all_upper && *from == 'f')
2322*38fd1498Szrj 	    return false;
2323*38fd1498Szrj 	  if (f == 'e' && *from == 'F')
2324*38fd1498Szrj 	    return false;
2325*38fd1498Szrj 	  f = *from;
2326*38fd1498Szrj 	}
2327*38fd1498Szrj       else if (f == 'I' || f == 'i')
2328*38fd1498Szrj 	{
2329*38fd1498Szrj 	  if ((size_t) (pfile->buffer->cur - from)
2330*38fd1498Szrj 	      < sizeof "intentional fallthru" - 1)
2331*38fd1498Szrj 	    return false;
2332*38fd1498Szrj 	  if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2333*38fd1498Szrj 				  sizeof "NTENTIONAL" - 1) == 0)
2334*38fd1498Szrj 	    all_upper = true;
2335*38fd1498Szrj 	  else if (memcmp (from + 1, "ntentional",
2336*38fd1498Szrj 			   sizeof "ntentional" - 1))
2337*38fd1498Szrj 	    return false;
2338*38fd1498Szrj 	  from += sizeof "intentional" - 1;
2339*38fd1498Szrj 	  if (*from == ' ')
2340*38fd1498Szrj 	    {
2341*38fd1498Szrj 	      from++;
2342*38fd1498Szrj 	      if (all_upper && *from == 'f')
2343*38fd1498Szrj 		return false;
2344*38fd1498Szrj 	    }
2345*38fd1498Szrj 	  else if (all_upper)
2346*38fd1498Szrj 	    {
2347*38fd1498Szrj 	      if (memcmp (from, "LY F", sizeof "LY F" - 1))
2348*38fd1498Szrj 		return false;
2349*38fd1498Szrj 	      from += sizeof "LY " - 1;
2350*38fd1498Szrj 	    }
2351*38fd1498Szrj 	  else
2352*38fd1498Szrj 	    {
2353*38fd1498Szrj 	      if (memcmp (from, "ly ", sizeof "ly " - 1))
2354*38fd1498Szrj 		return false;
2355*38fd1498Szrj 	      from += sizeof "ly " - 1;
2356*38fd1498Szrj 	    }
2357*38fd1498Szrj 	  if (f == 'i' && *from == 'F')
2358*38fd1498Szrj 	    return false;
2359*38fd1498Szrj 	  f = *from;
2360*38fd1498Szrj 	}
2361*38fd1498Szrj       if (f != 'F' && f != 'f')
2362*38fd1498Szrj 	return false;
2363*38fd1498Szrj       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2364*38fd1498Szrj 	return false;
2365*38fd1498Szrj       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2366*38fd1498Szrj 	all_upper = true;
2367*38fd1498Szrj       else if (all_upper)
2368*38fd1498Szrj 	return false;
2369*38fd1498Szrj       else if (memcmp (from + 1, "all", sizeof "all" - 1))
2370*38fd1498Szrj 	return false;
2371*38fd1498Szrj       from += sizeof "fall" - 1;
2372*38fd1498Szrj       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2373*38fd1498Szrj 	from += 2;
2374*38fd1498Szrj       else if (*from == ' ' || *from == '-')
2375*38fd1498Szrj 	from++;
2376*38fd1498Szrj       else if (*from != (all_upper ? 'T' : 't'))
2377*38fd1498Szrj 	return false;
2378*38fd1498Szrj       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2379*38fd1498Szrj 	return false;
2380*38fd1498Szrj       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2381*38fd1498Szrj 	return false;
2382*38fd1498Szrj       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2383*38fd1498Szrj 	{
2384*38fd1498Szrj 	  if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2385*38fd1498Szrj 	    return false;
2386*38fd1498Szrj 	  if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2387*38fd1498Szrj 		      sizeof "hrough" - 1))
2388*38fd1498Szrj 	    return false;
2389*38fd1498Szrj 	  from += sizeof "through" - 1;
2390*38fd1498Szrj 	}
2391*38fd1498Szrj       else
2392*38fd1498Szrj 	from += sizeof "thru" - 1;
2393*38fd1498Szrj       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2394*38fd1498Szrj 	from++;
2395*38fd1498Szrj       if (*from == '-')
2396*38fd1498Szrj 	{
2397*38fd1498Szrj 	  from++;
2398*38fd1498Szrj 	  if (*comment_start == '*')
2399*38fd1498Szrj 	    {
2400*38fd1498Szrj 	      do
2401*38fd1498Szrj 		{
2402*38fd1498Szrj 		  while (*from && *from != '*'
2403*38fd1498Szrj 			 && *from != '\n' && *from != '\r')
2404*38fd1498Szrj 		    from++;
2405*38fd1498Szrj 		  if (*from != '*' || from[1] == '/')
2406*38fd1498Szrj 		    break;
2407*38fd1498Szrj 		  from++;
2408*38fd1498Szrj 		}
2409*38fd1498Szrj 	      while (1);
2410*38fd1498Szrj 	    }
2411*38fd1498Szrj 	  else
2412*38fd1498Szrj 	    while (*from && *from != '\n' && *from != '\r')
2413*38fd1498Szrj 	      from++;
2414*38fd1498Szrj 	}
2415*38fd1498Szrj     }
2416*38fd1498Szrj   /* C block comment.  */
2417*38fd1498Szrj   if (*comment_start == '*')
2418*38fd1498Szrj     {
2419*38fd1498Szrj       if (*from != '*' || from[1] != '/')
2420*38fd1498Szrj 	return false;
2421*38fd1498Szrj     }
2422*38fd1498Szrj   /* C++ line comment.  */
2423*38fd1498Szrj   else if (*from != '\n')
2424*38fd1498Szrj     return false;
2425*38fd1498Szrj 
2426*38fd1498Szrj   return true;
2427*38fd1498Szrj }
2428*38fd1498Szrj 
2429*38fd1498Szrj /* Allocate COUNT tokens for RUN.  */
2430*38fd1498Szrj void
_cpp_init_tokenrun(tokenrun * run,unsigned int count)2431*38fd1498Szrj _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2432*38fd1498Szrj {
2433*38fd1498Szrj   run->base = XNEWVEC (cpp_token, count);
2434*38fd1498Szrj   run->limit = run->base + count;
2435*38fd1498Szrj   run->next = NULL;
2436*38fd1498Szrj }
2437*38fd1498Szrj 
2438*38fd1498Szrj /* Returns the next tokenrun, or creates one if there is none.  */
2439*38fd1498Szrj static tokenrun *
next_tokenrun(tokenrun * run)2440*38fd1498Szrj next_tokenrun (tokenrun *run)
2441*38fd1498Szrj {
2442*38fd1498Szrj   if (run->next == NULL)
2443*38fd1498Szrj     {
2444*38fd1498Szrj       run->next = XNEW (tokenrun);
2445*38fd1498Szrj       run->next->prev = run;
2446*38fd1498Szrj       _cpp_init_tokenrun (run->next, 250);
2447*38fd1498Szrj     }
2448*38fd1498Szrj 
2449*38fd1498Szrj   return run->next;
2450*38fd1498Szrj }
2451*38fd1498Szrj 
2452*38fd1498Szrj /* Return the number of not yet processed token in a given
2453*38fd1498Szrj    context.  */
2454*38fd1498Szrj int
_cpp_remaining_tokens_num_in_context(cpp_context * context)2455*38fd1498Szrj _cpp_remaining_tokens_num_in_context (cpp_context *context)
2456*38fd1498Szrj {
2457*38fd1498Szrj   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2458*38fd1498Szrj     return (LAST (context).token - FIRST (context).token);
2459*38fd1498Szrj   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2460*38fd1498Szrj 	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
2461*38fd1498Szrj     return (LAST (context).ptoken - FIRST (context).ptoken);
2462*38fd1498Szrj   else
2463*38fd1498Szrj       abort ();
2464*38fd1498Szrj }
2465*38fd1498Szrj 
2466*38fd1498Szrj /* Returns the token present at index INDEX in a given context.  If
2467*38fd1498Szrj    INDEX is zero, the next token to be processed is returned.  */
2468*38fd1498Szrj static const cpp_token*
_cpp_token_from_context_at(cpp_context * context,int index)2469*38fd1498Szrj _cpp_token_from_context_at (cpp_context *context, int index)
2470*38fd1498Szrj {
2471*38fd1498Szrj   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2472*38fd1498Szrj     return &(FIRST (context).token[index]);
2473*38fd1498Szrj   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2474*38fd1498Szrj 	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
2475*38fd1498Szrj     return FIRST (context).ptoken[index];
2476*38fd1498Szrj  else
2477*38fd1498Szrj    abort ();
2478*38fd1498Szrj }
2479*38fd1498Szrj 
2480*38fd1498Szrj /* Look ahead in the input stream.  */
2481*38fd1498Szrj const cpp_token *
cpp_peek_token(cpp_reader * pfile,int index)2482*38fd1498Szrj cpp_peek_token (cpp_reader *pfile, int index)
2483*38fd1498Szrj {
2484*38fd1498Szrj   cpp_context *context = pfile->context;
2485*38fd1498Szrj   const cpp_token *peektok;
2486*38fd1498Szrj   int count;
2487*38fd1498Szrj 
2488*38fd1498Szrj   /* First, scan through any pending cpp_context objects.  */
2489*38fd1498Szrj   while (context->prev)
2490*38fd1498Szrj     {
2491*38fd1498Szrj       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2492*38fd1498Szrj 
2493*38fd1498Szrj       if (index < (int) sz)
2494*38fd1498Szrj         return _cpp_token_from_context_at (context, index);
2495*38fd1498Szrj       index -= (int) sz;
2496*38fd1498Szrj       context = context->prev;
2497*38fd1498Szrj     }
2498*38fd1498Szrj 
2499*38fd1498Szrj   /* We will have to read some new tokens after all (and do so
2500*38fd1498Szrj      without invalidating preceding tokens).  */
2501*38fd1498Szrj   count = index;
2502*38fd1498Szrj   pfile->keep_tokens++;
2503*38fd1498Szrj 
2504*38fd1498Szrj   /* For peeked tokens temporarily disable line_change reporting,
2505*38fd1498Szrj      until the tokens are parsed for real.  */
2506*38fd1498Szrj   void (*line_change) (cpp_reader *, const cpp_token *, int)
2507*38fd1498Szrj     = pfile->cb.line_change;
2508*38fd1498Szrj   pfile->cb.line_change = NULL;
2509*38fd1498Szrj 
2510*38fd1498Szrj   do
2511*38fd1498Szrj     {
2512*38fd1498Szrj       peektok = _cpp_lex_token (pfile);
2513*38fd1498Szrj       if (peektok->type == CPP_EOF)
2514*38fd1498Szrj 	{
2515*38fd1498Szrj 	  index--;
2516*38fd1498Szrj 	  break;
2517*38fd1498Szrj 	}
2518*38fd1498Szrj     }
2519*38fd1498Szrj   while (index--);
2520*38fd1498Szrj 
2521*38fd1498Szrj   _cpp_backup_tokens_direct (pfile, count - index);
2522*38fd1498Szrj   pfile->keep_tokens--;
2523*38fd1498Szrj   pfile->cb.line_change = line_change;
2524*38fd1498Szrj 
2525*38fd1498Szrj   return peektok;
2526*38fd1498Szrj }
2527*38fd1498Szrj 
2528*38fd1498Szrj /* Allocate a single token that is invalidated at the same time as the
2529*38fd1498Szrj    rest of the tokens on the line.  Has its line and col set to the
2530*38fd1498Szrj    same as the last lexed token, so that diagnostics appear in the
2531*38fd1498Szrj    right place.  */
2532*38fd1498Szrj cpp_token *
_cpp_temp_token(cpp_reader * pfile)2533*38fd1498Szrj _cpp_temp_token (cpp_reader *pfile)
2534*38fd1498Szrj {
2535*38fd1498Szrj   cpp_token *old, *result;
2536*38fd1498Szrj   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2537*38fd1498Szrj   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2538*38fd1498Szrj 
2539*38fd1498Szrj   old = pfile->cur_token - 1;
2540*38fd1498Szrj   /* Any pre-existing lookaheads must not be clobbered.  */
2541*38fd1498Szrj   if (la)
2542*38fd1498Szrj     {
2543*38fd1498Szrj       if (sz <= la)
2544*38fd1498Szrj         {
2545*38fd1498Szrj           tokenrun *next = next_tokenrun (pfile->cur_run);
2546*38fd1498Szrj 
2547*38fd1498Szrj           if (sz < la)
2548*38fd1498Szrj             memmove (next->base + 1, next->base,
2549*38fd1498Szrj                      (la - sz) * sizeof (cpp_token));
2550*38fd1498Szrj 
2551*38fd1498Szrj           next->base[0] = pfile->cur_run->limit[-1];
2552*38fd1498Szrj         }
2553*38fd1498Szrj 
2554*38fd1498Szrj       if (sz > 1)
2555*38fd1498Szrj         memmove (pfile->cur_token + 1, pfile->cur_token,
2556*38fd1498Szrj                  MIN (la, sz - 1) * sizeof (cpp_token));
2557*38fd1498Szrj     }
2558*38fd1498Szrj 
2559*38fd1498Szrj   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2560*38fd1498Szrj     {
2561*38fd1498Szrj       pfile->cur_run = next_tokenrun (pfile->cur_run);
2562*38fd1498Szrj       pfile->cur_token = pfile->cur_run->base;
2563*38fd1498Szrj     }
2564*38fd1498Szrj 
2565*38fd1498Szrj   result = pfile->cur_token++;
2566*38fd1498Szrj   result->src_loc = old->src_loc;
2567*38fd1498Szrj   return result;
2568*38fd1498Szrj }
2569*38fd1498Szrj 
2570*38fd1498Szrj /* Lex a token into RESULT (external interface).  Takes care of issues
2571*38fd1498Szrj    like directive handling, token lookahead, multiple include
2572*38fd1498Szrj    optimization and skipping.  */
2573*38fd1498Szrj const cpp_token *
_cpp_lex_token(cpp_reader * pfile)2574*38fd1498Szrj _cpp_lex_token (cpp_reader *pfile)
2575*38fd1498Szrj {
2576*38fd1498Szrj   cpp_token *result;
2577*38fd1498Szrj 
2578*38fd1498Szrj   for (;;)
2579*38fd1498Szrj     {
2580*38fd1498Szrj       if (pfile->cur_token == pfile->cur_run->limit)
2581*38fd1498Szrj 	{
2582*38fd1498Szrj 	  pfile->cur_run = next_tokenrun (pfile->cur_run);
2583*38fd1498Szrj 	  pfile->cur_token = pfile->cur_run->base;
2584*38fd1498Szrj 	}
2585*38fd1498Szrj       /* We assume that the current token is somewhere in the current
2586*38fd1498Szrj 	 run.  */
2587*38fd1498Szrj       if (pfile->cur_token < pfile->cur_run->base
2588*38fd1498Szrj 	  || pfile->cur_token >= pfile->cur_run->limit)
2589*38fd1498Szrj 	abort ();
2590*38fd1498Szrj 
2591*38fd1498Szrj       if (pfile->lookaheads)
2592*38fd1498Szrj 	{
2593*38fd1498Szrj 	  pfile->lookaheads--;
2594*38fd1498Szrj 	  result = pfile->cur_token++;
2595*38fd1498Szrj 	}
2596*38fd1498Szrj       else
2597*38fd1498Szrj 	result = _cpp_lex_direct (pfile);
2598*38fd1498Szrj 
2599*38fd1498Szrj       if (result->flags & BOL)
2600*38fd1498Szrj 	{
2601*38fd1498Szrj 	  /* Is this a directive.  If _cpp_handle_directive returns
2602*38fd1498Szrj 	     false, it is an assembler #.  */
2603*38fd1498Szrj 	  if (result->type == CPP_HASH
2604*38fd1498Szrj 	      /* 6.10.3 p 11: Directives in a list of macro arguments
2605*38fd1498Szrj 		 gives undefined behavior.  This implementation
2606*38fd1498Szrj 		 handles the directive as normal.  */
2607*38fd1498Szrj 	      && pfile->state.parsing_args != 1)
2608*38fd1498Szrj 	    {
2609*38fd1498Szrj 	      if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2610*38fd1498Szrj 		{
2611*38fd1498Szrj 		  if (pfile->directive_result.type == CPP_PADDING)
2612*38fd1498Szrj 		    continue;
2613*38fd1498Szrj 		  result = &pfile->directive_result;
2614*38fd1498Szrj 		}
2615*38fd1498Szrj 	    }
2616*38fd1498Szrj 	  else if (pfile->state.in_deferred_pragma)
2617*38fd1498Szrj 	    result = &pfile->directive_result;
2618*38fd1498Szrj 
2619*38fd1498Szrj 	  if (pfile->cb.line_change && !pfile->state.skipping)
2620*38fd1498Szrj 	    pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2621*38fd1498Szrj 	}
2622*38fd1498Szrj 
2623*38fd1498Szrj       /* We don't skip tokens in directives.  */
2624*38fd1498Szrj       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2625*38fd1498Szrj 	break;
2626*38fd1498Szrj 
2627*38fd1498Szrj       /* Outside a directive, invalidate controlling macros.  At file
2628*38fd1498Szrj 	 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2629*38fd1498Szrj 	 get here and MI optimization works.  */
2630*38fd1498Szrj       pfile->mi_valid = false;
2631*38fd1498Szrj 
2632*38fd1498Szrj       if (!pfile->state.skipping || result->type == CPP_EOF)
2633*38fd1498Szrj 	break;
2634*38fd1498Szrj     }
2635*38fd1498Szrj 
2636*38fd1498Szrj   return result;
2637*38fd1498Szrj }
2638*38fd1498Szrj 
2639*38fd1498Szrj /* Returns true if a fresh line has been loaded.  */
2640*38fd1498Szrj bool
_cpp_get_fresh_line(cpp_reader * pfile)2641*38fd1498Szrj _cpp_get_fresh_line (cpp_reader *pfile)
2642*38fd1498Szrj {
2643*38fd1498Szrj   int return_at_eof;
2644*38fd1498Szrj 
2645*38fd1498Szrj   /* We can't get a new line until we leave the current directive.  */
2646*38fd1498Szrj   if (pfile->state.in_directive)
2647*38fd1498Szrj     return false;
2648*38fd1498Szrj 
2649*38fd1498Szrj   for (;;)
2650*38fd1498Szrj     {
2651*38fd1498Szrj       cpp_buffer *buffer = pfile->buffer;
2652*38fd1498Szrj 
2653*38fd1498Szrj       if (!buffer->need_line)
2654*38fd1498Szrj 	return true;
2655*38fd1498Szrj 
2656*38fd1498Szrj       if (buffer->next_line < buffer->rlimit)
2657*38fd1498Szrj 	{
2658*38fd1498Szrj 	  _cpp_clean_line (pfile);
2659*38fd1498Szrj 	  return true;
2660*38fd1498Szrj 	}
2661*38fd1498Szrj 
2662*38fd1498Szrj       /* First, get out of parsing arguments state.  */
2663*38fd1498Szrj       if (pfile->state.parsing_args)
2664*38fd1498Szrj 	return false;
2665*38fd1498Szrj 
2666*38fd1498Szrj       /* End of buffer.  Non-empty files should end in a newline.  */
2667*38fd1498Szrj       if (buffer->buf != buffer->rlimit
2668*38fd1498Szrj 	  && buffer->next_line > buffer->rlimit
2669*38fd1498Szrj 	  && !buffer->from_stage3)
2670*38fd1498Szrj 	{
2671*38fd1498Szrj 	  /* Clip to buffer size.  */
2672*38fd1498Szrj 	  buffer->next_line = buffer->rlimit;
2673*38fd1498Szrj 	}
2674*38fd1498Szrj 
2675*38fd1498Szrj       return_at_eof = buffer->return_at_eof;
2676*38fd1498Szrj       _cpp_pop_buffer (pfile);
2677*38fd1498Szrj       if (pfile->buffer == NULL || return_at_eof)
2678*38fd1498Szrj 	return false;
2679*38fd1498Szrj     }
2680*38fd1498Szrj }
2681*38fd1498Szrj 
2682*38fd1498Szrj #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)		\
2683*38fd1498Szrj   do							\
2684*38fd1498Szrj     {							\
2685*38fd1498Szrj       result->type = ELSE_TYPE;				\
2686*38fd1498Szrj       if (*buffer->cur == CHAR)				\
2687*38fd1498Szrj 	buffer->cur++, result->type = THEN_TYPE;	\
2688*38fd1498Szrj     }							\
2689*38fd1498Szrj   while (0)
2690*38fd1498Szrj 
2691*38fd1498Szrj /* Lex a token into pfile->cur_token, which is also incremented, to
2692*38fd1498Szrj    get diagnostics pointing to the correct location.
2693*38fd1498Szrj 
2694*38fd1498Szrj    Does not handle issues such as token lookahead, multiple-include
2695*38fd1498Szrj    optimization, directives, skipping etc.  This function is only
2696*38fd1498Szrj    suitable for use by _cpp_lex_token, and in special cases like
2697*38fd1498Szrj    lex_expansion_token which doesn't care for any of these issues.
2698*38fd1498Szrj 
2699*38fd1498Szrj    When meeting a newline, returns CPP_EOF if parsing a directive,
2700*38fd1498Szrj    otherwise returns to the start of the token buffer if permissible.
2701*38fd1498Szrj    Returns the location of the lexed token.  */
2702*38fd1498Szrj cpp_token *
_cpp_lex_direct(cpp_reader * pfile)2703*38fd1498Szrj _cpp_lex_direct (cpp_reader *pfile)
2704*38fd1498Szrj {
2705*38fd1498Szrj   cppchar_t c;
2706*38fd1498Szrj   cpp_buffer *buffer;
2707*38fd1498Szrj   const unsigned char *comment_start;
2708*38fd1498Szrj   bool fallthrough_comment = false;
2709*38fd1498Szrj   cpp_token *result = pfile->cur_token++;
2710*38fd1498Szrj 
2711*38fd1498Szrj  fresh_line:
2712*38fd1498Szrj   result->flags = 0;
2713*38fd1498Szrj   buffer = pfile->buffer;
2714*38fd1498Szrj   if (buffer->need_line)
2715*38fd1498Szrj     {
2716*38fd1498Szrj       if (pfile->state.in_deferred_pragma)
2717*38fd1498Szrj 	{
2718*38fd1498Szrj 	  result->type = CPP_PRAGMA_EOL;
2719*38fd1498Szrj 	  pfile->state.in_deferred_pragma = false;
2720*38fd1498Szrj 	  if (!pfile->state.pragma_allow_expansion)
2721*38fd1498Szrj 	    pfile->state.prevent_expansion--;
2722*38fd1498Szrj 	  return result;
2723*38fd1498Szrj 	}
2724*38fd1498Szrj       if (!_cpp_get_fresh_line (pfile))
2725*38fd1498Szrj 	{
2726*38fd1498Szrj 	  result->type = CPP_EOF;
2727*38fd1498Szrj 	  if (!pfile->state.in_directive)
2728*38fd1498Szrj 	    {
2729*38fd1498Szrj 	      /* Tell the compiler the line number of the EOF token.  */
2730*38fd1498Szrj 	      result->src_loc = pfile->line_table->highest_line;
2731*38fd1498Szrj 	      result->flags = BOL;
2732*38fd1498Szrj 	    }
2733*38fd1498Szrj 	  return result;
2734*38fd1498Szrj 	}
2735*38fd1498Szrj       if (buffer != pfile->buffer)
2736*38fd1498Szrj 	fallthrough_comment = false;
2737*38fd1498Szrj       if (!pfile->keep_tokens)
2738*38fd1498Szrj 	{
2739*38fd1498Szrj 	  pfile->cur_run = &pfile->base_run;
2740*38fd1498Szrj 	  result = pfile->base_run.base;
2741*38fd1498Szrj 	  pfile->cur_token = result + 1;
2742*38fd1498Szrj 	}
2743*38fd1498Szrj       result->flags = BOL;
2744*38fd1498Szrj       if (pfile->state.parsing_args == 2)
2745*38fd1498Szrj 	result->flags |= PREV_WHITE;
2746*38fd1498Szrj     }
2747*38fd1498Szrj   buffer = pfile->buffer;
2748*38fd1498Szrj  update_tokens_line:
2749*38fd1498Szrj   result->src_loc = pfile->line_table->highest_line;
2750*38fd1498Szrj 
2751*38fd1498Szrj  skipped_white:
2752*38fd1498Szrj   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2753*38fd1498Szrj       && !pfile->overlaid_buffer)
2754*38fd1498Szrj     {
2755*38fd1498Szrj       _cpp_process_line_notes (pfile, false);
2756*38fd1498Szrj       result->src_loc = pfile->line_table->highest_line;
2757*38fd1498Szrj     }
2758*38fd1498Szrj   c = *buffer->cur++;
2759*38fd1498Szrj 
2760*38fd1498Szrj   if (pfile->forced_token_location_p)
2761*38fd1498Szrj     result->src_loc = *pfile->forced_token_location_p;
2762*38fd1498Szrj   else
2763*38fd1498Szrj     result->src_loc = linemap_position_for_column (pfile->line_table,
2764*38fd1498Szrj 					  CPP_BUF_COLUMN (buffer, buffer->cur));
2765*38fd1498Szrj 
2766*38fd1498Szrj   switch (c)
2767*38fd1498Szrj     {
2768*38fd1498Szrj     case ' ': case '\t': case '\f': case '\v': case '\0':
2769*38fd1498Szrj       result->flags |= PREV_WHITE;
2770*38fd1498Szrj       skip_whitespace (pfile, c);
2771*38fd1498Szrj       goto skipped_white;
2772*38fd1498Szrj 
2773*38fd1498Szrj     case '\n':
2774*38fd1498Szrj       if (buffer->cur < buffer->rlimit)
2775*38fd1498Szrj 	CPP_INCREMENT_LINE (pfile, 0);
2776*38fd1498Szrj       buffer->need_line = true;
2777*38fd1498Szrj       goto fresh_line;
2778*38fd1498Szrj 
2779*38fd1498Szrj     case '0': case '1': case '2': case '3': case '4':
2780*38fd1498Szrj     case '5': case '6': case '7': case '8': case '9':
2781*38fd1498Szrj       {
2782*38fd1498Szrj 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2783*38fd1498Szrj 	result->type = CPP_NUMBER;
2784*38fd1498Szrj 	lex_number (pfile, &result->val.str, &nst);
2785*38fd1498Szrj 	warn_about_normalization (pfile, result, &nst);
2786*38fd1498Szrj 	break;
2787*38fd1498Szrj       }
2788*38fd1498Szrj 
2789*38fd1498Szrj     case 'L':
2790*38fd1498Szrj     case 'u':
2791*38fd1498Szrj     case 'U':
2792*38fd1498Szrj     case 'R':
2793*38fd1498Szrj       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2794*38fd1498Szrj 	 wide strings or raw strings.  */
2795*38fd1498Szrj       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2796*38fd1498Szrj 	  || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2797*38fd1498Szrj 	{
2798*38fd1498Szrj 	  if ((*buffer->cur == '\'' && c != 'R')
2799*38fd1498Szrj 	      || *buffer->cur == '"'
2800*38fd1498Szrj 	      || (*buffer->cur == 'R'
2801*38fd1498Szrj 		  && c != 'R'
2802*38fd1498Szrj 		  && buffer->cur[1] == '"'
2803*38fd1498Szrj 		  && CPP_OPTION (pfile, rliterals))
2804*38fd1498Szrj 	      || (*buffer->cur == '8'
2805*38fd1498Szrj 		  && c == 'u'
2806*38fd1498Szrj 		  && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
2807*38fd1498Szrj 				&& CPP_OPTION (pfile, utf8_char_literals)))
2808*38fd1498Szrj 		      || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2809*38fd1498Szrj 			  && CPP_OPTION (pfile, rliterals)))))
2810*38fd1498Szrj 	    {
2811*38fd1498Szrj 	      lex_string (pfile, result, buffer->cur - 1);
2812*38fd1498Szrj 	      break;
2813*38fd1498Szrj 	    }
2814*38fd1498Szrj 	}
2815*38fd1498Szrj       /* Fall through.  */
2816*38fd1498Szrj 
2817*38fd1498Szrj     case '_':
2818*38fd1498Szrj     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2819*38fd1498Szrj     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2820*38fd1498Szrj     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2821*38fd1498Szrj     case 's': case 't':           case 'v': case 'w': case 'x':
2822*38fd1498Szrj     case 'y': case 'z':
2823*38fd1498Szrj     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2824*38fd1498Szrj     case 'G': case 'H': case 'I': case 'J': case 'K':
2825*38fd1498Szrj     case 'M': case 'N': case 'O': case 'P': case 'Q':
2826*38fd1498Szrj     case 'S': case 'T':           case 'V': case 'W': case 'X':
2827*38fd1498Szrj     case 'Y': case 'Z':
2828*38fd1498Szrj       result->type = CPP_NAME;
2829*38fd1498Szrj       {
2830*38fd1498Szrj 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2831*38fd1498Szrj 	result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2832*38fd1498Szrj 						&nst,
2833*38fd1498Szrj 						&result->val.node.spelling);
2834*38fd1498Szrj 	warn_about_normalization (pfile, result, &nst);
2835*38fd1498Szrj       }
2836*38fd1498Szrj 
2837*38fd1498Szrj       /* Convert named operators to their proper types.  */
2838*38fd1498Szrj       if (result->val.node.node->flags & NODE_OPERATOR)
2839*38fd1498Szrj 	{
2840*38fd1498Szrj 	  result->flags |= NAMED_OP;
2841*38fd1498Szrj 	  result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2842*38fd1498Szrj 	}
2843*38fd1498Szrj 
2844*38fd1498Szrj       /* Signal FALLTHROUGH comment followed by another token.  */
2845*38fd1498Szrj       if (fallthrough_comment)
2846*38fd1498Szrj 	result->flags |= PREV_FALLTHROUGH;
2847*38fd1498Szrj       break;
2848*38fd1498Szrj 
2849*38fd1498Szrj     case '\'':
2850*38fd1498Szrj     case '"':
2851*38fd1498Szrj       lex_string (pfile, result, buffer->cur - 1);
2852*38fd1498Szrj       break;
2853*38fd1498Szrj 
2854*38fd1498Szrj     case '/':
2855*38fd1498Szrj       /* A potential block or line comment.  */
2856*38fd1498Szrj       comment_start = buffer->cur;
2857*38fd1498Szrj       c = *buffer->cur;
2858*38fd1498Szrj 
2859*38fd1498Szrj       if (c == '*')
2860*38fd1498Szrj 	{
2861*38fd1498Szrj 	  if (_cpp_skip_block_comment (pfile))
2862*38fd1498Szrj 	    cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2863*38fd1498Szrj 	}
2864*38fd1498Szrj       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
2865*38fd1498Szrj 	{
2866*38fd1498Szrj 	  /* Don't warn for system headers.  */
2867*38fd1498Szrj 	  if (cpp_in_system_header (pfile))
2868*38fd1498Szrj 	    ;
2869*38fd1498Szrj 	  /* Warn about comments if pedantically GNUC89, and not
2870*38fd1498Szrj 	     in system headers.  */
2871*38fd1498Szrj 	  else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
2872*38fd1498Szrj 		   && CPP_PEDANTIC (pfile)
2873*38fd1498Szrj 		   && ! buffer->warned_cplusplus_comments)
2874*38fd1498Szrj 	    {
2875*38fd1498Szrj 	      cpp_error (pfile, CPP_DL_PEDWARN,
2876*38fd1498Szrj 			 "C++ style comments are not allowed in ISO C90");
2877*38fd1498Szrj 	      cpp_error (pfile, CPP_DL_PEDWARN,
2878*38fd1498Szrj 			 "(this will be reported only once per input file)");
2879*38fd1498Szrj 	      buffer->warned_cplusplus_comments = 1;
2880*38fd1498Szrj 	    }
2881*38fd1498Szrj 	  /* Or if specifically desired via -Wc90-c99-compat.  */
2882*38fd1498Szrj 	  else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
2883*38fd1498Szrj 		   && ! CPP_OPTION (pfile, cplusplus)
2884*38fd1498Szrj 		   && ! buffer->warned_cplusplus_comments)
2885*38fd1498Szrj 	    {
2886*38fd1498Szrj 	      cpp_error (pfile, CPP_DL_WARNING,
2887*38fd1498Szrj 			 "C++ style comments are incompatible with C90");
2888*38fd1498Szrj 	      cpp_error (pfile, CPP_DL_WARNING,
2889*38fd1498Szrj 			 "(this will be reported only once per input file)");
2890*38fd1498Szrj 	      buffer->warned_cplusplus_comments = 1;
2891*38fd1498Szrj 	    }
2892*38fd1498Szrj 	  /* In C89/C94, C++ style comments are forbidden.  */
2893*38fd1498Szrj 	  else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
2894*38fd1498Szrj 		    || CPP_OPTION (pfile, lang) == CLK_STDC94))
2895*38fd1498Szrj 	    {
2896*38fd1498Szrj 	      /* But don't be confused about valid code such as
2897*38fd1498Szrj 	         - // immediately followed by *,
2898*38fd1498Szrj 		 - // in a preprocessing directive,
2899*38fd1498Szrj 		 - // in an #if 0 block.  */
2900*38fd1498Szrj 	      if (buffer->cur[1] == '*'
2901*38fd1498Szrj 		  || pfile->state.in_directive
2902*38fd1498Szrj 		  || pfile->state.skipping)
2903*38fd1498Szrj 		{
2904*38fd1498Szrj 		  result->type = CPP_DIV;
2905*38fd1498Szrj 		  break;
2906*38fd1498Szrj 		}
2907*38fd1498Szrj 	      else if (! buffer->warned_cplusplus_comments)
2908*38fd1498Szrj 		{
2909*38fd1498Szrj 		  cpp_error (pfile, CPP_DL_ERROR,
2910*38fd1498Szrj 			     "C++ style comments are not allowed in ISO C90");
2911*38fd1498Szrj 		  cpp_error (pfile, CPP_DL_ERROR,
2912*38fd1498Szrj 			     "(this will be reported only once per input "
2913*38fd1498Szrj 			     "file)");
2914*38fd1498Szrj 		  buffer->warned_cplusplus_comments = 1;
2915*38fd1498Szrj 		}
2916*38fd1498Szrj 	    }
2917*38fd1498Szrj 	  if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2918*38fd1498Szrj 	    cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2919*38fd1498Szrj 	}
2920*38fd1498Szrj       else if (c == '=')
2921*38fd1498Szrj 	{
2922*38fd1498Szrj 	  buffer->cur++;
2923*38fd1498Szrj 	  result->type = CPP_DIV_EQ;
2924*38fd1498Szrj 	  break;
2925*38fd1498Szrj 	}
2926*38fd1498Szrj       else
2927*38fd1498Szrj 	{
2928*38fd1498Szrj 	  result->type = CPP_DIV;
2929*38fd1498Szrj 	  break;
2930*38fd1498Szrj 	}
2931*38fd1498Szrj 
2932*38fd1498Szrj       if (fallthrough_comment_p (pfile, comment_start))
2933*38fd1498Szrj 	fallthrough_comment = true;
2934*38fd1498Szrj 
2935*38fd1498Szrj       if (pfile->cb.comment)
2936*38fd1498Szrj 	{
2937*38fd1498Szrj 	  size_t len = pfile->buffer->cur - comment_start;
2938*38fd1498Szrj 	  pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
2939*38fd1498Szrj 			     len + 1);
2940*38fd1498Szrj 	}
2941*38fd1498Szrj 
2942*38fd1498Szrj       if (!pfile->state.save_comments)
2943*38fd1498Szrj 	{
2944*38fd1498Szrj 	  result->flags |= PREV_WHITE;
2945*38fd1498Szrj 	  goto update_tokens_line;
2946*38fd1498Szrj 	}
2947*38fd1498Szrj 
2948*38fd1498Szrj       if (fallthrough_comment)
2949*38fd1498Szrj 	result->flags |= PREV_FALLTHROUGH;
2950*38fd1498Szrj 
2951*38fd1498Szrj       /* Save the comment as a token in its own right.  */
2952*38fd1498Szrj       save_comment (pfile, result, comment_start, c);
2953*38fd1498Szrj       break;
2954*38fd1498Szrj 
2955*38fd1498Szrj     case '<':
2956*38fd1498Szrj       if (pfile->state.angled_headers)
2957*38fd1498Szrj 	{
2958*38fd1498Szrj 	  lex_string (pfile, result, buffer->cur - 1);
2959*38fd1498Szrj 	  if (result->type != CPP_LESS)
2960*38fd1498Szrj 	    break;
2961*38fd1498Szrj 	}
2962*38fd1498Szrj 
2963*38fd1498Szrj       result->type = CPP_LESS;
2964*38fd1498Szrj       if (*buffer->cur == '=')
2965*38fd1498Szrj 	buffer->cur++, result->type = CPP_LESS_EQ;
2966*38fd1498Szrj       else if (*buffer->cur == '<')
2967*38fd1498Szrj 	{
2968*38fd1498Szrj 	  buffer->cur++;
2969*38fd1498Szrj 	  IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2970*38fd1498Szrj 	}
2971*38fd1498Szrj       else if (CPP_OPTION (pfile, digraphs))
2972*38fd1498Szrj 	{
2973*38fd1498Szrj 	  if (*buffer->cur == ':')
2974*38fd1498Szrj 	    {
2975*38fd1498Szrj 	      /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2976*38fd1498Szrj 		 three characters are <:: and the subsequent character
2977*38fd1498Szrj 		 is neither : nor >, the < is treated as a preprocessor
2978*38fd1498Szrj 		 token by itself".  */
2979*38fd1498Szrj 	      if (CPP_OPTION (pfile, cplusplus)
2980*38fd1498Szrj 		  && CPP_OPTION (pfile, lang) != CLK_CXX98
2981*38fd1498Szrj 		  && CPP_OPTION (pfile, lang) != CLK_GNUCXX
2982*38fd1498Szrj 		  && buffer->cur[1] == ':'
2983*38fd1498Szrj 		  && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2984*38fd1498Szrj 		break;
2985*38fd1498Szrj 
2986*38fd1498Szrj 	      buffer->cur++;
2987*38fd1498Szrj 	      result->flags |= DIGRAPH;
2988*38fd1498Szrj 	      result->type = CPP_OPEN_SQUARE;
2989*38fd1498Szrj 	    }
2990*38fd1498Szrj 	  else if (*buffer->cur == '%')
2991*38fd1498Szrj 	    {
2992*38fd1498Szrj 	      buffer->cur++;
2993*38fd1498Szrj 	      result->flags |= DIGRAPH;
2994*38fd1498Szrj 	      result->type = CPP_OPEN_BRACE;
2995*38fd1498Szrj 	    }
2996*38fd1498Szrj 	}
2997*38fd1498Szrj       break;
2998*38fd1498Szrj 
2999*38fd1498Szrj     case '>':
3000*38fd1498Szrj       result->type = CPP_GREATER;
3001*38fd1498Szrj       if (*buffer->cur == '=')
3002*38fd1498Szrj 	buffer->cur++, result->type = CPP_GREATER_EQ;
3003*38fd1498Szrj       else if (*buffer->cur == '>')
3004*38fd1498Szrj 	{
3005*38fd1498Szrj 	  buffer->cur++;
3006*38fd1498Szrj 	  IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3007*38fd1498Szrj 	}
3008*38fd1498Szrj       break;
3009*38fd1498Szrj 
3010*38fd1498Szrj     case '%':
3011*38fd1498Szrj       result->type = CPP_MOD;
3012*38fd1498Szrj       if (*buffer->cur == '=')
3013*38fd1498Szrj 	buffer->cur++, result->type = CPP_MOD_EQ;
3014*38fd1498Szrj       else if (CPP_OPTION (pfile, digraphs))
3015*38fd1498Szrj 	{
3016*38fd1498Szrj 	  if (*buffer->cur == ':')
3017*38fd1498Szrj 	    {
3018*38fd1498Szrj 	      buffer->cur++;
3019*38fd1498Szrj 	      result->flags |= DIGRAPH;
3020*38fd1498Szrj 	      result->type = CPP_HASH;
3021*38fd1498Szrj 	      if (*buffer->cur == '%' && buffer->cur[1] == ':')
3022*38fd1498Szrj 		buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3023*38fd1498Szrj 	    }
3024*38fd1498Szrj 	  else if (*buffer->cur == '>')
3025*38fd1498Szrj 	    {
3026*38fd1498Szrj 	      buffer->cur++;
3027*38fd1498Szrj 	      result->flags |= DIGRAPH;
3028*38fd1498Szrj 	      result->type = CPP_CLOSE_BRACE;
3029*38fd1498Szrj 	    }
3030*38fd1498Szrj 	}
3031*38fd1498Szrj       break;
3032*38fd1498Szrj 
3033*38fd1498Szrj     case '.':
3034*38fd1498Szrj       result->type = CPP_DOT;
3035*38fd1498Szrj       if (ISDIGIT (*buffer->cur))
3036*38fd1498Szrj 	{
3037*38fd1498Szrj 	  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3038*38fd1498Szrj 	  result->type = CPP_NUMBER;
3039*38fd1498Szrj 	  lex_number (pfile, &result->val.str, &nst);
3040*38fd1498Szrj 	  warn_about_normalization (pfile, result, &nst);
3041*38fd1498Szrj 	}
3042*38fd1498Szrj       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3043*38fd1498Szrj 	buffer->cur += 2, result->type = CPP_ELLIPSIS;
3044*38fd1498Szrj       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3045*38fd1498Szrj 	buffer->cur++, result->type = CPP_DOT_STAR;
3046*38fd1498Szrj       break;
3047*38fd1498Szrj 
3048*38fd1498Szrj     case '+':
3049*38fd1498Szrj       result->type = CPP_PLUS;
3050*38fd1498Szrj       if (*buffer->cur == '+')
3051*38fd1498Szrj 	buffer->cur++, result->type = CPP_PLUS_PLUS;
3052*38fd1498Szrj       else if (*buffer->cur == '=')
3053*38fd1498Szrj 	buffer->cur++, result->type = CPP_PLUS_EQ;
3054*38fd1498Szrj       break;
3055*38fd1498Szrj 
3056*38fd1498Szrj     case '-':
3057*38fd1498Szrj       result->type = CPP_MINUS;
3058*38fd1498Szrj       if (*buffer->cur == '>')
3059*38fd1498Szrj 	{
3060*38fd1498Szrj 	  buffer->cur++;
3061*38fd1498Szrj 	  result->type = CPP_DEREF;
3062*38fd1498Szrj 	  if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3063*38fd1498Szrj 	    buffer->cur++, result->type = CPP_DEREF_STAR;
3064*38fd1498Szrj 	}
3065*38fd1498Szrj       else if (*buffer->cur == '-')
3066*38fd1498Szrj 	buffer->cur++, result->type = CPP_MINUS_MINUS;
3067*38fd1498Szrj       else if (*buffer->cur == '=')
3068*38fd1498Szrj 	buffer->cur++, result->type = CPP_MINUS_EQ;
3069*38fd1498Szrj       break;
3070*38fd1498Szrj 
3071*38fd1498Szrj     case '&':
3072*38fd1498Szrj       result->type = CPP_AND;
3073*38fd1498Szrj       if (*buffer->cur == '&')
3074*38fd1498Szrj 	buffer->cur++, result->type = CPP_AND_AND;
3075*38fd1498Szrj       else if (*buffer->cur == '=')
3076*38fd1498Szrj 	buffer->cur++, result->type = CPP_AND_EQ;
3077*38fd1498Szrj       break;
3078*38fd1498Szrj 
3079*38fd1498Szrj     case '|':
3080*38fd1498Szrj       result->type = CPP_OR;
3081*38fd1498Szrj       if (*buffer->cur == '|')
3082*38fd1498Szrj 	buffer->cur++, result->type = CPP_OR_OR;
3083*38fd1498Szrj       else if (*buffer->cur == '=')
3084*38fd1498Szrj 	buffer->cur++, result->type = CPP_OR_EQ;
3085*38fd1498Szrj       break;
3086*38fd1498Szrj 
3087*38fd1498Szrj     case ':':
3088*38fd1498Szrj       result->type = CPP_COLON;
3089*38fd1498Szrj       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
3090*38fd1498Szrj 	buffer->cur++, result->type = CPP_SCOPE;
3091*38fd1498Szrj       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3092*38fd1498Szrj 	{
3093*38fd1498Szrj 	  buffer->cur++;
3094*38fd1498Szrj 	  result->flags |= DIGRAPH;
3095*38fd1498Szrj 	  result->type = CPP_CLOSE_SQUARE;
3096*38fd1498Szrj 	}
3097*38fd1498Szrj       break;
3098*38fd1498Szrj 
3099*38fd1498Szrj     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3100*38fd1498Szrj     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3101*38fd1498Szrj     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3102*38fd1498Szrj     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3103*38fd1498Szrj     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3104*38fd1498Szrj 
3105*38fd1498Szrj     case '?': result->type = CPP_QUERY; break;
3106*38fd1498Szrj     case '~': result->type = CPP_COMPL; break;
3107*38fd1498Szrj     case ',': result->type = CPP_COMMA; break;
3108*38fd1498Szrj     case '(': result->type = CPP_OPEN_PAREN; break;
3109*38fd1498Szrj     case ')': result->type = CPP_CLOSE_PAREN; break;
3110*38fd1498Szrj     case '[': result->type = CPP_OPEN_SQUARE; break;
3111*38fd1498Szrj     case ']': result->type = CPP_CLOSE_SQUARE; break;
3112*38fd1498Szrj     case '{': result->type = CPP_OPEN_BRACE; break;
3113*38fd1498Szrj     case '}': result->type = CPP_CLOSE_BRACE; break;
3114*38fd1498Szrj     case ';': result->type = CPP_SEMICOLON; break;
3115*38fd1498Szrj 
3116*38fd1498Szrj       /* @ is a punctuator in Objective-C.  */
3117*38fd1498Szrj     case '@': result->type = CPP_ATSIGN; break;
3118*38fd1498Szrj 
3119*38fd1498Szrj     case '$':
3120*38fd1498Szrj     case '\\':
3121*38fd1498Szrj       {
3122*38fd1498Szrj 	const uchar *base = --buffer->cur;
3123*38fd1498Szrj 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3124*38fd1498Szrj 
3125*38fd1498Szrj 	if (forms_identifier_p (pfile, true, &nst))
3126*38fd1498Szrj 	  {
3127*38fd1498Szrj 	    result->type = CPP_NAME;
3128*38fd1498Szrj 	    result->val.node.node = lex_identifier (pfile, base, true, &nst,
3129*38fd1498Szrj 						    &result->val.node.spelling);
3130*38fd1498Szrj 	    warn_about_normalization (pfile, result, &nst);
3131*38fd1498Szrj 	    break;
3132*38fd1498Szrj 	  }
3133*38fd1498Szrj 	buffer->cur++;
3134*38fd1498Szrj       }
3135*38fd1498Szrj       /* FALLTHRU */
3136*38fd1498Szrj 
3137*38fd1498Szrj     default:
3138*38fd1498Szrj       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
3139*38fd1498Szrj       break;
3140*38fd1498Szrj     }
3141*38fd1498Szrj 
3142*38fd1498Szrj   /* Potentially convert the location of the token to a range.  */
3143*38fd1498Szrj   if (result->src_loc >= RESERVED_LOCATION_COUNT
3144*38fd1498Szrj       && result->type != CPP_EOF)
3145*38fd1498Szrj     {
3146*38fd1498Szrj       /* Ensure that any line notes are processed, so that we have the
3147*38fd1498Szrj 	 correct physical line/column for the end-point of the token even
3148*38fd1498Szrj 	 when a logical line is split via one or more backslashes.  */
3149*38fd1498Szrj       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3150*38fd1498Szrj 	  && !pfile->overlaid_buffer)
3151*38fd1498Szrj 	_cpp_process_line_notes (pfile, false);
3152*38fd1498Szrj 
3153*38fd1498Szrj       source_range tok_range;
3154*38fd1498Szrj       tok_range.m_start = result->src_loc;
3155*38fd1498Szrj       tok_range.m_finish
3156*38fd1498Szrj 	= linemap_position_for_column (pfile->line_table,
3157*38fd1498Szrj 				       CPP_BUF_COLUMN (buffer, buffer->cur));
3158*38fd1498Szrj 
3159*38fd1498Szrj       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3160*38fd1498Szrj 					       result->src_loc,
3161*38fd1498Szrj 					       tok_range, NULL);
3162*38fd1498Szrj     }
3163*38fd1498Szrj 
3164*38fd1498Szrj   return result;
3165*38fd1498Szrj }
3166*38fd1498Szrj 
3167*38fd1498Szrj /* An upper bound on the number of bytes needed to spell TOKEN.
3168*38fd1498Szrj    Does not include preceding whitespace.  */
3169*38fd1498Szrj unsigned int
cpp_token_len(const cpp_token * token)3170*38fd1498Szrj cpp_token_len (const cpp_token *token)
3171*38fd1498Szrj {
3172*38fd1498Szrj   unsigned int len;
3173*38fd1498Szrj 
3174*38fd1498Szrj   switch (TOKEN_SPELL (token))
3175*38fd1498Szrj     {
3176*38fd1498Szrj     default:		len = 6;				break;
3177*38fd1498Szrj     case SPELL_LITERAL:	len = token->val.str.len;		break;
3178*38fd1498Szrj     case SPELL_IDENT:	len = NODE_LEN (token->val.node.node) * 10;	break;
3179*38fd1498Szrj     }
3180*38fd1498Szrj 
3181*38fd1498Szrj   return len;
3182*38fd1498Szrj }
3183*38fd1498Szrj 
3184*38fd1498Szrj /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3185*38fd1498Szrj    Return the number of bytes read out of NAME.  (There are always
3186*38fd1498Szrj    10 bytes written to BUFFER.)  */
3187*38fd1498Szrj 
3188*38fd1498Szrj static size_t
utf8_to_ucn(unsigned char * buffer,const unsigned char * name)3189*38fd1498Szrj utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3190*38fd1498Szrj {
3191*38fd1498Szrj   int j;
3192*38fd1498Szrj   int ucn_len = 0;
3193*38fd1498Szrj   int ucn_len_c;
3194*38fd1498Szrj   unsigned t;
3195*38fd1498Szrj   unsigned long utf32;
3196*38fd1498Szrj 
3197*38fd1498Szrj   /* Compute the length of the UTF-8 sequence.  */
3198*38fd1498Szrj   for (t = *name; t & 0x80; t <<= 1)
3199*38fd1498Szrj     ucn_len++;
3200*38fd1498Szrj 
3201*38fd1498Szrj   utf32 = *name & (0x7F >> ucn_len);
3202*38fd1498Szrj   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3203*38fd1498Szrj     {
3204*38fd1498Szrj       utf32 = (utf32 << 6) | (*++name & 0x3F);
3205*38fd1498Szrj 
3206*38fd1498Szrj       /* Ill-formed UTF-8.  */
3207*38fd1498Szrj       if ((*name & ~0x3F) != 0x80)
3208*38fd1498Szrj 	abort ();
3209*38fd1498Szrj     }
3210*38fd1498Szrj 
3211*38fd1498Szrj   *buffer++ = '\\';
3212*38fd1498Szrj   *buffer++ = 'U';
3213*38fd1498Szrj   for (j = 7; j >= 0; j--)
3214*38fd1498Szrj     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3215*38fd1498Szrj   return ucn_len;
3216*38fd1498Szrj }
3217*38fd1498Szrj 
3218*38fd1498Szrj /* Given a token TYPE corresponding to a digraph, return a pointer to
3219*38fd1498Szrj    the spelling of the digraph.  */
3220*38fd1498Szrj static const unsigned char *
cpp_digraph2name(enum cpp_ttype type)3221*38fd1498Szrj cpp_digraph2name (enum cpp_ttype type)
3222*38fd1498Szrj {
3223*38fd1498Szrj   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3224*38fd1498Szrj }
3225*38fd1498Szrj 
3226*38fd1498Szrj /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3227*38fd1498Szrj    The buffer must already contain the enough space to hold the
3228*38fd1498Szrj    token's spelling.  Returns a pointer to the character after the
3229*38fd1498Szrj    last character written.  */
3230*38fd1498Szrj unsigned char *
_cpp_spell_ident_ucns(unsigned char * buffer,cpp_hashnode * ident)3231*38fd1498Szrj _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3232*38fd1498Szrj {
3233*38fd1498Szrj   size_t i;
3234*38fd1498Szrj   const unsigned char *name = NODE_NAME (ident);
3235*38fd1498Szrj 
3236*38fd1498Szrj   for (i = 0; i < NODE_LEN (ident); i++)
3237*38fd1498Szrj     if (name[i] & ~0x7F)
3238*38fd1498Szrj       {
3239*38fd1498Szrj 	i += utf8_to_ucn (buffer, name + i) - 1;
3240*38fd1498Szrj 	buffer += 10;
3241*38fd1498Szrj       }
3242*38fd1498Szrj     else
3243*38fd1498Szrj       *buffer++ = name[i];
3244*38fd1498Szrj 
3245*38fd1498Szrj   return buffer;
3246*38fd1498Szrj }
3247*38fd1498Szrj 
3248*38fd1498Szrj /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
3249*38fd1498Szrj    already contain the enough space to hold the token's spelling.
3250*38fd1498Szrj    Returns a pointer to the character after the last character written.
3251*38fd1498Szrj    FORSTRING is true if this is to be the spelling after translation
3252*38fd1498Szrj    phase 1 (with the original spelling of extended identifiers), false
3253*38fd1498Szrj    if extended identifiers should always be written using UCNs (there is
3254*38fd1498Szrj    no option for always writing them in the internal UTF-8 form).
3255*38fd1498Szrj    FIXME: Would be nice if we didn't need the PFILE argument.  */
3256*38fd1498Szrj unsigned char *
cpp_spell_token(cpp_reader * pfile,const cpp_token * token,unsigned char * buffer,bool forstring)3257*38fd1498Szrj cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3258*38fd1498Szrj 		 unsigned char *buffer, bool forstring)
3259*38fd1498Szrj {
3260*38fd1498Szrj   switch (TOKEN_SPELL (token))
3261*38fd1498Szrj     {
3262*38fd1498Szrj     case SPELL_OPERATOR:
3263*38fd1498Szrj       {
3264*38fd1498Szrj 	const unsigned char *spelling;
3265*38fd1498Szrj 	unsigned char c;
3266*38fd1498Szrj 
3267*38fd1498Szrj 	if (token->flags & DIGRAPH)
3268*38fd1498Szrj 	  spelling = cpp_digraph2name (token->type);
3269*38fd1498Szrj 	else if (token->flags & NAMED_OP)
3270*38fd1498Szrj 	  goto spell_ident;
3271*38fd1498Szrj 	else
3272*38fd1498Szrj 	  spelling = TOKEN_NAME (token);
3273*38fd1498Szrj 
3274*38fd1498Szrj 	while ((c = *spelling++) != '\0')
3275*38fd1498Szrj 	  *buffer++ = c;
3276*38fd1498Szrj       }
3277*38fd1498Szrj       break;
3278*38fd1498Szrj 
3279*38fd1498Szrj     spell_ident:
3280*38fd1498Szrj     case SPELL_IDENT:
3281*38fd1498Szrj       if (forstring)
3282*38fd1498Szrj 	{
3283*38fd1498Szrj 	  memcpy (buffer, NODE_NAME (token->val.node.spelling),
3284*38fd1498Szrj 		  NODE_LEN (token->val.node.spelling));
3285*38fd1498Szrj 	  buffer += NODE_LEN (token->val.node.spelling);
3286*38fd1498Szrj 	}
3287*38fd1498Szrj       else
3288*38fd1498Szrj 	buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3289*38fd1498Szrj       break;
3290*38fd1498Szrj 
3291*38fd1498Szrj     case SPELL_LITERAL:
3292*38fd1498Szrj       memcpy (buffer, token->val.str.text, token->val.str.len);
3293*38fd1498Szrj       buffer += token->val.str.len;
3294*38fd1498Szrj       break;
3295*38fd1498Szrj 
3296*38fd1498Szrj     case SPELL_NONE:
3297*38fd1498Szrj       cpp_error (pfile, CPP_DL_ICE,
3298*38fd1498Szrj 		 "unspellable token %s", TOKEN_NAME (token));
3299*38fd1498Szrj       break;
3300*38fd1498Szrj     }
3301*38fd1498Szrj 
3302*38fd1498Szrj   return buffer;
3303*38fd1498Szrj }
3304*38fd1498Szrj 
3305*38fd1498Szrj /* Returns TOKEN spelt as a null-terminated string.  The string is
3306*38fd1498Szrj    freed when the reader is destroyed.  Useful for diagnostics.  */
3307*38fd1498Szrj unsigned char *
cpp_token_as_text(cpp_reader * pfile,const cpp_token * token)3308*38fd1498Szrj cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3309*38fd1498Szrj {
3310*38fd1498Szrj   unsigned int len = cpp_token_len (token) + 1;
3311*38fd1498Szrj   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3312*38fd1498Szrj 
3313*38fd1498Szrj   end = cpp_spell_token (pfile, token, start, false);
3314*38fd1498Szrj   end[0] = '\0';
3315*38fd1498Szrj 
3316*38fd1498Szrj   return start;
3317*38fd1498Szrj }
3318*38fd1498Szrj 
3319*38fd1498Szrj /* Returns a pointer to a string which spells the token defined by
3320*38fd1498Szrj    TYPE and FLAGS.  Used by C front ends, which really should move to
3321*38fd1498Szrj    using cpp_token_as_text.  */
3322*38fd1498Szrj const char *
cpp_type2name(enum cpp_ttype type,unsigned char flags)3323*38fd1498Szrj cpp_type2name (enum cpp_ttype type, unsigned char flags)
3324*38fd1498Szrj {
3325*38fd1498Szrj   if (flags & DIGRAPH)
3326*38fd1498Szrj     return (const char *) cpp_digraph2name (type);
3327*38fd1498Szrj   else if (flags & NAMED_OP)
3328*38fd1498Szrj     return cpp_named_operator2name (type);
3329*38fd1498Szrj 
3330*38fd1498Szrj   return (const char *) token_spellings[type].name;
3331*38fd1498Szrj }
3332*38fd1498Szrj 
3333*38fd1498Szrj /* Writes the spelling of token to FP, without any preceding space.
3334*38fd1498Szrj    Separated from cpp_spell_token for efficiency - to avoid stdio
3335*38fd1498Szrj    double-buffering.  */
3336*38fd1498Szrj void
cpp_output_token(const cpp_token * token,FILE * fp)3337*38fd1498Szrj cpp_output_token (const cpp_token *token, FILE *fp)
3338*38fd1498Szrj {
3339*38fd1498Szrj   switch (TOKEN_SPELL (token))
3340*38fd1498Szrj     {
3341*38fd1498Szrj     case SPELL_OPERATOR:
3342*38fd1498Szrj       {
3343*38fd1498Szrj 	const unsigned char *spelling;
3344*38fd1498Szrj 	int c;
3345*38fd1498Szrj 
3346*38fd1498Szrj 	if (token->flags & DIGRAPH)
3347*38fd1498Szrj 	  spelling = cpp_digraph2name (token->type);
3348*38fd1498Szrj 	else if (token->flags & NAMED_OP)
3349*38fd1498Szrj 	  goto spell_ident;
3350*38fd1498Szrj 	else
3351*38fd1498Szrj 	  spelling = TOKEN_NAME (token);
3352*38fd1498Szrj 
3353*38fd1498Szrj 	c = *spelling;
3354*38fd1498Szrj 	do
3355*38fd1498Szrj 	  putc (c, fp);
3356*38fd1498Szrj 	while ((c = *++spelling) != '\0');
3357*38fd1498Szrj       }
3358*38fd1498Szrj       break;
3359*38fd1498Szrj 
3360*38fd1498Szrj     spell_ident:
3361*38fd1498Szrj     case SPELL_IDENT:
3362*38fd1498Szrj       {
3363*38fd1498Szrj 	size_t i;
3364*38fd1498Szrj 	const unsigned char * name = NODE_NAME (token->val.node.node);
3365*38fd1498Szrj 
3366*38fd1498Szrj 	for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3367*38fd1498Szrj 	  if (name[i] & ~0x7F)
3368*38fd1498Szrj 	    {
3369*38fd1498Szrj 	      unsigned char buffer[10];
3370*38fd1498Szrj 	      i += utf8_to_ucn (buffer, name + i) - 1;
3371*38fd1498Szrj 	      fwrite (buffer, 1, 10, fp);
3372*38fd1498Szrj 	    }
3373*38fd1498Szrj 	  else
3374*38fd1498Szrj 	    fputc (NODE_NAME (token->val.node.node)[i], fp);
3375*38fd1498Szrj       }
3376*38fd1498Szrj       break;
3377*38fd1498Szrj 
3378*38fd1498Szrj     case SPELL_LITERAL:
3379*38fd1498Szrj       fwrite (token->val.str.text, 1, token->val.str.len, fp);
3380*38fd1498Szrj       break;
3381*38fd1498Szrj 
3382*38fd1498Szrj     case SPELL_NONE:
3383*38fd1498Szrj       /* An error, most probably.  */
3384*38fd1498Szrj       break;
3385*38fd1498Szrj     }
3386*38fd1498Szrj }
3387*38fd1498Szrj 
3388*38fd1498Szrj /* Compare two tokens.  */
3389*38fd1498Szrj int
_cpp_equiv_tokens(const cpp_token * a,const cpp_token * b)3390*38fd1498Szrj _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3391*38fd1498Szrj {
3392*38fd1498Szrj   if (a->type == b->type && a->flags == b->flags)
3393*38fd1498Szrj     switch (TOKEN_SPELL (a))
3394*38fd1498Szrj       {
3395*38fd1498Szrj       default:			/* Keep compiler happy.  */
3396*38fd1498Szrj       case SPELL_OPERATOR:
3397*38fd1498Szrj 	/* token_no is used to track where multiple consecutive ##
3398*38fd1498Szrj 	   tokens were originally located.  */
3399*38fd1498Szrj 	return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3400*38fd1498Szrj       case SPELL_NONE:
3401*38fd1498Szrj 	return (a->type != CPP_MACRO_ARG
3402*38fd1498Szrj 		|| (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3403*38fd1498Szrj 		    && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3404*38fd1498Szrj       case SPELL_IDENT:
3405*38fd1498Szrj 	return (a->val.node.node == b->val.node.node
3406*38fd1498Szrj 		&& a->val.node.spelling == b->val.node.spelling);
3407*38fd1498Szrj       case SPELL_LITERAL:
3408*38fd1498Szrj 	return (a->val.str.len == b->val.str.len
3409*38fd1498Szrj 		&& !memcmp (a->val.str.text, b->val.str.text,
3410*38fd1498Szrj 			    a->val.str.len));
3411*38fd1498Szrj       }
3412*38fd1498Szrj 
3413*38fd1498Szrj   return 0;
3414*38fd1498Szrj }
3415*38fd1498Szrj 
3416*38fd1498Szrj /* Returns nonzero if a space should be inserted to avoid an
3417*38fd1498Szrj    accidental token paste for output.  For simplicity, it is
3418*38fd1498Szrj    conservative, and occasionally advises a space where one is not
3419*38fd1498Szrj    needed, e.g. "." and ".2".  */
3420*38fd1498Szrj int
cpp_avoid_paste(cpp_reader * pfile,const cpp_token * token1,const cpp_token * token2)3421*38fd1498Szrj cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3422*38fd1498Szrj 		 const cpp_token *token2)
3423*38fd1498Szrj {
3424*38fd1498Szrj   enum cpp_ttype a = token1->type, b = token2->type;
3425*38fd1498Szrj   cppchar_t c;
3426*38fd1498Szrj 
3427*38fd1498Szrj   if (token1->flags & NAMED_OP)
3428*38fd1498Szrj     a = CPP_NAME;
3429*38fd1498Szrj   if (token2->flags & NAMED_OP)
3430*38fd1498Szrj     b = CPP_NAME;
3431*38fd1498Szrj 
3432*38fd1498Szrj   c = EOF;
3433*38fd1498Szrj   if (token2->flags & DIGRAPH)
3434*38fd1498Szrj     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3435*38fd1498Szrj   else if (token_spellings[b].category == SPELL_OPERATOR)
3436*38fd1498Szrj     c = token_spellings[b].name[0];
3437*38fd1498Szrj 
3438*38fd1498Szrj   /* Quickly get everything that can paste with an '='.  */
3439*38fd1498Szrj   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3440*38fd1498Szrj     return 1;
3441*38fd1498Szrj 
3442*38fd1498Szrj   switch (a)
3443*38fd1498Szrj     {
3444*38fd1498Szrj     case CPP_GREATER:	return c == '>';
3445*38fd1498Szrj     case CPP_LESS:	return c == '<' || c == '%' || c == ':';
3446*38fd1498Szrj     case CPP_PLUS:	return c == '+';
3447*38fd1498Szrj     case CPP_MINUS:	return c == '-' || c == '>';
3448*38fd1498Szrj     case CPP_DIV:	return c == '/' || c == '*'; /* Comments.  */
3449*38fd1498Szrj     case CPP_MOD:	return c == ':' || c == '>';
3450*38fd1498Szrj     case CPP_AND:	return c == '&';
3451*38fd1498Szrj     case CPP_OR:	return c == '|';
3452*38fd1498Szrj     case CPP_COLON:	return c == ':' || c == '>';
3453*38fd1498Szrj     case CPP_DEREF:	return c == '*';
3454*38fd1498Szrj     case CPP_DOT:	return c == '.' || c == '%' || b == CPP_NUMBER;
3455*38fd1498Szrj     case CPP_HASH:	return c == '#' || c == '%'; /* Digraph form.  */
3456*38fd1498Szrj     case CPP_NAME:	return ((b == CPP_NUMBER
3457*38fd1498Szrj 				 && name_p (pfile, &token2->val.str))
3458*38fd1498Szrj 				|| b == CPP_NAME
3459*38fd1498Szrj 				|| b == CPP_CHAR || b == CPP_STRING); /* L */
3460*38fd1498Szrj     case CPP_NUMBER:	return (b == CPP_NUMBER || b == CPP_NAME
3461*38fd1498Szrj 				|| c == '.' || c == '+' || c == '-');
3462*38fd1498Szrj 				      /* UCNs */
3463*38fd1498Szrj     case CPP_OTHER:	return ((token1->val.str.text[0] == '\\'
3464*38fd1498Szrj 				 && b == CPP_NAME)
3465*38fd1498Szrj 				|| (CPP_OPTION (pfile, objc)
3466*38fd1498Szrj 				    && token1->val.str.text[0] == '@'
3467*38fd1498Szrj 				    && (b == CPP_NAME || b == CPP_STRING)));
3468*38fd1498Szrj     case CPP_STRING:
3469*38fd1498Szrj     case CPP_WSTRING:
3470*38fd1498Szrj     case CPP_UTF8STRING:
3471*38fd1498Szrj     case CPP_STRING16:
3472*38fd1498Szrj     case CPP_STRING32:	return (CPP_OPTION (pfile, user_literals)
3473*38fd1498Szrj 				&& (b == CPP_NAME
3474*38fd1498Szrj 				    || (TOKEN_SPELL (token2) == SPELL_LITERAL
3475*38fd1498Szrj 					&& ISIDST (token2->val.str.text[0]))));
3476*38fd1498Szrj 
3477*38fd1498Szrj     default:		break;
3478*38fd1498Szrj     }
3479*38fd1498Szrj 
3480*38fd1498Szrj   return 0;
3481*38fd1498Szrj }
3482*38fd1498Szrj 
3483*38fd1498Szrj /* Output all the remaining tokens on the current line, and a newline
3484*38fd1498Szrj    character, to FP.  Leading whitespace is removed.  If there are
3485*38fd1498Szrj    macros, special token padding is not performed.  */
3486*38fd1498Szrj void
cpp_output_line(cpp_reader * pfile,FILE * fp)3487*38fd1498Szrj cpp_output_line (cpp_reader *pfile, FILE *fp)
3488*38fd1498Szrj {
3489*38fd1498Szrj   const cpp_token *token;
3490*38fd1498Szrj 
3491*38fd1498Szrj   token = cpp_get_token (pfile);
3492*38fd1498Szrj   while (token->type != CPP_EOF)
3493*38fd1498Szrj     {
3494*38fd1498Szrj       cpp_output_token (token, fp);
3495*38fd1498Szrj       token = cpp_get_token (pfile);
3496*38fd1498Szrj       if (token->flags & PREV_WHITE)
3497*38fd1498Szrj 	putc (' ', fp);
3498*38fd1498Szrj     }
3499*38fd1498Szrj 
3500*38fd1498Szrj   putc ('\n', fp);
3501*38fd1498Szrj }
3502*38fd1498Szrj 
3503*38fd1498Szrj /* Return a string representation of all the remaining tokens on the
3504*38fd1498Szrj    current line.  The result is allocated using xmalloc and must be
3505*38fd1498Szrj    freed by the caller.  */
3506*38fd1498Szrj unsigned char *
cpp_output_line_to_string(cpp_reader * pfile,const unsigned char * dir_name)3507*38fd1498Szrj cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3508*38fd1498Szrj {
3509*38fd1498Szrj   const cpp_token *token;
3510*38fd1498Szrj   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3511*38fd1498Szrj   unsigned int alloced = 120 + out;
3512*38fd1498Szrj   unsigned char *result = (unsigned char *) xmalloc (alloced);
3513*38fd1498Szrj 
3514*38fd1498Szrj   /* If DIR_NAME is empty, there are no initial contents.  */
3515*38fd1498Szrj   if (dir_name)
3516*38fd1498Szrj     {
3517*38fd1498Szrj       sprintf ((char *) result, "#%s ", dir_name);
3518*38fd1498Szrj       out += 2;
3519*38fd1498Szrj     }
3520*38fd1498Szrj 
3521*38fd1498Szrj   token = cpp_get_token (pfile);
3522*38fd1498Szrj   while (token->type != CPP_EOF)
3523*38fd1498Szrj     {
3524*38fd1498Szrj       unsigned char *last;
3525*38fd1498Szrj       /* Include room for a possible space and the terminating nul.  */
3526*38fd1498Szrj       unsigned int len = cpp_token_len (token) + 2;
3527*38fd1498Szrj 
3528*38fd1498Szrj       if (out + len > alloced)
3529*38fd1498Szrj 	{
3530*38fd1498Szrj 	  alloced *= 2;
3531*38fd1498Szrj 	  if (out + len > alloced)
3532*38fd1498Szrj 	    alloced = out + len;
3533*38fd1498Szrj 	  result = (unsigned char *) xrealloc (result, alloced);
3534*38fd1498Szrj 	}
3535*38fd1498Szrj 
3536*38fd1498Szrj       last = cpp_spell_token (pfile, token, &result[out], 0);
3537*38fd1498Szrj       out = last - result;
3538*38fd1498Szrj 
3539*38fd1498Szrj       token = cpp_get_token (pfile);
3540*38fd1498Szrj       if (token->flags & PREV_WHITE)
3541*38fd1498Szrj 	result[out++] = ' ';
3542*38fd1498Szrj     }
3543*38fd1498Szrj 
3544*38fd1498Szrj   result[out] = '\0';
3545*38fd1498Szrj   return result;
3546*38fd1498Szrj }
3547*38fd1498Szrj 
3548*38fd1498Szrj /* Memory buffers.  Changing these three constants can have a dramatic
3549*38fd1498Szrj    effect on performance.  The values here are reasonable defaults,
3550*38fd1498Szrj    but might be tuned.  If you adjust them, be sure to test across a
3551*38fd1498Szrj    range of uses of cpplib, including heavy nested function-like macro
3552*38fd1498Szrj    expansion.  Also check the change in peak memory usage (NJAMD is a
3553*38fd1498Szrj    good tool for this).  */
3554*38fd1498Szrj #define MIN_BUFF_SIZE 8000
3555*38fd1498Szrj #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3556*38fd1498Szrj #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3557*38fd1498Szrj 	(MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3558*38fd1498Szrj 
3559*38fd1498Szrj #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3560*38fd1498Szrj   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3561*38fd1498Szrj #endif
3562*38fd1498Szrj 
3563*38fd1498Szrj /* Create a new allocation buffer.  Place the control block at the end
3564*38fd1498Szrj    of the buffer, so that buffer overflows will cause immediate chaos.  */
3565*38fd1498Szrj static _cpp_buff *
new_buff(size_t len)3566*38fd1498Szrj new_buff (size_t len)
3567*38fd1498Szrj {
3568*38fd1498Szrj   _cpp_buff *result;
3569*38fd1498Szrj   unsigned char *base;
3570*38fd1498Szrj 
3571*38fd1498Szrj   if (len < MIN_BUFF_SIZE)
3572*38fd1498Szrj     len = MIN_BUFF_SIZE;
3573*38fd1498Szrj   len = CPP_ALIGN (len);
3574*38fd1498Szrj 
3575*38fd1498Szrj #ifdef ENABLE_VALGRIND_ANNOTATIONS
3576*38fd1498Szrj   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3577*38fd1498Szrj      struct first.  */
3578*38fd1498Szrj   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3579*38fd1498Szrj   base = XNEWVEC (unsigned char, len + slen);
3580*38fd1498Szrj   result = (_cpp_buff *) base;
3581*38fd1498Szrj   base += slen;
3582*38fd1498Szrj #else
3583*38fd1498Szrj   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3584*38fd1498Szrj   result = (_cpp_buff *) (base + len);
3585*38fd1498Szrj #endif
3586*38fd1498Szrj   result->base = base;
3587*38fd1498Szrj   result->cur = base;
3588*38fd1498Szrj   result->limit = base + len;
3589*38fd1498Szrj   result->next = NULL;
3590*38fd1498Szrj   return result;
3591*38fd1498Szrj }
3592*38fd1498Szrj 
3593*38fd1498Szrj /* Place a chain of unwanted allocation buffers on the free list.  */
3594*38fd1498Szrj void
_cpp_release_buff(cpp_reader * pfile,_cpp_buff * buff)3595*38fd1498Szrj _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3596*38fd1498Szrj {
3597*38fd1498Szrj   _cpp_buff *end = buff;
3598*38fd1498Szrj 
3599*38fd1498Szrj   while (end->next)
3600*38fd1498Szrj     end = end->next;
3601*38fd1498Szrj   end->next = pfile->free_buffs;
3602*38fd1498Szrj   pfile->free_buffs = buff;
3603*38fd1498Szrj }
3604*38fd1498Szrj 
3605*38fd1498Szrj /* Return a free buffer of size at least MIN_SIZE.  */
3606*38fd1498Szrj _cpp_buff *
_cpp_get_buff(cpp_reader * pfile,size_t min_size)3607*38fd1498Szrj _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3608*38fd1498Szrj {
3609*38fd1498Szrj   _cpp_buff *result, **p;
3610*38fd1498Szrj 
3611*38fd1498Szrj   for (p = &pfile->free_buffs;; p = &(*p)->next)
3612*38fd1498Szrj     {
3613*38fd1498Szrj       size_t size;
3614*38fd1498Szrj 
3615*38fd1498Szrj       if (*p == NULL)
3616*38fd1498Szrj 	return new_buff (min_size);
3617*38fd1498Szrj       result = *p;
3618*38fd1498Szrj       size = result->limit - result->base;
3619*38fd1498Szrj       /* Return a buffer that's big enough, but don't waste one that's
3620*38fd1498Szrj          way too big.  */
3621*38fd1498Szrj       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3622*38fd1498Szrj 	break;
3623*38fd1498Szrj     }
3624*38fd1498Szrj 
3625*38fd1498Szrj   *p = result->next;
3626*38fd1498Szrj   result->next = NULL;
3627*38fd1498Szrj   result->cur = result->base;
3628*38fd1498Szrj   return result;
3629*38fd1498Szrj }
3630*38fd1498Szrj 
3631*38fd1498Szrj /* Creates a new buffer with enough space to hold the uncommitted
3632*38fd1498Szrj    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3633*38fd1498Szrj    the excess bytes to the new buffer.  Chains the new buffer after
3634*38fd1498Szrj    BUFF, and returns the new buffer.  */
3635*38fd1498Szrj _cpp_buff *
_cpp_append_extend_buff(cpp_reader * pfile,_cpp_buff * buff,size_t min_extra)3636*38fd1498Szrj _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3637*38fd1498Szrj {
3638*38fd1498Szrj   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3639*38fd1498Szrj   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3640*38fd1498Szrj 
3641*38fd1498Szrj   buff->next = new_buff;
3642*38fd1498Szrj   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3643*38fd1498Szrj   return new_buff;
3644*38fd1498Szrj }
3645*38fd1498Szrj 
3646*38fd1498Szrj /* Creates a new buffer with enough space to hold the uncommitted
3647*38fd1498Szrj    remaining bytes of the buffer pointed to by BUFF, and at least
3648*38fd1498Szrj    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3649*38fd1498Szrj    Chains the new buffer before the buffer pointed to by BUFF, and
3650*38fd1498Szrj    updates the pointer to point to the new buffer.  */
3651*38fd1498Szrj void
_cpp_extend_buff(cpp_reader * pfile,_cpp_buff ** pbuff,size_t min_extra)3652*38fd1498Szrj _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3653*38fd1498Szrj {
3654*38fd1498Szrj   _cpp_buff *new_buff, *old_buff = *pbuff;
3655*38fd1498Szrj   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3656*38fd1498Szrj 
3657*38fd1498Szrj   new_buff = _cpp_get_buff (pfile, size);
3658*38fd1498Szrj   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3659*38fd1498Szrj   new_buff->next = old_buff;
3660*38fd1498Szrj   *pbuff = new_buff;
3661*38fd1498Szrj }
3662*38fd1498Szrj 
3663*38fd1498Szrj /* Free a chain of buffers starting at BUFF.  */
3664*38fd1498Szrj void
_cpp_free_buff(_cpp_buff * buff)3665*38fd1498Szrj _cpp_free_buff (_cpp_buff *buff)
3666*38fd1498Szrj {
3667*38fd1498Szrj   _cpp_buff *next;
3668*38fd1498Szrj 
3669*38fd1498Szrj   for (; buff; buff = next)
3670*38fd1498Szrj     {
3671*38fd1498Szrj       next = buff->next;
3672*38fd1498Szrj #ifdef ENABLE_VALGRIND_ANNOTATIONS
3673*38fd1498Szrj       free (buff);
3674*38fd1498Szrj #else
3675*38fd1498Szrj       free (buff->base);
3676*38fd1498Szrj #endif
3677*38fd1498Szrj     }
3678*38fd1498Szrj }
3679*38fd1498Szrj 
3680*38fd1498Szrj /* Allocate permanent, unaligned storage of length LEN.  */
3681*38fd1498Szrj unsigned char *
_cpp_unaligned_alloc(cpp_reader * pfile,size_t len)3682*38fd1498Szrj _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3683*38fd1498Szrj {
3684*38fd1498Szrj   _cpp_buff *buff = pfile->u_buff;
3685*38fd1498Szrj   unsigned char *result = buff->cur;
3686*38fd1498Szrj 
3687*38fd1498Szrj   if (len > (size_t) (buff->limit - result))
3688*38fd1498Szrj     {
3689*38fd1498Szrj       buff = _cpp_get_buff (pfile, len);
3690*38fd1498Szrj       buff->next = pfile->u_buff;
3691*38fd1498Szrj       pfile->u_buff = buff;
3692*38fd1498Szrj       result = buff->cur;
3693*38fd1498Szrj     }
3694*38fd1498Szrj 
3695*38fd1498Szrj   buff->cur = result + len;
3696*38fd1498Szrj   return result;
3697*38fd1498Szrj }
3698*38fd1498Szrj 
3699*38fd1498Szrj /* Allocate permanent, unaligned storage of length LEN from a_buff.
3700*38fd1498Szrj    That buffer is used for growing allocations when saving macro
3701*38fd1498Szrj    replacement lists in a #define, and when parsing an answer to an
3702*38fd1498Szrj    assertion in #assert, #unassert or #if (and therefore possibly
3703*38fd1498Szrj    whilst expanding macros).  It therefore must not be used by any
3704*38fd1498Szrj    code that they might call: specifically the lexer and the guts of
3705*38fd1498Szrj    the macro expander.
3706*38fd1498Szrj 
3707*38fd1498Szrj    All existing other uses clearly fit this restriction: storing
3708*38fd1498Szrj    registered pragmas during initialization.  */
3709*38fd1498Szrj unsigned char *
_cpp_aligned_alloc(cpp_reader * pfile,size_t len)3710*38fd1498Szrj _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3711*38fd1498Szrj {
3712*38fd1498Szrj   _cpp_buff *buff = pfile->a_buff;
3713*38fd1498Szrj   unsigned char *result = buff->cur;
3714*38fd1498Szrj 
3715*38fd1498Szrj   if (len > (size_t) (buff->limit - result))
3716*38fd1498Szrj     {
3717*38fd1498Szrj       buff = _cpp_get_buff (pfile, len);
3718*38fd1498Szrj       buff->next = pfile->a_buff;
3719*38fd1498Szrj       pfile->a_buff = buff;
3720*38fd1498Szrj       result = buff->cur;
3721*38fd1498Szrj     }
3722*38fd1498Szrj 
3723*38fd1498Szrj   buff->cur = result + len;
3724*38fd1498Szrj   return result;
3725*38fd1498Szrj }
3726*38fd1498Szrj 
3727*38fd1498Szrj /* Say which field of TOK is in use.  */
3728*38fd1498Szrj 
3729*38fd1498Szrj enum cpp_token_fld_kind
cpp_token_val_index(const cpp_token * tok)3730*38fd1498Szrj cpp_token_val_index (const cpp_token *tok)
3731*38fd1498Szrj {
3732*38fd1498Szrj   switch (TOKEN_SPELL (tok))
3733*38fd1498Szrj     {
3734*38fd1498Szrj     case SPELL_IDENT:
3735*38fd1498Szrj       return CPP_TOKEN_FLD_NODE;
3736*38fd1498Szrj     case SPELL_LITERAL:
3737*38fd1498Szrj       return CPP_TOKEN_FLD_STR;
3738*38fd1498Szrj     case SPELL_OPERATOR:
3739*38fd1498Szrj       if (tok->type == CPP_PASTE)
3740*38fd1498Szrj 	return CPP_TOKEN_FLD_TOKEN_NO;
3741*38fd1498Szrj       else
3742*38fd1498Szrj 	return CPP_TOKEN_FLD_NONE;
3743*38fd1498Szrj     case SPELL_NONE:
3744*38fd1498Szrj       if (tok->type == CPP_MACRO_ARG)
3745*38fd1498Szrj 	return CPP_TOKEN_FLD_ARG_NO;
3746*38fd1498Szrj       else if (tok->type == CPP_PADDING)
3747*38fd1498Szrj 	return CPP_TOKEN_FLD_SOURCE;
3748*38fd1498Szrj       else if (tok->type == CPP_PRAGMA)
3749*38fd1498Szrj 	return CPP_TOKEN_FLD_PRAGMA;
3750*38fd1498Szrj       /* fall through */
3751*38fd1498Szrj     default:
3752*38fd1498Szrj       return CPP_TOKEN_FLD_NONE;
3753*38fd1498Szrj     }
3754*38fd1498Szrj }
3755*38fd1498Szrj 
3756*38fd1498Szrj /* All tokens lexed in R after calling this function will be forced to have
3757*38fd1498Szrj    their source_location the same as the location referenced by P, until
3758*38fd1498Szrj    cpp_stop_forcing_token_locations is called for R.  */
3759*38fd1498Szrj 
3760*38fd1498Szrj void
cpp_force_token_locations(cpp_reader * r,source_location * p)3761*38fd1498Szrj cpp_force_token_locations (cpp_reader *r, source_location *p)
3762*38fd1498Szrj {
3763*38fd1498Szrj   r->forced_token_location_p = p;
3764*38fd1498Szrj }
3765*38fd1498Szrj 
3766*38fd1498Szrj /* Go back to assigning locations naturally for lexed tokens.  */
3767*38fd1498Szrj 
3768*38fd1498Szrj void
cpp_stop_forcing_token_locations(cpp_reader * r)3769*38fd1498Szrj cpp_stop_forcing_token_locations (cpp_reader *r)
3770*38fd1498Szrj {
3771*38fd1498Szrj   r->forced_token_location_p = NULL;
3772*38fd1498Szrj }
3773