1 /* Data and functions related to line maps and input files.
2    Copyright (C) 2004-2021 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "intl.h"
24 #include "diagnostic.h"
25 #include "diagnostic-core.h"
26 #include "selftest.h"
27 #include "cpplib.h"
28 
29 #ifndef HAVE_ICONV
30 #define HAVE_ICONV 0
31 #endif
32 
33 /* This is a cache used by get_next_line to store the content of a
34    file to be searched for file lines.  */
35 class fcache
36 {
37 public:
38   /* These are information used to store a line boundary.  */
39   class line_info
40   {
41   public:
42     /* The line number.  It starts from 1.  */
43     size_t line_num;
44 
45     /* The position (byte count) of the beginning of the line,
46        relative to the file data pointer.  This starts at zero.  */
47     size_t start_pos;
48 
49     /* The position (byte count) of the last byte of the line.  This
50        normally points to the '\n' character, or to one byte after the
51        last byte of the file, if the file doesn't contain a '\n'
52        character.  */
53     size_t end_pos;
54 
line_info(size_t l,size_t s,size_t e)55     line_info (size_t l, size_t s, size_t e)
56       : line_num (l), start_pos (s), end_pos (e)
57     {}
58 
line_info()59     line_info ()
60       :line_num (0), start_pos (0), end_pos (0)
61     {}
62   };
63 
64   /* The number of time this file has been accessed.  This is used
65      to designate which file cache to evict from the cache
66      array.  */
67   unsigned use_count;
68 
69   /* The file_path is the key for identifying a particular file in
70      the cache.
71      For libcpp-using code, the underlying buffer for this field is
72      owned by the corresponding _cpp_file within the cpp_reader.  */
73   const char *file_path;
74 
75   FILE *fp;
76 
77   /* This points to the content of the file that we've read so
78      far.  */
79   char *data;
80 
81   /*  The size of the DATA array above.*/
82   size_t size;
83 
84   /* The number of bytes read from the underlying file so far.  This
85      must be less (or equal) than SIZE above.  */
86   size_t nb_read;
87 
88   /* The index of the beginning of the current line.  */
89   size_t line_start_idx;
90 
91   /* The number of the previous line read.  This starts at 1.  Zero
92      means we've read no line so far.  */
93   size_t line_num;
94 
95   /* This is the total number of lines of the current file.  At the
96      moment, we try to get this information from the line map
97      subsystem.  Note that this is just a hint.  When using the C++
98      front-end, this hint is correct because the input file is then
99      completely tokenized before parsing starts; so the line map knows
100      the number of lines before compilation really starts.  For e.g,
101      the C front-end, it can happen that we start emitting diagnostics
102      before the line map has seen the end of the file.  */
103   size_t total_lines;
104 
105   /* Could this file be missing a trailing newline on its final line?
106      Initially true (to cope with empty files), set to true/false
107      as each line is read.  */
108   bool missing_trailing_newline;
109 
110   /* This is a record of the beginning and end of the lines we've seen
111      while reading the file.  This is useful to avoid walking the data
112      from the beginning when we are asked to read a line that is
113      before LINE_START_IDX above.  Note that the maximum size of this
114      record is fcache_line_record_size, so that the memory consumption
115      doesn't explode.  We thus scale total_lines down to
116      fcache_line_record_size.  */
117   vec<line_info, va_heap> line_record;
118 
119   fcache ();
120   ~fcache ();
121 };
122 
123 /* Current position in real source file.  */
124 
125 location_t input_location = UNKNOWN_LOCATION;
126 
127 class line_maps *line_table;
128 
129 /* A stashed copy of "line_table" for use by selftest::line_table_test.
130    This needs to be a global so that it can be a GC root, and thus
131    prevent the stashed copy from being garbage-collected if the GC runs
132    during a line_table_test.  */
133 
134 class line_maps *saved_line_table;
135 
136 static fcache *fcache_tab;
137 static const size_t fcache_tab_size = 16;
138 static const size_t fcache_buffer_size = 4 * 1024;
139 static const size_t fcache_line_record_size = 100;
140 
141 /* Expand the source location LOC into a human readable location.  If
142    LOC resolves to a builtin location, the file name of the readable
143    location is set to the string "<built-in>". If EXPANSION_POINT_P is
144    TRUE and LOC is virtual, then it is resolved to the expansion
145    point of the involved macro.  Otherwise, it is resolved to the
146    spelling location of the token.
147 
148    When resolving to the spelling location of the token, if the
149    resulting location is for a built-in location (that is, it has no
150    associated line/column) in the context of a macro expansion, the
151    returned location is the first one (while unwinding the macro
152    location towards its expansion point) that is in real source
153    code.
154 
155    ASPECT controls which part of the location to use.  */
156 
157 static expanded_location
expand_location_1(location_t loc,bool expansion_point_p,enum location_aspect aspect)158 expand_location_1 (location_t loc,
159 		   bool expansion_point_p,
160 		   enum location_aspect aspect)
161 {
162   expanded_location xloc;
163   const line_map_ordinary *map;
164   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
165   tree block = NULL;
166 
167   if (IS_ADHOC_LOC (loc))
168     {
169       block = LOCATION_BLOCK (loc);
170       loc = LOCATION_LOCUS (loc);
171     }
172 
173   memset (&xloc, 0, sizeof (xloc));
174 
175   if (loc >= RESERVED_LOCATION_COUNT)
176     {
177       if (!expansion_point_p)
178 	{
179 	  /* We want to resolve LOC to its spelling location.
180 
181 	     But if that spelling location is a reserved location that
182 	     appears in the context of a macro expansion (like for a
183 	     location for a built-in token), let's consider the first
184 	     location (toward the expansion point) that is not reserved;
185 	     that is, the first location that is in real source code.  */
186 	  loc = linemap_unwind_to_first_non_reserved_loc (line_table,
187 							  loc, NULL);
188 	  lrk = LRK_SPELLING_LOCATION;
189 	}
190       loc = linemap_resolve_location (line_table, loc, lrk, &map);
191 
192       /* loc is now either in an ordinary map, or is a reserved location.
193 	 If it is a compound location, the caret is in a spelling location,
194 	 but the start/finish might still be a virtual location.
195 	 Depending of what the caller asked for, we may need to recurse
196 	 one level in order to resolve any virtual locations in the
197 	 end-points.  */
198       switch (aspect)
199 	{
200 	default:
201 	  gcc_unreachable ();
202 	  /* Fall through.  */
203 	case LOCATION_ASPECT_CARET:
204 	  break;
205 	case LOCATION_ASPECT_START:
206 	  {
207 	    location_t start = get_start (loc);
208 	    if (start != loc)
209 	      return expand_location_1 (start, expansion_point_p, aspect);
210 	  }
211 	  break;
212 	case LOCATION_ASPECT_FINISH:
213 	  {
214 	    location_t finish = get_finish (loc);
215 	    if (finish != loc)
216 	      return expand_location_1 (finish, expansion_point_p, aspect);
217 	  }
218 	  break;
219 	}
220       xloc = linemap_expand_location (line_table, map, loc);
221     }
222 
223   xloc.data = block;
224   if (loc <= BUILTINS_LOCATION)
225     xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
226 
227   return xloc;
228 }
229 
230 /* Initialize the set of cache used for files accessed by caret
231    diagnostic.  */
232 
233 static void
diagnostic_file_cache_init(void)234 diagnostic_file_cache_init (void)
235 {
236   if (fcache_tab == NULL)
237     fcache_tab = new fcache[fcache_tab_size];
238 }
239 
240 /* Free the resources used by the set of cache used for files accessed
241    by caret diagnostic.  */
242 
243 void
diagnostic_file_cache_fini(void)244 diagnostic_file_cache_fini (void)
245 {
246   if (fcache_tab)
247     {
248       delete [] (fcache_tab);
249       fcache_tab = NULL;
250     }
251 }
252 
253 /* Return the total lines number that have been read so far by the
254    line map (in the preprocessor) so far.  For languages like C++ that
255    entirely preprocess the input file before starting to parse, this
256    equals the actual number of lines of the file.  */
257 
258 static size_t
total_lines_num(const char * file_path)259 total_lines_num (const char *file_path)
260 {
261   size_t r = 0;
262   location_t l = 0;
263   if (linemap_get_file_highest_location (line_table, file_path, &l))
264     {
265       gcc_assert (l >= RESERVED_LOCATION_COUNT);
266       expanded_location xloc = expand_location (l);
267       r = xloc.line;
268     }
269   return r;
270 }
271 
272 /* Lookup the cache used for the content of a given file accessed by
273    caret diagnostic.  Return the found cached file, or NULL if no
274    cached file was found.  */
275 
276 static fcache*
lookup_file_in_cache_tab(const char * file_path)277 lookup_file_in_cache_tab (const char *file_path)
278 {
279   if (file_path == NULL)
280     return NULL;
281 
282   diagnostic_file_cache_init ();
283 
284   /* This will contain the found cached file.  */
285   fcache *r = NULL;
286   for (unsigned i = 0; i < fcache_tab_size; ++i)
287     {
288       fcache *c = &fcache_tab[i];
289       if (c->file_path && !strcmp (c->file_path, file_path))
290 	{
291 	  ++c->use_count;
292 	  r = c;
293 	}
294     }
295 
296   if (r)
297     ++r->use_count;
298 
299   return r;
300 }
301 
302 /* Purge any mention of FILENAME from the cache of files used for
303    printing source code.  For use in selftests when working
304    with tempfiles.  */
305 
306 void
diagnostics_file_cache_forcibly_evict_file(const char * file_path)307 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
308 {
309   gcc_assert (file_path);
310 
311   fcache *r = lookup_file_in_cache_tab (file_path);
312   if (!r)
313     /* Not found.  */
314     return;
315 
316   r->file_path = NULL;
317   if (r->fp)
318     fclose (r->fp);
319   r->fp = NULL;
320   r->nb_read = 0;
321   r->line_start_idx = 0;
322   r->line_num = 0;
323   r->line_record.truncate (0);
324   r->use_count = 0;
325   r->total_lines = 0;
326   r->missing_trailing_newline = true;
327 }
328 
329 /* Return the file cache that has been less used, recently, or the
330    first empty one.  If HIGHEST_USE_COUNT is non-null,
331    *HIGHEST_USE_COUNT is set to the highest use count of the entries
332    in the cache table.  */
333 
334 static fcache*
evicted_cache_tab_entry(unsigned * highest_use_count)335 evicted_cache_tab_entry (unsigned *highest_use_count)
336 {
337   diagnostic_file_cache_init ();
338 
339   fcache *to_evict = &fcache_tab[0];
340   unsigned huc = to_evict->use_count;
341   for (unsigned i = 1; i < fcache_tab_size; ++i)
342     {
343       fcache *c = &fcache_tab[i];
344       bool c_is_empty = (c->file_path == NULL);
345 
346       if (c->use_count < to_evict->use_count
347 	  || (to_evict->file_path && c_is_empty))
348 	/* We evict C because it's either an entry with a lower use
349 	   count or one that is empty.  */
350 	to_evict = c;
351 
352       if (huc < c->use_count)
353 	huc = c->use_count;
354 
355       if (c_is_empty)
356 	/* We've reached the end of the cache; subsequent elements are
357 	   all empty.  */
358 	break;
359     }
360 
361   if (highest_use_count)
362     *highest_use_count = huc;
363 
364   return to_evict;
365 }
366 
367 /* Create the cache used for the content of a given file to be
368    accessed by caret diagnostic.  This cache is added to an array of
369    cache and can be retrieved by lookup_file_in_cache_tab.  This
370    function returns the created cache.  Note that only the last
371    fcache_tab_size files are cached.  */
372 
373 static fcache*
add_file_to_cache_tab(const char * file_path)374 add_file_to_cache_tab (const char *file_path)
375 {
376 
377   FILE *fp = fopen (file_path, "r");
378   if (fp == NULL)
379     return NULL;
380 
381   unsigned highest_use_count = 0;
382   fcache *r = evicted_cache_tab_entry (&highest_use_count);
383   r->file_path = file_path;
384   if (r->fp)
385     fclose (r->fp);
386   r->fp = fp;
387   r->nb_read = 0;
388   r->line_start_idx = 0;
389   r->line_num = 0;
390   r->line_record.truncate (0);
391   /* Ensure that this cache entry doesn't get evicted next time
392      add_file_to_cache_tab is called.  */
393   r->use_count = ++highest_use_count;
394   r->total_lines = total_lines_num (file_path);
395   r->missing_trailing_newline = true;
396 
397   return r;
398 }
399 
400 /* Lookup the cache used for the content of a given file accessed by
401    caret diagnostic.  If no cached file was found, create a new cache
402    for this file, add it to the array of cached file and return
403    it.  */
404 
405 static fcache*
lookup_or_add_file_to_cache_tab(const char * file_path)406 lookup_or_add_file_to_cache_tab (const char *file_path)
407 {
408   fcache *r = lookup_file_in_cache_tab (file_path);
409   if (r == NULL)
410     r = add_file_to_cache_tab (file_path);
411   return r;
412 }
413 
414 /* Default constructor for a cache of file used by caret
415    diagnostic.  */
416 
fcache()417 fcache::fcache ()
418 : use_count (0), file_path (NULL), fp (NULL), data (0),
419   size (0), nb_read (0), line_start_idx (0), line_num (0),
420   total_lines (0), missing_trailing_newline (true)
421 {
422   line_record.create (0);
423 }
424 
425 /* Destructor for a cache of file used by caret diagnostic.  */
426 
~fcache()427 fcache::~fcache ()
428 {
429   if (fp)
430     {
431       fclose (fp);
432       fp = NULL;
433     }
434   if (data)
435     {
436       XDELETEVEC (data);
437       data = 0;
438     }
439   line_record.release ();
440 }
441 
442 /* Returns TRUE iff the cache would need to be filled with data coming
443    from the file.  That is, either the cache is empty or full or the
444    current line is empty.  Note that if the cache is full, it would
445    need to be extended and filled again.  */
446 
447 static bool
needs_read(fcache * c)448 needs_read (fcache *c)
449 {
450   return (c->nb_read == 0
451 	  || c->nb_read == c->size
452 	  || (c->line_start_idx >= c->nb_read - 1));
453 }
454 
455 /*  Return TRUE iff the cache is full and thus needs to be
456     extended.  */
457 
458 static bool
needs_grow(fcache * c)459 needs_grow (fcache *c)
460 {
461   return c->nb_read == c->size;
462 }
463 
464 /* Grow the cache if it needs to be extended.  */
465 
466 static void
maybe_grow(fcache * c)467 maybe_grow (fcache *c)
468 {
469   if (!needs_grow (c))
470     return;
471 
472   size_t size = c->size == 0 ? fcache_buffer_size : c->size * 2;
473   c->data = XRESIZEVEC (char, c->data, size);
474   c->size = size;
475 }
476 
477 /*  Read more data into the cache.  Extends the cache if need be.
478     Returns TRUE iff new data could be read.  */
479 
480 static bool
read_data(fcache * c)481 read_data (fcache *c)
482 {
483   if (feof (c->fp) || ferror (c->fp))
484     return false;
485 
486   maybe_grow (c);
487 
488   char * from = c->data + c->nb_read;
489   size_t to_read = c->size - c->nb_read;
490   size_t nb_read = fread (from, 1, to_read, c->fp);
491 
492   if (ferror (c->fp))
493     return false;
494 
495   c->nb_read += nb_read;
496   return !!nb_read;
497 }
498 
499 /* Read new data iff the cache needs to be filled with more data
500    coming from the file FP.  Return TRUE iff the cache was filled with
501    mode data.  */
502 
503 static bool
maybe_read_data(fcache * c)504 maybe_read_data (fcache *c)
505 {
506   if (!needs_read (c))
507     return false;
508   return read_data (c);
509 }
510 
511 /* Read a new line from file FP, using C as a cache for the data
512    coming from the file.  Upon successful completion, *LINE is set to
513    the beginning of the line found.  *LINE points directly in the
514    line cache and is only valid until the next call of get_next_line.
515    *LINE_LEN is set to the length of the line.  Note that the line
516    does not contain any terminal delimiter.  This function returns
517    true if some data was read or process from the cache, false
518    otherwise.  Note that subsequent calls to get_next_line might
519    make the content of *LINE invalid.  */
520 
521 static bool
get_next_line(fcache * c,char ** line,ssize_t * line_len)522 get_next_line (fcache *c, char **line, ssize_t *line_len)
523 {
524   /* Fill the cache with data to process.  */
525   maybe_read_data (c);
526 
527   size_t remaining_size = c->nb_read - c->line_start_idx;
528   if (remaining_size == 0)
529     /* There is no more data to process.  */
530     return false;
531 
532   char *line_start = c->data + c->line_start_idx;
533 
534   char *next_line_start = NULL;
535   size_t len = 0;
536   char *line_end = (char *) memchr (line_start, '\n', remaining_size);
537   if (line_end == NULL)
538     {
539       /* We haven't found the end-of-line delimiter in the cache.
540 	 Fill the cache with more data from the file and look for the
541 	 '\n'.  */
542       while (maybe_read_data (c))
543 	{
544 	  line_start = c->data + c->line_start_idx;
545 	  remaining_size = c->nb_read - c->line_start_idx;
546 	  line_end = (char *) memchr (line_start, '\n', remaining_size);
547 	  if (line_end != NULL)
548 	    {
549 	      next_line_start = line_end + 1;
550 	      break;
551 	    }
552 	}
553       if (line_end == NULL)
554 	{
555 	  /* We've loadded all the file into the cache and still no
556 	     '\n'.  Let's say the line ends up at one byte passed the
557 	     end of the file.  This is to stay consistent with the case
558 	     of when the line ends up with a '\n' and line_end points to
559 	     that terminal '\n'.  That consistency is useful below in
560 	     the len calculation.  */
561 	  line_end = c->data + c->nb_read ;
562 	  c->missing_trailing_newline = true;
563 	}
564       else
565 	c->missing_trailing_newline = false;
566     }
567   else
568     {
569       next_line_start = line_end + 1;
570       c->missing_trailing_newline = false;
571     }
572 
573   if (ferror (c->fp))
574     return false;
575 
576   /* At this point, we've found the end of the of line.  It either
577      points to the '\n' or to one byte after the last byte of the
578      file.  */
579   gcc_assert (line_end != NULL);
580 
581   len = line_end - line_start;
582 
583   if (c->line_start_idx < c->nb_read)
584     *line = line_start;
585 
586   ++c->line_num;
587 
588   /* Before we update our line record, make sure the hint about the
589      total number of lines of the file is correct.  If it's not, then
590      we give up recording line boundaries from now on.  */
591   bool update_line_record = true;
592   if (c->line_num > c->total_lines)
593     update_line_record = false;
594 
595     /* Now update our line record so that re-reading lines from the
596      before c->line_start_idx is faster.  */
597   if (update_line_record
598       && c->line_record.length () < fcache_line_record_size)
599     {
600       /* If the file lines fits in the line record, we just record all
601 	 its lines ...*/
602       if (c->total_lines <= fcache_line_record_size
603 	  && c->line_num > c->line_record.length ())
604 	c->line_record.safe_push (fcache::line_info (c->line_num,
605 						 c->line_start_idx,
606 						 line_end - c->data));
607       else if (c->total_lines > fcache_line_record_size)
608 	{
609 	  /* ... otherwise, we just scale total_lines down to
610 	     (fcache_line_record_size lines.  */
611 	  size_t n = (c->line_num * fcache_line_record_size) / c->total_lines;
612 	  if (c->line_record.length () == 0
613 	      || n >= c->line_record.length ())
614 	    c->line_record.safe_push (fcache::line_info (c->line_num,
615 						     c->line_start_idx,
616 						     line_end - c->data));
617 	}
618     }
619 
620   /* Update c->line_start_idx so that it points to the next line to be
621      read.  */
622   if (next_line_start)
623     c->line_start_idx = next_line_start - c->data;
624   else
625     /* We didn't find any terminal '\n'.  Let's consider that the end
626        of line is the end of the data in the cache.  The next
627        invocation of get_next_line will either read more data from the
628        underlying file or return false early because we've reached the
629        end of the file.  */
630     c->line_start_idx = c->nb_read;
631 
632   *line_len = len;
633 
634   return true;
635 }
636 
637 /* Consume the next bytes coming from the cache (or from its
638    underlying file if there are remaining unread bytes in the file)
639    until we reach the next end-of-line (or end-of-file).  There is no
640    copying from the cache involved.  Return TRUE upon successful
641    completion.  */
642 
643 static bool
goto_next_line(fcache * cache)644 goto_next_line (fcache *cache)
645 {
646   char *l;
647   ssize_t len;
648 
649   return get_next_line (cache, &l, &len);
650 }
651 
652 /* Read an arbitrary line number LINE_NUM from the file cached in C.
653    If the line was read successfully, *LINE points to the beginning
654    of the line in the file cache and *LINE_LEN is the length of the
655    line.  *LINE is not nul-terminated, but may contain zero bytes.
656    *LINE is only valid until the next call of read_line_num.
657    This function returns bool if a line was read.  */
658 
659 static bool
read_line_num(fcache * c,size_t line_num,char ** line,ssize_t * line_len)660 read_line_num (fcache *c, size_t line_num,
661 	       char **line, ssize_t *line_len)
662 {
663   gcc_assert (line_num > 0);
664 
665   if (line_num <= c->line_num)
666     {
667       /* We've been asked to read lines that are before c->line_num.
668 	 So lets use our line record (if it's not empty) to try to
669 	 avoid re-reading the file from the beginning again.  */
670 
671       if (c->line_record.is_empty ())
672 	{
673 	  c->line_start_idx = 0;
674 	  c->line_num = 0;
675 	}
676       else
677 	{
678 	  fcache::line_info *i = NULL;
679 	  if (c->total_lines <= fcache_line_record_size)
680 	    {
681 	      /* In languages where the input file is not totally
682 		 preprocessed up front, the c->total_lines hint
683 		 can be smaller than the number of lines of the
684 		 file.  In that case, only the first
685 		 c->total_lines have been recorded.
686 
687 		 Otherwise, the first c->total_lines we've read have
688 		 their start/end recorded here.  */
689 	      i = (line_num <= c->total_lines)
690 		? &c->line_record[line_num - 1]
691 		: &c->line_record[c->total_lines - 1];
692 	      gcc_assert (i->line_num <= line_num);
693 	    }
694 	  else
695 	    {
696 	      /*  So the file had more lines than our line record
697 		  size.  Thus the number of lines we've recorded has
698 		  been scaled down to fcache_line_reacord_size.  Let's
699 		  pick the start/end of the recorded line that is
700 		  closest to line_num.  */
701 	      size_t n = (line_num <= c->total_lines)
702 		? line_num * fcache_line_record_size / c->total_lines
703 		: c ->line_record.length () - 1;
704 	      if (n < c->line_record.length ())
705 		{
706 		  i = &c->line_record[n];
707 		  gcc_assert (i->line_num <= line_num);
708 		}
709 	    }
710 
711 	  if (i && i->line_num == line_num)
712 	    {
713 	      /* We have the start/end of the line.  */
714 	      *line = c->data + i->start_pos;
715 	      *line_len = i->end_pos - i->start_pos;
716 	      return true;
717 	    }
718 
719 	  if (i)
720 	    {
721 	      c->line_start_idx = i->start_pos;
722 	      c->line_num = i->line_num - 1;
723 	    }
724 	  else
725 	    {
726 	      c->line_start_idx = 0;
727 	      c->line_num = 0;
728 	    }
729 	}
730     }
731 
732   /*  Let's walk from line c->line_num up to line_num - 1, without
733       copying any line.  */
734   while (c->line_num < line_num - 1)
735     if (!goto_next_line (c))
736       return false;
737 
738   /* The line we want is the next one.  Let's read and copy it back to
739      the caller.  */
740   return get_next_line (c, line, line_len);
741 }
742 
743 /* Return the physical source line that corresponds to FILE_PATH/LINE.
744    The line is not nul-terminated.  The returned pointer is only
745    valid until the next call of location_get_source_line.
746    Note that the line can contain several null characters,
747    so the returned value's length has the actual length of the line.
748    If the function fails, a NULL char_span is returned.  */
749 
750 char_span
location_get_source_line(const char * file_path,int line)751 location_get_source_line (const char *file_path, int line)
752 {
753   char *buffer = NULL;
754   ssize_t len;
755 
756   if (line == 0)
757     return char_span (NULL, 0);
758 
759   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
760   if (c == NULL)
761     return char_span (NULL, 0);
762 
763   bool read = read_line_num (c, line, &buffer, &len);
764   if (!read)
765     return char_span (NULL, 0);
766 
767   return char_span (buffer, len);
768 }
769 
770 /* Determine if FILE_PATH missing a trailing newline on its final line.
771    Only valid to call once all of the file has been loaded, by
772    requesting a line number beyond the end of the file.  */
773 
774 bool
location_missing_trailing_newline(const char * file_path)775 location_missing_trailing_newline (const char *file_path)
776 {
777   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
778   if (c == NULL)
779     return false;
780 
781   return c->missing_trailing_newline;
782 }
783 
784 /* Test if the location originates from the spelling location of a
785    builtin-tokens.  That is, return TRUE if LOC is a (possibly
786    virtual) location of a built-in token that appears in the expansion
787    list of a macro.  Please note that this function also works on
788    tokens that result from built-in tokens.  For instance, the
789    function would return true if passed a token "4" that is the result
790    of the expansion of the built-in __LINE__ macro.  */
791 bool
is_location_from_builtin_token(location_t loc)792 is_location_from_builtin_token (location_t loc)
793 {
794   const line_map_ordinary *map = NULL;
795   loc = linemap_resolve_location (line_table, loc,
796 				  LRK_SPELLING_LOCATION, &map);
797   return loc == BUILTINS_LOCATION;
798 }
799 
800 /* Expand the source location LOC into a human readable location.  If
801    LOC is virtual, it resolves to the expansion point of the involved
802    macro.  If LOC resolves to a builtin location, the file name of the
803    readable location is set to the string "<built-in>".  */
804 
805 expanded_location
expand_location(location_t loc)806 expand_location (location_t loc)
807 {
808   return expand_location_1 (loc, /*expansion_point_p=*/true,
809 			    LOCATION_ASPECT_CARET);
810 }
811 
812 /* Expand the source location LOC into a human readable location.  If
813    LOC is virtual, it resolves to the expansion location of the
814    relevant macro.  If LOC resolves to a builtin location, the file
815    name of the readable location is set to the string
816    "<built-in>".  */
817 
818 expanded_location
expand_location_to_spelling_point(location_t loc,enum location_aspect aspect)819 expand_location_to_spelling_point (location_t loc,
820 				   enum location_aspect aspect)
821 {
822   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
823 }
824 
825 /* The rich_location class within libcpp requires a way to expand
826    location_t instances, and relies on the client code
827    providing a symbol named
828      linemap_client_expand_location_to_spelling_point
829    to do this.
830 
831    This is the implementation for libcommon.a (all host binaries),
832    which simply calls into expand_location_1.  */
833 
834 expanded_location
linemap_client_expand_location_to_spelling_point(location_t loc,enum location_aspect aspect)835 linemap_client_expand_location_to_spelling_point (location_t loc,
836 						  enum location_aspect aspect)
837 {
838   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
839 }
840 
841 
842 /* If LOCATION is in a system header and if it is a virtual location for
843    a token coming from the expansion of a macro, unwind it to the
844    location of the expansion point of the macro.  Otherwise, just return
845    LOCATION.
846 
847    This is used for instance when we want to emit diagnostics about a
848    token that may be located in a macro that is itself defined in a
849    system header, for example, for the NULL macro.  In such a case, if
850    LOCATION were passed directly to diagnostic functions such as
851    warning_at, the diagnostic would be suppressed (unless
852    -Wsystem-headers).  */
853 
854 location_t
expansion_point_location_if_in_system_header(location_t location)855 expansion_point_location_if_in_system_header (location_t location)
856 {
857   if (in_system_header_at (location))
858     location = linemap_resolve_location (line_table, location,
859 					 LRK_MACRO_EXPANSION_POINT,
860 					 NULL);
861   return location;
862 }
863 
864 /* If LOCATION is a virtual location for a token coming from the expansion
865    of a macro, unwind to the location of the expansion point of the macro.  */
866 
867 location_t
expansion_point_location(location_t location)868 expansion_point_location (location_t location)
869 {
870   return linemap_resolve_location (line_table, location,
871 				   LRK_MACRO_EXPANSION_POINT, NULL);
872 }
873 
874 /* Construct a location with caret at CARET, ranging from START to
875    finish e.g.
876 
877                  11111111112
878         12345678901234567890
879      522
880      523   return foo + bar;
881                   ~~~~^~~~~
882      524
883 
884    The location's caret is at the "+", line 523 column 15, but starts
885    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
886    of "bar" at column 19.  */
887 
888 location_t
make_location(location_t caret,location_t start,location_t finish)889 make_location (location_t caret, location_t start, location_t finish)
890 {
891   location_t pure_loc = get_pure_location (caret);
892   source_range src_range;
893   src_range.m_start = get_start (start);
894   src_range.m_finish = get_finish (finish);
895   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
896 						   pure_loc,
897 						   src_range,
898 						   NULL);
899   return combined_loc;
900 }
901 
902 /* Same as above, but taking a source range rather than two locations.  */
903 
904 location_t
make_location(location_t caret,source_range src_range)905 make_location (location_t caret, source_range src_range)
906 {
907   location_t pure_loc = get_pure_location (caret);
908   return COMBINE_LOCATION_DATA (line_table, pure_loc, src_range, NULL);
909 }
910 
911 /* An expanded_location stores the column in byte units.  This function
912    converts that column to display units.  That requires reading the associated
913    source line in order to calculate the display width.  If that cannot be done
914    for any reason, then returns the byte column as a fallback.  */
915 int
location_compute_display_column(expanded_location exploc,int tabstop)916 location_compute_display_column (expanded_location exploc, int tabstop)
917 {
918   if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
919     return exploc.column;
920   char_span line = location_get_source_line (exploc.file, exploc.line);
921   /* If line is NULL, this function returns exploc.column which is the
922      desired fallback.  */
923   return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
924 					    exploc.column, tabstop);
925 }
926 
927 /* Dump statistics to stderr about the memory usage of the line_table
928    set of line maps.  This also displays some statistics about macro
929    expansion.  */
930 
931 void
dump_line_table_statistics(void)932 dump_line_table_statistics (void)
933 {
934   struct linemap_stats s;
935   long total_used_map_size,
936     macro_maps_size,
937     total_allocated_map_size;
938 
939   memset (&s, 0, sizeof (s));
940 
941   linemap_get_statistics (line_table, &s);
942 
943   macro_maps_size = s.macro_maps_used_size
944     + s.macro_maps_locations_size;
945 
946   total_allocated_map_size = s.ordinary_maps_allocated_size
947     + s.macro_maps_allocated_size
948     + s.macro_maps_locations_size;
949 
950   total_used_map_size = s.ordinary_maps_used_size
951     + s.macro_maps_used_size
952     + s.macro_maps_locations_size;
953 
954   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
955            s.num_expanded_macros);
956   if (s.num_expanded_macros != 0)
957     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
958              s.num_macro_tokens / s.num_expanded_macros);
959   fprintf (stderr,
960            "\nLine Table allocations during the "
961 	   "compilation process\n");
962   fprintf (stderr, "Number of ordinary maps used:        " PRsa (5) "\n",
963 	   SIZE_AMOUNT (s.num_ordinary_maps_used));
964   fprintf (stderr, "Ordinary map used size:              " PRsa (5) "\n",
965 	   SIZE_AMOUNT (s.ordinary_maps_used_size));
966   fprintf (stderr, "Number of ordinary maps allocated:   " PRsa (5) "\n",
967 	   SIZE_AMOUNT (s.num_ordinary_maps_allocated));
968   fprintf (stderr, "Ordinary maps allocated size:        " PRsa (5) "\n",
969 	   SIZE_AMOUNT (s.ordinary_maps_allocated_size));
970   fprintf (stderr, "Number of macro maps used:           " PRsa (5) "\n",
971 	   SIZE_AMOUNT (s.num_macro_maps_used));
972   fprintf (stderr, "Macro maps used size:                " PRsa (5) "\n",
973 	   SIZE_AMOUNT (s.macro_maps_used_size));
974   fprintf (stderr, "Macro maps locations size:           " PRsa (5) "\n",
975 	   SIZE_AMOUNT (s.macro_maps_locations_size));
976   fprintf (stderr, "Macro maps size:                     " PRsa (5) "\n",
977 	   SIZE_AMOUNT (macro_maps_size));
978   fprintf (stderr, "Duplicated maps locations size:      " PRsa (5) "\n",
979 	   SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
980   fprintf (stderr, "Total allocated maps size:           " PRsa (5) "\n",
981 	   SIZE_AMOUNT (total_allocated_map_size));
982   fprintf (stderr, "Total used maps size:                " PRsa (5) "\n",
983 	   SIZE_AMOUNT (total_used_map_size));
984   fprintf (stderr, "Ad-hoc table size:                   " PRsa (5) "\n",
985 	   SIZE_AMOUNT (s.adhoc_table_size));
986   fprintf (stderr, "Ad-hoc table entries used:           " PRsa (5) "\n",
987 	   SIZE_AMOUNT (s.adhoc_table_entries_used));
988   fprintf (stderr, "optimized_ranges:                    " PRsa (5) "\n",
989 	   SIZE_AMOUNT (line_table->num_optimized_ranges));
990   fprintf (stderr, "unoptimized_ranges:                  " PRsa (5) "\n",
991 	   SIZE_AMOUNT (line_table->num_unoptimized_ranges));
992 
993   fprintf (stderr, "\n");
994 }
995 
996 /* Get location one beyond the final location in ordinary map IDX.  */
997 
998 static location_t
get_end_location(class line_maps * set,unsigned int idx)999 get_end_location (class line_maps *set, unsigned int idx)
1000 {
1001   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1002     return set->highest_location;
1003 
1004   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1005   return MAP_START_LOCATION (next_map);
1006 }
1007 
1008 /* Helper function for write_digit_row.  */
1009 
1010 static void
write_digit(FILE * stream,int digit)1011 write_digit (FILE *stream, int digit)
1012 {
1013   fputc ('0' + (digit % 10), stream);
1014 }
1015 
1016 /* Helper function for dump_location_info.
1017    Write a row of numbers to STREAM, numbering a source line,
1018    giving the units, tens, hundreds etc of the column number.  */
1019 
1020 static void
write_digit_row(FILE * stream,int indent,const line_map_ordinary * map,location_t loc,int max_col,int divisor)1021 write_digit_row (FILE *stream, int indent,
1022 		 const line_map_ordinary *map,
1023 		 location_t loc, int max_col, int divisor)
1024 {
1025   fprintf (stream, "%*c", indent, ' ');
1026   fprintf (stream, "|");
1027   for (int column = 1; column < max_col; column++)
1028     {
1029       location_t column_loc = loc + (column << map->m_range_bits);
1030       write_digit (stream, column_loc / divisor);
1031     }
1032   fprintf (stream, "\n");
1033 }
1034 
1035 /* Write a half-closed (START) / half-open (END) interval of
1036    location_t to STREAM.  */
1037 
1038 static void
dump_location_range(FILE * stream,location_t start,location_t end)1039 dump_location_range (FILE *stream,
1040 		     location_t start, location_t end)
1041 {
1042   fprintf (stream,
1043 	   "  location_t interval: %u <= loc < %u\n",
1044 	   start, end);
1045 }
1046 
1047 /* Write a labelled description of a half-closed (START) / half-open (END)
1048    interval of location_t to STREAM.  */
1049 
1050 static void
dump_labelled_location_range(FILE * stream,const char * name,location_t start,location_t end)1051 dump_labelled_location_range (FILE *stream,
1052 			      const char *name,
1053 			      location_t start, location_t end)
1054 {
1055   fprintf (stream, "%s\n", name);
1056   dump_location_range (stream, start, end);
1057   fprintf (stream, "\n");
1058 }
1059 
1060 /* Write a visualization of the locations in the line_table to STREAM.  */
1061 
1062 void
dump_location_info(FILE * stream)1063 dump_location_info (FILE *stream)
1064 {
1065   /* Visualize the reserved locations.  */
1066   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1067 				0, RESERVED_LOCATION_COUNT);
1068 
1069   /* Visualize the ordinary line_map instances, rendering the sources. */
1070   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1071     {
1072       location_t end_location = get_end_location (line_table, idx);
1073       /* half-closed: doesn't include this one. */
1074 
1075       const line_map_ordinary *map
1076 	= LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1077       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1078       dump_location_range (stream,
1079 			   MAP_START_LOCATION (map), end_location);
1080       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1081       fprintf (stream, "  starting at line: %i\n",
1082 	       ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1083       fprintf (stream, "  column and range bits: %i\n",
1084 	       map->m_column_and_range_bits);
1085       fprintf (stream, "  column bits: %i\n",
1086 	       map->m_column_and_range_bits - map->m_range_bits);
1087       fprintf (stream, "  range bits: %i\n",
1088 	       map->m_range_bits);
1089       const char * reason;
1090       switch (map->reason) {
1091       case LC_ENTER:
1092 	reason = "LC_ENTER";
1093 	break;
1094       case LC_LEAVE:
1095 	reason = "LC_LEAVE";
1096 	break;
1097       case LC_RENAME:
1098 	reason = "LC_RENAME";
1099 	break;
1100       case LC_RENAME_VERBATIM:
1101 	reason = "LC_RENAME_VERBATIM";
1102 	break;
1103       case LC_ENTER_MACRO:
1104 	reason = "LC_RENAME_MACRO";
1105 	break;
1106       default:
1107 	reason = "Unknown";
1108       }
1109       fprintf (stream, "  reason: %d (%s)\n", map->reason, reason);
1110 
1111       const line_map_ordinary *includer_map
1112 	= linemap_included_from_linemap (line_table, map);
1113       fprintf (stream, "  included from location: %d",
1114 	       linemap_included_from (map));
1115       if (includer_map) {
1116 	fprintf (stream, " (in ordinary map %d)",
1117 		 int (includer_map - line_table->info_ordinary.maps));
1118       }
1119       fprintf (stream, "\n");
1120 
1121       /* Render the span of source lines that this "map" covers.  */
1122       for (location_t loc = MAP_START_LOCATION (map);
1123 	   loc < end_location;
1124 	   loc += (1 << map->m_range_bits) )
1125 	{
1126 	  gcc_assert (pure_location_p (line_table, loc) );
1127 
1128 	  expanded_location exploc
1129 	    = linemap_expand_location (line_table, map, loc);
1130 
1131 	  if (exploc.column == 0)
1132 	    {
1133 	      /* Beginning of a new source line: draw the line.  */
1134 
1135 	      char_span line_text = location_get_source_line (exploc.file,
1136 							      exploc.line);
1137 	      if (!line_text)
1138 		break;
1139 	      fprintf (stream,
1140 		       "%s:%3i|loc:%5i|%.*s\n",
1141 		       exploc.file, exploc.line,
1142 		       loc,
1143 		       (int)line_text.length (), line_text.get_buffer ());
1144 
1145 	      /* "loc" is at column 0, which means "the whole line".
1146 		 Render the locations *within* the line, by underlining
1147 		 it, showing the location_t numeric values
1148 		 at each column.  */
1149 	      size_t max_col = (1 << map->m_column_and_range_bits) - 1;
1150 	      if (max_col > line_text.length ())
1151 		max_col = line_text.length () + 1;
1152 
1153 	      int len_lnum = num_digits (exploc.line);
1154 	      if (len_lnum < 3)
1155 		len_lnum = 3;
1156 	      int len_loc = num_digits (loc);
1157 	      if (len_loc < 5)
1158 		len_loc = 5;
1159 
1160 	      int indent = 6 + strlen (exploc.file) + len_lnum + len_loc;
1161 
1162 	      /* Thousands.  */
1163 	      if (end_location > 999)
1164 		write_digit_row (stream, indent, map, loc, max_col, 1000);
1165 
1166 	      /* Hundreds.  */
1167 	      if (end_location > 99)
1168 		write_digit_row (stream, indent, map, loc, max_col, 100);
1169 
1170 	      /* Tens.  */
1171 	      write_digit_row (stream, indent, map, loc, max_col, 10);
1172 
1173 	      /* Units.  */
1174 	      write_digit_row (stream, indent, map, loc, max_col, 1);
1175 	    }
1176 	}
1177       fprintf (stream, "\n");
1178     }
1179 
1180   /* Visualize unallocated values.  */
1181   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1182 				line_table->highest_location,
1183 				LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1184 
1185   /* Visualize the macro line_map instances, rendering the sources. */
1186   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1187     {
1188       /* Each macro map that is allocated owns location_t values
1189 	 that are *lower* that the one before them.
1190 	 Hence it's meaningful to view them either in order of ascending
1191 	 source locations, or in order of ascending macro map index.  */
1192       const bool ascending_location_ts = true;
1193       unsigned int idx = (ascending_location_ts
1194 			  ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1195 			  : i);
1196       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1197       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1198 	       idx,
1199 	       linemap_map_get_macro_name (map),
1200 	       MACRO_MAP_NUM_MACRO_TOKENS (map));
1201       dump_location_range (stream,
1202 			   map->start_location,
1203 			   (map->start_location
1204 			    + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1205       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1206 	      "expansion point is location %i",
1207 	      MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1208       fprintf (stream, "  map->start_location: %u\n",
1209 	       map->start_location);
1210 
1211       fprintf (stream, "  macro_locations:\n");
1212       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1213 	{
1214 	  location_t x = MACRO_MAP_LOCATIONS (map)[2 * i];
1215 	  location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1216 
1217 	  /* linemap_add_macro_token encodes token numbers in an expansion
1218 	     by putting them after MAP_START_LOCATION. */
1219 
1220 	  /* I'm typically seeing 4 uninitialized entries at the end of
1221 	     0xafafafaf.
1222 	     This appears to be due to macro.c:replace_args
1223 	     adding 2 extra args for padding tokens; presumably there may
1224 	     be a leading and/or trailing padding token injected,
1225 	     each for 2 more location slots.
1226 	     This would explain there being up to 4 location_ts slots
1227 	     that may be uninitialized.  */
1228 
1229 	  fprintf (stream, "    %u: %u, %u\n",
1230 		   i,
1231 		   x,
1232 		   y);
1233 	  if (x == y)
1234 	    {
1235 	      if (x < MAP_START_LOCATION (map))
1236 		inform (x, "token %u has %<x-location == y-location == %u%>",
1237 			i, x);
1238 	      else
1239 		fprintf (stream,
1240 			 "x-location == y-location == %u encodes token # %u\n",
1241 			 x, x - MAP_START_LOCATION (map));
1242 		}
1243 	  else
1244 	    {
1245 	      inform (x, "token %u has %<x-location == %u%>", i, x);
1246 	      inform (x, "token %u has %<y-location == %u%>", i, y);
1247 	    }
1248 	}
1249       fprintf (stream, "\n");
1250     }
1251 
1252   /* It appears that MAX_LOCATION_T itself is never assigned to a
1253      macro map, presumably due to an off-by-one error somewhere
1254      between the logic in linemap_enter_macro and
1255      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1256   dump_labelled_location_range (stream, "MAX_LOCATION_T",
1257 				MAX_LOCATION_T,
1258 				MAX_LOCATION_T + 1);
1259 
1260   /* Visualize ad-hoc values.  */
1261   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1262 				MAX_LOCATION_T + 1, UINT_MAX);
1263 }
1264 
1265 /* string_concat's constructor.  */
1266 
string_concat(int num,location_t * locs)1267 string_concat::string_concat (int num, location_t *locs)
1268   : m_num (num)
1269 {
1270   m_locs = ggc_vec_alloc <location_t> (num);
1271   for (int i = 0; i < num; i++)
1272     m_locs[i] = locs[i];
1273 }
1274 
1275 /* string_concat_db's constructor.  */
1276 
string_concat_db()1277 string_concat_db::string_concat_db ()
1278 {
1279   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1280 }
1281 
1282 /* Record that a string concatenation occurred, covering NUM
1283    string literal tokens.  LOCS is an array of size NUM, containing the
1284    locations of the tokens.  A copy of LOCS is taken.  */
1285 
1286 void
record_string_concatenation(int num,location_t * locs)1287 string_concat_db::record_string_concatenation (int num, location_t *locs)
1288 {
1289   gcc_assert (num > 1);
1290   gcc_assert (locs);
1291 
1292   location_t key_loc = get_key_loc (locs[0]);
1293 
1294   string_concat *concat
1295     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1296   m_table->put (key_loc, concat);
1297 }
1298 
1299 /* Determine if LOC was the location of the initial token of a
1300    concatenation of string literal tokens.
1301    If so, *OUT_NUM is written to with the number of tokens, and
1302    *OUT_LOCS with the location of an array of locations of the
1303    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1304    storage owned by the string_concat_db.
1305    Otherwise, return false.  */
1306 
1307 bool
get_string_concatenation(location_t loc,int * out_num,location_t ** out_locs)1308 string_concat_db::get_string_concatenation (location_t loc,
1309 					    int *out_num,
1310 					    location_t **out_locs)
1311 {
1312   gcc_assert (out_num);
1313   gcc_assert (out_locs);
1314 
1315   location_t key_loc = get_key_loc (loc);
1316 
1317   string_concat **concat = m_table->get (key_loc);
1318   if (!concat)
1319     return false;
1320 
1321   *out_num = (*concat)->m_num;
1322   *out_locs =(*concat)->m_locs;
1323   return true;
1324 }
1325 
1326 /* Internal function.  Canonicalize LOC into a form suitable for
1327    use as a key within the database, stripping away macro expansion,
1328    ad-hoc information, and range information, using the location of
1329    the start of LOC within an ordinary linemap.  */
1330 
1331 location_t
get_key_loc(location_t loc)1332 string_concat_db::get_key_loc (location_t loc)
1333 {
1334   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1335 				  NULL);
1336 
1337   loc = get_range_from_loc (line_table, loc).m_start;
1338 
1339   return loc;
1340 }
1341 
1342 /* Helper class for use within get_substring_ranges_for_loc.
1343    An vec of cpp_string with responsibility for releasing all of the
1344    str->text for each str in the vector.  */
1345 
1346 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1347 {
1348  public:
auto_cpp_string_vec(int alloc)1349   auto_cpp_string_vec (int alloc)
1350     : auto_vec <cpp_string> (alloc) {}
1351 
~auto_cpp_string_vec()1352   ~auto_cpp_string_vec ()
1353   {
1354     /* Clean up the copies within this vec.  */
1355     int i;
1356     cpp_string *str;
1357     FOR_EACH_VEC_ELT (*this, i, str)
1358       free (const_cast <unsigned char *> (str->text));
1359   }
1360 };
1361 
1362 /* Attempt to populate RANGES with source location information on the
1363    individual characters within the string literal found at STRLOC.
1364    If CONCATS is non-NULL, then any string literals that the token at
1365    STRLOC  was concatenated with are also added to RANGES.
1366 
1367    Return NULL if successful, or an error message if any errors occurred (in
1368    which case RANGES may be only partially populated and should not
1369    be used).
1370 
1371    This is implemented by re-parsing the relevant source line(s).  */
1372 
1373 static const char *
get_substring_ranges_for_loc(cpp_reader * pfile,string_concat_db * concats,location_t strloc,enum cpp_ttype type,cpp_substring_ranges & ranges)1374 get_substring_ranges_for_loc (cpp_reader *pfile,
1375 			      string_concat_db *concats,
1376 			      location_t strloc,
1377 			      enum cpp_ttype type,
1378 			      cpp_substring_ranges &ranges)
1379 {
1380   gcc_assert (pfile);
1381 
1382   if (strloc == UNKNOWN_LOCATION)
1383     return "unknown location";
1384 
1385   /* Reparsing the strings requires accurate location information.
1386      If -ftrack-macro-expansion has been overridden from its default
1387      of 2, then we might have a location of a macro expansion point,
1388      rather than the location of the literal itself.
1389      Avoid this by requiring that we have full macro expansion tracking
1390      for substring locations to be available.  */
1391   if (cpp_get_options (pfile)->track_macro_expansion != 2)
1392     return "track_macro_expansion != 2";
1393 
1394   /* If #line or # 44 "file"-style directives are present, then there's
1395      no guarantee that the line numbers we have can be used to locate
1396      the strings.  For example, we might have a .i file with # directives
1397      pointing back to lines within a .c file, but the .c file might
1398      have been edited since the .i file was created.
1399      In such a case, the safest course is to disable on-demand substring
1400      locations.  */
1401   if (line_table->seen_line_directive)
1402     return "seen line directive";
1403 
1404   /* If string concatenation has occurred at STRLOC, get the locations
1405      of all of the literal tokens making up the compound string.
1406      Otherwise, just use STRLOC.  */
1407   int num_locs = 1;
1408   location_t *strlocs = &strloc;
1409   if (concats)
1410     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1411 
1412   auto_cpp_string_vec strs (num_locs);
1413   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1414   for (int i = 0; i < num_locs; i++)
1415     {
1416       /* Get range of strloc.  We will use it to locate the start and finish
1417 	 of the literal token within the line.  */
1418       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1419 
1420       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1421 	{
1422 	  /* If the string token was within a macro expansion, then we can
1423 	     cope with it for the simple case where we have a single token.
1424 	     Otherwise, bail out.  */
1425 	  if (src_range.m_start != src_range.m_finish)
1426 	    return "macro expansion";
1427 	}
1428       else
1429 	{
1430 	  if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1431 	    /* If so, we can't reliably determine where the token started within
1432 	       its line.  */
1433 	    return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1434 
1435 	  if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1436 	    /* If so, we can't reliably determine where the token finished
1437 	       within its line.  */
1438 	    return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1439 	}
1440 
1441       expanded_location start
1442 	= expand_location_to_spelling_point (src_range.m_start,
1443 					     LOCATION_ASPECT_START);
1444       expanded_location finish
1445 	= expand_location_to_spelling_point (src_range.m_finish,
1446 					     LOCATION_ASPECT_FINISH);
1447       if (start.file != finish.file)
1448 	return "range endpoints are in different files";
1449       if (start.line != finish.line)
1450 	return "range endpoints are on different lines";
1451       if (start.column > finish.column)
1452 	return "range endpoints are reversed";
1453 
1454       char_span line = location_get_source_line (start.file, start.line);
1455       if (!line)
1456 	return "unable to read source line";
1457 
1458       /* Determine the location of the literal (including quotes
1459 	 and leading prefix chars, such as the 'u' in a u""
1460 	 token).  */
1461       size_t literal_length = finish.column - start.column + 1;
1462 
1463       /* Ensure that we don't crash if we got the wrong location.  */
1464       if (start.column < 1)
1465 	return "zero start column";
1466       if (line.length () < (start.column - 1 + literal_length))
1467 	return "line is not wide enough";
1468 
1469       char_span literal = line.subspan (start.column - 1, literal_length);
1470 
1471       cpp_string from;
1472       from.len = literal_length;
1473       /* Make a copy of the literal, to avoid having to rely on
1474 	 the lifetime of the copy of the line within the cache.
1475 	 This will be released by the auto_cpp_string_vec dtor.  */
1476       from.text = (unsigned char *)literal.xstrdup ();
1477       strs.safe_push (from);
1478 
1479       /* For very long lines, a new linemap could have started
1480 	 halfway through the token.
1481 	 Ensure that the loc_reader uses the linemap of the
1482 	 *end* of the token for its start location.  */
1483       const line_map_ordinary *start_ord_map;
1484       linemap_resolve_location (line_table, src_range.m_start,
1485 				LRK_SPELLING_LOCATION, &start_ord_map);
1486       const line_map_ordinary *final_ord_map;
1487       linemap_resolve_location (line_table, src_range.m_finish,
1488 				LRK_SPELLING_LOCATION, &final_ord_map);
1489       if (start_ord_map == NULL || final_ord_map == NULL)
1490 	return "failed to get ordinary maps";
1491       /* Bulletproofing.  We ought to only have different ordinary maps
1492 	 for start vs finish due to line-length jumps.  */
1493       if (start_ord_map != final_ord_map
1494 	  && start_ord_map->to_file != final_ord_map->to_file)
1495 	return "start and finish are spelled in different ordinary maps";
1496       /* The file from linemap_resolve_location ought to match that from
1497 	 expand_location_to_spelling_point.  */
1498       if (start_ord_map->to_file != start.file)
1499 	return "mismatching file after resolving linemap";
1500 
1501       location_t start_loc
1502 	= linemap_position_for_line_and_column (line_table, final_ord_map,
1503 						start.line, start.column);
1504 
1505       cpp_string_location_reader loc_reader (start_loc, line_table);
1506       loc_readers.safe_push (loc_reader);
1507     }
1508 
1509   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1510   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1511 						 loc_readers.address (),
1512 						 num_locs, &ranges, type);
1513   if (err)
1514     return err;
1515 
1516   /* Success: "ranges" should now contain information on the string.  */
1517   return NULL;
1518 }
1519 
1520 /* Attempt to populate *OUT_LOC with source location information on the
1521    given characters within the string literal found at STRLOC.
1522    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1523    character set.
1524 
1525    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1526    and string literal "012345\n789"
1527    *OUT_LOC is written to with:
1528      "012345\n789"
1529          ~^~~~~
1530 
1531    If CONCATS is non-NULL, then any string literals that the token at
1532    STRLOC was concatenated with are also considered.
1533 
1534    This is implemented by re-parsing the relevant source line(s).
1535 
1536    Return NULL if successful, or an error message if any errors occurred.
1537    Error messages are intended for GCC developers (to help debugging) rather
1538    than for end-users.  */
1539 
1540 const char *
get_location_within_string(cpp_reader * pfile,string_concat_db * concats,location_t strloc,enum cpp_ttype type,int caret_idx,int start_idx,int end_idx,location_t * out_loc)1541 get_location_within_string (cpp_reader *pfile,
1542 			    string_concat_db *concats,
1543 			    location_t strloc,
1544 			    enum cpp_ttype type,
1545 			    int caret_idx, int start_idx, int end_idx,
1546 			    location_t *out_loc)
1547 {
1548   gcc_checking_assert (caret_idx >= 0);
1549   gcc_checking_assert (start_idx >= 0);
1550   gcc_checking_assert (end_idx >= 0);
1551   gcc_assert (out_loc);
1552 
1553   cpp_substring_ranges ranges;
1554   const char *err
1555     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1556   if (err)
1557     return err;
1558 
1559   if (caret_idx >= ranges.get_num_ranges ())
1560     return "caret_idx out of range";
1561   if (start_idx >= ranges.get_num_ranges ())
1562     return "start_idx out of range";
1563   if (end_idx >= ranges.get_num_ranges ())
1564     return "end_idx out of range";
1565 
1566   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1567 			    ranges.get_range (start_idx).m_start,
1568 			    ranges.get_range (end_idx).m_finish);
1569   return NULL;
1570 }
1571 
1572 #if CHECKING_P
1573 
1574 namespace selftest {
1575 
1576 /* Selftests of location handling.  */
1577 
1578 /* Attempt to populate *OUT_RANGE with source location information on the
1579    given character within the string literal found at STRLOC.
1580    CHAR_IDX refers to an offset within the execution character set.
1581    If CONCATS is non-NULL, then any string literals that the token at
1582    STRLOC was concatenated with are also considered.
1583 
1584    This is implemented by re-parsing the relevant source line(s).
1585 
1586    Return NULL if successful, or an error message if any errors occurred.
1587    Error messages are intended for GCC developers (to help debugging) rather
1588    than for end-users.  */
1589 
1590 static const char *
get_source_range_for_char(cpp_reader * pfile,string_concat_db * concats,location_t strloc,enum cpp_ttype type,int char_idx,source_range * out_range)1591 get_source_range_for_char (cpp_reader *pfile,
1592 			   string_concat_db *concats,
1593 			   location_t strloc,
1594 			   enum cpp_ttype type,
1595 			   int char_idx,
1596 			   source_range *out_range)
1597 {
1598   gcc_checking_assert (char_idx >= 0);
1599   gcc_assert (out_range);
1600 
1601   cpp_substring_ranges ranges;
1602   const char *err
1603     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1604   if (err)
1605     return err;
1606 
1607   if (char_idx >= ranges.get_num_ranges ())
1608     return "char_idx out of range";
1609 
1610   *out_range = ranges.get_range (char_idx);
1611   return NULL;
1612 }
1613 
1614 /* As get_source_range_for_char, but write to *OUT the number
1615    of ranges that are available.  */
1616 
1617 static const char *
get_num_source_ranges_for_substring(cpp_reader * pfile,string_concat_db * concats,location_t strloc,enum cpp_ttype type,int * out)1618 get_num_source_ranges_for_substring (cpp_reader *pfile,
1619 				     string_concat_db *concats,
1620 				     location_t strloc,
1621 				     enum cpp_ttype type,
1622 				     int *out)
1623 {
1624   gcc_assert (out);
1625 
1626   cpp_substring_ranges ranges;
1627   const char *err
1628     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1629 
1630   if (err)
1631     return err;
1632 
1633   *out = ranges.get_num_ranges ();
1634   return NULL;
1635 }
1636 
1637 /* Selftests of location handling.  */
1638 
1639 /* Verify that compare() on linenum_type handles comparisons over the full
1640    range of the type.  */
1641 
1642 static void
test_linenum_comparisons()1643 test_linenum_comparisons ()
1644 {
1645   linenum_type min_line (0);
1646   linenum_type max_line (0xffffffff);
1647   ASSERT_EQ (0, compare (min_line, min_line));
1648   ASSERT_EQ (0, compare (max_line, max_line));
1649 
1650   ASSERT_GT (compare (max_line, min_line), 0);
1651   ASSERT_LT (compare (min_line, max_line), 0);
1652 }
1653 
1654 /* Helper function for verifying location data: when location_t
1655    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1656    as having column 0.  */
1657 
1658 static bool
should_have_column_data_p(location_t loc)1659 should_have_column_data_p (location_t loc)
1660 {
1661   if (IS_ADHOC_LOC (loc))
1662     loc = get_location_from_adhoc_loc (line_table, loc);
1663   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1664     return false;
1665   return true;
1666 }
1667 
1668 /* Selftest for should_have_column_data_p.  */
1669 
1670 static void
test_should_have_column_data_p()1671 test_should_have_column_data_p ()
1672 {
1673   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1674   ASSERT_TRUE
1675     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1676   ASSERT_FALSE
1677     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1678 }
1679 
1680 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1681    on LOC.  */
1682 
1683 static void
assert_loceq(const char * exp_filename,int exp_linenum,int exp_colnum,location_t loc)1684 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1685 	      location_t loc)
1686 {
1687   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1688   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1689   /* If location_t values are sufficiently high, then column numbers
1690      will be unavailable and LOCATION_COLUMN (loc) will be 0.
1691      When close to the threshold, column numbers *may* be present: if
1692      the final linemap before the threshold contains a line that straddles
1693      the threshold, locations in that line have column information.  */
1694   if (should_have_column_data_p (loc))
1695     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1696 }
1697 
1698 /* Various selftests involve constructing a line table and one or more
1699    line maps within it.
1700 
1701    For maximum test coverage we want to run these tests with a variety
1702    of situations:
1703    - line_table->default_range_bits: some frontends use a non-zero value
1704    and others use zero
1705    - the fallback modes within line-map.c: there are various threshold
1706    values for location_t beyond line-map.c changes
1707    behavior (disabling of the range-packing optimization, disabling
1708    of column-tracking).  We can exercise these by starting the line_table
1709    at interesting values at or near these thresholds.
1710 
1711    The following struct describes a particular case within our test
1712    matrix.  */
1713 
1714 class line_table_case
1715 {
1716 public:
line_table_case(int default_range_bits,int base_location)1717   line_table_case (int default_range_bits, int base_location)
1718   : m_default_range_bits (default_range_bits),
1719     m_base_location (base_location)
1720   {}
1721 
1722   int m_default_range_bits;
1723   int m_base_location;
1724 };
1725 
1726 /* Constructor.  Store the old value of line_table, and create a new
1727    one, using sane defaults.  */
1728 
line_table_test()1729 line_table_test::line_table_test ()
1730 {
1731   gcc_assert (saved_line_table == NULL);
1732   saved_line_table = line_table;
1733   line_table = ggc_alloc<line_maps> ();
1734   linemap_init (line_table, BUILTINS_LOCATION);
1735   gcc_assert (saved_line_table->reallocator);
1736   line_table->reallocator = saved_line_table->reallocator;
1737   gcc_assert (saved_line_table->round_alloc_size);
1738   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1739   line_table->default_range_bits = 0;
1740 }
1741 
1742 /* Constructor.  Store the old value of line_table, and create a new
1743    one, using the sitation described in CASE_.  */
1744 
line_table_test(const line_table_case & case_)1745 line_table_test::line_table_test (const line_table_case &case_)
1746 {
1747   gcc_assert (saved_line_table == NULL);
1748   saved_line_table = line_table;
1749   line_table = ggc_alloc<line_maps> ();
1750   linemap_init (line_table, BUILTINS_LOCATION);
1751   gcc_assert (saved_line_table->reallocator);
1752   line_table->reallocator = saved_line_table->reallocator;
1753   gcc_assert (saved_line_table->round_alloc_size);
1754   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1755   line_table->default_range_bits = case_.m_default_range_bits;
1756   if (case_.m_base_location)
1757     {
1758       line_table->highest_location = case_.m_base_location;
1759       line_table->highest_line = case_.m_base_location;
1760     }
1761 }
1762 
1763 /* Destructor.  Restore the old value of line_table.  */
1764 
~line_table_test()1765 line_table_test::~line_table_test ()
1766 {
1767   gcc_assert (saved_line_table != NULL);
1768   line_table = saved_line_table;
1769   saved_line_table = NULL;
1770 }
1771 
1772 /* Verify basic operation of ordinary linemaps.  */
1773 
1774 static void
test_accessing_ordinary_linemaps(const line_table_case & case_)1775 test_accessing_ordinary_linemaps (const line_table_case &case_)
1776 {
1777   line_table_test ltt (case_);
1778 
1779   /* Build a simple linemap describing some locations. */
1780   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
1781 
1782   linemap_line_start (line_table, 1, 100);
1783   location_t loc_a = linemap_position_for_column (line_table, 1);
1784   location_t loc_b = linemap_position_for_column (line_table, 23);
1785 
1786   linemap_line_start (line_table, 2, 100);
1787   location_t loc_c = linemap_position_for_column (line_table, 1);
1788   location_t loc_d = linemap_position_for_column (line_table, 17);
1789 
1790   /* Example of a very long line.  */
1791   linemap_line_start (line_table, 3, 2000);
1792   location_t loc_e = linemap_position_for_column (line_table, 700);
1793 
1794   /* Transitioning back to a short line.  */
1795   linemap_line_start (line_table, 4, 0);
1796   location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
1797 
1798   if (should_have_column_data_p (loc_back_to_short))
1799     {
1800       /* Verify that we switched to short lines in the linemap.  */
1801       line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
1802       ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
1803     }
1804 
1805   /* Example of a line that will eventually be seen to be longer
1806      than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
1807      below that.  */
1808   linemap_line_start (line_table, 5, 2000);
1809 
1810   location_t loc_start_of_very_long_line
1811     = linemap_position_for_column (line_table, 2000);
1812   location_t loc_too_wide
1813     = linemap_position_for_column (line_table, 4097);
1814   location_t loc_too_wide_2
1815     = linemap_position_for_column (line_table, 4098);
1816 
1817   /* ...and back to a sane line length.  */
1818   linemap_line_start (line_table, 6, 100);
1819   location_t loc_sane_again = linemap_position_for_column (line_table, 10);
1820 
1821   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1822 
1823   /* Multiple files.  */
1824   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
1825   linemap_line_start (line_table, 1, 200);
1826   location_t loc_f = linemap_position_for_column (line_table, 150);
1827   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1828 
1829   /* Verify that we can recover the location info.  */
1830   assert_loceq ("foo.c", 1, 1, loc_a);
1831   assert_loceq ("foo.c", 1, 23, loc_b);
1832   assert_loceq ("foo.c", 2, 1, loc_c);
1833   assert_loceq ("foo.c", 2, 17, loc_d);
1834   assert_loceq ("foo.c", 3, 700, loc_e);
1835   assert_loceq ("foo.c", 4, 100, loc_back_to_short);
1836 
1837   /* In the very wide line, the initial location should be fully tracked.  */
1838   assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
1839   /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
1840      be disabled.  */
1841   assert_loceq ("foo.c", 5, 0, loc_too_wide);
1842   assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
1843   /*...and column-tracking should be re-enabled for subsequent lines.  */
1844   assert_loceq ("foo.c", 6, 10, loc_sane_again);
1845 
1846   assert_loceq ("bar.c", 1, 150, loc_f);
1847 
1848   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
1849   ASSERT_TRUE (pure_location_p (line_table, loc_a));
1850 
1851   /* Verify using make_location to build a range, and extracting data
1852      back from it.  */
1853   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
1854   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
1855   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
1856   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
1857   ASSERT_EQ (loc_b, src_range.m_start);
1858   ASSERT_EQ (loc_d, src_range.m_finish);
1859 }
1860 
1861 /* Verify various properties of UNKNOWN_LOCATION.  */
1862 
1863 static void
test_unknown_location()1864 test_unknown_location ()
1865 {
1866   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
1867   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
1868   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
1869 }
1870 
1871 /* Verify various properties of BUILTINS_LOCATION.  */
1872 
1873 static void
test_builtins()1874 test_builtins ()
1875 {
1876   assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
1877   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
1878 }
1879 
1880 /* Regression test for make_location.
1881    Ensure that we use pure locations for the start/finish of the range,
1882    rather than storing a packed or ad-hoc range as the start/finish.  */
1883 
1884 static void
test_make_location_nonpure_range_endpoints(const line_table_case & case_)1885 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
1886 {
1887   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
1888      with C++ frontend.
1889      ....................0000000001111111111222.
1890      ....................1234567890123456789012.  */
1891   const char *content = "     r += !aaa == bbb;\n";
1892   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
1893   line_table_test ltt (case_);
1894   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
1895 
1896   const location_t c11 = linemap_position_for_column (line_table, 11);
1897   const location_t c12 = linemap_position_for_column (line_table, 12);
1898   const location_t c13 = linemap_position_for_column (line_table, 13);
1899   const location_t c14 = linemap_position_for_column (line_table, 14);
1900   const location_t c21 = linemap_position_for_column (line_table, 21);
1901 
1902   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
1903     return;
1904 
1905   /* Use column 13 for the caret location, arbitrarily, to verify that we
1906      handle start != caret.  */
1907   const location_t aaa = make_location (c13, c12, c14);
1908   ASSERT_EQ (c13, get_pure_location (aaa));
1909   ASSERT_EQ (c12, get_start (aaa));
1910   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
1911   ASSERT_EQ (c14, get_finish (aaa));
1912   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
1913 
1914   /* Make a location using a location with a range as the start-point.  */
1915   const location_t not_aaa = make_location (c11, aaa, c14);
1916   ASSERT_EQ (c11, get_pure_location (not_aaa));
1917   /* It should use the start location of the range, not store the range
1918      itself.  */
1919   ASSERT_EQ (c12, get_start (not_aaa));
1920   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
1921   ASSERT_EQ (c14, get_finish (not_aaa));
1922   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
1923 
1924   /* Similarly, make a location with a range as the end-point.  */
1925   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
1926   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
1927   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
1928   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
1929   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
1930   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
1931   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
1932   /* It should use the finish location of the range, not store the range
1933      itself.  */
1934   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
1935   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
1936   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
1937   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
1938   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
1939 }
1940 
1941 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
1942 
1943 static void
test_reading_source_line()1944 test_reading_source_line ()
1945 {
1946   /* Create a tempfile and write some text to it.  */
1947   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
1948 			"01234567890123456789\n"
1949 			"This is the test text\n"
1950 			"This is the 3rd line");
1951 
1952   /* Read back a specific line from the tempfile.  */
1953   char_span source_line = location_get_source_line (tmp.get_filename (), 3);
1954   ASSERT_TRUE (source_line);
1955   ASSERT_TRUE (source_line.get_buffer () != NULL);
1956   ASSERT_EQ (20, source_line.length ());
1957   ASSERT_TRUE (!strncmp ("This is the 3rd line",
1958 			 source_line.get_buffer (), source_line.length ()));
1959 
1960   source_line = location_get_source_line (tmp.get_filename (), 2);
1961   ASSERT_TRUE (source_line);
1962   ASSERT_TRUE (source_line.get_buffer () != NULL);
1963   ASSERT_EQ (21, source_line.length ());
1964   ASSERT_TRUE (!strncmp ("This is the test text",
1965 			 source_line.get_buffer (), source_line.length ()));
1966 
1967   source_line = location_get_source_line (tmp.get_filename (), 4);
1968   ASSERT_FALSE (source_line);
1969   ASSERT_TRUE (source_line.get_buffer () == NULL);
1970 }
1971 
1972 /* Tests of lexing.  */
1973 
1974 /* Verify that token TOK from PARSER has cpp_token_as_text
1975    equal to EXPECTED_TEXT.  */
1976 
1977 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)		\
1978   SELFTEST_BEGIN_STMT							\
1979     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));	\
1980     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);		\
1981   SELFTEST_END_STMT
1982 
1983 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
1984    and ranges from EXP_START_COL to EXP_FINISH_COL.
1985    Use LOC as the effective location of the selftest.  */
1986 
1987 static void
assert_token_loc_eq(const location & loc,const cpp_token * tok,const char * exp_filename,int exp_linenum,int exp_start_col,int exp_finish_col)1988 assert_token_loc_eq (const location &loc,
1989 		     const cpp_token *tok,
1990 		     const char *exp_filename, int exp_linenum,
1991 		     int exp_start_col, int exp_finish_col)
1992 {
1993   location_t tok_loc = tok->src_loc;
1994   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
1995   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
1996 
1997   /* If location_t values are sufficiently high, then column numbers
1998      will be unavailable.  */
1999   if (!should_have_column_data_p (tok_loc))
2000     return;
2001 
2002   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2003   source_range tok_range = get_range_from_loc (line_table, tok_loc);
2004   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2005   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2006 }
2007 
2008 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
2009    SELFTEST_LOCATION as the effective location of the selftest.  */
2010 
2011 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2012 			    EXP_START_COL, EXP_FINISH_COL) \
2013   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2014 		       (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2015 
2016 /* Test of lexing a file using libcpp, verifying tokens and their
2017    location information.  */
2018 
2019 static void
test_lexer(const line_table_case & case_)2020 test_lexer (const line_table_case &case_)
2021 {
2022   /* Create a tempfile and write some text to it.  */
2023   const char *content =
2024     /*00000000011111111112222222222333333.3333444444444.455555555556
2025       12345678901234567890123456789012345.6789012345678.901234567890.  */
2026     ("test_name /* c-style comment */\n"
2027      "                                  \"test literal\"\n"
2028      " // test c++-style comment\n"
2029      "   42\n");
2030   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2031 
2032   line_table_test ltt (case_);
2033 
2034   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2035 
2036   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2037   ASSERT_NE (fname, NULL);
2038 
2039   /* Verify that we get the expected tokens back, with the correct
2040      location information.  */
2041 
2042   location_t loc;
2043   const cpp_token *tok;
2044   tok = cpp_get_token_with_location (parser, &loc);
2045   ASSERT_NE (tok, NULL);
2046   ASSERT_EQ (tok->type, CPP_NAME);
2047   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2048   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2049 
2050   tok = cpp_get_token_with_location (parser, &loc);
2051   ASSERT_NE (tok, NULL);
2052   ASSERT_EQ (tok->type, CPP_STRING);
2053   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2054   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2055 
2056   tok = cpp_get_token_with_location (parser, &loc);
2057   ASSERT_NE (tok, NULL);
2058   ASSERT_EQ (tok->type, CPP_NUMBER);
2059   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2060   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2061 
2062   tok = cpp_get_token_with_location (parser, &loc);
2063   ASSERT_NE (tok, NULL);
2064   ASSERT_EQ (tok->type, CPP_EOF);
2065 
2066   cpp_finish (parser, NULL);
2067   cpp_destroy (parser);
2068 }
2069 
2070 /* Forward decls.  */
2071 
2072 class lexer_test;
2073 class lexer_test_options;
2074 
2075 /* A class for specifying options of a lexer_test.
2076    The "apply" vfunc is called during the lexer_test constructor.  */
2077 
2078 class lexer_test_options
2079 {
2080  public:
2081   virtual void apply (lexer_test &) = 0;
2082 };
2083 
2084 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2085    in its dtor.
2086 
2087    This is needed by struct lexer_test to ensure that the cleanup of the
2088    cpp_reader happens *after* the cleanup of the temp_source_file.  */
2089 
2090 class cpp_reader_ptr
2091 {
2092  public:
cpp_reader_ptr(cpp_reader * ptr)2093   cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2094 
~cpp_reader_ptr()2095   ~cpp_reader_ptr ()
2096   {
2097     cpp_finish (m_ptr, NULL);
2098     cpp_destroy (m_ptr);
2099   }
2100 
2101   operator cpp_reader * () const { return m_ptr; }
2102 
2103  private:
2104   cpp_reader *m_ptr;
2105 };
2106 
2107 /* A struct for writing lexer tests.  */
2108 
2109 class lexer_test
2110 {
2111 public:
2112   lexer_test (const line_table_case &case_, const char *content,
2113 	      lexer_test_options *options);
2114   ~lexer_test ();
2115 
2116   const cpp_token *get_token ();
2117 
2118   /* The ordering of these fields matters.
2119      The line_table_test must be first, since the cpp_reader_ptr
2120      uses it.
2121      The cpp_reader must be cleaned up *after* the temp_source_file
2122      since the filenames in input.c's input cache are owned by the
2123      cpp_reader; in particular, when ~temp_source_file evicts the
2124      filename the filenames must still be alive.  */
2125   line_table_test m_ltt;
2126   cpp_reader_ptr m_parser;
2127   temp_source_file m_tempfile;
2128   string_concat_db m_concats;
2129   bool m_implicitly_expect_EOF;
2130 };
2131 
2132 /* Use an EBCDIC encoding for the execution charset, specifically
2133    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2134 
2135    This exercises iconv integration within libcpp.
2136    Not every build of iconv supports the given charset,
2137    so we need to flag this error and handle it gracefully.  */
2138 
2139 class ebcdic_execution_charset : public lexer_test_options
2140 {
2141  public:
ebcdic_execution_charset()2142   ebcdic_execution_charset () : m_num_iconv_errors (0)
2143     {
2144       gcc_assert (s_singleton == NULL);
2145       s_singleton = this;
2146     }
~ebcdic_execution_charset()2147   ~ebcdic_execution_charset ()
2148     {
2149       gcc_assert (s_singleton == this);
2150       s_singleton = NULL;
2151     }
2152 
apply(lexer_test & test)2153   void apply (lexer_test &test) FINAL OVERRIDE
2154   {
2155     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2156     cpp_opts->narrow_charset = "IBM1047";
2157 
2158     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2159     callbacks->diagnostic = on_diagnostic;
2160   }
2161 
on_diagnostic(cpp_reader * pfile ATTRIBUTE_UNUSED,enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,enum cpp_warning_reason reason ATTRIBUTE_UNUSED,rich_location * richloc ATTRIBUTE_UNUSED,const char * msgid,va_list * ap ATTRIBUTE_UNUSED)2162   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2163 			     enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2164 			     enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2165 			     rich_location *richloc ATTRIBUTE_UNUSED,
2166 			     const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2167     ATTRIBUTE_FPTR_PRINTF(5,0)
2168   {
2169     gcc_assert (s_singleton);
2170     /* Avoid exgettext from picking this up, it is translated in libcpp.  */
2171     const char *msg = "conversion from %s to %s not supported by iconv";
2172 #ifdef ENABLE_NLS
2173     msg = dgettext ("cpplib", msg);
2174 #endif
2175     /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc
2176        when the local iconv build doesn't support the conversion.  */
2177     if (strcmp (msgid, msg) == 0)
2178       {
2179 	s_singleton->m_num_iconv_errors++;
2180 	return true;
2181       }
2182 
2183     /* Otherwise, we have an unexpected error.  */
2184     abort ();
2185   }
2186 
iconv_errors_occurred_p()2187   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2188 
2189  private:
2190   static ebcdic_execution_charset *s_singleton;
2191   int m_num_iconv_errors;
2192 };
2193 
2194 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2195 
2196 /* A lexer_test_options subclass that records a list of diagnostic
2197    messages emitted by the lexer.  */
2198 
2199 class lexer_diagnostic_sink : public lexer_test_options
2200 {
2201  public:
lexer_diagnostic_sink()2202   lexer_diagnostic_sink ()
2203   {
2204     gcc_assert (s_singleton == NULL);
2205     s_singleton = this;
2206   }
~lexer_diagnostic_sink()2207   ~lexer_diagnostic_sink ()
2208   {
2209     gcc_assert (s_singleton == this);
2210     s_singleton = NULL;
2211 
2212     int i;
2213     char *str;
2214     FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2215       free (str);
2216   }
2217 
apply(lexer_test & test)2218   void apply (lexer_test &test) FINAL OVERRIDE
2219   {
2220     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2221     callbacks->diagnostic = on_diagnostic;
2222   }
2223 
on_diagnostic(cpp_reader * pfile ATTRIBUTE_UNUSED,enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,enum cpp_warning_reason reason ATTRIBUTE_UNUSED,rich_location * richloc ATTRIBUTE_UNUSED,const char * msgid,va_list * ap)2224   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2225 			     enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2226 			     enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2227 			     rich_location *richloc ATTRIBUTE_UNUSED,
2228 			     const char *msgid, va_list *ap)
2229     ATTRIBUTE_FPTR_PRINTF(5,0)
2230   {
2231     char *msg = xvasprintf (msgid, *ap);
2232     s_singleton->m_diagnostics.safe_push (msg);
2233     return true;
2234   }
2235 
2236   auto_vec<char *> m_diagnostics;
2237 
2238  private:
2239   static lexer_diagnostic_sink *s_singleton;
2240 };
2241 
2242 lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2243 
2244 /* Constructor.  Override line_table with a new instance based on CASE_,
2245    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2246    start parsing the tempfile.  */
2247 
lexer_test(const line_table_case & case_,const char * content,lexer_test_options * options)2248 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2249 			lexer_test_options *options)
2250 : m_ltt (case_),
2251   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2252   /* Create a tempfile and write the text to it.  */
2253   m_tempfile (SELFTEST_LOCATION, ".c", content),
2254   m_concats (),
2255   m_implicitly_expect_EOF (true)
2256 {
2257   if (options)
2258     options->apply (*this);
2259 
2260   cpp_init_iconv (m_parser);
2261 
2262   /* Parse the file.  */
2263   const char *fname = cpp_read_main_file (m_parser,
2264 					  m_tempfile.get_filename ());
2265   ASSERT_NE (fname, NULL);
2266 }
2267 
2268 /* Destructor.  By default, verify that the next token in m_parser is EOF.  */
2269 
~lexer_test()2270 lexer_test::~lexer_test ()
2271 {
2272   location_t loc;
2273   const cpp_token *tok;
2274 
2275   if (m_implicitly_expect_EOF)
2276     {
2277       tok = cpp_get_token_with_location (m_parser, &loc);
2278       ASSERT_NE (tok, NULL);
2279       ASSERT_EQ (tok->type, CPP_EOF);
2280     }
2281 }
2282 
2283 /* Get the next token from m_parser.  */
2284 
2285 const cpp_token *
get_token()2286 lexer_test::get_token ()
2287 {
2288   location_t loc;
2289   const cpp_token *tok;
2290 
2291   tok = cpp_get_token_with_location (m_parser, &loc);
2292   ASSERT_NE (tok, NULL);
2293   return tok;
2294 }
2295 
2296 /* Verify that locations within string literals are correctly handled.  */
2297 
2298 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2299    using the string concatenation database for TEST.
2300 
2301    Assert that the character at index IDX is on EXPECTED_LINE,
2302    and that it begins at column EXPECTED_START_COL and ends at
2303    EXPECTED_FINISH_COL (unless the locations are beyond
2304    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2305    columns).  */
2306 
2307 static void
assert_char_at_range(const location & loc,lexer_test & test,location_t strloc,enum cpp_ttype type,int idx,int expected_line,int expected_start_col,int expected_finish_col)2308 assert_char_at_range (const location &loc,
2309 		      lexer_test& test,
2310 		      location_t strloc, enum cpp_ttype type, int idx,
2311 		      int expected_line, int expected_start_col,
2312 		      int expected_finish_col)
2313 {
2314   cpp_reader *pfile = test.m_parser;
2315   string_concat_db *concats = &test.m_concats;
2316 
2317   source_range actual_range = source_range();
2318   const char *err
2319     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2320 				 &actual_range);
2321   if (should_have_column_data_p (strloc))
2322     ASSERT_EQ_AT (loc, NULL, err);
2323   else
2324     {
2325       ASSERT_STREQ_AT (loc,
2326 		       "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2327 		       err);
2328       return;
2329     }
2330 
2331   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2332   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2333   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2334   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2335 
2336   if (should_have_column_data_p (actual_range.m_start))
2337     {
2338       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2339       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2340     }
2341   if (should_have_column_data_p (actual_range.m_finish))
2342     {
2343       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2344       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2345     }
2346 }
2347 
2348 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2349    the effective location of any errors.  */
2350 
2351 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2352 			     EXPECTED_START_COL, EXPECTED_FINISH_COL)	\
2353   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2354 			(IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2355 			(EXPECTED_FINISH_COL))
2356 
2357 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2358    using the string concatenation database for TEST.
2359 
2360    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2361 
2362 static void
assert_num_substring_ranges(const location & loc,lexer_test & test,location_t strloc,enum cpp_ttype type,int expected_num_ranges)2363 assert_num_substring_ranges (const location &loc,
2364 			     lexer_test& test,
2365 			     location_t strloc,
2366 			     enum cpp_ttype type,
2367 			     int expected_num_ranges)
2368 {
2369   cpp_reader *pfile = test.m_parser;
2370   string_concat_db *concats = &test.m_concats;
2371 
2372   int actual_num_ranges = -1;
2373   const char *err
2374     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2375 					   &actual_num_ranges);
2376   if (should_have_column_data_p (strloc))
2377     ASSERT_EQ_AT (loc, NULL, err);
2378   else
2379     {
2380       ASSERT_STREQ_AT (loc,
2381 		       "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2382 		       err);
2383       return;
2384     }
2385   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2386 }
2387 
2388 /* Macro for calling assert_num_substring_ranges, supplying
2389    SELFTEST_LOCATION for the effective location of any errors.  */
2390 
2391 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2392 				    EXPECTED_NUM_RANGES)		\
2393   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2394 			       (TYPE), (EXPECTED_NUM_RANGES))
2395 
2396 
2397 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2398    returns an error (using the string concatenation database for TEST).  */
2399 
2400 static void
assert_has_no_substring_ranges(const location & loc,lexer_test & test,location_t strloc,enum cpp_ttype type,const char * expected_err)2401 assert_has_no_substring_ranges (const location &loc,
2402 				lexer_test& test,
2403 				location_t strloc,
2404 				enum cpp_ttype type,
2405 				const char *expected_err)
2406 {
2407   cpp_reader *pfile = test.m_parser;
2408   string_concat_db *concats = &test.m_concats;
2409   cpp_substring_ranges ranges;
2410   const char *actual_err
2411     = get_substring_ranges_for_loc (pfile, concats, strloc,
2412 				    type, ranges);
2413   if (should_have_column_data_p (strloc))
2414     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2415   else
2416     ASSERT_STREQ_AT (loc,
2417 		     "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2418 		     actual_err);
2419 }
2420 
2421 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2422     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2423 				    (STRLOC), (TYPE), (ERR))
2424 
2425 /* Lex a simple string literal.  Verify the substring location data, before
2426    and after running cpp_interpret_string on it.  */
2427 
2428 static void
test_lexer_string_locations_simple(const line_table_case & case_)2429 test_lexer_string_locations_simple (const line_table_case &case_)
2430 {
2431   /* Digits 0-9 (with 0 at column 10), the simple way.
2432      ....................000000000.11111111112.2222222223333333333
2433      ....................123456789.01234567890.1234567890123456789
2434      We add a trailing comment to ensure that we correctly locate
2435      the end of the string literal token.  */
2436   const char *content = "        \"0123456789\" /* not a string */\n";
2437   lexer_test test (case_, content, NULL);
2438 
2439   /* Verify that we get the expected token back, with the correct
2440      location information.  */
2441   const cpp_token *tok = test.get_token ();
2442   ASSERT_EQ (tok->type, CPP_STRING);
2443   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2444   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2445 
2446   /* At this point in lexing, the quote characters are treated as part of
2447      the string (they are stripped off by cpp_interpret_string).  */
2448 
2449   ASSERT_EQ (tok->val.str.len, 12);
2450 
2451   /* Verify that cpp_interpret_string works.  */
2452   cpp_string dst_string;
2453   const enum cpp_ttype type = CPP_STRING;
2454   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2455 				      &dst_string, type);
2456   ASSERT_TRUE (result);
2457   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2458   free (const_cast <unsigned char *> (dst_string.text));
2459 
2460   /* Verify ranges of individual characters.  This no longer includes the
2461      opening quote, but does include the closing quote.  */
2462   for (int i = 0; i <= 10; i++)
2463     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2464 			  10 + i, 10 + i);
2465 
2466   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2467 }
2468 
2469 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2470    encoding.  */
2471 
2472 static void
test_lexer_string_locations_ebcdic(const line_table_case & case_)2473 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2474 {
2475   /* EBCDIC support requires iconv.  */
2476   if (!HAVE_ICONV)
2477     return;
2478 
2479   /* Digits 0-9 (with 0 at column 10), the simple way.
2480      ....................000000000.11111111112.2222222223333333333
2481      ....................123456789.01234567890.1234567890123456789
2482      We add a trailing comment to ensure that we correctly locate
2483      the end of the string literal token.  */
2484   const char *content = "        \"0123456789\" /* not a string */\n";
2485   ebcdic_execution_charset use_ebcdic;
2486   lexer_test test (case_, content, &use_ebcdic);
2487 
2488   /* Verify that we get the expected token back, with the correct
2489      location information.  */
2490   const cpp_token *tok = test.get_token ();
2491   ASSERT_EQ (tok->type, CPP_STRING);
2492   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2493   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2494 
2495   /* At this point in lexing, the quote characters are treated as part of
2496      the string (they are stripped off by cpp_interpret_string).  */
2497 
2498   ASSERT_EQ (tok->val.str.len, 12);
2499 
2500   /* The remainder of the test requires an iconv implementation that
2501      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2502   if (use_ebcdic.iconv_errors_occurred_p ())
2503     return;
2504 
2505   /* Verify that cpp_interpret_string works.  */
2506   cpp_string dst_string;
2507   const enum cpp_ttype type = CPP_STRING;
2508   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2509 				      &dst_string, type);
2510   ASSERT_TRUE (result);
2511   /* We should now have EBCDIC-encoded text, specifically
2512      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2513      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2514   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2515 		(const char *)dst_string.text);
2516   free (const_cast <unsigned char *> (dst_string.text));
2517 
2518   /* Verify that we don't attempt to record substring location information
2519      for such cases.  */
2520   ASSERT_HAS_NO_SUBSTRING_RANGES
2521     (test, tok->src_loc, type,
2522      "execution character set != source character set");
2523 }
2524 
2525 /* Lex a string literal containing a hex-escaped character.
2526    Verify the substring location data, before and after running
2527    cpp_interpret_string on it.  */
2528 
2529 static void
test_lexer_string_locations_hex(const line_table_case & case_)2530 test_lexer_string_locations_hex (const line_table_case &case_)
2531 {
2532   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2533      and with a space in place of digit 6, to terminate the escaped
2534      hex code.
2535      ....................000000000.111111.11112222.
2536      ....................123456789.012345.67890123.  */
2537   const char *content = "        \"01234\\x35 789\"\n";
2538   lexer_test test (case_, content, NULL);
2539 
2540   /* Verify that we get the expected token back, with the correct
2541      location information.  */
2542   const cpp_token *tok = test.get_token ();
2543   ASSERT_EQ (tok->type, CPP_STRING);
2544   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2545   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2546 
2547   /* At this point in lexing, the quote characters are treated as part of
2548      the string (they are stripped off by cpp_interpret_string).  */
2549   ASSERT_EQ (tok->val.str.len, 15);
2550 
2551   /* Verify that cpp_interpret_string works.  */
2552   cpp_string dst_string;
2553   const enum cpp_ttype type = CPP_STRING;
2554   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2555 				      &dst_string, type);
2556   ASSERT_TRUE (result);
2557   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2558   free (const_cast <unsigned char *> (dst_string.text));
2559 
2560   /* Verify ranges of individual characters.  This no longer includes the
2561      opening quote, but does include the closing quote.  */
2562   for (int i = 0; i <= 4; i++)
2563     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2564   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2565   for (int i = 6; i <= 10; i++)
2566     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2567 
2568   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2569 }
2570 
2571 /* Lex a string literal containing an octal-escaped character.
2572    Verify the substring location data after running cpp_interpret_string
2573    on it.  */
2574 
2575 static void
test_lexer_string_locations_oct(const line_table_case & case_)2576 test_lexer_string_locations_oct (const line_table_case &case_)
2577 {
2578   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2579      and with a space in place of digit 6, to terminate the escaped
2580      octal code.
2581      ....................000000000.111111.11112222.2222223333333333444
2582      ....................123456789.012345.67890123.4567890123456789012  */
2583   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2584   lexer_test test (case_, content, NULL);
2585 
2586   /* Verify that we get the expected token back, with the correct
2587      location information.  */
2588   const cpp_token *tok = test.get_token ();
2589   ASSERT_EQ (tok->type, CPP_STRING);
2590   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2591 
2592   /* Verify that cpp_interpret_string works.  */
2593   cpp_string dst_string;
2594   const enum cpp_ttype type = CPP_STRING;
2595   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2596 				      &dst_string, type);
2597   ASSERT_TRUE (result);
2598   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2599   free (const_cast <unsigned char *> (dst_string.text));
2600 
2601   /* Verify ranges of individual characters.  This no longer includes the
2602      opening quote, but does include the closing quote.  */
2603   for (int i = 0; i < 5; i++)
2604     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2605   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2606   for (int i = 6; i <= 10; i++)
2607     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2608 
2609   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2610 }
2611 
2612 /* Test of string literal containing letter escapes.  */
2613 
2614 static void
test_lexer_string_locations_letter_escape_1(const line_table_case & case_)2615 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2616 {
2617   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2618      .....................000000000.1.11111.1.1.11222.22222223333333
2619      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2620   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2621   lexer_test test (case_, content, NULL);
2622 
2623   /* Verify that we get the expected tokens back.  */
2624   const cpp_token *tok = test.get_token ();
2625   ASSERT_EQ (tok->type, CPP_STRING);
2626   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2627 
2628   /* Verify ranges of individual characters. */
2629   /* "\t".  */
2630   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2631 			0, 1, 10, 11);
2632   /* "foo". */
2633   for (int i = 1; i <= 3; i++)
2634     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2635 			  i, 1, 11 + i, 11 + i);
2636   /* "\\" and "\n".  */
2637   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2638 			4, 1, 15, 16);
2639   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2640 			5, 1, 17, 18);
2641 
2642   /* "bar" and closing quote for nul-terminator.  */
2643   for (int i = 6; i <= 9; i++)
2644     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2645 			  i, 1, 13 + i, 13 + i);
2646 
2647   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2648 }
2649 
2650 /* Another test of a string literal containing a letter escape.
2651    Based on string seen in
2652      printf ("%-%\n");
2653    in gcc.dg/format/c90-printf-1.c.  */
2654 
2655 static void
test_lexer_string_locations_letter_escape_2(const line_table_case & case_)2656 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2657 {
2658   /* .....................000000000.1111.11.1111.22222222223.
2659      .....................123456789.0123.45.6789.01234567890.  */
2660   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2661   lexer_test test (case_, content, NULL);
2662 
2663   /* Verify that we get the expected tokens back.  */
2664   const cpp_token *tok = test.get_token ();
2665   ASSERT_EQ (tok->type, CPP_STRING);
2666   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2667 
2668   /* Verify ranges of individual characters. */
2669   /* "%-%".  */
2670   for (int i = 0; i < 3; i++)
2671     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2672 			  i, 1, 10 + i, 10 + i);
2673   /* "\n".  */
2674   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2675 			3, 1, 13, 14);
2676 
2677   /* Closing quote for nul-terminator.  */
2678   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2679 			4, 1, 15, 15);
2680 
2681   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2682 }
2683 
2684 /* Lex a string literal containing UCN 4 characters.
2685    Verify the substring location data after running cpp_interpret_string
2686    on it.  */
2687 
2688 static void
test_lexer_string_locations_ucn4(const line_table_case & case_)2689 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2690 {
2691   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2692      as UCN 4.
2693      ....................000000000.111111.111122.222222223.33333333344444
2694      ....................123456789.012345.678901.234567890.12345678901234  */
2695   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
2696   lexer_test test (case_, content, NULL);
2697 
2698   /* Verify that we get the expected token back, with the correct
2699      location information.  */
2700   const cpp_token *tok = test.get_token ();
2701   ASSERT_EQ (tok->type, CPP_STRING);
2702   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2703 
2704   /* Verify that cpp_interpret_string works.
2705      The string should be encoded in the execution character
2706      set.  Assuming that is UTF-8, we should have the following:
2707      -----------  ----  -----  -------  ----------------
2708      Byte offset  Byte  Octal  Unicode  Source Column(s)
2709      -----------  ----  -----  -------  ----------------
2710      0            0x30         '0'      10
2711      1            0x31         '1'      11
2712      2            0x32         '2'      12
2713      3            0x33         '3'      13
2714      4            0x34         '4'      14
2715      5            0xE2  \342   U+2174   15-20
2716      6            0x85  \205    (cont)  15-20
2717      7            0xB4  \264    (cont)  15-20
2718      8            0xE2  \342   U+2175   21-26
2719      9            0x85  \205    (cont)  21-26
2720      10           0xB5  \265    (cont)  21-26
2721      11           0x37         '7'      27
2722      12           0x38         '8'      28
2723      13           0x39         '9'      29
2724      14           0x00                  30 (closing quote)
2725      -----------  ----  -----  -------  ---------------.  */
2726 
2727   cpp_string dst_string;
2728   const enum cpp_ttype type = CPP_STRING;
2729   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2730 				      &dst_string, type);
2731   ASSERT_TRUE (result);
2732   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2733 		(const char *)dst_string.text);
2734   free (const_cast <unsigned char *> (dst_string.text));
2735 
2736   /* Verify ranges of individual characters.  This no longer includes the
2737      opening quote, but does include the closing quote.
2738      '01234'.  */
2739   for (int i = 0; i <= 4; i++)
2740     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2741   /* U+2174.  */
2742   for (int i = 5; i <= 7; i++)
2743     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2744   /* U+2175.  */
2745   for (int i = 8; i <= 10; i++)
2746     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2747   /* '789' and nul terminator  */
2748   for (int i = 11; i <= 14; i++)
2749     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2750 
2751   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2752 }
2753 
2754 /* Lex a string literal containing UCN 8 characters.
2755    Verify the substring location data after running cpp_interpret_string
2756    on it.  */
2757 
2758 static void
test_lexer_string_locations_ucn8(const line_table_case & case_)2759 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2760 {
2761   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2762      ....................000000000.111111.1111222222.2222333333333.344444
2763      ....................123456789.012345.6789012345.6789012345678.901234  */
2764   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
2765   lexer_test test (case_, content, NULL);
2766 
2767   /* Verify that we get the expected token back, with the correct
2768      location information.  */
2769   const cpp_token *tok = test.get_token ();
2770   ASSERT_EQ (tok->type, CPP_STRING);
2771   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
2772 			   "\"01234\\U00002174\\U00002175789\"");
2773 
2774   /* Verify that cpp_interpret_string works.
2775      The UTF-8 encoding of the string is identical to that from
2776      the ucn4 testcase above; the only difference is the column
2777      locations.  */
2778   cpp_string dst_string;
2779   const enum cpp_ttype type = CPP_STRING;
2780   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2781 				      &dst_string, type);
2782   ASSERT_TRUE (result);
2783   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2784 		(const char *)dst_string.text);
2785   free (const_cast <unsigned char *> (dst_string.text));
2786 
2787   /* Verify ranges of individual characters.  This no longer includes the
2788      opening quote, but does include the closing quote.
2789      '01234'.  */
2790   for (int i = 0; i <= 4; i++)
2791     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2792   /* U+2174.  */
2793   for (int i = 5; i <= 7; i++)
2794     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
2795   /* U+2175.  */
2796   for (int i = 8; i <= 10; i++)
2797     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
2798   /* '789' at columns 35-37  */
2799   for (int i = 11; i <= 13; i++)
2800     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
2801   /* Closing quote/nul-terminator at column 38.  */
2802   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
2803 
2804   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2805 }
2806 
2807 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
2808 
2809 static uint32_t
uint32_from_big_endian(const uint32_t * ptr_be_value)2810 uint32_from_big_endian (const uint32_t *ptr_be_value)
2811 {
2812   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2813   return (((uint32_t) buf[0] << 24)
2814 	  | ((uint32_t) buf[1] << 16)
2815 	  | ((uint32_t) buf[2] << 8)
2816 	  | (uint32_t) buf[3]);
2817 }
2818 
2819 /* Lex a wide string literal and verify that attempts to read substring
2820    location data from it fail gracefully.  */
2821 
2822 static void
test_lexer_string_locations_wide_string(const line_table_case & case_)2823 test_lexer_string_locations_wide_string (const line_table_case &case_)
2824 {
2825   /* Digits 0-9.
2826      ....................000000000.11111111112.22222222233333
2827      ....................123456789.01234567890.12345678901234  */
2828   const char *content = "       L\"0123456789\" /* non-str */\n";
2829   lexer_test test (case_, content, NULL);
2830 
2831   /* Verify that we get the expected token back, with the correct
2832      location information.  */
2833   const cpp_token *tok = test.get_token ();
2834   ASSERT_EQ (tok->type, CPP_WSTRING);
2835   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
2836 
2837   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
2838   cpp_string dst_string;
2839   const enum cpp_ttype type = CPP_WSTRING;
2840   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2841 				      &dst_string, type);
2842   ASSERT_TRUE (result);
2843   /* The cpp_reader defaults to big-endian with
2844      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
2845      now be encoded as UTF-32BE.  */
2846   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2847   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2848   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2849   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2850   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2851   free (const_cast <unsigned char *> (dst_string.text));
2852 
2853   /* We don't yet support generating substring location information
2854      for L"" strings.  */
2855   ASSERT_HAS_NO_SUBSTRING_RANGES
2856     (test, tok->src_loc, type,
2857      "execution character set != source character set");
2858 }
2859 
2860 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
2861 
2862 static uint16_t
uint16_from_big_endian(const uint16_t * ptr_be_value)2863 uint16_from_big_endian (const uint16_t *ptr_be_value)
2864 {
2865   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2866   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
2867 }
2868 
2869 /* Lex a u"" string literal and verify that attempts to read substring
2870    location data from it fail gracefully.  */
2871 
2872 static void
test_lexer_string_locations_string16(const line_table_case & case_)2873 test_lexer_string_locations_string16 (const line_table_case &case_)
2874 {
2875   /* Digits 0-9.
2876      ....................000000000.11111111112.22222222233333
2877      ....................123456789.01234567890.12345678901234  */
2878   const char *content = "       u\"0123456789\" /* non-str */\n";
2879   lexer_test test (case_, content, NULL);
2880 
2881   /* Verify that we get the expected token back, with the correct
2882      location information.  */
2883   const cpp_token *tok = test.get_token ();
2884   ASSERT_EQ (tok->type, CPP_STRING16);
2885   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
2886 
2887   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
2888   cpp_string dst_string;
2889   const enum cpp_ttype type = CPP_STRING16;
2890   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2891 				      &dst_string, type);
2892   ASSERT_TRUE (result);
2893 
2894   /* The cpp_reader defaults to big-endian, so dst_string should
2895      now be encoded as UTF-16BE.  */
2896   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
2897   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
2898   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
2899   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
2900   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
2901   free (const_cast <unsigned char *> (dst_string.text));
2902 
2903   /* We don't yet support generating substring location information
2904      for L"" strings.  */
2905   ASSERT_HAS_NO_SUBSTRING_RANGES
2906     (test, tok->src_loc, type,
2907      "execution character set != source character set");
2908 }
2909 
2910 /* Lex a U"" string literal and verify that attempts to read substring
2911    location data from it fail gracefully.  */
2912 
2913 static void
test_lexer_string_locations_string32(const line_table_case & case_)2914 test_lexer_string_locations_string32 (const line_table_case &case_)
2915 {
2916   /* Digits 0-9.
2917      ....................000000000.11111111112.22222222233333
2918      ....................123456789.01234567890.12345678901234  */
2919   const char *content = "       U\"0123456789\" /* non-str */\n";
2920   lexer_test test (case_, content, NULL);
2921 
2922   /* Verify that we get the expected token back, with the correct
2923      location information.  */
2924   const cpp_token *tok = test.get_token ();
2925   ASSERT_EQ (tok->type, CPP_STRING32);
2926   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
2927 
2928   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
2929   cpp_string dst_string;
2930   const enum cpp_ttype type = CPP_STRING32;
2931   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2932 				      &dst_string, type);
2933   ASSERT_TRUE (result);
2934 
2935   /* The cpp_reader defaults to big-endian, so dst_string should
2936      now be encoded as UTF-32BE.  */
2937   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2938   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2939   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2940   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2941   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2942   free (const_cast <unsigned char *> (dst_string.text));
2943 
2944   /* We don't yet support generating substring location information
2945      for L"" strings.  */
2946   ASSERT_HAS_NO_SUBSTRING_RANGES
2947     (test, tok->src_loc, type,
2948      "execution character set != source character set");
2949 }
2950 
2951 /* Lex a u8-string literal.
2952    Verify the substring location data after running cpp_interpret_string
2953    on it.  */
2954 
2955 static void
test_lexer_string_locations_u8(const line_table_case & case_)2956 test_lexer_string_locations_u8 (const line_table_case &case_)
2957 {
2958   /* Digits 0-9.
2959      ....................000000000.11111111112.22222222233333
2960      ....................123456789.01234567890.12345678901234  */
2961   const char *content = "      u8\"0123456789\" /* non-str */\n";
2962   lexer_test test (case_, content, NULL);
2963 
2964   /* Verify that we get the expected token back, with the correct
2965      location information.  */
2966   const cpp_token *tok = test.get_token ();
2967   ASSERT_EQ (tok->type, CPP_UTF8STRING);
2968   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
2969 
2970   /* Verify that cpp_interpret_string works.  */
2971   cpp_string dst_string;
2972   const enum cpp_ttype type = CPP_STRING;
2973   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2974 				      &dst_string, type);
2975   ASSERT_TRUE (result);
2976   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2977   free (const_cast <unsigned char *> (dst_string.text));
2978 
2979   /* Verify ranges of individual characters.  This no longer includes the
2980      opening quote, but does include the closing quote.  */
2981   for (int i = 0; i <= 10; i++)
2982     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2983 }
2984 
2985 /* Lex a string literal containing UTF-8 source characters.
2986    Verify the substring location data after running cpp_interpret_string
2987    on it.  */
2988 
2989 static void
test_lexer_string_locations_utf8_source(const line_table_case & case_)2990 test_lexer_string_locations_utf8_source (const line_table_case &case_)
2991 {
2992  /* This string literal is written out to the source file as UTF-8,
2993     and is of the form "before mojibake after", where "mojibake"
2994     is written as the following four unicode code points:
2995        U+6587 CJK UNIFIED IDEOGRAPH-6587
2996        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2997        U+5316 CJK UNIFIED IDEOGRAPH-5316
2998        U+3051 HIRAGANA LETTER KE.
2999      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
3000      "before" and "after" are 1 byte per unicode character.
3001 
3002      The numbering shown are "columns", which are *byte* numbers within
3003      the line, rather than unicode character numbers.
3004 
3005      .................... 000000000.1111111.
3006      .................... 123456789.0123456.  */
3007   const char *content = ("        \"before "
3008 			 /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3009 			      UTF-8: 0xE6 0x96 0x87
3010 			      C octal escaped UTF-8: \346\226\207
3011 			    "column" numbers: 17-19.  */
3012 			 "\346\226\207"
3013 
3014 			 /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3015 			      UTF-8: 0xE5 0xAD 0x97
3016 			      C octal escaped UTF-8: \345\255\227
3017 			    "column" numbers: 20-22.  */
3018 			 "\345\255\227"
3019 
3020 			 /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3021 			      UTF-8: 0xE5 0x8C 0x96
3022 			      C octal escaped UTF-8: \345\214\226
3023 			    "column" numbers: 23-25.  */
3024 			 "\345\214\226"
3025 
3026 			 /* U+3051 HIRAGANA LETTER KE
3027 			      UTF-8: 0xE3 0x81 0x91
3028 			      C octal escaped UTF-8: \343\201\221
3029 			    "column" numbers: 26-28.  */
3030 			 "\343\201\221"
3031 
3032 			 /* column numbers 29 onwards
3033 			  2333333.33334444444444
3034 			  9012345.67890123456789. */
3035 			 " after\" /* non-str */\n");
3036   lexer_test test (case_, content, NULL);
3037 
3038   /* Verify that we get the expected token back, with the correct
3039      location information.  */
3040   const cpp_token *tok = test.get_token ();
3041   ASSERT_EQ (tok->type, CPP_STRING);
3042   ASSERT_TOKEN_AS_TEXT_EQ
3043     (test.m_parser, tok,
3044      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3045 
3046   /* Verify that cpp_interpret_string works.  */
3047   cpp_string dst_string;
3048   const enum cpp_ttype type = CPP_STRING;
3049   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3050 				      &dst_string, type);
3051   ASSERT_TRUE (result);
3052   ASSERT_STREQ
3053     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3054      (const char *)dst_string.text);
3055   free (const_cast <unsigned char *> (dst_string.text));
3056 
3057   /* Verify ranges of individual characters.  This no longer includes the
3058      opening quote, but does include the closing quote.
3059      Assuming that both source and execution encodings are UTF-8, we have
3060      a run of 25 octets in each, plus the NUL terminator.  */
3061   for (int i = 0; i < 25; i++)
3062     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3063   /* NUL-terminator should use the closing quote at column 35.  */
3064   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3065 
3066   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3067 }
3068 
3069 /* Test of string literal concatenation.  */
3070 
3071 static void
test_lexer_string_locations_concatenation_1(const line_table_case & case_)3072 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3073 {
3074   /* Digits 0-9.
3075      .....................000000000.111111.11112222222222
3076      .....................123456789.012345.67890123456789.  */
3077   const char *content = ("        \"01234\" /* non-str */\n"
3078 			 "        \"56789\" /* non-str */\n");
3079   lexer_test test (case_, content, NULL);
3080 
3081   location_t input_locs[2];
3082 
3083   /* Verify that we get the expected tokens back.  */
3084   auto_vec <cpp_string> input_strings;
3085   const cpp_token *tok_a = test.get_token ();
3086   ASSERT_EQ (tok_a->type, CPP_STRING);
3087   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3088   input_strings.safe_push (tok_a->val.str);
3089   input_locs[0] = tok_a->src_loc;
3090 
3091   const cpp_token *tok_b = test.get_token ();
3092   ASSERT_EQ (tok_b->type, CPP_STRING);
3093   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3094   input_strings.safe_push (tok_b->val.str);
3095   input_locs[1] = tok_b->src_loc;
3096 
3097   /* Verify that cpp_interpret_string works.  */
3098   cpp_string dst_string;
3099   const enum cpp_ttype type = CPP_STRING;
3100   bool result = cpp_interpret_string (test.m_parser,
3101 				      input_strings.address (), 2,
3102 				      &dst_string, type);
3103   ASSERT_TRUE (result);
3104   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3105   free (const_cast <unsigned char *> (dst_string.text));
3106 
3107   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3108   test.m_concats.record_string_concatenation (2, input_locs);
3109 
3110   location_t initial_loc = input_locs[0];
3111 
3112   /* "01234" on line 1.  */
3113   for (int i = 0; i <= 4; i++)
3114     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3115   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
3116   for (int i = 5; i <= 10; i++)
3117     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3118 
3119   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3120 }
3121 
3122 /* Another test of string literal concatenation.  */
3123 
3124 static void
test_lexer_string_locations_concatenation_2(const line_table_case & case_)3125 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3126 {
3127   /* Digits 0-9.
3128      .....................000000000.111.11111112222222
3129      .....................123456789.012.34567890123456.  */
3130   const char *content = ("        \"01\" /* non-str */\n"
3131 			 "        \"23\" /* non-str */\n"
3132 			 "        \"45\" /* non-str */\n"
3133 			 "        \"67\" /* non-str */\n"
3134 			 "        \"89\" /* non-str */\n");
3135   lexer_test test (case_, content, NULL);
3136 
3137   auto_vec <cpp_string> input_strings;
3138   location_t input_locs[5];
3139 
3140   /* Verify that we get the expected tokens back.  */
3141   for (int i = 0; i < 5; i++)
3142     {
3143       const cpp_token *tok = test.get_token ();
3144       ASSERT_EQ (tok->type, CPP_STRING);
3145       input_strings.safe_push (tok->val.str);
3146       input_locs[i] = tok->src_loc;
3147     }
3148 
3149   /* Verify that cpp_interpret_string works.  */
3150   cpp_string dst_string;
3151   const enum cpp_ttype type = CPP_STRING;
3152   bool result = cpp_interpret_string (test.m_parser,
3153 				      input_strings.address (), 5,
3154 				      &dst_string, type);
3155   ASSERT_TRUE (result);
3156   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3157   free (const_cast <unsigned char *> (dst_string.text));
3158 
3159   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3160   test.m_concats.record_string_concatenation (5, input_locs);
3161 
3162   location_t initial_loc = input_locs[0];
3163 
3164   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3165      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3166      and expect get_source_range_for_substring to fail.
3167      However, for a string concatenation test, we can have a case
3168      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3169      but subsequent strings can be after it.
3170      Attempting to detect this within assert_char_at_range
3171      would overcomplicate the logic for the common test cases, so
3172      we detect it here.  */
3173   if (should_have_column_data_p (input_locs[0])
3174       && !should_have_column_data_p (input_locs[4]))
3175     {
3176       /* Verify that get_source_range_for_substring gracefully rejects
3177 	 this case.  */
3178       source_range actual_range;
3179       const char *err
3180 	= get_source_range_for_char (test.m_parser, &test.m_concats,
3181 				     initial_loc, type, 0, &actual_range);
3182       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3183       return;
3184     }
3185 
3186   for (int i = 0; i < 5; i++)
3187     for (int j = 0; j < 2; j++)
3188       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3189 			    i + 1, 10 + j, 10 + j);
3190 
3191   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
3192   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3193 
3194   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3195 }
3196 
3197 /* Another test of string literal concatenation, this time combined with
3198    various kinds of escaped characters.  */
3199 
3200 static void
test_lexer_string_locations_concatenation_3(const line_table_case & case_)3201 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3202 {
3203   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3204      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
3205   const char *content
3206     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3207        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3208     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
3209   lexer_test test (case_, content, NULL);
3210 
3211   auto_vec <cpp_string> input_strings;
3212   location_t input_locs[4];
3213 
3214   /* Verify that we get the expected tokens back.  */
3215   for (int i = 0; i < 4; i++)
3216     {
3217       const cpp_token *tok = test.get_token ();
3218       ASSERT_EQ (tok->type, CPP_STRING);
3219       input_strings.safe_push (tok->val.str);
3220       input_locs[i] = tok->src_loc;
3221     }
3222 
3223   /* Verify that cpp_interpret_string works.  */
3224   cpp_string dst_string;
3225   const enum cpp_ttype type = CPP_STRING;
3226   bool result = cpp_interpret_string (test.m_parser,
3227 				      input_strings.address (), 4,
3228 				      &dst_string, type);
3229   ASSERT_TRUE (result);
3230   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3231   free (const_cast <unsigned char *> (dst_string.text));
3232 
3233   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3234   test.m_concats.record_string_concatenation (4, input_locs);
3235 
3236   location_t initial_loc = input_locs[0];
3237 
3238   for (int i = 0; i <= 4; i++)
3239     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3240   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3241   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3242   for (int i = 7; i <= 9; i++)
3243     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3244 
3245   /* NUL-terminator should use the location of the final closing quote.  */
3246   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3247 
3248   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3249 }
3250 
3251 /* Test of string literal in a macro.  */
3252 
3253 static void
test_lexer_string_locations_macro(const line_table_case & case_)3254 test_lexer_string_locations_macro (const line_table_case &case_)
3255 {
3256   /* Digits 0-9.
3257      .....................0000000001111111111.22222222223.
3258      .....................1234567890123456789.01234567890.  */
3259   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3260 			 "  MACRO");
3261   lexer_test test (case_, content, NULL);
3262 
3263   /* Verify that we get the expected tokens back.  */
3264   const cpp_token *tok = test.get_token ();
3265   ASSERT_EQ (tok->type, CPP_PADDING);
3266 
3267   tok = test.get_token ();
3268   ASSERT_EQ (tok->type, CPP_STRING);
3269   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3270 
3271   /* Verify ranges of individual characters.  We ought to
3272      see columns within the macro definition.  */
3273   for (int i = 0; i <= 10; i++)
3274     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3275 			  i, 1, 20 + i, 20 + i);
3276 
3277   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3278 
3279   tok = test.get_token ();
3280   ASSERT_EQ (tok->type, CPP_PADDING);
3281 }
3282 
3283 /* Test of stringification of a macro argument.  */
3284 
3285 static void
test_lexer_string_locations_stringified_macro_argument(const line_table_case & case_)3286 test_lexer_string_locations_stringified_macro_argument
3287   (const line_table_case &case_)
3288 {
3289   /* .....................000000000111111111122222222223.
3290      .....................123456789012345678901234567890.  */
3291   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3292 			 "MACRO(foo)\n");
3293   lexer_test test (case_, content, NULL);
3294 
3295   /* Verify that we get the expected token back.  */
3296   const cpp_token *tok = test.get_token ();
3297   ASSERT_EQ (tok->type, CPP_PADDING);
3298 
3299   tok = test.get_token ();
3300   ASSERT_EQ (tok->type, CPP_STRING);
3301   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3302 
3303   /* We don't support getting the location of a stringified macro
3304      argument.  Verify that it fails gracefully.  */
3305   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3306 				  "cpp_interpret_string_1 failed");
3307 
3308   tok = test.get_token ();
3309   ASSERT_EQ (tok->type, CPP_PADDING);
3310 
3311   tok = test.get_token ();
3312   ASSERT_EQ (tok->type, CPP_PADDING);
3313 }
3314 
3315 /* Ensure that we are fail gracefully if something attempts to pass
3316    in a location that isn't a string literal token.  Seen on this code:
3317 
3318      const char a[] = " %d ";
3319      __builtin_printf (a, 0.5);
3320                        ^
3321 
3322    when c-format.c erroneously used the indicated one-character
3323    location as the format string location, leading to a read past the
3324    end of a string buffer in cpp_interpret_string_1.  */
3325 
3326 static void
test_lexer_string_locations_non_string(const line_table_case & case_)3327 test_lexer_string_locations_non_string (const line_table_case &case_)
3328 {
3329   /* .....................000000000111111111122222222223.
3330      .....................123456789012345678901234567890.  */
3331   const char *content = ("         a\n");
3332   lexer_test test (case_, content, NULL);
3333 
3334   /* Verify that we get the expected token back.  */
3335   const cpp_token *tok = test.get_token ();
3336   ASSERT_EQ (tok->type, CPP_NAME);
3337   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3338 
3339   /* At this point, libcpp is attempting to interpret the name as a
3340      string literal, despite it not starting with a quote.  We don't detect
3341      that, but we should at least fail gracefully.  */
3342   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3343 				  "cpp_interpret_string_1 failed");
3344 }
3345 
3346 /* Ensure that we can read substring information for a token which
3347    starts in one linemap and ends in another .  Adapted from
3348    gcc.dg/cpp/pr69985.c.  */
3349 
3350 static void
test_lexer_string_locations_long_line(const line_table_case & case_)3351 test_lexer_string_locations_long_line (const line_table_case &case_)
3352 {
3353   /* .....................000000.000111111111
3354      .....................123456.789012346789.  */
3355   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3356 			 "     \"0123456789012345678901234567890123456789"
3357 			 "0123456789012345678901234567890123456789"
3358 			 "0123456789012345678901234567890123456789"
3359 			 "0123456789\"\n");
3360 
3361   lexer_test test (case_, content, NULL);
3362 
3363   /* Verify that we get the expected token back.  */
3364   const cpp_token *tok = test.get_token ();
3365   ASSERT_EQ (tok->type, CPP_STRING);
3366 
3367   if (!should_have_column_data_p (line_table->highest_location))
3368     return;
3369 
3370   /* Verify ranges of individual characters.  */
3371   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3372   for (int i = 0; i < 131; i++)
3373     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3374 			  i, 2, 7 + i, 7 + i);
3375 }
3376 
3377 /* Test of locations within a raw string that doesn't contain a newline.  */
3378 
3379 static void
test_lexer_string_locations_raw_string_one_line(const line_table_case & case_)3380 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3381 {
3382   /* .....................00.0000000111111111122.
3383      .....................12.3456789012345678901.  */
3384   const char *content = ("R\"foo(0123456789)foo\"\n");
3385   lexer_test test (case_, content, NULL);
3386 
3387   /* Verify that we get the expected token back.  */
3388   const cpp_token *tok = test.get_token ();
3389   ASSERT_EQ (tok->type, CPP_STRING);
3390 
3391   /* Verify that cpp_interpret_string works.  */
3392   cpp_string dst_string;
3393   const enum cpp_ttype type = CPP_STRING;
3394   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3395 				      &dst_string, type);
3396   ASSERT_TRUE (result);
3397   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3398   free (const_cast <unsigned char *> (dst_string.text));
3399 
3400   if (!should_have_column_data_p (line_table->highest_location))
3401     return;
3402 
3403   /* 0-9, plus the nil terminator.  */
3404   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3405   for (int i = 0; i < 11; i++)
3406     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3407 			  i, 1, 7 + i, 7 + i);
3408 }
3409 
3410 /* Test of locations within a raw string that contains a newline.  */
3411 
3412 static void
test_lexer_string_locations_raw_string_multiline(const line_table_case & case_)3413 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3414 {
3415   /* .....................00.0000.
3416      .....................12.3456.  */
3417   const char *content = ("R\"foo(\n"
3418   /* .....................00000.
3419      .....................12345.  */
3420 			 "hello\n"
3421 			 "world\n"
3422   /* .....................00000.
3423      .....................12345.  */
3424 			 ")foo\"\n");
3425   lexer_test test (case_, content, NULL);
3426 
3427   /* Verify that we get the expected token back.  */
3428   const cpp_token *tok = test.get_token ();
3429   ASSERT_EQ (tok->type, CPP_STRING);
3430 
3431   /* Verify that cpp_interpret_string works.  */
3432   cpp_string dst_string;
3433   const enum cpp_ttype type = CPP_STRING;
3434   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3435 				      &dst_string, type);
3436   ASSERT_TRUE (result);
3437   ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3438   free (const_cast <unsigned char *> (dst_string.text));
3439 
3440   if (!should_have_column_data_p (line_table->highest_location))
3441     return;
3442 
3443   /* Currently we don't support locations within raw strings that
3444      contain newlines.  */
3445   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3446 				  "range endpoints are on different lines");
3447 }
3448 
3449 /* Test of parsing an unterminated raw string.  */
3450 
3451 static void
test_lexer_string_locations_raw_string_unterminated(const line_table_case & case_)3452 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3453 {
3454   const char *content = "R\"ouch()ouCh\" /* etc */";
3455 
3456   lexer_diagnostic_sink diagnostics;
3457   lexer_test test (case_, content, &diagnostics);
3458   test.m_implicitly_expect_EOF = false;
3459 
3460   /* Attempt to parse the raw string.  */
3461   const cpp_token *tok = test.get_token ();
3462   ASSERT_EQ (tok->type, CPP_EOF);
3463 
3464   ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3465   /* We expect the message "unterminated raw string"
3466      in the "cpplib" translation domain.
3467      It's not clear that dgettext is available on all supported hosts,
3468      so this assertion is commented-out for now.
3469        ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3470                      diagnostics.m_diagnostics[0]);
3471   */
3472 }
3473 
3474 /* Test of lexing char constants.  */
3475 
3476 static void
test_lexer_char_constants(const line_table_case & case_)3477 test_lexer_char_constants (const line_table_case &case_)
3478 {
3479   /* Various char constants.
3480      .....................0000000001111111111.22222222223.
3481      .....................1234567890123456789.01234567890.  */
3482   const char *content = ("         'a'\n"
3483 			 "        u'a'\n"
3484 			 "        U'a'\n"
3485 			 "        L'a'\n"
3486 			 "         'abc'\n");
3487   lexer_test test (case_, content, NULL);
3488 
3489   /* Verify that we get the expected tokens back.  */
3490   /* 'a'.  */
3491   const cpp_token *tok = test.get_token ();
3492   ASSERT_EQ (tok->type, CPP_CHAR);
3493   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3494 
3495   unsigned int chars_seen;
3496   int unsignedp;
3497   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3498 					  &chars_seen, &unsignedp);
3499   ASSERT_EQ (cc, 'a');
3500   ASSERT_EQ (chars_seen, 1);
3501 
3502   /* u'a'.  */
3503   tok = test.get_token ();
3504   ASSERT_EQ (tok->type, CPP_CHAR16);
3505   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3506 
3507   /* U'a'.  */
3508   tok = test.get_token ();
3509   ASSERT_EQ (tok->type, CPP_CHAR32);
3510   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3511 
3512   /* L'a'.  */
3513   tok = test.get_token ();
3514   ASSERT_EQ (tok->type, CPP_WCHAR);
3515   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3516 
3517   /* 'abc' (c-char-sequence).  */
3518   tok = test.get_token ();
3519   ASSERT_EQ (tok->type, CPP_CHAR);
3520   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3521 }
3522 /* A table of interesting location_t values, giving one axis of our test
3523    matrix.  */
3524 
3525 static const location_t boundary_locations[] = {
3526   /* Zero means "don't override the default values for a new line_table".  */
3527   0,
3528 
3529   /* An arbitrary non-zero value that isn't close to one of
3530      the boundary values below.  */
3531   0x10000,
3532 
3533   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3534   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3535   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3536   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3537   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3538   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3539 
3540   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3541   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3542   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3543   LINE_MAP_MAX_LOCATION_WITH_COLS,
3544   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3545   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3546 };
3547 
3548 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3549 
3550 void
for_each_line_table_case(void (* testcase)(const line_table_case &))3551 for_each_line_table_case (void (*testcase) (const line_table_case &))
3552 {
3553   /* As noted above in the description of struct line_table_case,
3554      we want to explore a test matrix of interesting line_table
3555      situations, running various selftests for each case within the
3556      matrix.  */
3557 
3558   /* Run all tests with:
3559      (a) line_table->default_range_bits == 0, and
3560      (b) line_table->default_range_bits == 5.  */
3561   int num_cases_tested = 0;
3562   for (int default_range_bits = 0; default_range_bits <= 5;
3563        default_range_bits += 5)
3564     {
3565       /* ...and use each of the "interesting" location values as
3566 	 the starting location within line_table.  */
3567       const int num_boundary_locations
3568 	= sizeof (boundary_locations) / sizeof (boundary_locations[0]);
3569       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3570 	{
3571 	  line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3572 
3573 	  testcase (c);
3574 
3575 	  num_cases_tested++;
3576 	}
3577     }
3578 
3579   /* Verify that we fully covered the test matrix.  */
3580   ASSERT_EQ (num_cases_tested, 2 * 12);
3581 }
3582 
3583 /* Verify that when presented with a consecutive pair of locations with
3584    a very large line offset, we don't attempt to consolidate them into
3585    a single ordinary linemap where the line offsets within the line map
3586    would lead to overflow (PR lto/88147).  */
3587 
3588 static void
test_line_offset_overflow()3589 test_line_offset_overflow ()
3590 {
3591   line_table_test ltt (line_table_case (5, 0));
3592 
3593   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
3594   linemap_line_start (line_table, 1, 100);
3595   location_t loc_a = linemap_line_start (line_table, 2578, 255);
3596   assert_loceq ("foo.c", 2578, 0, loc_a);
3597 
3598   const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3599   ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
3600   ASSERT_EQ (ordmap_a->m_range_bits, 5);
3601 
3602   location_t loc_b = linemap_line_start (line_table, 404198, 512);
3603   assert_loceq ("foo.c", 404198, 0, loc_b);
3604 
3605   /* We should have started a new linemap, rather than attempting to store
3606      a very large line offset.  */
3607   const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3608   ASSERT_NE (ordmap_a, ordmap_b);
3609 }
3610 
test_cpp_utf8()3611 void test_cpp_utf8 ()
3612 {
3613   const int def_tabstop = 8;
3614   /* Verify that wcwidth of invalid UTF-8 or control bytes is 1.  */
3615   {
3616     int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, def_tabstop);
3617     ASSERT_EQ (8, w_bad);
3618     int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, def_tabstop);
3619     ASSERT_EQ (5, w_ctrl);
3620   }
3621 
3622   /* Verify that wcwidth of valid UTF-8 is as expected.  */
3623   {
3624     const int w_pi = cpp_display_width ("\xcf\x80", 2, def_tabstop);
3625     ASSERT_EQ (1, w_pi);
3626     const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, def_tabstop);
3627     ASSERT_EQ (2, w_emoji);
3628     const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2,
3629 							def_tabstop);
3630     ASSERT_EQ (1, w_umlaut_precomposed);
3631     const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3,
3632 						      def_tabstop);
3633     ASSERT_EQ (1, w_umlaut_combining);
3634     const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, def_tabstop);
3635     ASSERT_EQ (2, w_han);
3636     const int w_ascii = cpp_display_width ("GCC", 3, def_tabstop);
3637     ASSERT_EQ (3, w_ascii);
3638     const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
3639 					   "\x9f! \xe4\xb8\xba y\xcc\x88",
3640 					   24, def_tabstop);
3641     ASSERT_EQ (18, w_mixed);
3642   }
3643 
3644   /* Verify that display width properly expands tabs.  */
3645   {
3646     const char *tstr = "\tabc\td";
3647     ASSERT_EQ (6, cpp_display_width (tstr, 6, 1));
3648     ASSERT_EQ (10, cpp_display_width (tstr, 6, 3));
3649     ASSERT_EQ (17, cpp_display_width (tstr, 6, 8));
3650     ASSERT_EQ (1, cpp_display_column_to_byte_column (tstr, 6, 7, 8));
3651   }
3652 
3653   /* Verify that cpp_byte_column_to_display_column can go past the end,
3654      and similar edge cases.  */
3655   {
3656     const char *str
3657       /* Display columns.
3658          111111112345  */
3659       = "\xcf\x80 abc";
3660       /* 111122223456
3661 	 Byte columns.  */
3662 
3663     ASSERT_EQ (5, cpp_display_width (str, 6, def_tabstop));
3664     ASSERT_EQ (105,
3665 	       cpp_byte_column_to_display_column (str, 6, 106, def_tabstop));
3666     ASSERT_EQ (10000,
3667 	       cpp_byte_column_to_display_column (NULL, 0, 10000, def_tabstop));
3668     ASSERT_EQ (0,
3669 	       cpp_byte_column_to_display_column (NULL, 10000, 0, def_tabstop));
3670   }
3671 
3672   /* Verify that cpp_display_column_to_byte_column can go past the end,
3673      and similar edge cases, and check invertibility.  */
3674   {
3675     const char *str
3676       /* Display columns.
3677 	 000000000000000000000000000000000000011
3678 	 111111112222222234444444455555555678901  */
3679       = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
3680       /* 000000000000000000000000000000000111111
3681 	 111122223333444456666777788889999012345
3682 	 Byte columns.  */
3683     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, def_tabstop));
3684     ASSERT_EQ (15,
3685 	       cpp_display_column_to_byte_column (str, 15, 11, def_tabstop));
3686     ASSERT_EQ (115,
3687 	       cpp_display_column_to_byte_column (str, 15, 111, def_tabstop));
3688     ASSERT_EQ (10000,
3689 	       cpp_display_column_to_byte_column (NULL, 0, 10000, def_tabstop));
3690     ASSERT_EQ (0,
3691 	       cpp_display_column_to_byte_column (NULL, 10000, 0, def_tabstop));
3692 
3693     /* Verify that we do not interrupt a UTF-8 sequence.  */
3694     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, def_tabstop));
3695 
3696     for (int byte_col = 1; byte_col <= 15; ++byte_col)
3697       {
3698 	const int disp_col
3699 	  = cpp_byte_column_to_display_column (str, 15, byte_col, def_tabstop);
3700 	const int byte_col2
3701 	  = cpp_display_column_to_byte_column (str, 15, disp_col, def_tabstop);
3702 
3703 	/* If we ask for the display column in the middle of a UTF-8
3704 	   sequence, it will return the length of the partial sequence,
3705 	   matching the behavior of GCC before display column support.
3706 	   Otherwise check the round trip was successful.  */
3707 	if (byte_col < 4)
3708 	  ASSERT_EQ (byte_col, disp_col);
3709 	else if (byte_col >= 6 && byte_col < 9)
3710 	  ASSERT_EQ (3 + (byte_col - 5), disp_col);
3711 	else
3712 	  ASSERT_EQ (byte_col2, byte_col);
3713       }
3714   }
3715 
3716 }
3717 
3718 /* Run all of the selftests within this file.  */
3719 
3720 void
input_c_tests()3721 input_c_tests ()
3722 {
3723   test_linenum_comparisons ();
3724   test_should_have_column_data_p ();
3725   test_unknown_location ();
3726   test_builtins ();
3727   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3728 
3729   for_each_line_table_case (test_accessing_ordinary_linemaps);
3730   for_each_line_table_case (test_lexer);
3731   for_each_line_table_case (test_lexer_string_locations_simple);
3732   for_each_line_table_case (test_lexer_string_locations_ebcdic);
3733   for_each_line_table_case (test_lexer_string_locations_hex);
3734   for_each_line_table_case (test_lexer_string_locations_oct);
3735   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3736   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3737   for_each_line_table_case (test_lexer_string_locations_ucn4);
3738   for_each_line_table_case (test_lexer_string_locations_ucn8);
3739   for_each_line_table_case (test_lexer_string_locations_wide_string);
3740   for_each_line_table_case (test_lexer_string_locations_string16);
3741   for_each_line_table_case (test_lexer_string_locations_string32);
3742   for_each_line_table_case (test_lexer_string_locations_u8);
3743   for_each_line_table_case (test_lexer_string_locations_utf8_source);
3744   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3745   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3746   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3747   for_each_line_table_case (test_lexer_string_locations_macro);
3748   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3749   for_each_line_table_case (test_lexer_string_locations_non_string);
3750   for_each_line_table_case (test_lexer_string_locations_long_line);
3751   for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
3752   for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
3753   for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
3754   for_each_line_table_case (test_lexer_char_constants);
3755 
3756   test_reading_source_line ();
3757 
3758   test_line_offset_overflow ();
3759 
3760   test_cpp_utf8 ();
3761 }
3762 
3763 } // namespace selftest
3764 
3765 #endif /* CHECKING_P */
3766