1 /* Data and functions related to line maps and input files.
2    Copyright (C) 2004-2020 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "intl.h"
24 #include "diagnostic.h"
25 #include "diagnostic-core.h"
26 #include "selftest.h"
27 #include "cpplib.h"
28 
29 #ifndef HAVE_ICONV
30 #define HAVE_ICONV 0
31 #endif
32 
33 /* This is a cache used by get_next_line to store the content of a
34    file to be searched for file lines.  */
35 class fcache
36 {
37 public:
38   /* These are information used to store a line boundary.  */
39   class line_info
40   {
41   public:
42     /* The line number.  It starts from 1.  */
43     size_t line_num;
44 
45     /* The position (byte count) of the beginning of the line,
46        relative to the file data pointer.  This starts at zero.  */
47     size_t start_pos;
48 
49     /* The position (byte count) of the last byte of the line.  This
50        normally points to the '\n' character, or to one byte after the
51        last byte of the file, if the file doesn't contain a '\n'
52        character.  */
53     size_t end_pos;
54 
line_info(size_t l,size_t s,size_t e)55     line_info (size_t l, size_t s, size_t e)
56       : line_num (l), start_pos (s), end_pos (e)
57     {}
58 
line_info()59     line_info ()
60       :line_num (0), start_pos (0), end_pos (0)
61     {}
62   };
63 
64   /* The number of time this file has been accessed.  This is used
65      to designate which file cache to evict from the cache
66      array.  */
67   unsigned use_count;
68 
69   /* The file_path is the key for identifying a particular file in
70      the cache.
71      For libcpp-using code, the underlying buffer for this field is
72      owned by the corresponding _cpp_file within the cpp_reader.  */
73   const char *file_path;
74 
75   FILE *fp;
76 
77   /* This points to the content of the file that we've read so
78      far.  */
79   char *data;
80 
81   /*  The size of the DATA array above.*/
82   size_t size;
83 
84   /* The number of bytes read from the underlying file so far.  This
85      must be less (or equal) than SIZE above.  */
86   size_t nb_read;
87 
88   /* The index of the beginning of the current line.  */
89   size_t line_start_idx;
90 
91   /* The number of the previous line read.  This starts at 1.  Zero
92      means we've read no line so far.  */
93   size_t line_num;
94 
95   /* This is the total number of lines of the current file.  At the
96      moment, we try to get this information from the line map
97      subsystem.  Note that this is just a hint.  When using the C++
98      front-end, this hint is correct because the input file is then
99      completely tokenized before parsing starts; so the line map knows
100      the number of lines before compilation really starts.  For e.g,
101      the C front-end, it can happen that we start emitting diagnostics
102      before the line map has seen the end of the file.  */
103   size_t total_lines;
104 
105   /* Could this file be missing a trailing newline on its final line?
106      Initially true (to cope with empty files), set to true/false
107      as each line is read.  */
108   bool missing_trailing_newline;
109 
110   /* This is a record of the beginning and end of the lines we've seen
111      while reading the file.  This is useful to avoid walking the data
112      from the beginning when we are asked to read a line that is
113      before LINE_START_IDX above.  Note that the maximum size of this
114      record is fcache_line_record_size, so that the memory consumption
115      doesn't explode.  We thus scale total_lines down to
116      fcache_line_record_size.  */
117   vec<line_info, va_heap> line_record;
118 
119   fcache ();
120   ~fcache ();
121 };
122 
123 /* Current position in real source file.  */
124 
125 location_t input_location = UNKNOWN_LOCATION;
126 
127 class line_maps *line_table;
128 
129 /* A stashed copy of "line_table" for use by selftest::line_table_test.
130    This needs to be a global so that it can be a GC root, and thus
131    prevent the stashed copy from being garbage-collected if the GC runs
132    during a line_table_test.  */
133 
134 class line_maps *saved_line_table;
135 
136 static fcache *fcache_tab;
137 static const size_t fcache_tab_size = 16;
138 static const size_t fcache_buffer_size = 4 * 1024;
139 static const size_t fcache_line_record_size = 100;
140 
141 /* Expand the source location LOC into a human readable location.  If
142    LOC resolves to a builtin location, the file name of the readable
143    location is set to the string "<built-in>". If EXPANSION_POINT_P is
144    TRUE and LOC is virtual, then it is resolved to the expansion
145    point of the involved macro.  Otherwise, it is resolved to the
146    spelling location of the token.
147 
148    When resolving to the spelling location of the token, if the
149    resulting location is for a built-in location (that is, it has no
150    associated line/column) in the context of a macro expansion, the
151    returned location is the first one (while unwinding the macro
152    location towards its expansion point) that is in real source
153    code.
154 
155    ASPECT controls which part of the location to use.  */
156 
157 static expanded_location
expand_location_1(location_t loc,bool expansion_point_p,enum location_aspect aspect)158 expand_location_1 (location_t loc,
159 		   bool expansion_point_p,
160 		   enum location_aspect aspect)
161 {
162   expanded_location xloc;
163   const line_map_ordinary *map;
164   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
165   tree block = NULL;
166 
167   if (IS_ADHOC_LOC (loc))
168     {
169       block = LOCATION_BLOCK (loc);
170       loc = LOCATION_LOCUS (loc);
171     }
172 
173   memset (&xloc, 0, sizeof (xloc));
174 
175   if (loc >= RESERVED_LOCATION_COUNT)
176     {
177       if (!expansion_point_p)
178 	{
179 	  /* We want to resolve LOC to its spelling location.
180 
181 	     But if that spelling location is a reserved location that
182 	     appears in the context of a macro expansion (like for a
183 	     location for a built-in token), let's consider the first
184 	     location (toward the expansion point) that is not reserved;
185 	     that is, the first location that is in real source code.  */
186 	  loc = linemap_unwind_to_first_non_reserved_loc (line_table,
187 							  loc, NULL);
188 	  lrk = LRK_SPELLING_LOCATION;
189 	}
190       loc = linemap_resolve_location (line_table, loc, lrk, &map);
191 
192       /* loc is now either in an ordinary map, or is a reserved location.
193 	 If it is a compound location, the caret is in a spelling location,
194 	 but the start/finish might still be a virtual location.
195 	 Depending of what the caller asked for, we may need to recurse
196 	 one level in order to resolve any virtual locations in the
197 	 end-points.  */
198       switch (aspect)
199 	{
200 	default:
201 	  gcc_unreachable ();
202 	  /* Fall through.  */
203 	case LOCATION_ASPECT_CARET:
204 	  break;
205 	case LOCATION_ASPECT_START:
206 	  {
207 	    location_t start = get_start (loc);
208 	    if (start != loc)
209 	      return expand_location_1 (start, expansion_point_p, aspect);
210 	  }
211 	  break;
212 	case LOCATION_ASPECT_FINISH:
213 	  {
214 	    location_t finish = get_finish (loc);
215 	    if (finish != loc)
216 	      return expand_location_1 (finish, expansion_point_p, aspect);
217 	  }
218 	  break;
219 	}
220       xloc = linemap_expand_location (line_table, map, loc);
221     }
222 
223   xloc.data = block;
224   if (loc <= BUILTINS_LOCATION)
225     xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
226 
227   return xloc;
228 }
229 
230 /* Initialize the set of cache used for files accessed by caret
231    diagnostic.  */
232 
233 static void
diagnostic_file_cache_init(void)234 diagnostic_file_cache_init (void)
235 {
236   if (fcache_tab == NULL)
237     fcache_tab = new fcache[fcache_tab_size];
238 }
239 
240 /* Free the resources used by the set of cache used for files accessed
241    by caret diagnostic.  */
242 
243 void
diagnostic_file_cache_fini(void)244 diagnostic_file_cache_fini (void)
245 {
246   if (fcache_tab)
247     {
248       delete [] (fcache_tab);
249       fcache_tab = NULL;
250     }
251 }
252 
253 /* Return the total lines number that have been read so far by the
254    line map (in the preprocessor) so far.  For languages like C++ that
255    entirely preprocess the input file before starting to parse, this
256    equals the actual number of lines of the file.  */
257 
258 static size_t
total_lines_num(const char * file_path)259 total_lines_num (const char *file_path)
260 {
261   size_t r = 0;
262   location_t l = 0;
263   if (linemap_get_file_highest_location (line_table, file_path, &l))
264     {
265       gcc_assert (l >= RESERVED_LOCATION_COUNT);
266       expanded_location xloc = expand_location (l);
267       r = xloc.line;
268     }
269   return r;
270 }
271 
272 /* Lookup the cache used for the content of a given file accessed by
273    caret diagnostic.  Return the found cached file, or NULL if no
274    cached file was found.  */
275 
276 static fcache*
lookup_file_in_cache_tab(const char * file_path)277 lookup_file_in_cache_tab (const char *file_path)
278 {
279   if (file_path == NULL)
280     return NULL;
281 
282   diagnostic_file_cache_init ();
283 
284   /* This will contain the found cached file.  */
285   fcache *r = NULL;
286   for (unsigned i = 0; i < fcache_tab_size; ++i)
287     {
288       fcache *c = &fcache_tab[i];
289       if (c->file_path && !strcmp (c->file_path, file_path))
290 	{
291 	  ++c->use_count;
292 	  r = c;
293 	}
294     }
295 
296   if (r)
297     ++r->use_count;
298 
299   return r;
300 }
301 
302 /* Purge any mention of FILENAME from the cache of files used for
303    printing source code.  For use in selftests when working
304    with tempfiles.  */
305 
306 void
diagnostics_file_cache_forcibly_evict_file(const char * file_path)307 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
308 {
309   gcc_assert (file_path);
310 
311   fcache *r = lookup_file_in_cache_tab (file_path);
312   if (!r)
313     /* Not found.  */
314     return;
315 
316   r->file_path = NULL;
317   if (r->fp)
318     fclose (r->fp);
319   r->fp = NULL;
320   r->nb_read = 0;
321   r->line_start_idx = 0;
322   r->line_num = 0;
323   r->line_record.truncate (0);
324   r->use_count = 0;
325   r->total_lines = 0;
326   r->missing_trailing_newline = true;
327 }
328 
329 /* Return the file cache that has been less used, recently, or the
330    first empty one.  If HIGHEST_USE_COUNT is non-null,
331    *HIGHEST_USE_COUNT is set to the highest use count of the entries
332    in the cache table.  */
333 
334 static fcache*
evicted_cache_tab_entry(unsigned * highest_use_count)335 evicted_cache_tab_entry (unsigned *highest_use_count)
336 {
337   diagnostic_file_cache_init ();
338 
339   fcache *to_evict = &fcache_tab[0];
340   unsigned huc = to_evict->use_count;
341   for (unsigned i = 1; i < fcache_tab_size; ++i)
342     {
343       fcache *c = &fcache_tab[i];
344       bool c_is_empty = (c->file_path == NULL);
345 
346       if (c->use_count < to_evict->use_count
347 	  || (to_evict->file_path && c_is_empty))
348 	/* We evict C because it's either an entry with a lower use
349 	   count or one that is empty.  */
350 	to_evict = c;
351 
352       if (huc < c->use_count)
353 	huc = c->use_count;
354 
355       if (c_is_empty)
356 	/* We've reached the end of the cache; subsequent elements are
357 	   all empty.  */
358 	break;
359     }
360 
361   if (highest_use_count)
362     *highest_use_count = huc;
363 
364   return to_evict;
365 }
366 
367 /* Create the cache used for the content of a given file to be
368    accessed by caret diagnostic.  This cache is added to an array of
369    cache and can be retrieved by lookup_file_in_cache_tab.  This
370    function returns the created cache.  Note that only the last
371    fcache_tab_size files are cached.  */
372 
373 static fcache*
add_file_to_cache_tab(const char * file_path)374 add_file_to_cache_tab (const char *file_path)
375 {
376 
377   FILE *fp = fopen (file_path, "r");
378   if (fp == NULL)
379     return NULL;
380 
381   unsigned highest_use_count = 0;
382   fcache *r = evicted_cache_tab_entry (&highest_use_count);
383   r->file_path = file_path;
384   if (r->fp)
385     fclose (r->fp);
386   r->fp = fp;
387   r->nb_read = 0;
388   r->line_start_idx = 0;
389   r->line_num = 0;
390   r->line_record.truncate (0);
391   /* Ensure that this cache entry doesn't get evicted next time
392      add_file_to_cache_tab is called.  */
393   r->use_count = ++highest_use_count;
394   r->total_lines = total_lines_num (file_path);
395   r->missing_trailing_newline = true;
396 
397   return r;
398 }
399 
400 /* Lookup the cache used for the content of a given file accessed by
401    caret diagnostic.  If no cached file was found, create a new cache
402    for this file, add it to the array of cached file and return
403    it.  */
404 
405 static fcache*
lookup_or_add_file_to_cache_tab(const char * file_path)406 lookup_or_add_file_to_cache_tab (const char *file_path)
407 {
408   fcache *r = lookup_file_in_cache_tab (file_path);
409   if (r == NULL)
410     r = add_file_to_cache_tab (file_path);
411   return r;
412 }
413 
414 /* Default constructor for a cache of file used by caret
415    diagnostic.  */
416 
fcache()417 fcache::fcache ()
418 : use_count (0), file_path (NULL), fp (NULL), data (0),
419   size (0), nb_read (0), line_start_idx (0), line_num (0),
420   total_lines (0), missing_trailing_newline (true)
421 {
422   line_record.create (0);
423 }
424 
425 /* Destructor for a cache of file used by caret diagnostic.  */
426 
~fcache()427 fcache::~fcache ()
428 {
429   if (fp)
430     {
431       fclose (fp);
432       fp = NULL;
433     }
434   if (data)
435     {
436       XDELETEVEC (data);
437       data = 0;
438     }
439   line_record.release ();
440 }
441 
442 /* Returns TRUE iff the cache would need to be filled with data coming
443    from the file.  That is, either the cache is empty or full or the
444    current line is empty.  Note that if the cache is full, it would
445    need to be extended and filled again.  */
446 
447 static bool
needs_read(fcache * c)448 needs_read (fcache *c)
449 {
450   return (c->nb_read == 0
451 	  || c->nb_read == c->size
452 	  || (c->line_start_idx >= c->nb_read - 1));
453 }
454 
455 /*  Return TRUE iff the cache is full and thus needs to be
456     extended.  */
457 
458 static bool
needs_grow(fcache * c)459 needs_grow (fcache *c)
460 {
461   return c->nb_read == c->size;
462 }
463 
464 /* Grow the cache if it needs to be extended.  */
465 
466 static void
maybe_grow(fcache * c)467 maybe_grow (fcache *c)
468 {
469   if (!needs_grow (c))
470     return;
471 
472   size_t size = c->size == 0 ? fcache_buffer_size : c->size * 2;
473   c->data = XRESIZEVEC (char, c->data, size);
474   c->size = size;
475 }
476 
477 /*  Read more data into the cache.  Extends the cache if need be.
478     Returns TRUE iff new data could be read.  */
479 
480 static bool
read_data(fcache * c)481 read_data (fcache *c)
482 {
483   if (feof (c->fp) || ferror (c->fp))
484     return false;
485 
486   maybe_grow (c);
487 
488   char * from = c->data + c->nb_read;
489   size_t to_read = c->size - c->nb_read;
490   size_t nb_read = fread (from, 1, to_read, c->fp);
491 
492   if (ferror (c->fp))
493     return false;
494 
495   c->nb_read += nb_read;
496   return !!nb_read;
497 }
498 
499 /* Read new data iff the cache needs to be filled with more data
500    coming from the file FP.  Return TRUE iff the cache was filled with
501    mode data.  */
502 
503 static bool
maybe_read_data(fcache * c)504 maybe_read_data (fcache *c)
505 {
506   if (!needs_read (c))
507     return false;
508   return read_data (c);
509 }
510 
511 /* Read a new line from file FP, using C as a cache for the data
512    coming from the file.  Upon successful completion, *LINE is set to
513    the beginning of the line found.  *LINE points directly in the
514    line cache and is only valid until the next call of get_next_line.
515    *LINE_LEN is set to the length of the line.  Note that the line
516    does not contain any terminal delimiter.  This function returns
517    true if some data was read or process from the cache, false
518    otherwise.  Note that subsequent calls to get_next_line might
519    make the content of *LINE invalid.  */
520 
521 static bool
get_next_line(fcache * c,char ** line,ssize_t * line_len)522 get_next_line (fcache *c, char **line, ssize_t *line_len)
523 {
524   /* Fill the cache with data to process.  */
525   maybe_read_data (c);
526 
527   size_t remaining_size = c->nb_read - c->line_start_idx;
528   if (remaining_size == 0)
529     /* There is no more data to process.  */
530     return false;
531 
532   char *line_start = c->data + c->line_start_idx;
533 
534   char *next_line_start = NULL;
535   size_t len = 0;
536   char *line_end = (char *) memchr (line_start, '\n', remaining_size);
537   if (line_end == NULL)
538     {
539       /* We haven't found the end-of-line delimiter in the cache.
540 	 Fill the cache with more data from the file and look for the
541 	 '\n'.  */
542       while (maybe_read_data (c))
543 	{
544 	  line_start = c->data + c->line_start_idx;
545 	  remaining_size = c->nb_read - c->line_start_idx;
546 	  line_end = (char *) memchr (line_start, '\n', remaining_size);
547 	  if (line_end != NULL)
548 	    {
549 	      next_line_start = line_end + 1;
550 	      break;
551 	    }
552 	}
553       if (line_end == NULL)
554 	{
555 	  /* We've loadded all the file into the cache and still no
556 	     '\n'.  Let's say the line ends up at one byte passed the
557 	     end of the file.  This is to stay consistent with the case
558 	     of when the line ends up with a '\n' and line_end points to
559 	     that terminal '\n'.  That consistency is useful below in
560 	     the len calculation.  */
561 	  line_end = c->data + c->nb_read ;
562 	  c->missing_trailing_newline = true;
563 	}
564       else
565 	c->missing_trailing_newline = false;
566     }
567   else
568     {
569       next_line_start = line_end + 1;
570       c->missing_trailing_newline = false;
571     }
572 
573   if (ferror (c->fp))
574     return false;
575 
576   /* At this point, we've found the end of the of line.  It either
577      points to the '\n' or to one byte after the last byte of the
578      file.  */
579   gcc_assert (line_end != NULL);
580 
581   len = line_end - line_start;
582 
583   if (c->line_start_idx < c->nb_read)
584     *line = line_start;
585 
586   ++c->line_num;
587 
588   /* Before we update our line record, make sure the hint about the
589      total number of lines of the file is correct.  If it's not, then
590      we give up recording line boundaries from now on.  */
591   bool update_line_record = true;
592   if (c->line_num > c->total_lines)
593     update_line_record = false;
594 
595     /* Now update our line record so that re-reading lines from the
596      before c->line_start_idx is faster.  */
597   if (update_line_record
598       && c->line_record.length () < fcache_line_record_size)
599     {
600       /* If the file lines fits in the line record, we just record all
601 	 its lines ...*/
602       if (c->total_lines <= fcache_line_record_size
603 	  && c->line_num > c->line_record.length ())
604 	c->line_record.safe_push (fcache::line_info (c->line_num,
605 						 c->line_start_idx,
606 						 line_end - c->data));
607       else if (c->total_lines > fcache_line_record_size)
608 	{
609 	  /* ... otherwise, we just scale total_lines down to
610 	     (fcache_line_record_size lines.  */
611 	  size_t n = (c->line_num * fcache_line_record_size) / c->total_lines;
612 	  if (c->line_record.length () == 0
613 	      || n >= c->line_record.length ())
614 	    c->line_record.safe_push (fcache::line_info (c->line_num,
615 						     c->line_start_idx,
616 						     line_end - c->data));
617 	}
618     }
619 
620   /* Update c->line_start_idx so that it points to the next line to be
621      read.  */
622   if (next_line_start)
623     c->line_start_idx = next_line_start - c->data;
624   else
625     /* We didn't find any terminal '\n'.  Let's consider that the end
626        of line is the end of the data in the cache.  The next
627        invocation of get_next_line will either read more data from the
628        underlying file or return false early because we've reached the
629        end of the file.  */
630     c->line_start_idx = c->nb_read;
631 
632   *line_len = len;
633 
634   return true;
635 }
636 
637 /* Consume the next bytes coming from the cache (or from its
638    underlying file if there are remaining unread bytes in the file)
639    until we reach the next end-of-line (or end-of-file).  There is no
640    copying from the cache involved.  Return TRUE upon successful
641    completion.  */
642 
643 static bool
goto_next_line(fcache * cache)644 goto_next_line (fcache *cache)
645 {
646   char *l;
647   ssize_t len;
648 
649   return get_next_line (cache, &l, &len);
650 }
651 
652 /* Read an arbitrary line number LINE_NUM from the file cached in C.
653    If the line was read successfully, *LINE points to the beginning
654    of the line in the file cache and *LINE_LEN is the length of the
655    line.  *LINE is not nul-terminated, but may contain zero bytes.
656    *LINE is only valid until the next call of read_line_num.
657    This function returns bool if a line was read.  */
658 
659 static bool
read_line_num(fcache * c,size_t line_num,char ** line,ssize_t * line_len)660 read_line_num (fcache *c, size_t line_num,
661 	       char **line, ssize_t *line_len)
662 {
663   gcc_assert (line_num > 0);
664 
665   if (line_num <= c->line_num)
666     {
667       /* We've been asked to read lines that are before c->line_num.
668 	 So lets use our line record (if it's not empty) to try to
669 	 avoid re-reading the file from the beginning again.  */
670 
671       if (c->line_record.is_empty ())
672 	{
673 	  c->line_start_idx = 0;
674 	  c->line_num = 0;
675 	}
676       else
677 	{
678 	  fcache::line_info *i = NULL;
679 	  if (c->total_lines <= fcache_line_record_size)
680 	    {
681 	      /* In languages where the input file is not totally
682 		 preprocessed up front, the c->total_lines hint
683 		 can be smaller than the number of lines of the
684 		 file.  In that case, only the first
685 		 c->total_lines have been recorded.
686 
687 		 Otherwise, the first c->total_lines we've read have
688 		 their start/end recorded here.  */
689 	      i = (line_num <= c->total_lines)
690 		? &c->line_record[line_num - 1]
691 		: &c->line_record[c->total_lines - 1];
692 	      gcc_assert (i->line_num <= line_num);
693 	    }
694 	  else
695 	    {
696 	      /*  So the file had more lines than our line record
697 		  size.  Thus the number of lines we've recorded has
698 		  been scaled down to fcache_line_reacord_size.  Let's
699 		  pick the start/end of the recorded line that is
700 		  closest to line_num.  */
701 	      size_t n = (line_num <= c->total_lines)
702 		? line_num * fcache_line_record_size / c->total_lines
703 		: c ->line_record.length () - 1;
704 	      if (n < c->line_record.length ())
705 		{
706 		  i = &c->line_record[n];
707 		  gcc_assert (i->line_num <= line_num);
708 		}
709 	    }
710 
711 	  if (i && i->line_num == line_num)
712 	    {
713 	      /* We have the start/end of the line.  */
714 	      *line = c->data + i->start_pos;
715 	      *line_len = i->end_pos - i->start_pos;
716 	      return true;
717 	    }
718 
719 	  if (i)
720 	    {
721 	      c->line_start_idx = i->start_pos;
722 	      c->line_num = i->line_num - 1;
723 	    }
724 	  else
725 	    {
726 	      c->line_start_idx = 0;
727 	      c->line_num = 0;
728 	    }
729 	}
730     }
731 
732   /*  Let's walk from line c->line_num up to line_num - 1, without
733       copying any line.  */
734   while (c->line_num < line_num - 1)
735     if (!goto_next_line (c))
736       return false;
737 
738   /* The line we want is the next one.  Let's read and copy it back to
739      the caller.  */
740   return get_next_line (c, line, line_len);
741 }
742 
743 /* Return the physical source line that corresponds to FILE_PATH/LINE.
744    The line is not nul-terminated.  The returned pointer is only
745    valid until the next call of location_get_source_line.
746    Note that the line can contain several null characters,
747    so the returned value's length has the actual length of the line.
748    If the function fails, a NULL char_span is returned.  */
749 
750 char_span
location_get_source_line(const char * file_path,int line)751 location_get_source_line (const char *file_path, int line)
752 {
753   char *buffer = NULL;
754   ssize_t len;
755 
756   if (line == 0)
757     return char_span (NULL, 0);
758 
759   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
760   if (c == NULL)
761     return char_span (NULL, 0);
762 
763   bool read = read_line_num (c, line, &buffer, &len);
764   if (!read)
765     return char_span (NULL, 0);
766 
767   return char_span (buffer, len);
768 }
769 
770 /* Determine if FILE_PATH missing a trailing newline on its final line.
771    Only valid to call once all of the file has been loaded, by
772    requesting a line number beyond the end of the file.  */
773 
774 bool
location_missing_trailing_newline(const char * file_path)775 location_missing_trailing_newline (const char *file_path)
776 {
777   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
778   if (c == NULL)
779     return false;
780 
781   return c->missing_trailing_newline;
782 }
783 
784 /* Test if the location originates from the spelling location of a
785    builtin-tokens.  That is, return TRUE if LOC is a (possibly
786    virtual) location of a built-in token that appears in the expansion
787    list of a macro.  Please note that this function also works on
788    tokens that result from built-in tokens.  For instance, the
789    function would return true if passed a token "4" that is the result
790    of the expansion of the built-in __LINE__ macro.  */
791 bool
is_location_from_builtin_token(location_t loc)792 is_location_from_builtin_token (location_t loc)
793 {
794   const line_map_ordinary *map = NULL;
795   loc = linemap_resolve_location (line_table, loc,
796 				  LRK_SPELLING_LOCATION, &map);
797   return loc == BUILTINS_LOCATION;
798 }
799 
800 /* Expand the source location LOC into a human readable location.  If
801    LOC is virtual, it resolves to the expansion point of the involved
802    macro.  If LOC resolves to a builtin location, the file name of the
803    readable location is set to the string "<built-in>".  */
804 
805 expanded_location
expand_location(location_t loc)806 expand_location (location_t loc)
807 {
808   return expand_location_1 (loc, /*expansion_point_p=*/true,
809 			    LOCATION_ASPECT_CARET);
810 }
811 
812 /* Expand the source location LOC into a human readable location.  If
813    LOC is virtual, it resolves to the expansion location of the
814    relevant macro.  If LOC resolves to a builtin location, the file
815    name of the readable location is set to the string
816    "<built-in>".  */
817 
818 expanded_location
expand_location_to_spelling_point(location_t loc,enum location_aspect aspect)819 expand_location_to_spelling_point (location_t loc,
820 				   enum location_aspect aspect)
821 {
822   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
823 }
824 
825 /* The rich_location class within libcpp requires a way to expand
826    location_t instances, and relies on the client code
827    providing a symbol named
828      linemap_client_expand_location_to_spelling_point
829    to do this.
830 
831    This is the implementation for libcommon.a (all host binaries),
832    which simply calls into expand_location_1.  */
833 
834 expanded_location
linemap_client_expand_location_to_spelling_point(location_t loc,enum location_aspect aspect)835 linemap_client_expand_location_to_spelling_point (location_t loc,
836 						  enum location_aspect aspect)
837 {
838   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
839 }
840 
841 
842 /* If LOCATION is in a system header and if it is a virtual location for
843    a token coming from the expansion of a macro, unwind it to the
844    location of the expansion point of the macro.  Otherwise, just return
845    LOCATION.
846 
847    This is used for instance when we want to emit diagnostics about a
848    token that may be located in a macro that is itself defined in a
849    system header, for example, for the NULL macro.  In such a case, if
850    LOCATION were passed directly to diagnostic functions such as
851    warning_at, the diagnostic would be suppressed (unless
852    -Wsystem-headers).  */
853 
854 location_t
expansion_point_location_if_in_system_header(location_t location)855 expansion_point_location_if_in_system_header (location_t location)
856 {
857   if (in_system_header_at (location))
858     location = linemap_resolve_location (line_table, location,
859 					 LRK_MACRO_EXPANSION_POINT,
860 					 NULL);
861   return location;
862 }
863 
864 /* If LOCATION is a virtual location for a token coming from the expansion
865    of a macro, unwind to the location of the expansion point of the macro.  */
866 
867 location_t
expansion_point_location(location_t location)868 expansion_point_location (location_t location)
869 {
870   return linemap_resolve_location (line_table, location,
871 				   LRK_MACRO_EXPANSION_POINT, NULL);
872 }
873 
874 /* Construct a location with caret at CARET, ranging from START to
875    finish e.g.
876 
877                  11111111112
878         12345678901234567890
879      522
880      523   return foo + bar;
881                   ~~~~^~~~~
882      524
883 
884    The location's caret is at the "+", line 523 column 15, but starts
885    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
886    of "bar" at column 19.  */
887 
888 location_t
make_location(location_t caret,location_t start,location_t finish)889 make_location (location_t caret, location_t start, location_t finish)
890 {
891   location_t pure_loc = get_pure_location (caret);
892   source_range src_range;
893   src_range.m_start = get_start (start);
894   src_range.m_finish = get_finish (finish);
895   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
896 						   pure_loc,
897 						   src_range,
898 						   NULL);
899   return combined_loc;
900 }
901 
902 /* Same as above, but taking a source range rather than two locations.  */
903 
904 location_t
make_location(location_t caret,source_range src_range)905 make_location (location_t caret, source_range src_range)
906 {
907   location_t pure_loc = get_pure_location (caret);
908   return COMBINE_LOCATION_DATA (line_table, pure_loc, src_range, NULL);
909 }
910 
911 /* An expanded_location stores the column in byte units.  This function
912    converts that column to display units.  That requires reading the associated
913    source line in order to calculate the display width.  If that cannot be done
914    for any reason, then returns the byte column as a fallback.  */
915 int
location_compute_display_column(expanded_location exploc)916 location_compute_display_column (expanded_location exploc)
917 {
918   if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
919     return exploc.column;
920   char_span line = location_get_source_line (exploc.file, exploc.line);
921   /* If line is NULL, this function returns exploc.column which is the
922      desired fallback.  */
923   return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
924 					    exploc.column);
925 }
926 
927 /* Dump statistics to stderr about the memory usage of the line_table
928    set of line maps.  This also displays some statistics about macro
929    expansion.  */
930 
931 void
dump_line_table_statistics(void)932 dump_line_table_statistics (void)
933 {
934   struct linemap_stats s;
935   long total_used_map_size,
936     macro_maps_size,
937     total_allocated_map_size;
938 
939   memset (&s, 0, sizeof (s));
940 
941   linemap_get_statistics (line_table, &s);
942 
943   macro_maps_size = s.macro_maps_used_size
944     + s.macro_maps_locations_size;
945 
946   total_allocated_map_size = s.ordinary_maps_allocated_size
947     + s.macro_maps_allocated_size
948     + s.macro_maps_locations_size;
949 
950   total_used_map_size = s.ordinary_maps_used_size
951     + s.macro_maps_used_size
952     + s.macro_maps_locations_size;
953 
954   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
955            s.num_expanded_macros);
956   if (s.num_expanded_macros != 0)
957     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
958              s.num_macro_tokens / s.num_expanded_macros);
959   fprintf (stderr,
960            "\nLine Table allocations during the "
961 	   "compilation process\n");
962   fprintf (stderr, "Number of ordinary maps used:        " PRsa (5) "\n",
963 	   SIZE_AMOUNT (s.num_ordinary_maps_used));
964   fprintf (stderr, "Ordinary map used size:              " PRsa (5) "\n",
965 	   SIZE_AMOUNT (s.ordinary_maps_used_size));
966   fprintf (stderr, "Number of ordinary maps allocated:   " PRsa (5) "\n",
967 	   SIZE_AMOUNT (s.num_ordinary_maps_allocated));
968   fprintf (stderr, "Ordinary maps allocated size:        " PRsa (5) "\n",
969 	   SIZE_AMOUNT (s.ordinary_maps_allocated_size));
970   fprintf (stderr, "Number of macro maps used:           " PRsa (5) "\n",
971 	   SIZE_AMOUNT (s.num_macro_maps_used));
972   fprintf (stderr, "Macro maps used size:                " PRsa (5) "\n",
973 	   SIZE_AMOUNT (s.macro_maps_used_size));
974   fprintf (stderr, "Macro maps locations size:           " PRsa (5) "\n",
975 	   SIZE_AMOUNT (s.macro_maps_locations_size));
976   fprintf (stderr, "Macro maps size:                     " PRsa (5) "\n",
977 	   SIZE_AMOUNT (macro_maps_size));
978   fprintf (stderr, "Duplicated maps locations size:      " PRsa (5) "\n",
979 	   SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
980   fprintf (stderr, "Total allocated maps size:           " PRsa (5) "\n",
981 	   SIZE_AMOUNT (total_allocated_map_size));
982   fprintf (stderr, "Total used maps size:                " PRsa (5) "\n",
983 	   SIZE_AMOUNT (total_used_map_size));
984   fprintf (stderr, "Ad-hoc table size:                   " PRsa (5) "\n",
985 	   SIZE_AMOUNT (s.adhoc_table_size));
986   fprintf (stderr, "Ad-hoc table entries used:           " PRsa (5) "\n",
987 	   SIZE_AMOUNT (s.adhoc_table_entries_used));
988   fprintf (stderr, "optimized_ranges:                    " PRsa (5) "\n",
989 	   SIZE_AMOUNT (line_table->num_optimized_ranges));
990   fprintf (stderr, "unoptimized_ranges:                  " PRsa (5) "\n",
991 	   SIZE_AMOUNT (line_table->num_unoptimized_ranges));
992 
993   fprintf (stderr, "\n");
994 }
995 
996 /* Get location one beyond the final location in ordinary map IDX.  */
997 
998 static location_t
get_end_location(class line_maps * set,unsigned int idx)999 get_end_location (class line_maps *set, unsigned int idx)
1000 {
1001   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1002     return set->highest_location;
1003 
1004   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1005   return MAP_START_LOCATION (next_map);
1006 }
1007 
1008 /* Helper function for write_digit_row.  */
1009 
1010 static void
write_digit(FILE * stream,int digit)1011 write_digit (FILE *stream, int digit)
1012 {
1013   fputc ('0' + (digit % 10), stream);
1014 }
1015 
1016 /* Helper function for dump_location_info.
1017    Write a row of numbers to STREAM, numbering a source line,
1018    giving the units, tens, hundreds etc of the column number.  */
1019 
1020 static void
write_digit_row(FILE * stream,int indent,const line_map_ordinary * map,location_t loc,int max_col,int divisor)1021 write_digit_row (FILE *stream, int indent,
1022 		 const line_map_ordinary *map,
1023 		 location_t loc, int max_col, int divisor)
1024 {
1025   fprintf (stream, "%*c", indent, ' ');
1026   fprintf (stream, "|");
1027   for (int column = 1; column < max_col; column++)
1028     {
1029       location_t column_loc = loc + (column << map->m_range_bits);
1030       write_digit (stream, column_loc / divisor);
1031     }
1032   fprintf (stream, "\n");
1033 }
1034 
1035 /* Write a half-closed (START) / half-open (END) interval of
1036    location_t to STREAM.  */
1037 
1038 static void
dump_location_range(FILE * stream,location_t start,location_t end)1039 dump_location_range (FILE *stream,
1040 		     location_t start, location_t end)
1041 {
1042   fprintf (stream,
1043 	   "  location_t interval: %u <= loc < %u\n",
1044 	   start, end);
1045 }
1046 
1047 /* Write a labelled description of a half-closed (START) / half-open (END)
1048    interval of location_t to STREAM.  */
1049 
1050 static void
dump_labelled_location_range(FILE * stream,const char * name,location_t start,location_t end)1051 dump_labelled_location_range (FILE *stream,
1052 			      const char *name,
1053 			      location_t start, location_t end)
1054 {
1055   fprintf (stream, "%s\n", name);
1056   dump_location_range (stream, start, end);
1057   fprintf (stream, "\n");
1058 }
1059 
1060 /* Write a visualization of the locations in the line_table to STREAM.  */
1061 
1062 void
dump_location_info(FILE * stream)1063 dump_location_info (FILE *stream)
1064 {
1065   /* Visualize the reserved locations.  */
1066   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1067 				0, RESERVED_LOCATION_COUNT);
1068 
1069   /* Visualize the ordinary line_map instances, rendering the sources. */
1070   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1071     {
1072       location_t end_location = get_end_location (line_table, idx);
1073       /* half-closed: doesn't include this one. */
1074 
1075       const line_map_ordinary *map
1076 	= LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1077       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1078       dump_location_range (stream,
1079 			   MAP_START_LOCATION (map), end_location);
1080       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1081       fprintf (stream, "  starting at line: %i\n",
1082 	       ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1083       fprintf (stream, "  column and range bits: %i\n",
1084 	       map->m_column_and_range_bits);
1085       fprintf (stream, "  column bits: %i\n",
1086 	       map->m_column_and_range_bits - map->m_range_bits);
1087       fprintf (stream, "  range bits: %i\n",
1088 	       map->m_range_bits);
1089       const char * reason;
1090       switch (map->reason) {
1091       case LC_ENTER:
1092 	reason = "LC_ENTER";
1093 	break;
1094       case LC_LEAVE:
1095 	reason = "LC_LEAVE";
1096 	break;
1097       case LC_RENAME:
1098 	reason = "LC_RENAME";
1099 	break;
1100       case LC_RENAME_VERBATIM:
1101 	reason = "LC_RENAME_VERBATIM";
1102 	break;
1103       case LC_ENTER_MACRO:
1104 	reason = "LC_RENAME_MACRO";
1105 	break;
1106       default:
1107 	reason = "Unknown";
1108       }
1109       fprintf (stream, "  reason: %d (%s)\n", map->reason, reason);
1110 
1111       const line_map_ordinary *includer_map
1112 	= linemap_included_from_linemap (line_table, map);
1113       fprintf (stream, "  included from location: %d",
1114 	       linemap_included_from (map));
1115       if (includer_map) {
1116 	fprintf (stream, " (in ordinary map %d)",
1117 		 int (includer_map - line_table->info_ordinary.maps));
1118       }
1119       fprintf (stream, "\n");
1120 
1121       /* Render the span of source lines that this "map" covers.  */
1122       for (location_t loc = MAP_START_LOCATION (map);
1123 	   loc < end_location;
1124 	   loc += (1 << map->m_range_bits) )
1125 	{
1126 	  gcc_assert (pure_location_p (line_table, loc) );
1127 
1128 	  expanded_location exploc
1129 	    = linemap_expand_location (line_table, map, loc);
1130 
1131 	  if (exploc.column == 0)
1132 	    {
1133 	      /* Beginning of a new source line: draw the line.  */
1134 
1135 	      char_span line_text = location_get_source_line (exploc.file,
1136 							      exploc.line);
1137 	      if (!line_text)
1138 		break;
1139 	      fprintf (stream,
1140 		       "%s:%3i|loc:%5i|%.*s\n",
1141 		       exploc.file, exploc.line,
1142 		       loc,
1143 		       (int)line_text.length (), line_text.get_buffer ());
1144 
1145 	      /* "loc" is at column 0, which means "the whole line".
1146 		 Render the locations *within* the line, by underlining
1147 		 it, showing the location_t numeric values
1148 		 at each column.  */
1149 	      size_t max_col = (1 << map->m_column_and_range_bits) - 1;
1150 	      if (max_col > line_text.length ())
1151 		max_col = line_text.length () + 1;
1152 
1153 	      int len_lnum = num_digits (exploc.line);
1154 	      if (len_lnum < 3)
1155 		len_lnum = 3;
1156 	      int len_loc = num_digits (loc);
1157 	      if (len_loc < 5)
1158 		len_loc = 5;
1159 
1160 	      int indent = 6 + strlen (exploc.file) + len_lnum + len_loc;
1161 
1162 	      /* Thousands.  */
1163 	      if (end_location > 999)
1164 		write_digit_row (stream, indent, map, loc, max_col, 1000);
1165 
1166 	      /* Hundreds.  */
1167 	      if (end_location > 99)
1168 		write_digit_row (stream, indent, map, loc, max_col, 100);
1169 
1170 	      /* Tens.  */
1171 	      write_digit_row (stream, indent, map, loc, max_col, 10);
1172 
1173 	      /* Units.  */
1174 	      write_digit_row (stream, indent, map, loc, max_col, 1);
1175 	    }
1176 	}
1177       fprintf (stream, "\n");
1178     }
1179 
1180   /* Visualize unallocated values.  */
1181   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1182 				line_table->highest_location,
1183 				LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1184 
1185   /* Visualize the macro line_map instances, rendering the sources. */
1186   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1187     {
1188       /* Each macro map that is allocated owns location_t values
1189 	 that are *lower* that the one before them.
1190 	 Hence it's meaningful to view them either in order of ascending
1191 	 source locations, or in order of ascending macro map index.  */
1192       const bool ascending_location_ts = true;
1193       unsigned int idx = (ascending_location_ts
1194 			  ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1195 			  : i);
1196       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1197       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1198 	       idx,
1199 	       linemap_map_get_macro_name (map),
1200 	       MACRO_MAP_NUM_MACRO_TOKENS (map));
1201       dump_location_range (stream,
1202 			   map->start_location,
1203 			   (map->start_location
1204 			    + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1205       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1206 	      "expansion point is location %i",
1207 	      MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1208       fprintf (stream, "  map->start_location: %u\n",
1209 	       map->start_location);
1210 
1211       fprintf (stream, "  macro_locations:\n");
1212       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1213 	{
1214 	  location_t x = MACRO_MAP_LOCATIONS (map)[2 * i];
1215 	  location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1216 
1217 	  /* linemap_add_macro_token encodes token numbers in an expansion
1218 	     by putting them after MAP_START_LOCATION. */
1219 
1220 	  /* I'm typically seeing 4 uninitialized entries at the end of
1221 	     0xafafafaf.
1222 	     This appears to be due to macro.c:replace_args
1223 	     adding 2 extra args for padding tokens; presumably there may
1224 	     be a leading and/or trailing padding token injected,
1225 	     each for 2 more location slots.
1226 	     This would explain there being up to 4 location_ts slots
1227 	     that may be uninitialized.  */
1228 
1229 	  fprintf (stream, "    %u: %u, %u\n",
1230 		   i,
1231 		   x,
1232 		   y);
1233 	  if (x == y)
1234 	    {
1235 	      if (x < MAP_START_LOCATION (map))
1236 		inform (x, "token %u has %<x-location == y-location == %u%>",
1237 			i, x);
1238 	      else
1239 		fprintf (stream,
1240 			 "x-location == y-location == %u encodes token # %u\n",
1241 			 x, x - MAP_START_LOCATION (map));
1242 		}
1243 	  else
1244 	    {
1245 	      inform (x, "token %u has %<x-location == %u%>", i, x);
1246 	      inform (x, "token %u has %<y-location == %u%>", i, y);
1247 	    }
1248 	}
1249       fprintf (stream, "\n");
1250     }
1251 
1252   /* It appears that MAX_LOCATION_T itself is never assigned to a
1253      macro map, presumably due to an off-by-one error somewhere
1254      between the logic in linemap_enter_macro and
1255      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1256   dump_labelled_location_range (stream, "MAX_LOCATION_T",
1257 				MAX_LOCATION_T,
1258 				MAX_LOCATION_T + 1);
1259 
1260   /* Visualize ad-hoc values.  */
1261   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1262 				MAX_LOCATION_T + 1, UINT_MAX);
1263 }
1264 
1265 /* string_concat's constructor.  */
1266 
string_concat(int num,location_t * locs)1267 string_concat::string_concat (int num, location_t *locs)
1268   : m_num (num)
1269 {
1270   m_locs = ggc_vec_alloc <location_t> (num);
1271   for (int i = 0; i < num; i++)
1272     m_locs[i] = locs[i];
1273 }
1274 
1275 /* string_concat_db's constructor.  */
1276 
string_concat_db()1277 string_concat_db::string_concat_db ()
1278 {
1279   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1280 }
1281 
1282 /* Record that a string concatenation occurred, covering NUM
1283    string literal tokens.  LOCS is an array of size NUM, containing the
1284    locations of the tokens.  A copy of LOCS is taken.  */
1285 
1286 void
record_string_concatenation(int num,location_t * locs)1287 string_concat_db::record_string_concatenation (int num, location_t *locs)
1288 {
1289   gcc_assert (num > 1);
1290   gcc_assert (locs);
1291 
1292   location_t key_loc = get_key_loc (locs[0]);
1293 
1294   string_concat *concat
1295     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1296   m_table->put (key_loc, concat);
1297 }
1298 
1299 /* Determine if LOC was the location of the initial token of a
1300    concatenation of string literal tokens.
1301    If so, *OUT_NUM is written to with the number of tokens, and
1302    *OUT_LOCS with the location of an array of locations of the
1303    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1304    storage owned by the string_concat_db.
1305    Otherwise, return false.  */
1306 
1307 bool
get_string_concatenation(location_t loc,int * out_num,location_t ** out_locs)1308 string_concat_db::get_string_concatenation (location_t loc,
1309 					    int *out_num,
1310 					    location_t **out_locs)
1311 {
1312   gcc_assert (out_num);
1313   gcc_assert (out_locs);
1314 
1315   location_t key_loc = get_key_loc (loc);
1316 
1317   string_concat **concat = m_table->get (key_loc);
1318   if (!concat)
1319     return false;
1320 
1321   *out_num = (*concat)->m_num;
1322   *out_locs =(*concat)->m_locs;
1323   return true;
1324 }
1325 
1326 /* Internal function.  Canonicalize LOC into a form suitable for
1327    use as a key within the database, stripping away macro expansion,
1328    ad-hoc information, and range information, using the location of
1329    the start of LOC within an ordinary linemap.  */
1330 
1331 location_t
get_key_loc(location_t loc)1332 string_concat_db::get_key_loc (location_t loc)
1333 {
1334   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1335 				  NULL);
1336 
1337   loc = get_range_from_loc (line_table, loc).m_start;
1338 
1339   return loc;
1340 }
1341 
1342 /* Helper class for use within get_substring_ranges_for_loc.
1343    An vec of cpp_string with responsibility for releasing all of the
1344    str->text for each str in the vector.  */
1345 
1346 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1347 {
1348  public:
auto_cpp_string_vec(int alloc)1349   auto_cpp_string_vec (int alloc)
1350     : auto_vec <cpp_string> (alloc) {}
1351 
~auto_cpp_string_vec()1352   ~auto_cpp_string_vec ()
1353   {
1354     /* Clean up the copies within this vec.  */
1355     int i;
1356     cpp_string *str;
1357     FOR_EACH_VEC_ELT (*this, i, str)
1358       free (const_cast <unsigned char *> (str->text));
1359   }
1360 };
1361 
1362 /* Attempt to populate RANGES with source location information on the
1363    individual characters within the string literal found at STRLOC.
1364    If CONCATS is non-NULL, then any string literals that the token at
1365    STRLOC  was concatenated with are also added to RANGES.
1366 
1367    Return NULL if successful, or an error message if any errors occurred (in
1368    which case RANGES may be only partially populated and should not
1369    be used).
1370 
1371    This is implemented by re-parsing the relevant source line(s).  */
1372 
1373 static const char *
get_substring_ranges_for_loc(cpp_reader * pfile,string_concat_db * concats,location_t strloc,enum cpp_ttype type,cpp_substring_ranges & ranges)1374 get_substring_ranges_for_loc (cpp_reader *pfile,
1375 			      string_concat_db *concats,
1376 			      location_t strloc,
1377 			      enum cpp_ttype type,
1378 			      cpp_substring_ranges &ranges)
1379 {
1380   gcc_assert (pfile);
1381 
1382   if (strloc == UNKNOWN_LOCATION)
1383     return "unknown location";
1384 
1385   /* Reparsing the strings requires accurate location information.
1386      If -ftrack-macro-expansion has been overridden from its default
1387      of 2, then we might have a location of a macro expansion point,
1388      rather than the location of the literal itself.
1389      Avoid this by requiring that we have full macro expansion tracking
1390      for substring locations to be available.  */
1391   if (cpp_get_options (pfile)->track_macro_expansion != 2)
1392     return "track_macro_expansion != 2";
1393 
1394   /* If #line or # 44 "file"-style directives are present, then there's
1395      no guarantee that the line numbers we have can be used to locate
1396      the strings.  For example, we might have a .i file with # directives
1397      pointing back to lines within a .c file, but the .c file might
1398      have been edited since the .i file was created.
1399      In such a case, the safest course is to disable on-demand substring
1400      locations.  */
1401   if (line_table->seen_line_directive)
1402     return "seen line directive";
1403 
1404   /* If string concatenation has occurred at STRLOC, get the locations
1405      of all of the literal tokens making up the compound string.
1406      Otherwise, just use STRLOC.  */
1407   int num_locs = 1;
1408   location_t *strlocs = &strloc;
1409   if (concats)
1410     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1411 
1412   auto_cpp_string_vec strs (num_locs);
1413   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1414   for (int i = 0; i < num_locs; i++)
1415     {
1416       /* Get range of strloc.  We will use it to locate the start and finish
1417 	 of the literal token within the line.  */
1418       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1419 
1420       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1421 	{
1422 	  /* If the string token was within a macro expansion, then we can
1423 	     cope with it for the simple case where we have a single token.
1424 	     Otherwise, bail out.  */
1425 	  if (src_range.m_start != src_range.m_finish)
1426 	    return "macro expansion";
1427 	}
1428       else
1429 	{
1430 	  if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1431 	    /* If so, we can't reliably determine where the token started within
1432 	       its line.  */
1433 	    return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1434 
1435 	  if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1436 	    /* If so, we can't reliably determine where the token finished
1437 	       within its line.  */
1438 	    return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1439 	}
1440 
1441       expanded_location start
1442 	= expand_location_to_spelling_point (src_range.m_start,
1443 					     LOCATION_ASPECT_START);
1444       expanded_location finish
1445 	= expand_location_to_spelling_point (src_range.m_finish,
1446 					     LOCATION_ASPECT_FINISH);
1447       if (start.file != finish.file)
1448 	return "range endpoints are in different files";
1449       if (start.line != finish.line)
1450 	return "range endpoints are on different lines";
1451       if (start.column > finish.column)
1452 	return "range endpoints are reversed";
1453 
1454       char_span line = location_get_source_line (start.file, start.line);
1455       if (!line)
1456 	return "unable to read source line";
1457 
1458       /* Determine the location of the literal (including quotes
1459 	 and leading prefix chars, such as the 'u' in a u""
1460 	 token).  */
1461       size_t literal_length = finish.column - start.column + 1;
1462 
1463       /* Ensure that we don't crash if we got the wrong location.  */
1464       if (line.length () < (start.column - 1 + literal_length))
1465 	return "line is not wide enough";
1466 
1467       char_span literal = line.subspan (start.column - 1, literal_length);
1468 
1469       cpp_string from;
1470       from.len = literal_length;
1471       /* Make a copy of the literal, to avoid having to rely on
1472 	 the lifetime of the copy of the line within the cache.
1473 	 This will be released by the auto_cpp_string_vec dtor.  */
1474       from.text = (unsigned char *)literal.xstrdup ();
1475       strs.safe_push (from);
1476 
1477       /* For very long lines, a new linemap could have started
1478 	 halfway through the token.
1479 	 Ensure that the loc_reader uses the linemap of the
1480 	 *end* of the token for its start location.  */
1481       const line_map_ordinary *start_ord_map;
1482       linemap_resolve_location (line_table, src_range.m_start,
1483 				LRK_SPELLING_LOCATION, &start_ord_map);
1484       const line_map_ordinary *final_ord_map;
1485       linemap_resolve_location (line_table, src_range.m_finish,
1486 				LRK_SPELLING_LOCATION, &final_ord_map);
1487       if (start_ord_map == NULL || final_ord_map == NULL)
1488 	return "failed to get ordinary maps";
1489       /* Bulletproofing.  We ought to only have different ordinary maps
1490 	 for start vs finish due to line-length jumps.  */
1491       if (start_ord_map != final_ord_map
1492 	  && start_ord_map->to_file != final_ord_map->to_file)
1493 	return "start and finish are spelled in different ordinary maps";
1494       /* The file from linemap_resolve_location ought to match that from
1495 	 expand_location_to_spelling_point.  */
1496       if (start_ord_map->to_file != start.file)
1497 	return "mismatching file after resolving linemap";
1498 
1499       location_t start_loc
1500 	= linemap_position_for_line_and_column (line_table, final_ord_map,
1501 						start.line, start.column);
1502 
1503       cpp_string_location_reader loc_reader (start_loc, line_table);
1504       loc_readers.safe_push (loc_reader);
1505     }
1506 
1507   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1508   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1509 						 loc_readers.address (),
1510 						 num_locs, &ranges, type);
1511   if (err)
1512     return err;
1513 
1514   /* Success: "ranges" should now contain information on the string.  */
1515   return NULL;
1516 }
1517 
1518 /* Attempt to populate *OUT_LOC with source location information on the
1519    given characters within the string literal found at STRLOC.
1520    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1521    character set.
1522 
1523    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1524    and string literal "012345\n789"
1525    *OUT_LOC is written to with:
1526      "012345\n789"
1527          ~^~~~~
1528 
1529    If CONCATS is non-NULL, then any string literals that the token at
1530    STRLOC was concatenated with are also considered.
1531 
1532    This is implemented by re-parsing the relevant source line(s).
1533 
1534    Return NULL if successful, or an error message if any errors occurred.
1535    Error messages are intended for GCC developers (to help debugging) rather
1536    than for end-users.  */
1537 
1538 const char *
get_location_within_string(cpp_reader * pfile,string_concat_db * concats,location_t strloc,enum cpp_ttype type,int caret_idx,int start_idx,int end_idx,location_t * out_loc)1539 get_location_within_string (cpp_reader *pfile,
1540 			    string_concat_db *concats,
1541 			    location_t strloc,
1542 			    enum cpp_ttype type,
1543 			    int caret_idx, int start_idx, int end_idx,
1544 			    location_t *out_loc)
1545 {
1546   gcc_checking_assert (caret_idx >= 0);
1547   gcc_checking_assert (start_idx >= 0);
1548   gcc_checking_assert (end_idx >= 0);
1549   gcc_assert (out_loc);
1550 
1551   cpp_substring_ranges ranges;
1552   const char *err
1553     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1554   if (err)
1555     return err;
1556 
1557   if (caret_idx >= ranges.get_num_ranges ())
1558     return "caret_idx out of range";
1559   if (start_idx >= ranges.get_num_ranges ())
1560     return "start_idx out of range";
1561   if (end_idx >= ranges.get_num_ranges ())
1562     return "end_idx out of range";
1563 
1564   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1565 			    ranges.get_range (start_idx).m_start,
1566 			    ranges.get_range (end_idx).m_finish);
1567   return NULL;
1568 }
1569 
1570 #if CHECKING_P
1571 
1572 namespace selftest {
1573 
1574 /* Selftests of location handling.  */
1575 
1576 /* Attempt to populate *OUT_RANGE with source location information on the
1577    given character within the string literal found at STRLOC.
1578    CHAR_IDX refers to an offset within the execution character set.
1579    If CONCATS is non-NULL, then any string literals that the token at
1580    STRLOC was concatenated with are also considered.
1581 
1582    This is implemented by re-parsing the relevant source line(s).
1583 
1584    Return NULL if successful, or an error message if any errors occurred.
1585    Error messages are intended for GCC developers (to help debugging) rather
1586    than for end-users.  */
1587 
1588 static const char *
get_source_range_for_char(cpp_reader * pfile,string_concat_db * concats,location_t strloc,enum cpp_ttype type,int char_idx,source_range * out_range)1589 get_source_range_for_char (cpp_reader *pfile,
1590 			   string_concat_db *concats,
1591 			   location_t strloc,
1592 			   enum cpp_ttype type,
1593 			   int char_idx,
1594 			   source_range *out_range)
1595 {
1596   gcc_checking_assert (char_idx >= 0);
1597   gcc_assert (out_range);
1598 
1599   cpp_substring_ranges ranges;
1600   const char *err
1601     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1602   if (err)
1603     return err;
1604 
1605   if (char_idx >= ranges.get_num_ranges ())
1606     return "char_idx out of range";
1607 
1608   *out_range = ranges.get_range (char_idx);
1609   return NULL;
1610 }
1611 
1612 /* As get_source_range_for_char, but write to *OUT the number
1613    of ranges that are available.  */
1614 
1615 static const char *
get_num_source_ranges_for_substring(cpp_reader * pfile,string_concat_db * concats,location_t strloc,enum cpp_ttype type,int * out)1616 get_num_source_ranges_for_substring (cpp_reader *pfile,
1617 				     string_concat_db *concats,
1618 				     location_t strloc,
1619 				     enum cpp_ttype type,
1620 				     int *out)
1621 {
1622   gcc_assert (out);
1623 
1624   cpp_substring_ranges ranges;
1625   const char *err
1626     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1627 
1628   if (err)
1629     return err;
1630 
1631   *out = ranges.get_num_ranges ();
1632   return NULL;
1633 }
1634 
1635 /* Selftests of location handling.  */
1636 
1637 /* Verify that compare() on linenum_type handles comparisons over the full
1638    range of the type.  */
1639 
1640 static void
test_linenum_comparisons()1641 test_linenum_comparisons ()
1642 {
1643   linenum_type min_line (0);
1644   linenum_type max_line (0xffffffff);
1645   ASSERT_EQ (0, compare (min_line, min_line));
1646   ASSERT_EQ (0, compare (max_line, max_line));
1647 
1648   ASSERT_GT (compare (max_line, min_line), 0);
1649   ASSERT_LT (compare (min_line, max_line), 0);
1650 }
1651 
1652 /* Helper function for verifying location data: when location_t
1653    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1654    as having column 0.  */
1655 
1656 static bool
should_have_column_data_p(location_t loc)1657 should_have_column_data_p (location_t loc)
1658 {
1659   if (IS_ADHOC_LOC (loc))
1660     loc = get_location_from_adhoc_loc (line_table, loc);
1661   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1662     return false;
1663   return true;
1664 }
1665 
1666 /* Selftest for should_have_column_data_p.  */
1667 
1668 static void
test_should_have_column_data_p()1669 test_should_have_column_data_p ()
1670 {
1671   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1672   ASSERT_TRUE
1673     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1674   ASSERT_FALSE
1675     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1676 }
1677 
1678 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1679    on LOC.  */
1680 
1681 static void
assert_loceq(const char * exp_filename,int exp_linenum,int exp_colnum,location_t loc)1682 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1683 	      location_t loc)
1684 {
1685   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1686   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1687   /* If location_t values are sufficiently high, then column numbers
1688      will be unavailable and LOCATION_COLUMN (loc) will be 0.
1689      When close to the threshold, column numbers *may* be present: if
1690      the final linemap before the threshold contains a line that straddles
1691      the threshold, locations in that line have column information.  */
1692   if (should_have_column_data_p (loc))
1693     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1694 }
1695 
1696 /* Various selftests involve constructing a line table and one or more
1697    line maps within it.
1698 
1699    For maximum test coverage we want to run these tests with a variety
1700    of situations:
1701    - line_table->default_range_bits: some frontends use a non-zero value
1702    and others use zero
1703    - the fallback modes within line-map.c: there are various threshold
1704    values for location_t beyond line-map.c changes
1705    behavior (disabling of the range-packing optimization, disabling
1706    of column-tracking).  We can exercise these by starting the line_table
1707    at interesting values at or near these thresholds.
1708 
1709    The following struct describes a particular case within our test
1710    matrix.  */
1711 
1712 class line_table_case
1713 {
1714 public:
line_table_case(int default_range_bits,int base_location)1715   line_table_case (int default_range_bits, int base_location)
1716   : m_default_range_bits (default_range_bits),
1717     m_base_location (base_location)
1718   {}
1719 
1720   int m_default_range_bits;
1721   int m_base_location;
1722 };
1723 
1724 /* Constructor.  Store the old value of line_table, and create a new
1725    one, using sane defaults.  */
1726 
line_table_test()1727 line_table_test::line_table_test ()
1728 {
1729   gcc_assert (saved_line_table == NULL);
1730   saved_line_table = line_table;
1731   line_table = ggc_alloc<line_maps> ();
1732   linemap_init (line_table, BUILTINS_LOCATION);
1733   gcc_assert (saved_line_table->reallocator);
1734   line_table->reallocator = saved_line_table->reallocator;
1735   gcc_assert (saved_line_table->round_alloc_size);
1736   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1737   line_table->default_range_bits = 0;
1738 }
1739 
1740 /* Constructor.  Store the old value of line_table, and create a new
1741    one, using the sitation described in CASE_.  */
1742 
line_table_test(const line_table_case & case_)1743 line_table_test::line_table_test (const line_table_case &case_)
1744 {
1745   gcc_assert (saved_line_table == NULL);
1746   saved_line_table = line_table;
1747   line_table = ggc_alloc<line_maps> ();
1748   linemap_init (line_table, BUILTINS_LOCATION);
1749   gcc_assert (saved_line_table->reallocator);
1750   line_table->reallocator = saved_line_table->reallocator;
1751   gcc_assert (saved_line_table->round_alloc_size);
1752   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1753   line_table->default_range_bits = case_.m_default_range_bits;
1754   if (case_.m_base_location)
1755     {
1756       line_table->highest_location = case_.m_base_location;
1757       line_table->highest_line = case_.m_base_location;
1758     }
1759 }
1760 
1761 /* Destructor.  Restore the old value of line_table.  */
1762 
~line_table_test()1763 line_table_test::~line_table_test ()
1764 {
1765   gcc_assert (saved_line_table != NULL);
1766   line_table = saved_line_table;
1767   saved_line_table = NULL;
1768 }
1769 
1770 /* Verify basic operation of ordinary linemaps.  */
1771 
1772 static void
test_accessing_ordinary_linemaps(const line_table_case & case_)1773 test_accessing_ordinary_linemaps (const line_table_case &case_)
1774 {
1775   line_table_test ltt (case_);
1776 
1777   /* Build a simple linemap describing some locations. */
1778   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
1779 
1780   linemap_line_start (line_table, 1, 100);
1781   location_t loc_a = linemap_position_for_column (line_table, 1);
1782   location_t loc_b = linemap_position_for_column (line_table, 23);
1783 
1784   linemap_line_start (line_table, 2, 100);
1785   location_t loc_c = linemap_position_for_column (line_table, 1);
1786   location_t loc_d = linemap_position_for_column (line_table, 17);
1787 
1788   /* Example of a very long line.  */
1789   linemap_line_start (line_table, 3, 2000);
1790   location_t loc_e = linemap_position_for_column (line_table, 700);
1791 
1792   /* Transitioning back to a short line.  */
1793   linemap_line_start (line_table, 4, 0);
1794   location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
1795 
1796   if (should_have_column_data_p (loc_back_to_short))
1797     {
1798       /* Verify that we switched to short lines in the linemap.  */
1799       line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
1800       ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
1801     }
1802 
1803   /* Example of a line that will eventually be seen to be longer
1804      than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
1805      below that.  */
1806   linemap_line_start (line_table, 5, 2000);
1807 
1808   location_t loc_start_of_very_long_line
1809     = linemap_position_for_column (line_table, 2000);
1810   location_t loc_too_wide
1811     = linemap_position_for_column (line_table, 4097);
1812   location_t loc_too_wide_2
1813     = linemap_position_for_column (line_table, 4098);
1814 
1815   /* ...and back to a sane line length.  */
1816   linemap_line_start (line_table, 6, 100);
1817   location_t loc_sane_again = linemap_position_for_column (line_table, 10);
1818 
1819   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1820 
1821   /* Multiple files.  */
1822   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
1823   linemap_line_start (line_table, 1, 200);
1824   location_t loc_f = linemap_position_for_column (line_table, 150);
1825   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1826 
1827   /* Verify that we can recover the location info.  */
1828   assert_loceq ("foo.c", 1, 1, loc_a);
1829   assert_loceq ("foo.c", 1, 23, loc_b);
1830   assert_loceq ("foo.c", 2, 1, loc_c);
1831   assert_loceq ("foo.c", 2, 17, loc_d);
1832   assert_loceq ("foo.c", 3, 700, loc_e);
1833   assert_loceq ("foo.c", 4, 100, loc_back_to_short);
1834 
1835   /* In the very wide line, the initial location should be fully tracked.  */
1836   assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
1837   /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
1838      be disabled.  */
1839   assert_loceq ("foo.c", 5, 0, loc_too_wide);
1840   assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
1841   /*...and column-tracking should be re-enabled for subsequent lines.  */
1842   assert_loceq ("foo.c", 6, 10, loc_sane_again);
1843 
1844   assert_loceq ("bar.c", 1, 150, loc_f);
1845 
1846   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
1847   ASSERT_TRUE (pure_location_p (line_table, loc_a));
1848 
1849   /* Verify using make_location to build a range, and extracting data
1850      back from it.  */
1851   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
1852   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
1853   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
1854   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
1855   ASSERT_EQ (loc_b, src_range.m_start);
1856   ASSERT_EQ (loc_d, src_range.m_finish);
1857 }
1858 
1859 /* Verify various properties of UNKNOWN_LOCATION.  */
1860 
1861 static void
test_unknown_location()1862 test_unknown_location ()
1863 {
1864   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
1865   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
1866   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
1867 }
1868 
1869 /* Verify various properties of BUILTINS_LOCATION.  */
1870 
1871 static void
test_builtins()1872 test_builtins ()
1873 {
1874   assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
1875   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
1876 }
1877 
1878 /* Regression test for make_location.
1879    Ensure that we use pure locations for the start/finish of the range,
1880    rather than storing a packed or ad-hoc range as the start/finish.  */
1881 
1882 static void
test_make_location_nonpure_range_endpoints(const line_table_case & case_)1883 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
1884 {
1885   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
1886      with C++ frontend.
1887      ....................0000000001111111111222.
1888      ....................1234567890123456789012.  */
1889   const char *content = "     r += !aaa == bbb;\n";
1890   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
1891   line_table_test ltt (case_);
1892   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
1893 
1894   const location_t c11 = linemap_position_for_column (line_table, 11);
1895   const location_t c12 = linemap_position_for_column (line_table, 12);
1896   const location_t c13 = linemap_position_for_column (line_table, 13);
1897   const location_t c14 = linemap_position_for_column (line_table, 14);
1898   const location_t c21 = linemap_position_for_column (line_table, 21);
1899 
1900   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
1901     return;
1902 
1903   /* Use column 13 for the caret location, arbitrarily, to verify that we
1904      handle start != caret.  */
1905   const location_t aaa = make_location (c13, c12, c14);
1906   ASSERT_EQ (c13, get_pure_location (aaa));
1907   ASSERT_EQ (c12, get_start (aaa));
1908   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
1909   ASSERT_EQ (c14, get_finish (aaa));
1910   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
1911 
1912   /* Make a location using a location with a range as the start-point.  */
1913   const location_t not_aaa = make_location (c11, aaa, c14);
1914   ASSERT_EQ (c11, get_pure_location (not_aaa));
1915   /* It should use the start location of the range, not store the range
1916      itself.  */
1917   ASSERT_EQ (c12, get_start (not_aaa));
1918   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
1919   ASSERT_EQ (c14, get_finish (not_aaa));
1920   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
1921 
1922   /* Similarly, make a location with a range as the end-point.  */
1923   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
1924   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
1925   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
1926   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
1927   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
1928   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
1929   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
1930   /* It should use the finish location of the range, not store the range
1931      itself.  */
1932   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
1933   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
1934   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
1935   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
1936   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
1937 }
1938 
1939 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
1940 
1941 static void
test_reading_source_line()1942 test_reading_source_line ()
1943 {
1944   /* Create a tempfile and write some text to it.  */
1945   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
1946 			"01234567890123456789\n"
1947 			"This is the test text\n"
1948 			"This is the 3rd line");
1949 
1950   /* Read back a specific line from the tempfile.  */
1951   char_span source_line = location_get_source_line (tmp.get_filename (), 3);
1952   ASSERT_TRUE (source_line);
1953   ASSERT_TRUE (source_line.get_buffer () != NULL);
1954   ASSERT_EQ (20, source_line.length ());
1955   ASSERT_TRUE (!strncmp ("This is the 3rd line",
1956 			 source_line.get_buffer (), source_line.length ()));
1957 
1958   source_line = location_get_source_line (tmp.get_filename (), 2);
1959   ASSERT_TRUE (source_line);
1960   ASSERT_TRUE (source_line.get_buffer () != NULL);
1961   ASSERT_EQ (21, source_line.length ());
1962   ASSERT_TRUE (!strncmp ("This is the test text",
1963 			 source_line.get_buffer (), source_line.length ()));
1964 
1965   source_line = location_get_source_line (tmp.get_filename (), 4);
1966   ASSERT_FALSE (source_line);
1967   ASSERT_TRUE (source_line.get_buffer () == NULL);
1968 }
1969 
1970 /* Tests of lexing.  */
1971 
1972 /* Verify that token TOK from PARSER has cpp_token_as_text
1973    equal to EXPECTED_TEXT.  */
1974 
1975 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)		\
1976   SELFTEST_BEGIN_STMT							\
1977     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));	\
1978     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);		\
1979   SELFTEST_END_STMT
1980 
1981 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
1982    and ranges from EXP_START_COL to EXP_FINISH_COL.
1983    Use LOC as the effective location of the selftest.  */
1984 
1985 static void
assert_token_loc_eq(const location & loc,const cpp_token * tok,const char * exp_filename,int exp_linenum,int exp_start_col,int exp_finish_col)1986 assert_token_loc_eq (const location &loc,
1987 		     const cpp_token *tok,
1988 		     const char *exp_filename, int exp_linenum,
1989 		     int exp_start_col, int exp_finish_col)
1990 {
1991   location_t tok_loc = tok->src_loc;
1992   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
1993   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
1994 
1995   /* If location_t values are sufficiently high, then column numbers
1996      will be unavailable.  */
1997   if (!should_have_column_data_p (tok_loc))
1998     return;
1999 
2000   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2001   source_range tok_range = get_range_from_loc (line_table, tok_loc);
2002   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2003   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2004 }
2005 
2006 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
2007    SELFTEST_LOCATION as the effective location of the selftest.  */
2008 
2009 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2010 			    EXP_START_COL, EXP_FINISH_COL) \
2011   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2012 		       (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2013 
2014 /* Test of lexing a file using libcpp, verifying tokens and their
2015    location information.  */
2016 
2017 static void
test_lexer(const line_table_case & case_)2018 test_lexer (const line_table_case &case_)
2019 {
2020   /* Create a tempfile and write some text to it.  */
2021   const char *content =
2022     /*00000000011111111112222222222333333.3333444444444.455555555556
2023       12345678901234567890123456789012345.6789012345678.901234567890.  */
2024     ("test_name /* c-style comment */\n"
2025      "                                  \"test literal\"\n"
2026      " // test c++-style comment\n"
2027      "   42\n");
2028   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2029 
2030   line_table_test ltt (case_);
2031 
2032   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2033 
2034   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2035   ASSERT_NE (fname, NULL);
2036 
2037   /* Verify that we get the expected tokens back, with the correct
2038      location information.  */
2039 
2040   location_t loc;
2041   const cpp_token *tok;
2042   tok = cpp_get_token_with_location (parser, &loc);
2043   ASSERT_NE (tok, NULL);
2044   ASSERT_EQ (tok->type, CPP_NAME);
2045   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2046   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2047 
2048   tok = cpp_get_token_with_location (parser, &loc);
2049   ASSERT_NE (tok, NULL);
2050   ASSERT_EQ (tok->type, CPP_STRING);
2051   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2052   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2053 
2054   tok = cpp_get_token_with_location (parser, &loc);
2055   ASSERT_NE (tok, NULL);
2056   ASSERT_EQ (tok->type, CPP_NUMBER);
2057   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2058   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2059 
2060   tok = cpp_get_token_with_location (parser, &loc);
2061   ASSERT_NE (tok, NULL);
2062   ASSERT_EQ (tok->type, CPP_EOF);
2063 
2064   cpp_finish (parser, NULL);
2065   cpp_destroy (parser);
2066 }
2067 
2068 /* Forward decls.  */
2069 
2070 class lexer_test;
2071 class lexer_test_options;
2072 
2073 /* A class for specifying options of a lexer_test.
2074    The "apply" vfunc is called during the lexer_test constructor.  */
2075 
2076 class lexer_test_options
2077 {
2078  public:
2079   virtual void apply (lexer_test &) = 0;
2080 };
2081 
2082 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2083    in its dtor.
2084 
2085    This is needed by struct lexer_test to ensure that the cleanup of the
2086    cpp_reader happens *after* the cleanup of the temp_source_file.  */
2087 
2088 class cpp_reader_ptr
2089 {
2090  public:
cpp_reader_ptr(cpp_reader * ptr)2091   cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2092 
~cpp_reader_ptr()2093   ~cpp_reader_ptr ()
2094   {
2095     cpp_finish (m_ptr, NULL);
2096     cpp_destroy (m_ptr);
2097   }
2098 
2099   operator cpp_reader * () const { return m_ptr; }
2100 
2101  private:
2102   cpp_reader *m_ptr;
2103 };
2104 
2105 /* A struct for writing lexer tests.  */
2106 
2107 class lexer_test
2108 {
2109 public:
2110   lexer_test (const line_table_case &case_, const char *content,
2111 	      lexer_test_options *options);
2112   ~lexer_test ();
2113 
2114   const cpp_token *get_token ();
2115 
2116   /* The ordering of these fields matters.
2117      The line_table_test must be first, since the cpp_reader_ptr
2118      uses it.
2119      The cpp_reader must be cleaned up *after* the temp_source_file
2120      since the filenames in input.c's input cache are owned by the
2121      cpp_reader; in particular, when ~temp_source_file evicts the
2122      filename the filenames must still be alive.  */
2123   line_table_test m_ltt;
2124   cpp_reader_ptr m_parser;
2125   temp_source_file m_tempfile;
2126   string_concat_db m_concats;
2127   bool m_implicitly_expect_EOF;
2128 };
2129 
2130 /* Use an EBCDIC encoding for the execution charset, specifically
2131    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2132 
2133    This exercises iconv integration within libcpp.
2134    Not every build of iconv supports the given charset,
2135    so we need to flag this error and handle it gracefully.  */
2136 
2137 class ebcdic_execution_charset : public lexer_test_options
2138 {
2139  public:
ebcdic_execution_charset()2140   ebcdic_execution_charset () : m_num_iconv_errors (0)
2141     {
2142       gcc_assert (s_singleton == NULL);
2143       s_singleton = this;
2144     }
~ebcdic_execution_charset()2145   ~ebcdic_execution_charset ()
2146     {
2147       gcc_assert (s_singleton == this);
2148       s_singleton = NULL;
2149     }
2150 
apply(lexer_test & test)2151   void apply (lexer_test &test) FINAL OVERRIDE
2152   {
2153     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2154     cpp_opts->narrow_charset = "IBM1047";
2155 
2156     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2157     callbacks->diagnostic = on_diagnostic;
2158   }
2159 
on_diagnostic(cpp_reader * pfile ATTRIBUTE_UNUSED,enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,enum cpp_warning_reason reason ATTRIBUTE_UNUSED,rich_location * richloc ATTRIBUTE_UNUSED,const char * msgid,va_list * ap ATTRIBUTE_UNUSED)2160   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2161 			     enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2162 			     enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2163 			     rich_location *richloc ATTRIBUTE_UNUSED,
2164 			     const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2165     ATTRIBUTE_FPTR_PRINTF(5,0)
2166   {
2167     gcc_assert (s_singleton);
2168     /* Avoid exgettext from picking this up, it is translated in libcpp.  */
2169     const char *msg = "conversion from %s to %s not supported by iconv";
2170 #ifdef ENABLE_NLS
2171     msg = dgettext ("cpplib", msg);
2172 #endif
2173     /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc
2174        when the local iconv build doesn't support the conversion.  */
2175     if (strcmp (msgid, msg) == 0)
2176       {
2177 	s_singleton->m_num_iconv_errors++;
2178 	return true;
2179       }
2180 
2181     /* Otherwise, we have an unexpected error.  */
2182     abort ();
2183   }
2184 
iconv_errors_occurred_p()2185   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2186 
2187  private:
2188   static ebcdic_execution_charset *s_singleton;
2189   int m_num_iconv_errors;
2190 };
2191 
2192 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2193 
2194 /* A lexer_test_options subclass that records a list of diagnostic
2195    messages emitted by the lexer.  */
2196 
2197 class lexer_diagnostic_sink : public lexer_test_options
2198 {
2199  public:
lexer_diagnostic_sink()2200   lexer_diagnostic_sink ()
2201   {
2202     gcc_assert (s_singleton == NULL);
2203     s_singleton = this;
2204   }
~lexer_diagnostic_sink()2205   ~lexer_diagnostic_sink ()
2206   {
2207     gcc_assert (s_singleton == this);
2208     s_singleton = NULL;
2209 
2210     int i;
2211     char *str;
2212     FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2213       free (str);
2214   }
2215 
apply(lexer_test & test)2216   void apply (lexer_test &test) FINAL OVERRIDE
2217   {
2218     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2219     callbacks->diagnostic = on_diagnostic;
2220   }
2221 
on_diagnostic(cpp_reader * pfile ATTRIBUTE_UNUSED,enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,enum cpp_warning_reason reason ATTRIBUTE_UNUSED,rich_location * richloc ATTRIBUTE_UNUSED,const char * msgid,va_list * ap)2222   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2223 			     enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2224 			     enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2225 			     rich_location *richloc ATTRIBUTE_UNUSED,
2226 			     const char *msgid, va_list *ap)
2227     ATTRIBUTE_FPTR_PRINTF(5,0)
2228   {
2229     char *msg = xvasprintf (msgid, *ap);
2230     s_singleton->m_diagnostics.safe_push (msg);
2231     return true;
2232   }
2233 
2234   auto_vec<char *> m_diagnostics;
2235 
2236  private:
2237   static lexer_diagnostic_sink *s_singleton;
2238 };
2239 
2240 lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2241 
2242 /* Constructor.  Override line_table with a new instance based on CASE_,
2243    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2244    start parsing the tempfile.  */
2245 
lexer_test(const line_table_case & case_,const char * content,lexer_test_options * options)2246 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2247 			lexer_test_options *options)
2248 : m_ltt (case_),
2249   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2250   /* Create a tempfile and write the text to it.  */
2251   m_tempfile (SELFTEST_LOCATION, ".c", content),
2252   m_concats (),
2253   m_implicitly_expect_EOF (true)
2254 {
2255   if (options)
2256     options->apply (*this);
2257 
2258   cpp_init_iconv (m_parser);
2259 
2260   /* Parse the file.  */
2261   const char *fname = cpp_read_main_file (m_parser,
2262 					  m_tempfile.get_filename ());
2263   ASSERT_NE (fname, NULL);
2264 }
2265 
2266 /* Destructor.  By default, verify that the next token in m_parser is EOF.  */
2267 
~lexer_test()2268 lexer_test::~lexer_test ()
2269 {
2270   location_t loc;
2271   const cpp_token *tok;
2272 
2273   if (m_implicitly_expect_EOF)
2274     {
2275       tok = cpp_get_token_with_location (m_parser, &loc);
2276       ASSERT_NE (tok, NULL);
2277       ASSERT_EQ (tok->type, CPP_EOF);
2278     }
2279 }
2280 
2281 /* Get the next token from m_parser.  */
2282 
2283 const cpp_token *
get_token()2284 lexer_test::get_token ()
2285 {
2286   location_t loc;
2287   const cpp_token *tok;
2288 
2289   tok = cpp_get_token_with_location (m_parser, &loc);
2290   ASSERT_NE (tok, NULL);
2291   return tok;
2292 }
2293 
2294 /* Verify that locations within string literals are correctly handled.  */
2295 
2296 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2297    using the string concatenation database for TEST.
2298 
2299    Assert that the character at index IDX is on EXPECTED_LINE,
2300    and that it begins at column EXPECTED_START_COL and ends at
2301    EXPECTED_FINISH_COL (unless the locations are beyond
2302    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2303    columns).  */
2304 
2305 static void
assert_char_at_range(const location & loc,lexer_test & test,location_t strloc,enum cpp_ttype type,int idx,int expected_line,int expected_start_col,int expected_finish_col)2306 assert_char_at_range (const location &loc,
2307 		      lexer_test& test,
2308 		      location_t strloc, enum cpp_ttype type, int idx,
2309 		      int expected_line, int expected_start_col,
2310 		      int expected_finish_col)
2311 {
2312   cpp_reader *pfile = test.m_parser;
2313   string_concat_db *concats = &test.m_concats;
2314 
2315   source_range actual_range = source_range();
2316   const char *err
2317     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2318 				 &actual_range);
2319   if (should_have_column_data_p (strloc))
2320     ASSERT_EQ_AT (loc, NULL, err);
2321   else
2322     {
2323       ASSERT_STREQ_AT (loc,
2324 		       "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2325 		       err);
2326       return;
2327     }
2328 
2329   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2330   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2331   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2332   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2333 
2334   if (should_have_column_data_p (actual_range.m_start))
2335     {
2336       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2337       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2338     }
2339   if (should_have_column_data_p (actual_range.m_finish))
2340     {
2341       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2342       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2343     }
2344 }
2345 
2346 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2347    the effective location of any errors.  */
2348 
2349 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2350 			     EXPECTED_START_COL, EXPECTED_FINISH_COL)	\
2351   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2352 			(IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2353 			(EXPECTED_FINISH_COL))
2354 
2355 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2356    using the string concatenation database for TEST.
2357 
2358    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2359 
2360 static void
assert_num_substring_ranges(const location & loc,lexer_test & test,location_t strloc,enum cpp_ttype type,int expected_num_ranges)2361 assert_num_substring_ranges (const location &loc,
2362 			     lexer_test& test,
2363 			     location_t strloc,
2364 			     enum cpp_ttype type,
2365 			     int expected_num_ranges)
2366 {
2367   cpp_reader *pfile = test.m_parser;
2368   string_concat_db *concats = &test.m_concats;
2369 
2370   int actual_num_ranges = -1;
2371   const char *err
2372     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2373 					   &actual_num_ranges);
2374   if (should_have_column_data_p (strloc))
2375     ASSERT_EQ_AT (loc, NULL, err);
2376   else
2377     {
2378       ASSERT_STREQ_AT (loc,
2379 		       "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2380 		       err);
2381       return;
2382     }
2383   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2384 }
2385 
2386 /* Macro for calling assert_num_substring_ranges, supplying
2387    SELFTEST_LOCATION for the effective location of any errors.  */
2388 
2389 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2390 				    EXPECTED_NUM_RANGES)		\
2391   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2392 			       (TYPE), (EXPECTED_NUM_RANGES))
2393 
2394 
2395 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2396    returns an error (using the string concatenation database for TEST).  */
2397 
2398 static void
assert_has_no_substring_ranges(const location & loc,lexer_test & test,location_t strloc,enum cpp_ttype type,const char * expected_err)2399 assert_has_no_substring_ranges (const location &loc,
2400 				lexer_test& test,
2401 				location_t strloc,
2402 				enum cpp_ttype type,
2403 				const char *expected_err)
2404 {
2405   cpp_reader *pfile = test.m_parser;
2406   string_concat_db *concats = &test.m_concats;
2407   cpp_substring_ranges ranges;
2408   const char *actual_err
2409     = get_substring_ranges_for_loc (pfile, concats, strloc,
2410 				    type, ranges);
2411   if (should_have_column_data_p (strloc))
2412     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2413   else
2414     ASSERT_STREQ_AT (loc,
2415 		     "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2416 		     actual_err);
2417 }
2418 
2419 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2420     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2421 				    (STRLOC), (TYPE), (ERR))
2422 
2423 /* Lex a simple string literal.  Verify the substring location data, before
2424    and after running cpp_interpret_string on it.  */
2425 
2426 static void
test_lexer_string_locations_simple(const line_table_case & case_)2427 test_lexer_string_locations_simple (const line_table_case &case_)
2428 {
2429   /* Digits 0-9 (with 0 at column 10), the simple way.
2430      ....................000000000.11111111112.2222222223333333333
2431      ....................123456789.01234567890.1234567890123456789
2432      We add a trailing comment to ensure that we correctly locate
2433      the end of the string literal token.  */
2434   const char *content = "        \"0123456789\" /* not a string */\n";
2435   lexer_test test (case_, content, NULL);
2436 
2437   /* Verify that we get the expected token back, with the correct
2438      location information.  */
2439   const cpp_token *tok = test.get_token ();
2440   ASSERT_EQ (tok->type, CPP_STRING);
2441   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2442   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2443 
2444   /* At this point in lexing, the quote characters are treated as part of
2445      the string (they are stripped off by cpp_interpret_string).  */
2446 
2447   ASSERT_EQ (tok->val.str.len, 12);
2448 
2449   /* Verify that cpp_interpret_string works.  */
2450   cpp_string dst_string;
2451   const enum cpp_ttype type = CPP_STRING;
2452   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2453 				      &dst_string, type);
2454   ASSERT_TRUE (result);
2455   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2456   free (const_cast <unsigned char *> (dst_string.text));
2457 
2458   /* Verify ranges of individual characters.  This no longer includes the
2459      opening quote, but does include the closing quote.  */
2460   for (int i = 0; i <= 10; i++)
2461     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2462 			  10 + i, 10 + i);
2463 
2464   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2465 }
2466 
2467 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2468    encoding.  */
2469 
2470 static void
test_lexer_string_locations_ebcdic(const line_table_case & case_)2471 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2472 {
2473   /* EBCDIC support requires iconv.  */
2474   if (!HAVE_ICONV)
2475     return;
2476 
2477   /* Digits 0-9 (with 0 at column 10), the simple way.
2478      ....................000000000.11111111112.2222222223333333333
2479      ....................123456789.01234567890.1234567890123456789
2480      We add a trailing comment to ensure that we correctly locate
2481      the end of the string literal token.  */
2482   const char *content = "        \"0123456789\" /* not a string */\n";
2483   ebcdic_execution_charset use_ebcdic;
2484   lexer_test test (case_, content, &use_ebcdic);
2485 
2486   /* Verify that we get the expected token back, with the correct
2487      location information.  */
2488   const cpp_token *tok = test.get_token ();
2489   ASSERT_EQ (tok->type, CPP_STRING);
2490   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2491   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2492 
2493   /* At this point in lexing, the quote characters are treated as part of
2494      the string (they are stripped off by cpp_interpret_string).  */
2495 
2496   ASSERT_EQ (tok->val.str.len, 12);
2497 
2498   /* The remainder of the test requires an iconv implementation that
2499      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2500   if (use_ebcdic.iconv_errors_occurred_p ())
2501     return;
2502 
2503   /* Verify that cpp_interpret_string works.  */
2504   cpp_string dst_string;
2505   const enum cpp_ttype type = CPP_STRING;
2506   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2507 				      &dst_string, type);
2508   ASSERT_TRUE (result);
2509   /* We should now have EBCDIC-encoded text, specifically
2510      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2511      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2512   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2513 		(const char *)dst_string.text);
2514   free (const_cast <unsigned char *> (dst_string.text));
2515 
2516   /* Verify that we don't attempt to record substring location information
2517      for such cases.  */
2518   ASSERT_HAS_NO_SUBSTRING_RANGES
2519     (test, tok->src_loc, type,
2520      "execution character set != source character set");
2521 }
2522 
2523 /* Lex a string literal containing a hex-escaped character.
2524    Verify the substring location data, before and after running
2525    cpp_interpret_string on it.  */
2526 
2527 static void
test_lexer_string_locations_hex(const line_table_case & case_)2528 test_lexer_string_locations_hex (const line_table_case &case_)
2529 {
2530   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2531      and with a space in place of digit 6, to terminate the escaped
2532      hex code.
2533      ....................000000000.111111.11112222.
2534      ....................123456789.012345.67890123.  */
2535   const char *content = "        \"01234\\x35 789\"\n";
2536   lexer_test test (case_, content, NULL);
2537 
2538   /* Verify that we get the expected token back, with the correct
2539      location information.  */
2540   const cpp_token *tok = test.get_token ();
2541   ASSERT_EQ (tok->type, CPP_STRING);
2542   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2543   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2544 
2545   /* At this point in lexing, the quote characters are treated as part of
2546      the string (they are stripped off by cpp_interpret_string).  */
2547   ASSERT_EQ (tok->val.str.len, 15);
2548 
2549   /* Verify that cpp_interpret_string works.  */
2550   cpp_string dst_string;
2551   const enum cpp_ttype type = CPP_STRING;
2552   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2553 				      &dst_string, type);
2554   ASSERT_TRUE (result);
2555   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2556   free (const_cast <unsigned char *> (dst_string.text));
2557 
2558   /* Verify ranges of individual characters.  This no longer includes the
2559      opening quote, but does include the closing quote.  */
2560   for (int i = 0; i <= 4; i++)
2561     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2562   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2563   for (int i = 6; i <= 10; i++)
2564     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2565 
2566   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2567 }
2568 
2569 /* Lex a string literal containing an octal-escaped character.
2570    Verify the substring location data after running cpp_interpret_string
2571    on it.  */
2572 
2573 static void
test_lexer_string_locations_oct(const line_table_case & case_)2574 test_lexer_string_locations_oct (const line_table_case &case_)
2575 {
2576   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2577      and with a space in place of digit 6, to terminate the escaped
2578      octal code.
2579      ....................000000000.111111.11112222.2222223333333333444
2580      ....................123456789.012345.67890123.4567890123456789012  */
2581   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2582   lexer_test test (case_, content, NULL);
2583 
2584   /* Verify that we get the expected token back, with the correct
2585      location information.  */
2586   const cpp_token *tok = test.get_token ();
2587   ASSERT_EQ (tok->type, CPP_STRING);
2588   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2589 
2590   /* Verify that cpp_interpret_string works.  */
2591   cpp_string dst_string;
2592   const enum cpp_ttype type = CPP_STRING;
2593   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2594 				      &dst_string, type);
2595   ASSERT_TRUE (result);
2596   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2597   free (const_cast <unsigned char *> (dst_string.text));
2598 
2599   /* Verify ranges of individual characters.  This no longer includes the
2600      opening quote, but does include the closing quote.  */
2601   for (int i = 0; i < 5; i++)
2602     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2603   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2604   for (int i = 6; i <= 10; i++)
2605     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2606 
2607   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2608 }
2609 
2610 /* Test of string literal containing letter escapes.  */
2611 
2612 static void
test_lexer_string_locations_letter_escape_1(const line_table_case & case_)2613 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2614 {
2615   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2616      .....................000000000.1.11111.1.1.11222.22222223333333
2617      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2618   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2619   lexer_test test (case_, content, NULL);
2620 
2621   /* Verify that we get the expected tokens back.  */
2622   const cpp_token *tok = test.get_token ();
2623   ASSERT_EQ (tok->type, CPP_STRING);
2624   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2625 
2626   /* Verify ranges of individual characters. */
2627   /* "\t".  */
2628   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2629 			0, 1, 10, 11);
2630   /* "foo". */
2631   for (int i = 1; i <= 3; i++)
2632     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2633 			  i, 1, 11 + i, 11 + i);
2634   /* "\\" and "\n".  */
2635   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2636 			4, 1, 15, 16);
2637   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2638 			5, 1, 17, 18);
2639 
2640   /* "bar" and closing quote for nul-terminator.  */
2641   for (int i = 6; i <= 9; i++)
2642     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2643 			  i, 1, 13 + i, 13 + i);
2644 
2645   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2646 }
2647 
2648 /* Another test of a string literal containing a letter escape.
2649    Based on string seen in
2650      printf ("%-%\n");
2651    in gcc.dg/format/c90-printf-1.c.  */
2652 
2653 static void
test_lexer_string_locations_letter_escape_2(const line_table_case & case_)2654 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2655 {
2656   /* .....................000000000.1111.11.1111.22222222223.
2657      .....................123456789.0123.45.6789.01234567890.  */
2658   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2659   lexer_test test (case_, content, NULL);
2660 
2661   /* Verify that we get the expected tokens back.  */
2662   const cpp_token *tok = test.get_token ();
2663   ASSERT_EQ (tok->type, CPP_STRING);
2664   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2665 
2666   /* Verify ranges of individual characters. */
2667   /* "%-%".  */
2668   for (int i = 0; i < 3; i++)
2669     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2670 			  i, 1, 10 + i, 10 + i);
2671   /* "\n".  */
2672   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2673 			3, 1, 13, 14);
2674 
2675   /* Closing quote for nul-terminator.  */
2676   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2677 			4, 1, 15, 15);
2678 
2679   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2680 }
2681 
2682 /* Lex a string literal containing UCN 4 characters.
2683    Verify the substring location data after running cpp_interpret_string
2684    on it.  */
2685 
2686 static void
test_lexer_string_locations_ucn4(const line_table_case & case_)2687 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2688 {
2689   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2690      as UCN 4.
2691      ....................000000000.111111.111122.222222223.33333333344444
2692      ....................123456789.012345.678901.234567890.12345678901234  */
2693   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
2694   lexer_test test (case_, content, NULL);
2695 
2696   /* Verify that we get the expected token back, with the correct
2697      location information.  */
2698   const cpp_token *tok = test.get_token ();
2699   ASSERT_EQ (tok->type, CPP_STRING);
2700   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2701 
2702   /* Verify that cpp_interpret_string works.
2703      The string should be encoded in the execution character
2704      set.  Assuming that is UTF-8, we should have the following:
2705      -----------  ----  -----  -------  ----------------
2706      Byte offset  Byte  Octal  Unicode  Source Column(s)
2707      -----------  ----  -----  -------  ----------------
2708      0            0x30         '0'      10
2709      1            0x31         '1'      11
2710      2            0x32         '2'      12
2711      3            0x33         '3'      13
2712      4            0x34         '4'      14
2713      5            0xE2  \342   U+2174   15-20
2714      6            0x85  \205    (cont)  15-20
2715      7            0xB4  \264    (cont)  15-20
2716      8            0xE2  \342   U+2175   21-26
2717      9            0x85  \205    (cont)  21-26
2718      10           0xB5  \265    (cont)  21-26
2719      11           0x37         '7'      27
2720      12           0x38         '8'      28
2721      13           0x39         '9'      29
2722      14           0x00                  30 (closing quote)
2723      -----------  ----  -----  -------  ---------------.  */
2724 
2725   cpp_string dst_string;
2726   const enum cpp_ttype type = CPP_STRING;
2727   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2728 				      &dst_string, type);
2729   ASSERT_TRUE (result);
2730   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2731 		(const char *)dst_string.text);
2732   free (const_cast <unsigned char *> (dst_string.text));
2733 
2734   /* Verify ranges of individual characters.  This no longer includes the
2735      opening quote, but does include the closing quote.
2736      '01234'.  */
2737   for (int i = 0; i <= 4; i++)
2738     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2739   /* U+2174.  */
2740   for (int i = 5; i <= 7; i++)
2741     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2742   /* U+2175.  */
2743   for (int i = 8; i <= 10; i++)
2744     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2745   /* '789' and nul terminator  */
2746   for (int i = 11; i <= 14; i++)
2747     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2748 
2749   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2750 }
2751 
2752 /* Lex a string literal containing UCN 8 characters.
2753    Verify the substring location data after running cpp_interpret_string
2754    on it.  */
2755 
2756 static void
test_lexer_string_locations_ucn8(const line_table_case & case_)2757 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2758 {
2759   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2760      ....................000000000.111111.1111222222.2222333333333.344444
2761      ....................123456789.012345.6789012345.6789012345678.901234  */
2762   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
2763   lexer_test test (case_, content, NULL);
2764 
2765   /* Verify that we get the expected token back, with the correct
2766      location information.  */
2767   const cpp_token *tok = test.get_token ();
2768   ASSERT_EQ (tok->type, CPP_STRING);
2769   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
2770 			   "\"01234\\U00002174\\U00002175789\"");
2771 
2772   /* Verify that cpp_interpret_string works.
2773      The UTF-8 encoding of the string is identical to that from
2774      the ucn4 testcase above; the only difference is the column
2775      locations.  */
2776   cpp_string dst_string;
2777   const enum cpp_ttype type = CPP_STRING;
2778   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2779 				      &dst_string, type);
2780   ASSERT_TRUE (result);
2781   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2782 		(const char *)dst_string.text);
2783   free (const_cast <unsigned char *> (dst_string.text));
2784 
2785   /* Verify ranges of individual characters.  This no longer includes the
2786      opening quote, but does include the closing quote.
2787      '01234'.  */
2788   for (int i = 0; i <= 4; i++)
2789     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2790   /* U+2174.  */
2791   for (int i = 5; i <= 7; i++)
2792     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
2793   /* U+2175.  */
2794   for (int i = 8; i <= 10; i++)
2795     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
2796   /* '789' at columns 35-37  */
2797   for (int i = 11; i <= 13; i++)
2798     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
2799   /* Closing quote/nul-terminator at column 38.  */
2800   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
2801 
2802   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2803 }
2804 
2805 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
2806 
2807 static uint32_t
uint32_from_big_endian(const uint32_t * ptr_be_value)2808 uint32_from_big_endian (const uint32_t *ptr_be_value)
2809 {
2810   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2811   return (((uint32_t) buf[0] << 24)
2812 	  | ((uint32_t) buf[1] << 16)
2813 	  | ((uint32_t) buf[2] << 8)
2814 	  | (uint32_t) buf[3]);
2815 }
2816 
2817 /* Lex a wide string literal and verify that attempts to read substring
2818    location data from it fail gracefully.  */
2819 
2820 static void
test_lexer_string_locations_wide_string(const line_table_case & case_)2821 test_lexer_string_locations_wide_string (const line_table_case &case_)
2822 {
2823   /* Digits 0-9.
2824      ....................000000000.11111111112.22222222233333
2825      ....................123456789.01234567890.12345678901234  */
2826   const char *content = "       L\"0123456789\" /* non-str */\n";
2827   lexer_test test (case_, content, NULL);
2828 
2829   /* Verify that we get the expected token back, with the correct
2830      location information.  */
2831   const cpp_token *tok = test.get_token ();
2832   ASSERT_EQ (tok->type, CPP_WSTRING);
2833   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
2834 
2835   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
2836   cpp_string dst_string;
2837   const enum cpp_ttype type = CPP_WSTRING;
2838   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2839 				      &dst_string, type);
2840   ASSERT_TRUE (result);
2841   /* The cpp_reader defaults to big-endian with
2842      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
2843      now be encoded as UTF-32BE.  */
2844   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2845   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2846   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2847   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2848   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2849   free (const_cast <unsigned char *> (dst_string.text));
2850 
2851   /* We don't yet support generating substring location information
2852      for L"" strings.  */
2853   ASSERT_HAS_NO_SUBSTRING_RANGES
2854     (test, tok->src_loc, type,
2855      "execution character set != source character set");
2856 }
2857 
2858 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
2859 
2860 static uint16_t
uint16_from_big_endian(const uint16_t * ptr_be_value)2861 uint16_from_big_endian (const uint16_t *ptr_be_value)
2862 {
2863   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2864   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
2865 }
2866 
2867 /* Lex a u"" string literal and verify that attempts to read substring
2868    location data from it fail gracefully.  */
2869 
2870 static void
test_lexer_string_locations_string16(const line_table_case & case_)2871 test_lexer_string_locations_string16 (const line_table_case &case_)
2872 {
2873   /* Digits 0-9.
2874      ....................000000000.11111111112.22222222233333
2875      ....................123456789.01234567890.12345678901234  */
2876   const char *content = "       u\"0123456789\" /* non-str */\n";
2877   lexer_test test (case_, content, NULL);
2878 
2879   /* Verify that we get the expected token back, with the correct
2880      location information.  */
2881   const cpp_token *tok = test.get_token ();
2882   ASSERT_EQ (tok->type, CPP_STRING16);
2883   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
2884 
2885   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
2886   cpp_string dst_string;
2887   const enum cpp_ttype type = CPP_STRING16;
2888   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2889 				      &dst_string, type);
2890   ASSERT_TRUE (result);
2891 
2892   /* The cpp_reader defaults to big-endian, so dst_string should
2893      now be encoded as UTF-16BE.  */
2894   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
2895   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
2896   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
2897   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
2898   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
2899   free (const_cast <unsigned char *> (dst_string.text));
2900 
2901   /* We don't yet support generating substring location information
2902      for L"" strings.  */
2903   ASSERT_HAS_NO_SUBSTRING_RANGES
2904     (test, tok->src_loc, type,
2905      "execution character set != source character set");
2906 }
2907 
2908 /* Lex a U"" string literal and verify that attempts to read substring
2909    location data from it fail gracefully.  */
2910 
2911 static void
test_lexer_string_locations_string32(const line_table_case & case_)2912 test_lexer_string_locations_string32 (const line_table_case &case_)
2913 {
2914   /* Digits 0-9.
2915      ....................000000000.11111111112.22222222233333
2916      ....................123456789.01234567890.12345678901234  */
2917   const char *content = "       U\"0123456789\" /* non-str */\n";
2918   lexer_test test (case_, content, NULL);
2919 
2920   /* Verify that we get the expected token back, with the correct
2921      location information.  */
2922   const cpp_token *tok = test.get_token ();
2923   ASSERT_EQ (tok->type, CPP_STRING32);
2924   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
2925 
2926   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
2927   cpp_string dst_string;
2928   const enum cpp_ttype type = CPP_STRING32;
2929   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2930 				      &dst_string, type);
2931   ASSERT_TRUE (result);
2932 
2933   /* The cpp_reader defaults to big-endian, so dst_string should
2934      now be encoded as UTF-32BE.  */
2935   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2936   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2937   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2938   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2939   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2940   free (const_cast <unsigned char *> (dst_string.text));
2941 
2942   /* We don't yet support generating substring location information
2943      for L"" strings.  */
2944   ASSERT_HAS_NO_SUBSTRING_RANGES
2945     (test, tok->src_loc, type,
2946      "execution character set != source character set");
2947 }
2948 
2949 /* Lex a u8-string literal.
2950    Verify the substring location data after running cpp_interpret_string
2951    on it.  */
2952 
2953 static void
test_lexer_string_locations_u8(const line_table_case & case_)2954 test_lexer_string_locations_u8 (const line_table_case &case_)
2955 {
2956   /* Digits 0-9.
2957      ....................000000000.11111111112.22222222233333
2958      ....................123456789.01234567890.12345678901234  */
2959   const char *content = "      u8\"0123456789\" /* non-str */\n";
2960   lexer_test test (case_, content, NULL);
2961 
2962   /* Verify that we get the expected token back, with the correct
2963      location information.  */
2964   const cpp_token *tok = test.get_token ();
2965   ASSERT_EQ (tok->type, CPP_UTF8STRING);
2966   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
2967 
2968   /* Verify that cpp_interpret_string works.  */
2969   cpp_string dst_string;
2970   const enum cpp_ttype type = CPP_STRING;
2971   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2972 				      &dst_string, type);
2973   ASSERT_TRUE (result);
2974   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2975   free (const_cast <unsigned char *> (dst_string.text));
2976 
2977   /* Verify ranges of individual characters.  This no longer includes the
2978      opening quote, but does include the closing quote.  */
2979   for (int i = 0; i <= 10; i++)
2980     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2981 }
2982 
2983 /* Lex a string literal containing UTF-8 source characters.
2984    Verify the substring location data after running cpp_interpret_string
2985    on it.  */
2986 
2987 static void
test_lexer_string_locations_utf8_source(const line_table_case & case_)2988 test_lexer_string_locations_utf8_source (const line_table_case &case_)
2989 {
2990  /* This string literal is written out to the source file as UTF-8,
2991     and is of the form "before mojibake after", where "mojibake"
2992     is written as the following four unicode code points:
2993        U+6587 CJK UNIFIED IDEOGRAPH-6587
2994        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2995        U+5316 CJK UNIFIED IDEOGRAPH-5316
2996        U+3051 HIRAGANA LETTER KE.
2997      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
2998      "before" and "after" are 1 byte per unicode character.
2999 
3000      The numbering shown are "columns", which are *byte* numbers within
3001      the line, rather than unicode character numbers.
3002 
3003      .................... 000000000.1111111.
3004      .................... 123456789.0123456.  */
3005   const char *content = ("        \"before "
3006 			 /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3007 			      UTF-8: 0xE6 0x96 0x87
3008 			      C octal escaped UTF-8: \346\226\207
3009 			    "column" numbers: 17-19.  */
3010 			 "\346\226\207"
3011 
3012 			 /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3013 			      UTF-8: 0xE5 0xAD 0x97
3014 			      C octal escaped UTF-8: \345\255\227
3015 			    "column" numbers: 20-22.  */
3016 			 "\345\255\227"
3017 
3018 			 /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3019 			      UTF-8: 0xE5 0x8C 0x96
3020 			      C octal escaped UTF-8: \345\214\226
3021 			    "column" numbers: 23-25.  */
3022 			 "\345\214\226"
3023 
3024 			 /* U+3051 HIRAGANA LETTER KE
3025 			      UTF-8: 0xE3 0x81 0x91
3026 			      C octal escaped UTF-8: \343\201\221
3027 			    "column" numbers: 26-28.  */
3028 			 "\343\201\221"
3029 
3030 			 /* column numbers 29 onwards
3031 			  2333333.33334444444444
3032 			  9012345.67890123456789. */
3033 			 " after\" /* non-str */\n");
3034   lexer_test test (case_, content, NULL);
3035 
3036   /* Verify that we get the expected token back, with the correct
3037      location information.  */
3038   const cpp_token *tok = test.get_token ();
3039   ASSERT_EQ (tok->type, CPP_STRING);
3040   ASSERT_TOKEN_AS_TEXT_EQ
3041     (test.m_parser, tok,
3042      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3043 
3044   /* Verify that cpp_interpret_string works.  */
3045   cpp_string dst_string;
3046   const enum cpp_ttype type = CPP_STRING;
3047   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3048 				      &dst_string, type);
3049   ASSERT_TRUE (result);
3050   ASSERT_STREQ
3051     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3052      (const char *)dst_string.text);
3053   free (const_cast <unsigned char *> (dst_string.text));
3054 
3055   /* Verify ranges of individual characters.  This no longer includes the
3056      opening quote, but does include the closing quote.
3057      Assuming that both source and execution encodings are UTF-8, we have
3058      a run of 25 octets in each, plus the NUL terminator.  */
3059   for (int i = 0; i < 25; i++)
3060     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3061   /* NUL-terminator should use the closing quote at column 35.  */
3062   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3063 
3064   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3065 }
3066 
3067 /* Test of string literal concatenation.  */
3068 
3069 static void
test_lexer_string_locations_concatenation_1(const line_table_case & case_)3070 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3071 {
3072   /* Digits 0-9.
3073      .....................000000000.111111.11112222222222
3074      .....................123456789.012345.67890123456789.  */
3075   const char *content = ("        \"01234\" /* non-str */\n"
3076 			 "        \"56789\" /* non-str */\n");
3077   lexer_test test (case_, content, NULL);
3078 
3079   location_t input_locs[2];
3080 
3081   /* Verify that we get the expected tokens back.  */
3082   auto_vec <cpp_string> input_strings;
3083   const cpp_token *tok_a = test.get_token ();
3084   ASSERT_EQ (tok_a->type, CPP_STRING);
3085   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3086   input_strings.safe_push (tok_a->val.str);
3087   input_locs[0] = tok_a->src_loc;
3088 
3089   const cpp_token *tok_b = test.get_token ();
3090   ASSERT_EQ (tok_b->type, CPP_STRING);
3091   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3092   input_strings.safe_push (tok_b->val.str);
3093   input_locs[1] = tok_b->src_loc;
3094 
3095   /* Verify that cpp_interpret_string works.  */
3096   cpp_string dst_string;
3097   const enum cpp_ttype type = CPP_STRING;
3098   bool result = cpp_interpret_string (test.m_parser,
3099 				      input_strings.address (), 2,
3100 				      &dst_string, type);
3101   ASSERT_TRUE (result);
3102   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3103   free (const_cast <unsigned char *> (dst_string.text));
3104 
3105   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3106   test.m_concats.record_string_concatenation (2, input_locs);
3107 
3108   location_t initial_loc = input_locs[0];
3109 
3110   /* "01234" on line 1.  */
3111   for (int i = 0; i <= 4; i++)
3112     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3113   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
3114   for (int i = 5; i <= 10; i++)
3115     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3116 
3117   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3118 }
3119 
3120 /* Another test of string literal concatenation.  */
3121 
3122 static void
test_lexer_string_locations_concatenation_2(const line_table_case & case_)3123 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3124 {
3125   /* Digits 0-9.
3126      .....................000000000.111.11111112222222
3127      .....................123456789.012.34567890123456.  */
3128   const char *content = ("        \"01\" /* non-str */\n"
3129 			 "        \"23\" /* non-str */\n"
3130 			 "        \"45\" /* non-str */\n"
3131 			 "        \"67\" /* non-str */\n"
3132 			 "        \"89\" /* non-str */\n");
3133   lexer_test test (case_, content, NULL);
3134 
3135   auto_vec <cpp_string> input_strings;
3136   location_t input_locs[5];
3137 
3138   /* Verify that we get the expected tokens back.  */
3139   for (int i = 0; i < 5; i++)
3140     {
3141       const cpp_token *tok = test.get_token ();
3142       ASSERT_EQ (tok->type, CPP_STRING);
3143       input_strings.safe_push (tok->val.str);
3144       input_locs[i] = tok->src_loc;
3145     }
3146 
3147   /* Verify that cpp_interpret_string works.  */
3148   cpp_string dst_string;
3149   const enum cpp_ttype type = CPP_STRING;
3150   bool result = cpp_interpret_string (test.m_parser,
3151 				      input_strings.address (), 5,
3152 				      &dst_string, type);
3153   ASSERT_TRUE (result);
3154   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3155   free (const_cast <unsigned char *> (dst_string.text));
3156 
3157   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3158   test.m_concats.record_string_concatenation (5, input_locs);
3159 
3160   location_t initial_loc = input_locs[0];
3161 
3162   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3163      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3164      and expect get_source_range_for_substring to fail.
3165      However, for a string concatenation test, we can have a case
3166      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3167      but subsequent strings can be after it.
3168      Attempting to detect this within assert_char_at_range
3169      would overcomplicate the logic for the common test cases, so
3170      we detect it here.  */
3171   if (should_have_column_data_p (input_locs[0])
3172       && !should_have_column_data_p (input_locs[4]))
3173     {
3174       /* Verify that get_source_range_for_substring gracefully rejects
3175 	 this case.  */
3176       source_range actual_range;
3177       const char *err
3178 	= get_source_range_for_char (test.m_parser, &test.m_concats,
3179 				     initial_loc, type, 0, &actual_range);
3180       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3181       return;
3182     }
3183 
3184   for (int i = 0; i < 5; i++)
3185     for (int j = 0; j < 2; j++)
3186       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3187 			    i + 1, 10 + j, 10 + j);
3188 
3189   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
3190   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3191 
3192   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3193 }
3194 
3195 /* Another test of string literal concatenation, this time combined with
3196    various kinds of escaped characters.  */
3197 
3198 static void
test_lexer_string_locations_concatenation_3(const line_table_case & case_)3199 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3200 {
3201   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3202      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
3203   const char *content
3204     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3205        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3206     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
3207   lexer_test test (case_, content, NULL);
3208 
3209   auto_vec <cpp_string> input_strings;
3210   location_t input_locs[4];
3211 
3212   /* Verify that we get the expected tokens back.  */
3213   for (int i = 0; i < 4; i++)
3214     {
3215       const cpp_token *tok = test.get_token ();
3216       ASSERT_EQ (tok->type, CPP_STRING);
3217       input_strings.safe_push (tok->val.str);
3218       input_locs[i] = tok->src_loc;
3219     }
3220 
3221   /* Verify that cpp_interpret_string works.  */
3222   cpp_string dst_string;
3223   const enum cpp_ttype type = CPP_STRING;
3224   bool result = cpp_interpret_string (test.m_parser,
3225 				      input_strings.address (), 4,
3226 				      &dst_string, type);
3227   ASSERT_TRUE (result);
3228   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3229   free (const_cast <unsigned char *> (dst_string.text));
3230 
3231   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3232   test.m_concats.record_string_concatenation (4, input_locs);
3233 
3234   location_t initial_loc = input_locs[0];
3235 
3236   for (int i = 0; i <= 4; i++)
3237     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3238   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3239   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3240   for (int i = 7; i <= 9; i++)
3241     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3242 
3243   /* NUL-terminator should use the location of the final closing quote.  */
3244   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3245 
3246   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3247 }
3248 
3249 /* Test of string literal in a macro.  */
3250 
3251 static void
test_lexer_string_locations_macro(const line_table_case & case_)3252 test_lexer_string_locations_macro (const line_table_case &case_)
3253 {
3254   /* Digits 0-9.
3255      .....................0000000001111111111.22222222223.
3256      .....................1234567890123456789.01234567890.  */
3257   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3258 			 "  MACRO");
3259   lexer_test test (case_, content, NULL);
3260 
3261   /* Verify that we get the expected tokens back.  */
3262   const cpp_token *tok = test.get_token ();
3263   ASSERT_EQ (tok->type, CPP_PADDING);
3264 
3265   tok = test.get_token ();
3266   ASSERT_EQ (tok->type, CPP_STRING);
3267   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3268 
3269   /* Verify ranges of individual characters.  We ought to
3270      see columns within the macro definition.  */
3271   for (int i = 0; i <= 10; i++)
3272     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3273 			  i, 1, 20 + i, 20 + i);
3274 
3275   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3276 
3277   tok = test.get_token ();
3278   ASSERT_EQ (tok->type, CPP_PADDING);
3279 }
3280 
3281 /* Test of stringification of a macro argument.  */
3282 
3283 static void
test_lexer_string_locations_stringified_macro_argument(const line_table_case & case_)3284 test_lexer_string_locations_stringified_macro_argument
3285   (const line_table_case &case_)
3286 {
3287   /* .....................000000000111111111122222222223.
3288      .....................123456789012345678901234567890.  */
3289   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3290 			 "MACRO(foo)\n");
3291   lexer_test test (case_, content, NULL);
3292 
3293   /* Verify that we get the expected token back.  */
3294   const cpp_token *tok = test.get_token ();
3295   ASSERT_EQ (tok->type, CPP_PADDING);
3296 
3297   tok = test.get_token ();
3298   ASSERT_EQ (tok->type, CPP_STRING);
3299   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3300 
3301   /* We don't support getting the location of a stringified macro
3302      argument.  Verify that it fails gracefully.  */
3303   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3304 				  "cpp_interpret_string_1 failed");
3305 
3306   tok = test.get_token ();
3307   ASSERT_EQ (tok->type, CPP_PADDING);
3308 
3309   tok = test.get_token ();
3310   ASSERT_EQ (tok->type, CPP_PADDING);
3311 }
3312 
3313 /* Ensure that we are fail gracefully if something attempts to pass
3314    in a location that isn't a string literal token.  Seen on this code:
3315 
3316      const char a[] = " %d ";
3317      __builtin_printf (a, 0.5);
3318                        ^
3319 
3320    when c-format.c erroneously used the indicated one-character
3321    location as the format string location, leading to a read past the
3322    end of a string buffer in cpp_interpret_string_1.  */
3323 
3324 static void
test_lexer_string_locations_non_string(const line_table_case & case_)3325 test_lexer_string_locations_non_string (const line_table_case &case_)
3326 {
3327   /* .....................000000000111111111122222222223.
3328      .....................123456789012345678901234567890.  */
3329   const char *content = ("         a\n");
3330   lexer_test test (case_, content, NULL);
3331 
3332   /* Verify that we get the expected token back.  */
3333   const cpp_token *tok = test.get_token ();
3334   ASSERT_EQ (tok->type, CPP_NAME);
3335   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3336 
3337   /* At this point, libcpp is attempting to interpret the name as a
3338      string literal, despite it not starting with a quote.  We don't detect
3339      that, but we should at least fail gracefully.  */
3340   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3341 				  "cpp_interpret_string_1 failed");
3342 }
3343 
3344 /* Ensure that we can read substring information for a token which
3345    starts in one linemap and ends in another .  Adapted from
3346    gcc.dg/cpp/pr69985.c.  */
3347 
3348 static void
test_lexer_string_locations_long_line(const line_table_case & case_)3349 test_lexer_string_locations_long_line (const line_table_case &case_)
3350 {
3351   /* .....................000000.000111111111
3352      .....................123456.789012346789.  */
3353   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3354 			 "     \"0123456789012345678901234567890123456789"
3355 			 "0123456789012345678901234567890123456789"
3356 			 "0123456789012345678901234567890123456789"
3357 			 "0123456789\"\n");
3358 
3359   lexer_test test (case_, content, NULL);
3360 
3361   /* Verify that we get the expected token back.  */
3362   const cpp_token *tok = test.get_token ();
3363   ASSERT_EQ (tok->type, CPP_STRING);
3364 
3365   if (!should_have_column_data_p (line_table->highest_location))
3366     return;
3367 
3368   /* Verify ranges of individual characters.  */
3369   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3370   for (int i = 0; i < 131; i++)
3371     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3372 			  i, 2, 7 + i, 7 + i);
3373 }
3374 
3375 /* Test of locations within a raw string that doesn't contain a newline.  */
3376 
3377 static void
test_lexer_string_locations_raw_string_one_line(const line_table_case & case_)3378 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3379 {
3380   /* .....................00.0000000111111111122.
3381      .....................12.3456789012345678901.  */
3382   const char *content = ("R\"foo(0123456789)foo\"\n");
3383   lexer_test test (case_, content, NULL);
3384 
3385   /* Verify that we get the expected token back.  */
3386   const cpp_token *tok = test.get_token ();
3387   ASSERT_EQ (tok->type, CPP_STRING);
3388 
3389   /* Verify that cpp_interpret_string works.  */
3390   cpp_string dst_string;
3391   const enum cpp_ttype type = CPP_STRING;
3392   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3393 				      &dst_string, type);
3394   ASSERT_TRUE (result);
3395   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3396   free (const_cast <unsigned char *> (dst_string.text));
3397 
3398   if (!should_have_column_data_p (line_table->highest_location))
3399     return;
3400 
3401   /* 0-9, plus the nil terminator.  */
3402   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3403   for (int i = 0; i < 11; i++)
3404     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3405 			  i, 1, 7 + i, 7 + i);
3406 }
3407 
3408 /* Test of locations within a raw string that contains a newline.  */
3409 
3410 static void
test_lexer_string_locations_raw_string_multiline(const line_table_case & case_)3411 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3412 {
3413   /* .....................00.0000.
3414      .....................12.3456.  */
3415   const char *content = ("R\"foo(\n"
3416   /* .....................00000.
3417      .....................12345.  */
3418 			 "hello\n"
3419 			 "world\n"
3420   /* .....................00000.
3421      .....................12345.  */
3422 			 ")foo\"\n");
3423   lexer_test test (case_, content, NULL);
3424 
3425   /* Verify that we get the expected token back.  */
3426   const cpp_token *tok = test.get_token ();
3427   ASSERT_EQ (tok->type, CPP_STRING);
3428 
3429   /* Verify that cpp_interpret_string works.  */
3430   cpp_string dst_string;
3431   const enum cpp_ttype type = CPP_STRING;
3432   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3433 				      &dst_string, type);
3434   ASSERT_TRUE (result);
3435   ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3436   free (const_cast <unsigned char *> (dst_string.text));
3437 
3438   if (!should_have_column_data_p (line_table->highest_location))
3439     return;
3440 
3441   /* Currently we don't support locations within raw strings that
3442      contain newlines.  */
3443   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3444 				  "range endpoints are on different lines");
3445 }
3446 
3447 /* Test of parsing an unterminated raw string.  */
3448 
3449 static void
test_lexer_string_locations_raw_string_unterminated(const line_table_case & case_)3450 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3451 {
3452   const char *content = "R\"ouch()ouCh\" /* etc */";
3453 
3454   lexer_diagnostic_sink diagnostics;
3455   lexer_test test (case_, content, &diagnostics);
3456   test.m_implicitly_expect_EOF = false;
3457 
3458   /* Attempt to parse the raw string.  */
3459   const cpp_token *tok = test.get_token ();
3460   ASSERT_EQ (tok->type, CPP_EOF);
3461 
3462   ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3463   /* We expect the message "unterminated raw string"
3464      in the "cpplib" translation domain.
3465      It's not clear that dgettext is available on all supported hosts,
3466      so this assertion is commented-out for now.
3467        ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3468                      diagnostics.m_diagnostics[0]);
3469   */
3470 }
3471 
3472 /* Test of lexing char constants.  */
3473 
3474 static void
test_lexer_char_constants(const line_table_case & case_)3475 test_lexer_char_constants (const line_table_case &case_)
3476 {
3477   /* Various char constants.
3478      .....................0000000001111111111.22222222223.
3479      .....................1234567890123456789.01234567890.  */
3480   const char *content = ("         'a'\n"
3481 			 "        u'a'\n"
3482 			 "        U'a'\n"
3483 			 "        L'a'\n"
3484 			 "         'abc'\n");
3485   lexer_test test (case_, content, NULL);
3486 
3487   /* Verify that we get the expected tokens back.  */
3488   /* 'a'.  */
3489   const cpp_token *tok = test.get_token ();
3490   ASSERT_EQ (tok->type, CPP_CHAR);
3491   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3492 
3493   unsigned int chars_seen;
3494   int unsignedp;
3495   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3496 					  &chars_seen, &unsignedp);
3497   ASSERT_EQ (cc, 'a');
3498   ASSERT_EQ (chars_seen, 1);
3499 
3500   /* u'a'.  */
3501   tok = test.get_token ();
3502   ASSERT_EQ (tok->type, CPP_CHAR16);
3503   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3504 
3505   /* U'a'.  */
3506   tok = test.get_token ();
3507   ASSERT_EQ (tok->type, CPP_CHAR32);
3508   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3509 
3510   /* L'a'.  */
3511   tok = test.get_token ();
3512   ASSERT_EQ (tok->type, CPP_WCHAR);
3513   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3514 
3515   /* 'abc' (c-char-sequence).  */
3516   tok = test.get_token ();
3517   ASSERT_EQ (tok->type, CPP_CHAR);
3518   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3519 }
3520 /* A table of interesting location_t values, giving one axis of our test
3521    matrix.  */
3522 
3523 static const location_t boundary_locations[] = {
3524   /* Zero means "don't override the default values for a new line_table".  */
3525   0,
3526 
3527   /* An arbitrary non-zero value that isn't close to one of
3528      the boundary values below.  */
3529   0x10000,
3530 
3531   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3532   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3533   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3534   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3535   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3536   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3537 
3538   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3539   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3540   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3541   LINE_MAP_MAX_LOCATION_WITH_COLS,
3542   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3543   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3544 };
3545 
3546 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3547 
3548 void
for_each_line_table_case(void (* testcase)(const line_table_case &))3549 for_each_line_table_case (void (*testcase) (const line_table_case &))
3550 {
3551   /* As noted above in the description of struct line_table_case,
3552      we want to explore a test matrix of interesting line_table
3553      situations, running various selftests for each case within the
3554      matrix.  */
3555 
3556   /* Run all tests with:
3557      (a) line_table->default_range_bits == 0, and
3558      (b) line_table->default_range_bits == 5.  */
3559   int num_cases_tested = 0;
3560   for (int default_range_bits = 0; default_range_bits <= 5;
3561        default_range_bits += 5)
3562     {
3563       /* ...and use each of the "interesting" location values as
3564 	 the starting location within line_table.  */
3565       const int num_boundary_locations
3566 	= sizeof (boundary_locations) / sizeof (boundary_locations[0]);
3567       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3568 	{
3569 	  line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3570 
3571 	  testcase (c);
3572 
3573 	  num_cases_tested++;
3574 	}
3575     }
3576 
3577   /* Verify that we fully covered the test matrix.  */
3578   ASSERT_EQ (num_cases_tested, 2 * 12);
3579 }
3580 
3581 /* Verify that when presented with a consecutive pair of locations with
3582    a very large line offset, we don't attempt to consolidate them into
3583    a single ordinary linemap where the line offsets within the line map
3584    would lead to overflow (PR lto/88147).  */
3585 
3586 static void
test_line_offset_overflow()3587 test_line_offset_overflow ()
3588 {
3589   line_table_test ltt (line_table_case (5, 0));
3590 
3591   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
3592   linemap_line_start (line_table, 1, 100);
3593   location_t loc_a = linemap_line_start (line_table, 2578, 255);
3594   assert_loceq ("foo.c", 2578, 0, loc_a);
3595 
3596   const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3597   ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
3598   ASSERT_EQ (ordmap_a->m_range_bits, 5);
3599 
3600   location_t loc_b = linemap_line_start (line_table, 404198, 512);
3601   assert_loceq ("foo.c", 404198, 0, loc_b);
3602 
3603   /* We should have started a new linemap, rather than attempting to store
3604      a very large line offset.  */
3605   const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3606   ASSERT_NE (ordmap_a, ordmap_b);
3607 }
3608 
test_cpp_utf8()3609 void test_cpp_utf8 ()
3610 {
3611   /* Verify that wcwidth of invalid UTF-8 or control bytes is 1.  */
3612   {
3613     int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8);
3614     ASSERT_EQ (8, w_bad);
3615     int w_ctrl = cpp_display_width ("\r\t\n\v\0\1", 6);
3616     ASSERT_EQ (6, w_ctrl);
3617   }
3618 
3619   /* Verify that wcwidth of valid UTF-8 is as expected.  */
3620   {
3621     const int w_pi = cpp_display_width ("\xcf\x80", 2);
3622     ASSERT_EQ (1, w_pi);
3623     const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4);
3624     ASSERT_EQ (2, w_emoji);
3625     const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2);
3626     ASSERT_EQ (1, w_umlaut_precomposed);
3627     const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3);
3628     ASSERT_EQ (1, w_umlaut_combining);
3629     const int w_han = cpp_display_width ("\xe4\xb8\xba", 3);
3630     ASSERT_EQ (2, w_han);
3631     const int w_ascii = cpp_display_width ("GCC", 3);
3632     ASSERT_EQ (3, w_ascii);
3633     const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
3634 					   "\x9f! \xe4\xb8\xba y\xcc\x88", 24);
3635     ASSERT_EQ (18, w_mixed);
3636   }
3637 
3638   /* Verify that cpp_byte_column_to_display_column can go past the end,
3639      and similar edge cases.  */
3640   {
3641     const char *str
3642       /* Display columns.
3643          111111112345  */
3644       = "\xcf\x80 abc";
3645       /* 111122223456
3646 	 Byte columns.  */
3647 
3648     ASSERT_EQ (5, cpp_display_width (str, 6));
3649     ASSERT_EQ (105, cpp_byte_column_to_display_column (str, 6, 106));
3650     ASSERT_EQ (10000, cpp_byte_column_to_display_column (NULL, 0, 10000));
3651     ASSERT_EQ (0, cpp_byte_column_to_display_column (NULL, 10000, 0));
3652   }
3653 
3654   /* Verify that cpp_display_column_to_byte_column can go past the end,
3655      and similar edge cases, and check invertibility.  */
3656   {
3657     const char *str
3658       /* Display columns.
3659 	 000000000000000000000000000000000000011
3660 	 111111112222222234444444455555555678901  */
3661       = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
3662       /* 000000000000000000000000000000000111111
3663 	 111122223333444456666777788889999012345
3664 	 Byte columns.  */
3665     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2));
3666     ASSERT_EQ (15, cpp_display_column_to_byte_column (str, 15, 11));
3667     ASSERT_EQ (115, cpp_display_column_to_byte_column (str, 15, 111));
3668     ASSERT_EQ (10000, cpp_display_column_to_byte_column (NULL, 0, 10000));
3669     ASSERT_EQ (0, cpp_display_column_to_byte_column (NULL, 10000, 0));
3670 
3671     /* Verify that we do not interrupt a UTF-8 sequence.  */
3672     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1));
3673 
3674     for (int byte_col = 1; byte_col <= 15; ++byte_col)
3675       {
3676 	const int disp_col = cpp_byte_column_to_display_column (str, 15,
3677 								byte_col);
3678 	const int byte_col2 = cpp_display_column_to_byte_column (str, 15,
3679 								 disp_col);
3680 
3681 	/* If we ask for the display column in the middle of a UTF-8
3682 	   sequence, it will return the length of the partial sequence,
3683 	   matching the behavior of GCC before display column support.
3684 	   Otherwise check the round trip was successful.  */
3685 	if (byte_col < 4)
3686 	  ASSERT_EQ (byte_col, disp_col);
3687 	else if (byte_col >= 6 && byte_col < 9)
3688 	  ASSERT_EQ (3 + (byte_col - 5), disp_col);
3689 	else
3690 	  ASSERT_EQ (byte_col2, byte_col);
3691       }
3692   }
3693 
3694 }
3695 
3696 /* Run all of the selftests within this file.  */
3697 
3698 void
input_c_tests()3699 input_c_tests ()
3700 {
3701   test_linenum_comparisons ();
3702   test_should_have_column_data_p ();
3703   test_unknown_location ();
3704   test_builtins ();
3705   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3706 
3707   for_each_line_table_case (test_accessing_ordinary_linemaps);
3708   for_each_line_table_case (test_lexer);
3709   for_each_line_table_case (test_lexer_string_locations_simple);
3710   for_each_line_table_case (test_lexer_string_locations_ebcdic);
3711   for_each_line_table_case (test_lexer_string_locations_hex);
3712   for_each_line_table_case (test_lexer_string_locations_oct);
3713   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3714   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3715   for_each_line_table_case (test_lexer_string_locations_ucn4);
3716   for_each_line_table_case (test_lexer_string_locations_ucn8);
3717   for_each_line_table_case (test_lexer_string_locations_wide_string);
3718   for_each_line_table_case (test_lexer_string_locations_string16);
3719   for_each_line_table_case (test_lexer_string_locations_string32);
3720   for_each_line_table_case (test_lexer_string_locations_u8);
3721   for_each_line_table_case (test_lexer_string_locations_utf8_source);
3722   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3723   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3724   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3725   for_each_line_table_case (test_lexer_string_locations_macro);
3726   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3727   for_each_line_table_case (test_lexer_string_locations_non_string);
3728   for_each_line_table_case (test_lexer_string_locations_long_line);
3729   for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
3730   for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
3731   for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
3732   for_each_line_table_case (test_lexer_char_constants);
3733 
3734   test_reading_source_line ();
3735 
3736   test_line_offset_overflow ();
3737 
3738   test_cpp_utf8 ();
3739 }
3740 
3741 } // namespace selftest
3742 
3743 #endif /* CHECKING_P */
3744