1 /**
2 * @file tokenize.cpp
3 * This file breaks up the text stream into tokens or chunks.
4 *
5 * Each routine needs to set pc.len and pc.type.
6 *
7 * @author Ben Gardner
8 * @license GPL v2+
9 */
10
11 #include "tokenize.h"
12
13 #include "keywords.h"
14 #include "prototypes.h"
15 #include "punctuators.h"
16 #include "unc_ctype.h"
17
18 #include <regex>
19 #include <stack>
20
21
22 #define LE_COUNT(x) cpd.le_counts[static_cast<size_t>(LE_ ## x)]
23
24 constexpr static auto LCURRENT = LTOK;
25
26 using namespace std;
27 using namespace uncrustify;
28
29
30 struct tok_info
31 {
tok_infotok_info32 tok_info()
33 : last_ch(0)
34 , idx(0)
35 , row(1)
36 , col(1)
37 {
38 }
39
40 size_t last_ch;
41 size_t idx;
42 size_t row;
43 size_t col;
44 };
45
46
47 struct tok_ctx
48 {
tok_ctxtok_ctx49 tok_ctx(const deque<int> &d)
50 : data(d)
51 {
52 }
53
54
55 //! save before trying to parse something that may fail
savetok_ctx56 void save()
57 {
58 save(s);
59 }
60
61
savetok_ctx62 void save(tok_info &info)
63 {
64 info = c;
65 }
66
67
68 //! restore previous saved state
restoretok_ctx69 void restore()
70 {
71 restore(s);
72 }
73
74
restoretok_ctx75 void restore(const tok_info &info)
76 {
77 c = info;
78 }
79
80
moretok_ctx81 bool more()
82 {
83 return(c.idx < data.size());
84 }
85
86
peektok_ctx87 size_t peek()
88 {
89 return(more() ? data[c.idx] : 0);
90 }
91
92
peektok_ctx93 size_t peek(size_t idx)
94 {
95 idx += c.idx;
96 return((idx < data.size()) ? data[idx] : 0);
97 }
98
99
gettok_ctx100 size_t get()
101 {
102 if (more())
103 {
104 size_t ch = data[c.idx++];
105
106 switch (ch)
107 {
108 case '\t':
109 log_rule_B("input_tab_size");
110 c.col = calc_next_tab_column(c.col, options::input_tab_size());
111 break;
112
113 case '\n':
114
115 if (c.last_ch != '\r')
116 {
117 c.row++;
118 c.col = 1;
119 }
120 break;
121
122 case '\r':
123 c.row++;
124 c.col = 1;
125 break;
126
127 default:
128 c.col++;
129 break;
130 }
131 c.last_ch = ch;
132 return(ch);
133 }
134 return(0);
135 }
136
137
expecttok_ctx138 bool expect(size_t ch)
139 {
140 if (peek() == ch)
141 {
142 get();
143 return(true);
144 }
145 return(false);
146 }
147
148
149 const deque<int> &data;
150 tok_info c; //! current
151 tok_info s; //! saved
152 };
153
154
155 /**
156 * Count the number of characters in a quoted string.
157 * The next bit of text starts with a quote char " or ' or <.
158 * Count the number of characters until the matching character.
159 *
160 * @param pc The structure to update, str is an input.
161 *
162 * @return Whether a string was parsed
163 */
164 static bool parse_string(tok_ctx &ctx, chunk_t &pc, size_t quote_idx, bool allow_escape);
165
166
167 /**
168 * Literal string, ends with single "
169 * Two "" don't end the string.
170 *
171 * @param pc The structure to update, str is an input.
172 *
173 * @return Whether a string was parsed
174 */
175 static bool parse_cs_string(tok_ctx &ctx, chunk_t &pc);
176
177
178 /**
179 * VALA verbatim string, ends with three quotes (""")
180 *
181 * @param pc The structure to update, str is an input.
182 */
183 static void parse_verbatim_string(tok_ctx &ctx, chunk_t &pc);
184
185
186 static bool tag_compare(const deque<int> &d, size_t a_idx, size_t b_idx, size_t len);
187
188
189 /**
190 * Parses a C++0x 'R' string. R"( xxx )" R"tag( )tag" u8R"(x)" uR"(x)"
191 * Newlines may be in the string.
192 *
193 * @param pc structure to update, str is an input.
194 */
195 static bool parse_cr_string(tok_ctx &ctx, chunk_t &pc, size_t q_idx);
196
197
198 /**
199 * Count the number of whitespace characters.
200 *
201 * @param pc The structure to update, str is an input.
202 *
203 * @return Whether whitespace was parsed
204 */
205 static bool parse_whitespace(tok_ctx &ctx, chunk_t &pc);
206
207
208 /**
209 * Called when we hit a backslash.
210 * If there is nothing but whitespace until the newline, then this is a
211 * backslash newline
212 *
213 * @param pc structure to update, str is an input
214 */
215 static bool parse_bs_newline(tok_ctx &ctx, chunk_t &pc);
216
217
218 /**
219 * Parses any number of tab or space chars followed by a newline.
220 * Does not change pc.len if a newline isn't found.
221 * This is not the same as parse_whitespace() because it only consumes until
222 * a single newline is encountered.
223 */
224 static bool parse_newline(tok_ctx &ctx);
225
226
227 /**
228 * PAWN #define is different than C/C++.
229 * #define PATTERN REPLACEMENT_TEXT
230 * The PATTERN may not contain a space or '[' or ']'.
231 * A generic whitespace check should be good enough.
232 * Do not change the pattern.
233 *
234 * @param pc structure to update, str is an input
235 */
236 static void parse_pawn_pattern(tok_ctx &ctx, chunk_t &pc, c_token_t tt);
237
238
239 static bool parse_ignored(tok_ctx &ctx, chunk_t &pc);
240
241
242 /**
243 * Skips the next bit of whatever and returns the type of block.
244 *
245 * pc.str is the input text.
246 * pc.len in the output length.
247 * pc.type is the output type
248 * pc.column is output column
249 *
250 * @param pc The structure to update, str is an input.
251 * @param prev_pc The previous structure
252 *
253 * @return true/false - whether anything was parsed
254 */
255 static bool parse_next(tok_ctx &ctx, chunk_t &pc, const chunk_t *prev_pc);
256
257
258 /**
259 * Parses all legal D string constants.
260 *
261 * Quoted strings:
262 * r"Wysiwyg" # WYSIWYG string
263 * x"hexstring" # Hexadecimal array
264 * `Wysiwyg` # WYSIWYG string
265 * 'char' # single character
266 * "reg_string" # regular string
267 *
268 * Non-quoted strings:
269 * \x12 # 1-byte hex constant
270 * \u1234 # 2-byte hex constant
271 * \U12345678 # 4-byte hex constant
272 * \123 # octal constant
273 * \& # named entity
274 * \n # single character
275 *
276 * @param pc The structure to update, str is an input.
277 *
278 * @return Whether a string was parsed
279 */
280 static bool d_parse_string(tok_ctx &ctx, chunk_t &pc);
281
282
283 /**
284 * Figure of the length of the comment at text.
285 * The next bit of text starts with a '/', so it might be a comment.
286 * There are three types of comments:
287 * - C comments that start with '/ *' and end with '* /'
288 * - C++ comments that start with //
289 * - D nestable comments '/+' '+/'
290 *
291 * @param pc The structure to update, str is an input.
292 *
293 * @return Whether a comment was parsed
294 */
295 static bool parse_comment(tok_ctx &ctx, chunk_t &pc);
296
297
298 /**
299 * Figure of the length of the code placeholder at text, if present.
300 * This is only for Xcode which sometimes inserts temporary code placeholder chunks, which in plaintext <#look like this#>.
301 *
302 * @param pc The structure to update, str is an input.
303 *
304 * @return Whether a placeholder was parsed.
305 */
306 static bool parse_code_placeholder(tok_ctx &ctx, chunk_t &pc);
307
308
309 /**
310 * Parse any attached suffix, which may be a user-defined literal suffix.
311 * If for a string, explicitly exclude common format and scan specifiers, ie,
312 * PRIx32 and SCNx64.
313 */
314 static void parse_suffix(tok_ctx &ctx, chunk_t &pc, bool forstring);
315
316
317 //! check if a symbol holds a boolean value
318 static bool is_bin(int ch);
319 static bool is_bin_(int ch);
320
321
322 //! check if a symbol holds a octal value
323 static bool is_oct(int ch);
324 static bool is_oct_(int ch);
325
326
327 //! check if a symbol holds a decimal value;
328 static bool is_dec(int ch);
329 static bool is_dec_(int ch);
330
331
332 //! check if a symbol holds a hexadecimal value
333 static bool is_hex(int ch);
334 static bool is_hex_(int ch);
335
336
337 /**
338 * Count the number of characters in the number.
339 * The next bit of text starts with a number (0-9 or '.'), so it is a number.
340 * Count the number of characters in the number.
341 *
342 * This should cover all number formats for all languages.
343 * Note that this is not a strict parser. It will happily parse numbers in
344 * an invalid format.
345 *
346 * For example, only D allows underscores in the numbers, but they are
347 * allowed in all formats.
348 *
349 * @param[in,out] pc The structure to update, str is an input.
350 *
351 * @return Whether a number was parsed
352 */
353 static bool parse_number(tok_ctx &ctx, chunk_t &pc);
354
355
d_parse_string(tok_ctx & ctx,chunk_t & pc)356 static bool d_parse_string(tok_ctx &ctx, chunk_t &pc)
357 {
358 size_t ch = ctx.peek();
359
360 if ( ch == '"'
361 || ch == '\'')
362 {
363 return(parse_string(ctx, pc, 0, true));
364 }
365
366 if (ch == '`')
367 {
368 return(parse_string(ctx, pc, 0, false));
369 }
370
371 if ( ( ch == 'r'
372 || ch == 'x')
373 && ctx.peek(1) == '"')
374 {
375 return(parse_string(ctx, pc, 1, false));
376 }
377
378 if (ch != '\\')
379 {
380 return(false);
381 }
382 ctx.save();
383 int cnt;
384
385 pc.str.clear();
386
387 while (ctx.peek() == '\\')
388 {
389 pc.str.append(ctx.get());
390
391 // Check for end of file
392 switch (ctx.peek())
393 {
394 case 'x': // \x HexDigit HexDigit
395 cnt = 3;
396
397 while (cnt--)
398 {
399 pc.str.append(ctx.get());
400 }
401 break;
402
403 case 'u': // \u HexDigit (x4)
404 cnt = 5;
405
406 while (cnt--)
407 {
408 pc.str.append(ctx.get());
409 }
410 break;
411
412 case 'U': // \U HexDigit (x8)
413 cnt = 9;
414
415 while (cnt--)
416 {
417 pc.str.append(ctx.get());
418 }
419 break;
420
421 case '0':
422 case '1':
423 case '2':
424 case '3':
425 case '4':
426 case '5':
427 case '6':
428 case '7':
429 // handle up to 3 octal digits
430 pc.str.append(ctx.get());
431 ch = ctx.peek();
432
433 if ( (ch >= '0')
434 && (ch <= '7'))
435 {
436 pc.str.append(ctx.get());
437 ch = ctx.peek();
438
439 if ( (ch >= '0')
440 && (ch <= '7'))
441 {
442 pc.str.append(ctx.get());
443 }
444 }
445 break;
446
447 case '&':
448 // \& NamedCharacterEntity ;
449 pc.str.append(ctx.get());
450
451 while (unc_isalpha(ctx.peek()))
452 {
453 pc.str.append(ctx.get());
454 }
455
456 if (ctx.peek() == ';')
457 {
458 pc.str.append(ctx.get());
459 }
460 break;
461
462 default:
463 // Everything else is a single character
464 pc.str.append(ctx.get());
465 break;
466 } // switch
467 }
468
469 if (pc.str.size() < 1)
470 {
471 ctx.restore();
472 return(false);
473 }
474 set_chunk_type(&pc, CT_STRING);
475 return(true);
476 } // d_parse_string
477
478
479 #if 0
480
481
482 //! A string-in-string search. Like strstr() with a haystack length.
483 static const char *str_search(const char *needle, const char *haystack, int haystack_len)
484 {
485 int needle_len = strlen(needle);
486
487 while (haystack_len-- >= needle_len)
488 {
489 if (memcmp(needle, haystack, needle_len) == 0)
490 {
491 return(haystack);
492 }
493 haystack++;
494 }
495 return(NULL);
496 }
497 #endif
498
499
parse_comment(tok_ctx & ctx,chunk_t & pc)500 static bool parse_comment(tok_ctx &ctx, chunk_t &pc)
501 {
502 bool is_d = language_is_set(LANG_D);
503 bool is_cs = language_is_set(LANG_CS);
504 size_t d_level = 0;
505
506 // does this start with '/ /' or '/ *' or '/ +' (d)
507 if ( (ctx.peek() != '/')
508 || ( (ctx.peek(1) != '*')
509 && (ctx.peek(1) != '/')
510 && ( (ctx.peek(1) != '+')
511 || !is_d)))
512 {
513 return(false);
514 }
515 ctx.save();
516
517 // account for opening two chars
518 pc.str = ctx.get(); // opening '/'
519 size_t ch = ctx.get();
520
521 pc.str.append(ch); // second char
522
523 if (ch == '/')
524 {
525 set_chunk_type(&pc, CT_COMMENT_CPP);
526
527 while (true)
528 {
529 int bs_cnt = 0;
530
531 while (ctx.more())
532 {
533 ch = ctx.peek();
534
535 if ( (ch == '\r')
536 || (ch == '\n'))
537 {
538 break;
539 }
540
541 if ( (ch == '\\')
542 && !is_cs) // backslashes aren't special in comments in C#
543 {
544 bs_cnt++;
545 }
546 else
547 {
548 bs_cnt = 0;
549 }
550 pc.str.append(ctx.get());
551 }
552
553 /*
554 * If we hit an odd number of backslashes right before the newline,
555 * then we keep going.
556 */
557 if ( ((bs_cnt & 1) == 0)
558 || !ctx.more())
559 {
560 break;
561 }
562
563 if (ctx.peek() == '\r')
564 {
565 pc.str.append(ctx.get());
566 }
567
568 if (ctx.peek() == '\n')
569 {
570 pc.str.append(ctx.get());
571 }
572 pc.nl_count++;
573 cpd.did_newline = true;
574 }
575 }
576 else if (!ctx.more())
577 {
578 // unexpected end of file
579 ctx.restore();
580 return(false);
581 }
582 else if (ch == '+')
583 {
584 set_chunk_type(&pc, CT_COMMENT);
585 d_level++;
586
587 while ( d_level > 0
588 && ctx.more())
589 {
590 if ( (ctx.peek() == '+')
591 && (ctx.peek(1) == '/'))
592 {
593 pc.str.append(ctx.get()); // store the '+'
594 pc.str.append(ctx.get()); // store the '/'
595 d_level--;
596 continue;
597 }
598
599 if ( (ctx.peek() == '/')
600 && (ctx.peek(1) == '+'))
601 {
602 pc.str.append(ctx.get()); // store the '/'
603 pc.str.append(ctx.get()); // store the '+'
604 d_level++;
605 continue;
606 }
607 ch = ctx.get();
608 pc.str.append(ch);
609
610 if ( (ch == '\n')
611 || (ch == '\r'))
612 {
613 set_chunk_type(&pc, CT_COMMENT_MULTI);
614 pc.nl_count++;
615
616 if (ch == '\r')
617 {
618 if (ctx.peek() == '\n')
619 {
620 ++LE_COUNT(CRLF);
621 pc.str.append(ctx.get()); // store the '\n'
622 }
623 else
624 {
625 ++LE_COUNT(CR);
626 }
627 }
628 else
629 {
630 ++LE_COUNT(LF);
631 }
632 }
633 }
634 }
635 else // must be '/ *'
636 {
637 set_chunk_type(&pc, CT_COMMENT);
638
639 while (ctx.more())
640 {
641 if ( (ctx.peek() == '*')
642 && (ctx.peek(1) == '/'))
643 {
644 pc.str.append(ctx.get()); // store the '*'
645 pc.str.append(ctx.get()); // store the '/'
646
647 tok_info ss;
648 ctx.save(ss);
649 size_t oldsize = pc.str.size();
650
651 // If there is another C comment right after this one, combine them
652 while ( (ctx.peek() == ' ')
653 || (ctx.peek() == '\t'))
654 {
655 pc.str.append(ctx.get());
656 }
657
658 if ( (ctx.peek() != '/')
659 || (ctx.peek(1) != '*'))
660 {
661 // undo the attempt to join
662 ctx.restore(ss);
663 pc.str.resize(oldsize);
664 break;
665 }
666 }
667 ch = ctx.get();
668 pc.str.append(ch);
669
670 if ( (ch == '\n')
671 || (ch == '\r'))
672 {
673 set_chunk_type(&pc, CT_COMMENT_MULTI);
674 pc.nl_count++;
675
676 if (ch == '\r')
677 {
678 if (ctx.peek() == '\n')
679 {
680 ++LE_COUNT(CRLF);
681 pc.str.append(ctx.get()); // store the '\n'
682 }
683 else
684 {
685 ++LE_COUNT(CR);
686 }
687 }
688 else
689 {
690 ++LE_COUNT(LF);
691 }
692 }
693 }
694 }
695
696 if (cpd.unc_off)
697 {
698 bool found_enable_marker = (find_enable_processing_comment_marker(pc.str) >= 0);
699
700 if (found_enable_marker)
701 {
702 const auto &ontext = options::enable_processing_cmt();
703
704 LOG_FMT(LBCTRL, "%s(%d): Found '%s' on line %zu\n",
705 __func__, __LINE__, ontext.c_str(), pc.orig_line);
706 cpd.unc_off = false;
707 }
708 }
709 else
710 {
711 auto position_disable_processing_cmt = find_disable_processing_comment_marker(pc.str);
712 bool found_disable_marker = (position_disable_processing_cmt >= 0);
713
714 if (found_disable_marker)
715 {
716 /**
717 * the user may wish to disable processing part of a multiline comment,
718 * in which case we'll handle at a late time. Check to see if processing
719 * is re-enabled elsewhere in this comment
720 */
721 auto position_enable_processing_cmt = find_enable_processing_comment_marker(pc.str);
722
723 if (position_enable_processing_cmt < position_disable_processing_cmt)
724 {
725 const auto &offtext = options::disable_processing_cmt();
726
727 LOG_FMT(LBCTRL, "%s(%d): Found '%s' on line %zu\n",
728 __func__, __LINE__, offtext.c_str(), pc.orig_line);
729 cpd.unc_off = true;
730 // Issue #842
731 cpd.unc_off_used = true;
732 }
733 }
734 }
735 return(true);
736 } // parse_comment
737
738
parse_code_placeholder(tok_ctx & ctx,chunk_t & pc)739 static bool parse_code_placeholder(tok_ctx &ctx, chunk_t &pc)
740 {
741 if ( (ctx.peek() != '<')
742 || (ctx.peek(1) != '#'))
743 {
744 return(false);
745 }
746 ctx.save();
747
748 // account for opening two chars '<#'
749 pc.str = ctx.get();
750 pc.str.append(ctx.get());
751
752 // grab everything until '#>', fail if not found.
753 size_t last1 = 0;
754
755 while (ctx.more())
756 {
757 size_t last2 = last1;
758 last1 = ctx.get();
759 pc.str.append(last1);
760
761 if ( (last2 == '#')
762 && (last1 == '>'))
763 {
764 set_chunk_type(&pc, CT_WORD);
765 return(true);
766 }
767 }
768 ctx.restore();
769 return(false);
770 }
771
772
parse_suffix(tok_ctx & ctx,chunk_t & pc,bool forstring=false)773 static void parse_suffix(tok_ctx &ctx, chunk_t &pc, bool forstring = false)
774 {
775 if (CharTable::IsKw1(ctx.peek()))
776 {
777 size_t slen = 0;
778 size_t oldsize = pc.str.size();
779
780 // don't add the suffix if we see L" or L' or S"
781 size_t p1 = ctx.peek();
782 size_t p2 = ctx.peek(1);
783
784 if ( forstring
785 && ( ( (p1 == 'L')
786 && ( (p2 == '"')
787 || (p2 == '\'')))
788 || ( (p1 == 'S')
789 && (p2 == '"'))))
790 {
791 return;
792 }
793 tok_info ss;
794 ctx.save(ss);
795
796 while ( ctx.more()
797 && CharTable::IsKw2(ctx.peek()))
798 {
799 slen++;
800 pc.str.append(ctx.get());
801 }
802
803 if ( forstring
804 && slen >= 4
805 && ( pc.str.startswith("PRI", oldsize)
806 || pc.str.startswith("SCN", oldsize)))
807 {
808 ctx.restore(ss);
809 pc.str.resize(oldsize);
810 }
811 }
812 }
813
814
is_bin(int ch)815 static bool is_bin(int ch)
816 {
817 return( (ch == '0')
818 || (ch == '1'));
819 }
820
821
is_bin_(int ch)822 static bool is_bin_(int ch)
823 {
824 return( is_bin(ch)
825 || ch == '_'
826 || ch == '\'');
827 }
828
829
is_oct(int ch)830 static bool is_oct(int ch)
831 {
832 return( (ch >= '0')
833 && (ch <= '7'));
834 }
835
836
is_oct_(int ch)837 static bool is_oct_(int ch)
838 {
839 return( is_oct(ch)
840 || ch == '_'
841 || ch == '\'');
842 }
843
844
is_dec(int ch)845 static bool is_dec(int ch)
846 {
847 return( (ch >= '0')
848 && (ch <= '9'));
849 }
850
851
is_dec_(int ch)852 static bool is_dec_(int ch)
853 {
854 // number separators: JAVA: "_", C++14: "'"
855 return( is_dec(ch)
856 || (ch == '_')
857 || (ch == '\''));
858 }
859
860
is_hex(int ch)861 static bool is_hex(int ch)
862 {
863 return( ( (ch >= '0')
864 && (ch <= '9'))
865 || ( (ch >= 'a')
866 && (ch <= 'f'))
867 || ( (ch >= 'A')
868 && (ch <= 'F')));
869 }
870
871
is_hex_(int ch)872 static bool is_hex_(int ch)
873 {
874 return( is_hex(ch)
875 || ch == '_'
876 || ch == '\'');
877 }
878
879
parse_number(tok_ctx & ctx,chunk_t & pc)880 static bool parse_number(tok_ctx &ctx, chunk_t &pc)
881 {
882 /*
883 * A number must start with a digit or a dot, followed by a digit
884 * (signs handled elsewhere)
885 */
886 if ( !is_dec(ctx.peek())
887 && ( (ctx.peek() != '.')
888 || !is_dec(ctx.peek(1))))
889 {
890 return(false);
891 }
892 bool is_float = (ctx.peek() == '.');
893
894 if ( is_float
895 && (ctx.peek(1) == '.')) // make sure it isn't '..'
896 {
897 return(false);
898 }
899 /*
900 * Check for Hex, Octal, or Binary
901 * Note that only D, C++14 and Pawn support binary
902 * Fixes the issue # 1591
903 * In c# the numbers starting with 0 are not treated as octal numbers.
904 */
905 bool did_hex = false;
906
907 if ( ctx.peek() == '0'
908 && !language_is_set(LANG_CS))
909 {
910 size_t ch;
911 chunk_t pc_temp;
912
913 pc.str.append(ctx.get()); // store the '0'
914 pc_temp.str.append('0');
915
916 // MS constant might have an "h" at the end. Look for it
917 ctx.save();
918
919 while ( ctx.more()
920 && CharTable::IsKw2(ctx.peek()))
921 {
922 ch = ctx.get();
923 pc_temp.str.append(ch);
924 }
925 ch = pc_temp.str[pc_temp.len() - 1];
926 ctx.restore();
927 LOG_FMT(LGUY, "%s(%d): pc_temp:%s\n", __func__, __LINE__, pc_temp.text());
928
929 if (ch == 'h') // TODO can we combine this in analyze_character
930 {
931 // we have an MS hexadecimal number with "h" at the end
932 LOG_FMT(LGUY, "%s(%d): MS hexadecimal number\n", __func__, __LINE__);
933 did_hex = true;
934
935 do
936 {
937 pc.str.append(ctx.get()); // store the rest
938 } while (is_hex_(ctx.peek()));
939
940 pc.str.append(ctx.get()); // store the h
941 LOG_FMT(LGUY, "%s(%d): pc:%s\n", __func__, __LINE__, pc.text());
942 }
943 else
944 {
945 switch (unc_toupper(ctx.peek()))
946 {
947 case 'X': // hex
948 did_hex = true;
949
950 do
951 {
952 pc.str.append(ctx.get()); // store the 'x' and then the rest
953 } while (is_hex_(ctx.peek()));
954
955 break;
956
957 case 'B': // binary
958
959 do
960 {
961 pc.str.append(ctx.get()); // store the 'b' and then the rest
962 } while (is_bin_(ctx.peek()));
963
964 break;
965
966 case '0': // octal or decimal
967 case '1':
968 case '2':
969 case '3':
970 case '4':
971 case '5':
972 case '6':
973 case '7':
974 case '8':
975 case '9':
976
977 do
978 {
979 pc.str.append(ctx.get());
980 } while (is_oct_(ctx.peek()));
981
982 break;
983
984 default:
985 // either just 0 or 0.1 or 0UL, etc
986 break;
987 } // switch
988 }
989 }
990 else
991 {
992 // Regular int or float
993 while (is_dec_(ctx.peek()))
994 {
995 pc.str.append(ctx.get());
996 }
997 }
998
999 // Check if we stopped on a decimal point & make sure it isn't '..'
1000 if ( (ctx.peek() == '.')
1001 && (ctx.peek(1) != '.'))
1002 {
1003 // Issue #1265, 5.clamp()
1004 tok_info ss;
1005 ctx.save(ss);
1006
1007 while ( ctx.more()
1008 && CharTable::IsKw2(ctx.peek(1)))
1009 {
1010 // skip characters to check for paren open
1011 ctx.get();
1012 }
1013
1014 if (ctx.peek(1) == '(')
1015 {
1016 ctx.restore(ss);
1017 set_chunk_type(&pc, CT_NUMBER);
1018 return(true);
1019 }
1020 else
1021 {
1022 ctx.restore(ss);
1023 }
1024 pc.str.append(ctx.get());
1025 is_float = true;
1026
1027 if (did_hex)
1028 {
1029 while (is_hex_(ctx.peek()))
1030 {
1031 pc.str.append(ctx.get());
1032 }
1033 }
1034 else
1035 {
1036 while (is_dec_(ctx.peek()))
1037 {
1038 pc.str.append(ctx.get());
1039 }
1040 }
1041 }
1042 /*
1043 * Check exponent
1044 * Valid exponents per language (not that it matters):
1045 * C/C++/D/Java: eEpP
1046 * C#/Pawn: eE
1047 */
1048 size_t tmp = unc_toupper(ctx.peek());
1049
1050 if ( (tmp == 'E')
1051 || (tmp == 'P'))
1052 {
1053 is_float = true;
1054 pc.str.append(ctx.get());
1055
1056 if ( (ctx.peek() == '+')
1057 || (ctx.peek() == '-'))
1058 {
1059 pc.str.append(ctx.get());
1060 }
1061
1062 while (is_dec_(ctx.peek()))
1063 {
1064 pc.str.append(ctx.get());
1065 }
1066 }
1067
1068 /*
1069 * Check the suffixes
1070 * Valid suffixes per language (not that it matters):
1071 * Integer Float
1072 * C/C++: uUlL64 lLfF
1073 * C#: uUlL fFdDMm
1074 * D: uUL ifFL
1075 * Java: lL fFdD
1076 * Pawn: (none) (none)
1077 *
1078 * Note that i, f, d, and m only appear in floats.
1079 */
1080 while (1)
1081 {
1082 size_t tmp2 = unc_toupper(ctx.peek());
1083
1084 if ( (tmp2 == 'I')
1085 || (tmp2 == 'F')
1086 || (tmp2 == 'D')
1087 || (tmp2 == 'M'))
1088 {
1089 is_float = true;
1090 }
1091 else if ( (tmp2 != 'L')
1092 && (tmp2 != 'U'))
1093 {
1094 break;
1095 }
1096 pc.str.append(ctx.get());
1097 }
1098
1099 // skip the Microsoft-specific '32' and '64' suffix
1100 if ( ( (ctx.peek() == '3')
1101 && (ctx.peek(1) == '2'))
1102 || ( (ctx.peek() == '6')
1103 && (ctx.peek(1) == '4')))
1104 {
1105 pc.str.append(ctx.get());
1106 pc.str.append(ctx.get());
1107 }
1108 set_chunk_type(&pc, is_float ? CT_NUMBER_FP : CT_NUMBER);
1109
1110 /*
1111 * If there is anything left, then we are probably dealing with garbage or
1112 * some sick macro junk. Eat it.
1113 */
1114 parse_suffix(ctx, pc);
1115
1116 return(true);
1117 } // parse_number
1118
1119
parse_string(tok_ctx & ctx,chunk_t & pc,size_t quote_idx,bool allow_escape)1120 static bool parse_string(tok_ctx &ctx, chunk_t &pc, size_t quote_idx, bool allow_escape)
1121 {
1122 log_rule_B("string_escape_char");
1123 const size_t escape_char = options::string_escape_char();
1124
1125 log_rule_B("string_escape_char2");
1126 const size_t escape_char2 = options::string_escape_char2();
1127
1128 log_rule_B("string_replace_tab_chars");
1129 const bool should_escape_tabs = ( allow_escape
1130 && options::string_replace_tab_chars()
1131 && language_is_set(LANG_ALLC));
1132
1133 pc.str.clear();
1134
1135 while (quote_idx-- > 0)
1136 {
1137 pc.str.append(ctx.get());
1138 }
1139 set_chunk_type(&pc, CT_STRING);
1140 const size_t termination_character = CharTable::Get(ctx.peek()) & 0xff;
1141
1142 pc.str.append(ctx.get()); // store the "
1143
1144 bool escaped = false;
1145
1146 while (ctx.more())
1147 {
1148 const size_t ch = ctx.get();
1149
1150 // convert char 9 (\t) to chars \t
1151 if ( (ch == '\t')
1152 && should_escape_tabs)
1153 {
1154 const size_t lastcol = ctx.c.col - 1;
1155 ctx.c.col = lastcol + 2;
1156 pc.str.append(escape_char);
1157 pc.str.append('t');
1158 continue;
1159 }
1160 pc.str.append(ch);
1161
1162 if (ch == '\n')
1163 {
1164 pc.nl_count++;
1165 set_chunk_type(&pc, CT_STRING_MULTI);
1166 }
1167 else if ( ch == '\r'
1168 && ctx.peek() != '\n')
1169 {
1170 pc.str.append(ctx.get());
1171 pc.nl_count++;
1172 set_chunk_type(&pc, CT_STRING_MULTI);
1173 }
1174
1175 // if last char in prev loop was escaped the one in the current loop isn't
1176 if (escaped)
1177 {
1178 escaped = false;
1179 continue;
1180 }
1181
1182 // see if the current char is a escape char
1183 if (allow_escape)
1184 {
1185 if (ch == escape_char)
1186 {
1187 escaped = (escape_char != 0);
1188 continue;
1189 }
1190
1191 if ( ch == escape_char2
1192 && (ctx.peek() == termination_character))
1193 {
1194 escaped = allow_escape;
1195 continue;
1196 }
1197 }
1198
1199 if (ch == termination_character)
1200 {
1201 break;
1202 }
1203 }
1204 parse_suffix(ctx, pc, true);
1205 return(true);
1206 } // parse_string
1207
1208 enum cs_string_t
1209 {
1210 CS_STRING_NONE = 0,
1211 CS_STRING_STRING = 1 << 0, // is any kind of string
1212 CS_STRING_VERBATIM = 1 << 1, // @"" style string
1213 CS_STRING_INTERPOLATED = 1 << 2, // $"" or $@"" style string
1214 };
1215
operator |=(cs_string_t & value,cs_string_t other)1216 static cs_string_t operator|=(cs_string_t &value, cs_string_t other)
1217 {
1218 return(value = static_cast<cs_string_t>(value | other));
1219 }
1220
1221
parse_cs_string_start(tok_ctx & ctx,chunk_t & pc)1222 static cs_string_t parse_cs_string_start(tok_ctx &ctx, chunk_t &pc)
1223 {
1224 cs_string_t stringType = CS_STRING_NONE;
1225 int offset = 0;
1226
1227 if (ctx.peek(offset) == '$')
1228 {
1229 stringType |= CS_STRING_INTERPOLATED;
1230 ++offset;
1231 }
1232
1233 if (ctx.peek(offset) == '@')
1234 {
1235 stringType |= CS_STRING_VERBATIM;
1236 ++offset;
1237 }
1238
1239 if (ctx.peek(offset) == '"')
1240 {
1241 stringType |= CS_STRING_STRING;
1242
1243 set_chunk_type(&pc, CT_STRING);
1244
1245 for (int i = 0; i <= offset; ++i)
1246 {
1247 pc.str.append(ctx.get());
1248 }
1249 }
1250 else
1251 {
1252 stringType = CS_STRING_NONE;
1253 }
1254 return(stringType);
1255 } // parse_cs_string_start
1256
1257
1258 struct CsStringParseState
1259 {
1260 cs_string_t type;
1261 int braceDepth;
1262
1263
CsStringParseStateCsStringParseState1264 CsStringParseState(cs_string_t stringType)
1265 {
1266 type = stringType;
1267 braceDepth = 0;
1268 }
1269 };
1270
1271
1272 /**
1273 * C# strings are complex enough (mostly due to interpolation and nesting) that they need a custom parser.
1274 */
parse_cs_string(tok_ctx & ctx,chunk_t & pc)1275 static bool parse_cs_string(tok_ctx &ctx, chunk_t &pc)
1276 {
1277 cs_string_t stringType = parse_cs_string_start(ctx, pc);
1278
1279 if (stringType == 0)
1280 {
1281 return(false);
1282 }
1283 // an interpolated string can contain {expressions}, which can contain $"strings", which in turn
1284 // can contain {expressions}, so we must track both as they are interleaved, in order to properly
1285 // parse the outermost string.
1286
1287 std::stack<CsStringParseState> parseState; // each entry is a nested string
1288
1289 parseState.push(CsStringParseState(stringType));
1290
1291 log_rule_B("string_replace_tab_chars");
1292 bool should_escape_tabs = options::string_replace_tab_chars();
1293
1294 while (ctx.more())
1295 {
1296 if (parseState.top().braceDepth > 0)
1297 {
1298 // all we can do when in an expr is look for expr close with }, or a new string opening. must do this first
1299 // so we can peek and potentially consume chars for new string openings, before the ch=get() happens later,
1300 // which is needed for newline processing.
1301
1302 if (ctx.peek() == '}')
1303 {
1304 pc.str.append(ctx.get());
1305
1306 if (ctx.peek() == '}')
1307 {
1308 pc.str.append(ctx.get()); // in interpolated string, `}}` is escape'd `}`
1309 }
1310 else
1311 {
1312 --parseState.top().braceDepth;
1313 }
1314 continue;
1315 }
1316 stringType = parse_cs_string_start(ctx, pc);
1317
1318 if (stringType)
1319 {
1320 parseState.push(CsStringParseState(stringType));
1321 continue;
1322 }
1323 }
1324 int lastcol = ctx.c.col;
1325 int ch = ctx.get();
1326
1327 pc.str.append(ch);
1328
1329 if (ch == '\n')
1330 {
1331 set_chunk_type(&pc, CT_STRING_MULTI);
1332 pc.nl_count++;
1333 }
1334 else if (ch == '\r')
1335 {
1336 set_chunk_type(&pc, CT_STRING_MULTI);
1337 }
1338 else if (parseState.top().braceDepth > 0)
1339 {
1340 // do nothing. if we're in a brace, we only want the newline handling, and skip the rest.
1341 }
1342 else if ( (ch == '\t')
1343 && should_escape_tabs)
1344 {
1345 if (parseState.top().type & CS_STRING_VERBATIM)
1346 {
1347 if (!cpd.warned_unable_string_replace_tab_chars)
1348 {
1349 cpd.warned_unable_string_replace_tab_chars = true;
1350
1351 log_rule_B("warn_level_tabs_found_in_verbatim_string_literals");
1352 log_sev_t warnlevel = (log_sev_t)options::warn_level_tabs_found_in_verbatim_string_literals();
1353
1354 /*
1355 * a tab char can't be replaced with \\t because escapes don't
1356 * work in here-strings. best we can do is warn.
1357 */
1358 LOG_FMT(warnlevel, "%s(%d): %s: orig_line is %zu, orig_col is %zu, Detected non-replaceable tab char in literal string\n",
1359 __func__, __LINE__, cpd.filename.c_str(), pc.orig_line, pc.orig_col);
1360 LOG_FMT(warnlevel, "%s(%d): Warning is given if doing tab-to-\\t replacement and we have found one in a C# verbatim string literal.\n",
1361 __func__, __LINE__);
1362
1363 if (warnlevel < LWARN)
1364 {
1365 cpd.error_count++;
1366 }
1367 }
1368 }
1369 else
1370 {
1371 ctx.c.col = lastcol + 2;
1372 pc.str.pop_back(); // remove \t
1373 pc.str.append("\\t");
1374
1375 continue;
1376 }
1377 }
1378 else if ( ch == '\\'
1379 && !(parseState.top().type & CS_STRING_VERBATIM))
1380 {
1381 // catch escaped quote in order to avoid ending string (but also must handle \\ to avoid accidental 'escape' seq of `\\"`)
1382 if ( ctx.peek() == '"'
1383 || ctx.peek() == '\\')
1384 {
1385 pc.str.append(ctx.get());
1386 }
1387 }
1388 else if (ch == '"')
1389 {
1390 if ( (parseState.top().type & CS_STRING_VERBATIM)
1391 && (ctx.peek() == '"'))
1392 {
1393 // in verbatim string, `""` is escape'd `"`
1394 pc.str.append(ctx.get());
1395 }
1396 else
1397 {
1398 // end of string
1399 parseState.pop();
1400
1401 if (parseState.empty())
1402 {
1403 break;
1404 }
1405 }
1406 }
1407 else if (parseState.top().type & CS_STRING_INTERPOLATED)
1408 {
1409 if (ch == '{')
1410 {
1411 if (ctx.peek() == '{')
1412 {
1413 pc.str.append(ctx.get()); // in interpolated string, `{{` is escape'd `{`
1414 }
1415 else
1416 {
1417 ++parseState.top().braceDepth;
1418 }
1419 }
1420 }
1421 }
1422 return(true);
1423 } // parse_cs_string
1424
1425
parse_verbatim_string(tok_ctx & ctx,chunk_t & pc)1426 static void parse_verbatim_string(tok_ctx &ctx, chunk_t &pc)
1427 {
1428 set_chunk_type(&pc, CT_STRING);
1429
1430 // consume the initial """
1431 pc.str = ctx.get();
1432 pc.str.append(ctx.get());
1433 pc.str.append(ctx.get());
1434
1435 // go until we hit a zero (end of file) or a """
1436 while (ctx.more())
1437 {
1438 size_t ch = ctx.get();
1439 pc.str.append(ch);
1440
1441 if ( (ch == '"')
1442 && (ctx.peek() == '"')
1443 && (ctx.peek(1) == '"'))
1444 {
1445 pc.str.append(ctx.get());
1446 pc.str.append(ctx.get());
1447 break;
1448 }
1449
1450 if ( (ch == '\n')
1451 || (ch == '\r'))
1452 {
1453 set_chunk_type(&pc, CT_STRING_MULTI);
1454 pc.nl_count++;
1455 }
1456 }
1457 }
1458
1459
tag_compare(const deque<int> & d,size_t a_idx,size_t b_idx,size_t len)1460 static bool tag_compare(const deque<int> &d, size_t a_idx, size_t b_idx, size_t len)
1461 {
1462 if (a_idx != b_idx)
1463 {
1464 while (len-- > 0)
1465 {
1466 if (d[a_idx] != d[b_idx])
1467 {
1468 return(false);
1469 }
1470 }
1471 }
1472 return(true);
1473 }
1474
1475
parse_cr_string(tok_ctx & ctx,chunk_t & pc,size_t q_idx)1476 static bool parse_cr_string(tok_ctx &ctx, chunk_t &pc, size_t q_idx)
1477 {
1478 size_t tag_idx = ctx.c.idx + q_idx + 1;
1479 size_t tag_len = 0;
1480
1481 ctx.save();
1482
1483 // Copy the prefix + " to the string
1484 pc.str.clear();
1485 int cnt = q_idx + 1;
1486
1487 while (cnt--)
1488 {
1489 pc.str.append(ctx.get());
1490 }
1491
1492 // Add the tag and get the length of the tag
1493 while ( ctx.more()
1494 && (ctx.peek() != '('))
1495 {
1496 tag_len++;
1497 pc.str.append(ctx.get());
1498 }
1499
1500 if (ctx.peek() != '(')
1501 {
1502 ctx.restore();
1503 return(false);
1504 }
1505 set_chunk_type(&pc, CT_STRING);
1506
1507 while (ctx.more())
1508 {
1509 if ( (ctx.peek() == ')')
1510 && (ctx.peek(tag_len + 1) == '"')
1511 && tag_compare(ctx.data, tag_idx, ctx.c.idx + 1, tag_len))
1512 {
1513 cnt = tag_len + 2; // for the )"
1514
1515 while (cnt--)
1516 {
1517 pc.str.append(ctx.get());
1518 }
1519 parse_suffix(ctx, pc);
1520 return(true);
1521 }
1522
1523 if (ctx.peek() == '\n')
1524 {
1525 pc.str.append(ctx.get());
1526 pc.nl_count++;
1527 set_chunk_type(&pc, CT_STRING_MULTI);
1528 }
1529 else
1530 {
1531 pc.str.append(ctx.get());
1532 }
1533 }
1534 ctx.restore();
1535 return(false);
1536 } // parse_cr_string
1537
1538
1539 /**
1540 * Count the number of characters in a word.
1541 * The first character is already valid for a keyword
1542 *
1543 * @param pc The structure to update, str is an input.
1544 * @return Whether a word was parsed (always true)
1545 */
parse_word(tok_ctx & ctx,chunk_t & pc,bool skipcheck)1546 static bool parse_word(tok_ctx &ctx, chunk_t &pc, bool skipcheck)
1547 {
1548 static unc_text intr_txt("@interface");
1549
1550 // The first character is already valid
1551 pc.str.clear();
1552 pc.str.append(ctx.get());
1553
1554 while (ctx.more())
1555 {
1556 size_t ch = ctx.peek();
1557
1558 if (CharTable::IsKw2(ch))
1559 {
1560 pc.str.append(ctx.get());
1561 }
1562 else if ( (ch == '\\')
1563 && (unc_tolower(ctx.peek(1)) == 'u'))
1564 {
1565 pc.str.append(ctx.get());
1566 pc.str.append(ctx.get());
1567 skipcheck = true;
1568 }
1569 else
1570 {
1571 break;
1572 }
1573
1574 // HACK: Non-ASCII character are only allowed in identifiers
1575 if (ch > 0x7f)
1576 {
1577 skipcheck = true;
1578 }
1579 }
1580 set_chunk_type(&pc, CT_WORD);
1581
1582 if (skipcheck)
1583 {
1584 return(true);
1585 }
1586
1587 // Detect pre-processor functions now
1588 if ( cpd.in_preproc == CT_PP_DEFINE
1589 && cpd.preproc_ncnl_count == 1)
1590 {
1591 if (ctx.peek() == '(')
1592 {
1593 set_chunk_type(&pc, CT_MACRO_FUNC);
1594 }
1595 else
1596 {
1597 set_chunk_type(&pc, CT_MACRO);
1598
1599 log_rule_B("pp_ignore_define_body");
1600
1601 if (options::pp_ignore_define_body())
1602 {
1603 /*
1604 * We are setting the PP_IGNORE preproc state because the following
1605 * chunks are part of the macro body and will have to be ignored.
1606 */
1607 cpd.in_preproc = CT_PP_IGNORE;
1608 }
1609 }
1610 }
1611 else
1612 {
1613 // '@interface' is reserved, not an interface itself
1614 if ( language_is_set(LANG_JAVA)
1615 && pc.str.startswith("@")
1616 && !pc.str.equals(intr_txt))
1617 {
1618 set_chunk_type(&pc, CT_ANNOTATION);
1619 }
1620 else
1621 {
1622 // Turn it into a keyword now
1623 // Issue #1460 will return "COMMENT_CPP"
1624 set_chunk_type(&pc, find_keyword_type(pc.text(), pc.str.size()));
1625
1626 /* Special pattern: if we're trying to redirect a preprocessor directive to PP_IGNORE,
1627 * then ensure we're actually part of a preprocessor before doing the swap, or we'll
1628 * end up with a function named 'define' as PP_IGNORE. This is necessary because with
1629 * the config 'set' feature, there's no way to do a pair of tokens as a word
1630 * substitution. */
1631 if ( pc.type == CT_PP_IGNORE
1632 && !cpd.in_preproc)
1633 {
1634 set_chunk_type(&pc, find_keyword_type(pc.text(), pc.str.size()));
1635 }
1636 else if (pc.type == CT_COMMENT_CPP) // Issue #1460
1637 {
1638 size_t ch;
1639 bool is_cs = language_is_set(LANG_CS);
1640
1641 // read until EOL
1642 while (true)
1643 {
1644 int bs_cnt = 0;
1645
1646 while (ctx.more())
1647 {
1648 ch = ctx.peek();
1649
1650 if ( (ch == '\r')
1651 || (ch == '\n'))
1652 {
1653 break;
1654 }
1655
1656 if ( (ch == '\\')
1657 && !is_cs) // backslashes aren't special in comments in C#
1658 {
1659 bs_cnt++;
1660 }
1661 else
1662 {
1663 bs_cnt = 0;
1664 }
1665 pc.str.append(ctx.get());
1666 }
1667
1668 /*
1669 * If we hit an odd number of backslashes right before the newline,
1670 * then we keep going.
1671 */
1672 if ( ((bs_cnt & 1) == 0)
1673 || !ctx.more())
1674 {
1675 break;
1676 }
1677
1678 if (ctx.peek() == '\r')
1679 {
1680 pc.str.append(ctx.get());
1681 }
1682
1683 if (ctx.peek() == '\n')
1684 {
1685 pc.str.append(ctx.get());
1686 }
1687 pc.nl_count++;
1688 cpd.did_newline = true;
1689 }
1690 // Store off the end column
1691 pc.orig_col_end = ctx.c.col;
1692 }
1693 }
1694 }
1695 return(true);
1696 } // parse_word
1697
1698
parse_attribute_specifier_sequence(tok_ctx & ctx)1699 static size_t parse_attribute_specifier_sequence(tok_ctx &ctx)
1700 {
1701 size_t nested = 0;
1702 size_t offset = 0;
1703 size_t parens = 0;
1704 auto ch1 = ctx.peek(offset++);
1705
1706 while (ch1)
1707 {
1708 auto ch2 = ctx.peek(offset++);
1709
1710 while ( ch2 == ' '
1711 || ch2 == '\n'
1712 || ch2 == '\r'
1713 || ch2 == '\t')
1714 {
1715 ch2 = ctx.peek(offset++);
1716 }
1717
1718 if ( nested == 0
1719 && ch2 != '[')
1720 {
1721 break;
1722 }
1723
1724 if (ch1 == '(')
1725 {
1726 ++parens;
1727 ch1 = ch2;
1728 continue;
1729 }
1730
1731 if (ch1 == ')')
1732 {
1733 if (parens == 0)
1734 {
1735 break;
1736 }
1737 --parens;
1738 ch1 = ch2;
1739 continue;
1740 }
1741
1742 if ( ch1 != '['
1743 && ch1 != ']')
1744 {
1745 ch1 = ch2;
1746 continue;
1747 }
1748
1749 if (ch2 != ch1)
1750 {
1751 if (parens == 0)
1752 {
1753 break;
1754 }
1755 ch1 = ch2;
1756 continue;
1757 }
1758
1759 if (ch1 == '[')
1760 {
1761 if ( nested != 0
1762 && parens == 0)
1763 {
1764 break;
1765 }
1766 ++nested;
1767 }
1768 else if (--nested == 0)
1769 {
1770 return(offset);
1771 }
1772 ch1 = ctx.peek(offset++);
1773 }
1774 return(0);
1775 } // parse_attribute_specifier_sequence
1776
1777
extract_attribute_specifier_sequence(tok_ctx & ctx,chunk_t & pc,size_t length)1778 static bool extract_attribute_specifier_sequence(tok_ctx &ctx, chunk_t &pc, size_t length)
1779 {
1780 pc.str.clear();
1781
1782 while (length--)
1783 {
1784 pc.str.append(ctx.get());
1785 }
1786 set_chunk_type(&pc, CT_ATTRIBUTE);
1787 return(true);
1788 } // extract_attribute_specifier_sequence
1789
1790
parse_whitespace(tok_ctx & ctx,chunk_t & pc)1791 static bool parse_whitespace(tok_ctx &ctx, chunk_t &pc)
1792 {
1793 size_t nl_count = 0;
1794 size_t ch = 0;
1795
1796 // REVISIT: use a better whitespace detector?
1797 while ( ctx.more()
1798 && unc_isspace(ctx.peek()))
1799 {
1800 ch = ctx.get(); // throw away the whitespace char
1801
1802 switch (ch)
1803 {
1804 case '\r':
1805
1806 if (ctx.expect('\n'))
1807 {
1808 // CRLF ending
1809 ++LE_COUNT(CRLF);
1810 }
1811 else
1812 {
1813 // CR ending
1814 ++LE_COUNT(CR);
1815 }
1816 nl_count++;
1817 pc.orig_prev_sp = 0;
1818 break;
1819
1820 case '\n':
1821 // LF ending
1822 ++LE_COUNT(LF);
1823 nl_count++;
1824 pc.orig_prev_sp = 0;
1825 break;
1826
1827 case '\t':
1828 log_rule_B("input_tab_size");
1829 pc.orig_prev_sp += calc_next_tab_column(cpd.column, options::input_tab_size()) - cpd.column;
1830 break;
1831
1832 case ' ':
1833 pc.orig_prev_sp++;
1834 break;
1835
1836 default:
1837 break;
1838 }
1839 }
1840
1841 if (ch != 0)
1842 {
1843 pc.str.clear();
1844 set_chunk_type(&pc, nl_count ? CT_NEWLINE : CT_WHITESPACE);
1845 pc.nl_count = nl_count;
1846 pc.after_tab = (ctx.c.last_ch == '\t');
1847 return(true);
1848 }
1849 return(false);
1850 } // parse_whitespace
1851
1852
parse_bs_newline(tok_ctx & ctx,chunk_t & pc)1853 static bool parse_bs_newline(tok_ctx &ctx, chunk_t &pc)
1854 {
1855 ctx.save();
1856 ctx.get(); // skip the '\'
1857
1858 size_t ch;
1859
1860 while ( ctx.more()
1861 && unc_isspace(ch = ctx.peek()))
1862 {
1863 ctx.get();
1864
1865 if ( (ch == '\r')
1866 || (ch == '\n'))
1867 {
1868 if (ch == '\r')
1869 {
1870 ctx.expect('\n');
1871 }
1872 set_chunk_type(&pc, CT_NL_CONT);
1873 pc.str = "\\";
1874 pc.nl_count = 1;
1875 return(true);
1876 }
1877 }
1878 ctx.restore();
1879 return(false);
1880 }
1881
1882
parse_newline(tok_ctx & ctx)1883 static bool parse_newline(tok_ctx &ctx)
1884 {
1885 ctx.save();
1886
1887 // Eat whitespace
1888 while ( (ctx.peek() == ' ')
1889 || (ctx.peek() == '\t'))
1890 {
1891 ctx.get();
1892 }
1893
1894 if ( (ctx.peek() == '\r')
1895 || (ctx.peek() == '\n'))
1896 {
1897 if (!ctx.expect('\n'))
1898 {
1899 ctx.get();
1900 ctx.expect('\n');
1901 }
1902 return(true);
1903 }
1904 ctx.restore();
1905 return(false);
1906 }
1907
1908
parse_pawn_pattern(tok_ctx & ctx,chunk_t & pc,c_token_t tt)1909 static void parse_pawn_pattern(tok_ctx &ctx, chunk_t &pc, c_token_t tt)
1910 {
1911 pc.str.clear();
1912 set_chunk_type(&pc, tt);
1913
1914 while (!unc_isspace(ctx.peek()))
1915 {
1916 // end the pattern on an escaped newline
1917 if (ctx.peek() == '\\')
1918 {
1919 size_t ch = ctx.peek(1);
1920
1921 if ( (ch == '\n')
1922 || (ch == '\r'))
1923 {
1924 break;
1925 }
1926 }
1927 pc.str.append(ctx.get());
1928 }
1929 }
1930
1931
parse_off_newlines(tok_ctx & ctx,chunk_t & pc)1932 static bool parse_off_newlines(tok_ctx &ctx, chunk_t &pc)
1933 {
1934 size_t nl_count = 0;
1935
1936 // Parse off newlines/blank lines
1937 while (parse_newline(ctx))
1938 {
1939 nl_count++;
1940 }
1941
1942 if (nl_count > 0)
1943 {
1944 pc.nl_count = nl_count;
1945 set_chunk_type(&pc, CT_NEWLINE);
1946 return(true);
1947 }
1948 return(false);
1949 }
1950
1951
parse_macro(tok_ctx & ctx,chunk_t & pc,const chunk_t * prev_pc)1952 static bool parse_macro(tok_ctx &ctx, chunk_t &pc, const chunk_t *prev_pc)
1953 {
1954 if (parse_off_newlines(ctx, pc))
1955 {
1956 return(true);
1957 }
1958
1959 if (parse_comment(ctx, pc)) // allow CT_COMMENT_MULTI within macros
1960 {
1961 return(true);
1962 }
1963 ctx.save();
1964 pc.str.clear();
1965
1966 bool continued = ( chunk_is_token(prev_pc, CT_NL_CONT)
1967 || chunk_is_token(prev_pc, CT_COMMENT_MULTI));
1968
1969 while (ctx.more())
1970 {
1971 size_t pk = ctx.peek(), pk1 = ctx.peek(1);
1972 bool nl = ( pk == '\n'
1973 || pk == '\r');
1974 bool nl_cont = ( pk == '\\'
1975 && ( pk1 == '\n'
1976 || pk1 == '\r'));
1977
1978 if ( ( nl_cont
1979 || ( continued
1980 && nl))
1981 && pc.str.size() > 0)
1982 {
1983 set_chunk_type(&pc, CT_IGNORED);
1984 return(true);
1985 }
1986 else if (nl)
1987 {
1988 break;
1989 }
1990 pc.str.append(ctx.get());
1991 }
1992 pc.str.clear();
1993 ctx.restore();
1994 return(false);
1995 } // parse_macro
1996
1997
parse_ignored(tok_ctx & ctx,chunk_t & pc)1998 static bool parse_ignored(tok_ctx &ctx, chunk_t &pc)
1999 {
2000 if (parse_off_newlines(ctx, pc))
2001 {
2002 return(true);
2003 }
2004 // See if the UO_enable_processing_cmt or #pragma endasm / #endasm text is on this line
2005 ctx.save();
2006 pc.str.clear();
2007
2008 while ( ctx.more()
2009 && (ctx.peek() != '\r')
2010 && (ctx.peek() != '\n'))
2011 {
2012 pc.str.append(ctx.get());
2013 }
2014
2015 if (pc.str.size() == 0)
2016 {
2017 // end of file?
2018 return(false);
2019 }
2020
2021 // HACK: turn on if we find '#endasm' or '#pragma' and 'endasm' separated by blanks
2022 if ( ( ( (pc.str.find("#pragma ") >= 0)
2023 || (pc.str.find("#pragma ") >= 0))
2024 && ( (pc.str.find(" endasm") >= 0)
2025 || (pc.str.find(" endasm") >= 0)))
2026 || (pc.str.find("#endasm") >= 0))
2027 {
2028 cpd.unc_off = false;
2029 ctx.restore();
2030 pc.str.clear();
2031 return(false);
2032 }
2033 // Note that we aren't actually making sure this is in a comment, yet
2034 log_rule_B("enable_processing_cmt");
2035 const auto &ontext = options::enable_processing_cmt();
2036
2037 if (!ontext.empty())
2038 {
2039 bool found_enable_pattern = false;
2040
2041 if ( ontext != UNCRUSTIFY_ON_TEXT
2042 && options::processing_cmt_as_regex())
2043 {
2044 std::wstring pc_wstring(pc.str.get().cbegin(),
2045 pc.str.get().cend());
2046 std::wregex criteria(std::wstring(ontext.cbegin(),
2047 ontext.cend()));
2048
2049 found_enable_pattern = std::regex_search(pc_wstring.cbegin(),
2050 pc_wstring.cend(),
2051 criteria);
2052 }
2053 else
2054 {
2055 found_enable_pattern = (pc.str.find(ontext.c_str()) >= 0);
2056 }
2057
2058 if (!found_enable_pattern)
2059 {
2060 set_chunk_type(&pc, CT_IGNORED);
2061 return(true);
2062 }
2063 }
2064 ctx.restore();
2065
2066 // parse off whitespace leading to the comment
2067 if (parse_whitespace(ctx, pc))
2068 {
2069 set_chunk_type(&pc, CT_IGNORED);
2070 return(true);
2071 }
2072
2073 // Look for the ending comment and let it pass
2074 if ( parse_comment(ctx, pc)
2075 && !cpd.unc_off)
2076 {
2077 return(true);
2078 }
2079 // Reset the chunk & scan to until a newline
2080 pc.str.clear();
2081
2082 while ( ctx.more()
2083 && (ctx.peek() != '\r')
2084 && (ctx.peek() != '\n'))
2085 {
2086 pc.str.append(ctx.get());
2087 }
2088
2089 if (pc.str.size() > 0)
2090 {
2091 set_chunk_type(&pc, CT_IGNORED);
2092 return(true);
2093 }
2094 return(false);
2095 } // parse_ignored
2096
2097
parse_next(tok_ctx & ctx,chunk_t & pc,const chunk_t * prev_pc)2098 static bool parse_next(tok_ctx &ctx, chunk_t &pc, const chunk_t *prev_pc)
2099 {
2100 if (!ctx.more())
2101 {
2102 return(false);
2103 }
2104 // Save off the current column
2105 set_chunk_type(&pc, CT_NONE);
2106 pc.orig_line = ctx.c.row;
2107 pc.column = ctx.c.col;
2108 pc.orig_col = ctx.c.col;
2109 pc.nl_count = 0;
2110 pc.flags = PCF_NONE;
2111
2112 // If it is turned off, we put everything except newlines into CT_UNKNOWN
2113 if (cpd.unc_off)
2114 {
2115 if (parse_ignored(ctx, pc))
2116 {
2117 return(true);
2118 }
2119 }
2120 log_rule_B("disable_processing_nl_cont");
2121
2122 // Parse macro blocks
2123 if (options::disable_processing_nl_cont())
2124 {
2125 if (parse_macro(ctx, pc, prev_pc))
2126 {
2127 return(true);
2128 }
2129 }
2130
2131 // Parse whitespace
2132 if (parse_whitespace(ctx, pc))
2133 {
2134 return(true);
2135 }
2136
2137 // Handle unknown/unhandled preprocessors
2138 if ( cpd.in_preproc > CT_PP_BODYCHUNK
2139 && cpd.in_preproc <= CT_PP_OTHER)
2140 {
2141 pc.str.clear();
2142 tok_info ss;
2143 ctx.save(ss);
2144 // Chunk to a newline or comment
2145 set_chunk_type(&pc, CT_PREPROC_BODY);
2146 size_t last = 0;
2147
2148 while (ctx.more())
2149 {
2150 size_t ch = ctx.peek();
2151
2152 // Fix for issue #1752
2153 // Ignoring extra spaces after ' \ ' for preproc body continuations
2154 if ( last == '\\'
2155 && ch == ' ')
2156 {
2157 ctx.get();
2158 continue;
2159 }
2160
2161 if ( (ch == '\n')
2162 || (ch == '\r'))
2163 {
2164 // Back off if this is an escaped newline
2165 if (last == '\\')
2166 {
2167 ctx.restore(ss);
2168 pc.str.pop_back();
2169 }
2170 break;
2171 }
2172
2173 // Quit on a C or C++ comment start Issue #1966
2174 if ( (ch == '/')
2175 && ( (ctx.peek(1) == '/')
2176 || (ctx.peek(1) == '*')))
2177 {
2178 break;
2179 }
2180 last = ch;
2181 ctx.save(ss);
2182
2183 pc.str.append(ctx.get());
2184 }
2185
2186 if (pc.str.size() > 0)
2187 {
2188 return(true);
2189 }
2190 }
2191
2192 // Detect backslash-newline
2193 if ( (ctx.peek() == '\\')
2194 && parse_bs_newline(ctx, pc))
2195 {
2196 return(true);
2197 }
2198
2199 // Parse comments
2200 if (parse_comment(ctx, pc))
2201 {
2202 return(true);
2203 }
2204
2205 // Parse code placeholders
2206 if (parse_code_placeholder(ctx, pc))
2207 {
2208 return(true);
2209 }
2210
2211 if (language_is_set(LANG_CS))
2212 {
2213 if (parse_cs_string(ctx, pc))
2214 {
2215 return(true);
2216 }
2217
2218 // check for non-keyword identifiers such as @if @switch, etc
2219 if ( (ctx.peek() == '@')
2220 && CharTable::IsKw1(ctx.peek(1)))
2221 {
2222 parse_word(ctx, pc, true);
2223 return(true);
2224 }
2225 }
2226
2227 // handle VALA """ strings """
2228 if ( language_is_set(LANG_VALA)
2229 && (ctx.peek() == '"')
2230 && (ctx.peek(1) == '"')
2231 && (ctx.peek(2) == '"'))
2232 {
2233 parse_verbatim_string(ctx, pc);
2234 return(true);
2235 }
2236 /*
2237 * handle C++(11) string/char literal prefixes u8|u|U|L|R including all
2238 * possible combinations and optional R delimiters: R"delim(x)delim"
2239 */
2240 auto ch = ctx.peek();
2241
2242 if ( language_is_set(LANG_C | LANG_CPP)
2243 && ( ch == 'u'
2244 || ch == 'U'
2245 || ch == 'R'
2246 || ch == 'L'))
2247 {
2248 auto idx = size_t{};
2249 auto is_real = false;
2250
2251 if ( ch == 'u'
2252 && ctx.peek(1) == '8')
2253 {
2254 idx = 2;
2255 }
2256 else if ( unc_tolower(ch) == 'u'
2257 || ch == 'L')
2258 {
2259 idx++;
2260 }
2261
2262 if ( language_is_set(LANG_C | LANG_CPP)
2263 && ctx.peek(idx) == 'R')
2264 {
2265 idx++;
2266 is_real = true;
2267 }
2268 const auto quote = ctx.peek(idx);
2269
2270 if (is_real)
2271 {
2272 if ( quote == '"'
2273 && parse_cr_string(ctx, pc, idx))
2274 {
2275 return(true);
2276 }
2277 }
2278 else if ( ( quote == '"'
2279 || quote == '\'')
2280 && parse_string(ctx, pc, idx, true))
2281 {
2282 return(true);
2283 }
2284 }
2285
2286 // PAWN specific stuff
2287 if (language_is_set(LANG_PAWN))
2288 {
2289 if ( cpd.preproc_ncnl_count == 1
2290 && ( cpd.in_preproc == CT_PP_DEFINE
2291 || cpd.in_preproc == CT_PP_EMIT))
2292 {
2293 parse_pawn_pattern(ctx, pc, CT_MACRO);
2294 return(true);
2295 }
2296
2297 // Check for PAWN strings: \"hi" or !"hi" or !\"hi" or \!"hi"
2298 if ( (ctx.peek() == '\\')
2299 || (ctx.peek() == '!'))
2300 {
2301 if (ctx.peek(1) == '"')
2302 {
2303 parse_string(ctx, pc, 1, (ctx.peek() == '!'));
2304 return(true);
2305 }
2306
2307 if ( ( (ctx.peek(1) == '\\')
2308 || (ctx.peek(1) == '!'))
2309 && (ctx.peek(2) == '"'))
2310 {
2311 parse_string(ctx, pc, 2, false);
2312 return(true);
2313 }
2314 }
2315
2316 // handle PAWN preprocessor args %0 .. %9
2317 if ( cpd.in_preproc == CT_PP_DEFINE
2318 && (ctx.peek() == '%')
2319 && unc_isdigit(ctx.peek(1)))
2320 {
2321 pc.str.clear();
2322 pc.str.append(ctx.get());
2323 pc.str.append(ctx.get());
2324 set_chunk_type(&pc, CT_WORD);
2325 return(true);
2326 }
2327 }
2328 // Parse strings and character constants
2329
2330 //parse_word(ctx, pc_temp, true);
2331 //ctx.restore(ctx.c);
2332 if (parse_number(ctx, pc))
2333 {
2334 return(true);
2335 }
2336
2337 if (language_is_set(LANG_D))
2338 {
2339 // D specific stuff
2340 if (d_parse_string(ctx, pc))
2341 {
2342 return(true);
2343 }
2344 }
2345 else
2346 {
2347 // Not D stuff
2348
2349 // Check for L'a', L"abc", 'a', "abc", <abc> strings
2350 ch = ctx.peek();
2351 size_t ch1 = ctx.peek(1);
2352
2353 if ( ( ( (ch == 'L')
2354 || (ch == 'S'))
2355 && ( (ch1 == '"')
2356 || (ch1 == '\'')))
2357 || (ch == '"')
2358 || (ch == '\'')
2359 || ( (ch == '<')
2360 && cpd.in_preproc == CT_PP_INCLUDE))
2361 {
2362 parse_string(ctx, pc, unc_isalpha(ch) ? 1 : 0, true);
2363 set_chunk_parent(&pc, CT_PP_INCLUDE);
2364 return(true);
2365 }
2366
2367 if ( (ch == '<')
2368 && cpd.in_preproc == CT_PP_DEFINE)
2369 {
2370 if (chunk_is_token(chunk_get_tail(), CT_MACRO))
2371 {
2372 // We have "#define XXX <", assume '<' starts an include string
2373 parse_string(ctx, pc, 0, false);
2374 return(true);
2375 }
2376 }
2377
2378 /* Inside clang's __has_include() could be "path/to/file.h" or system-style <path/to/file.h> */
2379 if ( (ch == '(')
2380 && (chunk_get_tail() != nullptr)
2381 && ( chunk_is_token(chunk_get_tail(), CT_CNG_HASINC)
2382 || chunk_is_token(chunk_get_tail(), CT_CNG_HASINCN)))
2383 {
2384 parse_string(ctx, pc, 0, false);
2385 return(true);
2386 }
2387 }
2388
2389 // Check for Objective C literals and VALA identifiers ('@1', '@if')
2390 if ( language_is_set(LANG_OC | LANG_VALA)
2391 && (ctx.peek() == '@'))
2392 {
2393 size_t nc = ctx.peek(1);
2394
2395 if (nc == 'R') // Issue #2720
2396 {
2397 if (ctx.peek(2) == '"')
2398 {
2399 if (parse_cr_string(ctx, pc, 2)) // Issue #3027
2400 {
2401 return(true);
2402 }
2403 // parse string without escaping
2404 parse_string(ctx, pc, 2, false);
2405 return(true);
2406 }
2407 }
2408
2409 if ( (nc == '"')
2410 || (nc == '\''))
2411 {
2412 // literal string
2413 parse_string(ctx, pc, 1, true);
2414 return(true);
2415 }
2416
2417 if ( (nc >= '0')
2418 && (nc <= '9'))
2419 {
2420 // literal number
2421 pc.str.append(ctx.get()); // store the '@'
2422 parse_number(ctx, pc);
2423 return(true);
2424 }
2425 }
2426
2427 // Check for pawn/ObjectiveC/Java and normal identifiers
2428 if ( CharTable::IsKw1(ctx.peek())
2429 || ( (ctx.peek() == '\\')
2430 && (unc_tolower(ctx.peek(1)) == 'u'))
2431 || ( (ctx.peek() == '@')
2432 && CharTable::IsKw1(ctx.peek(1))))
2433 {
2434 parse_word(ctx, pc, false);
2435 return(true);
2436 }
2437
2438 // Check for C++11/14/17/20 attribute specifier sequences
2439 if ( language_is_set(LANG_CPP)
2440 && ctx.peek() == '[')
2441 {
2442 if ( !language_is_set(LANG_OC)
2443 || !chunk_is_token(prev_pc, CT_OC_AT))
2444 {
2445 if (auto length = parse_attribute_specifier_sequence(ctx))
2446 {
2447 extract_attribute_specifier_sequence(ctx, pc, length);
2448 return(true);
2449 }
2450 }
2451 }
2452 // see if we have a punctuator
2453 char punc_txt[7];
2454
2455 punc_txt[0] = ctx.peek();
2456 punc_txt[1] = ctx.peek(1);
2457 punc_txt[2] = ctx.peek(2);
2458 punc_txt[3] = ctx.peek(3);
2459 punc_txt[4] = ctx.peek(4);
2460 punc_txt[5] = ctx.peek(5);
2461 punc_txt[6] = '\0';
2462 const chunk_tag_t *punc;
2463
2464 if ((punc = find_punctuator(punc_txt, cpd.lang_flags)) != nullptr)
2465 {
2466 int cnt = strlen(punc->tag);
2467
2468 while (cnt--)
2469 {
2470 pc.str.append(ctx.get());
2471 }
2472 set_chunk_type(&pc, punc->type);
2473 pc.flags |= PCF_PUNCTUATOR;
2474 return(true);
2475 }
2476 /* When parsing C/C++ files and running into some unknown token,
2477 * check if matches Objective-C as a last resort, before
2478 * considering it as garbage.
2479 */
2480 int probe_lang_flags = 0;
2481
2482 if (language_is_set(LANG_C | LANG_CPP))
2483 {
2484 probe_lang_flags = cpd.lang_flags | LANG_OC;
2485 }
2486
2487 if (probe_lang_flags != 0)
2488 {
2489 if ((punc = find_punctuator(punc_txt, probe_lang_flags)) != NULL)
2490 {
2491 cpd.lang_flags = probe_lang_flags;
2492 int cnt = strlen(punc->tag);
2493
2494 while (cnt--)
2495 {
2496 pc.str.append(ctx.get());
2497 }
2498 set_chunk_type(&pc, punc->type);
2499 pc.flags |= PCF_PUNCTUATOR;
2500 return(true);
2501 }
2502 }
2503 // throw away this character
2504 set_chunk_type(&pc, CT_UNKNOWN);
2505 pc.str.append(ctx.get());
2506
2507 LOG_FMT(LWARN, "%s:%zu Garbage in col %d: %x\n",
2508 cpd.filename.c_str(), pc.orig_line, (int)ctx.c.col, pc.str[0]);
2509 cpd.error_count++;
2510 return(true);
2511 } // parse_next
2512
2513
find_disable_processing_comment_marker(const unc_text & text,std::size_t start_idx)2514 int find_disable_processing_comment_marker(const unc_text &text,
2515 std::size_t start_idx)
2516 {
2517 log_rule_B("disable_processing_cmt");
2518 const auto &offtext = options::disable_processing_cmt();
2519 int idx = -1;
2520
2521 if ( !offtext.empty()
2522 && start_idx < text.size())
2523 {
2524 if ( offtext != UNCRUSTIFY_OFF_TEXT
2525 && options::processing_cmt_as_regex())
2526 {
2527 std::wsmatch match;
2528 std::wstring pc_wstring(text.get().cbegin() + start_idx,
2529 text.get().cend());
2530 std::wregex criteria(std::wstring(offtext.cbegin(),
2531 offtext.cend()));
2532
2533 std::regex_search(pc_wstring.cbegin(),
2534 pc_wstring.cend(),
2535 match,
2536 criteria);
2537
2538 if (!match.empty())
2539 {
2540 idx = int(match.position() + start_idx);
2541 }
2542 }
2543 else
2544 {
2545 idx = text.find(offtext.c_str(),
2546 start_idx);
2547
2548 if (idx >= 0)
2549 {
2550 idx += int(offtext.size());
2551 }
2552 }
2553
2554 /**
2555 * update the position to the start of the current line
2556 */
2557 while ( idx > 0
2558 && text[idx - 1] != '\n')
2559 {
2560 --idx;
2561 }
2562 }
2563 return(idx);
2564 } // find_disable_processing_comment_marker
2565
2566
find_enable_processing_comment_marker(const unc_text & text,std::size_t start_idx)2567 int find_enable_processing_comment_marker(const unc_text &text,
2568 std::size_t start_idx)
2569 {
2570 log_rule_B("enable_processing_cmt");
2571 const auto &ontext = options::enable_processing_cmt();
2572 int idx = -1;
2573
2574 if ( !ontext.empty()
2575 && start_idx < text.size())
2576 {
2577 if ( ontext != UNCRUSTIFY_ON_TEXT
2578 && options::processing_cmt_as_regex())
2579 {
2580 std::wsmatch match;
2581 std::wstring pc_wstring(text.get().cbegin() + start_idx,
2582 text.get().cend());
2583 std::wregex criteria(std::wstring(ontext.cbegin(),
2584 ontext.cend()));
2585
2586 std::regex_search(pc_wstring.cbegin(),
2587 pc_wstring.cend(),
2588 match,
2589 criteria);
2590
2591 if (!match.empty())
2592 {
2593 idx = int(start_idx + match.position() + match.size());
2594 }
2595 }
2596 else
2597 {
2598 idx = text.find(ontext.c_str(),
2599 start_idx);
2600
2601 if (idx >= 0)
2602 {
2603 idx += int(ontext.size());
2604 }
2605 }
2606
2607 /**
2608 * update the position to the end of the current line
2609 */
2610 if (idx >= 0)
2611 {
2612 while ( idx < int(text.size())
2613 && text[idx] != '\n')
2614 {
2615 ++idx;
2616 }
2617 }
2618 }
2619 return(idx);
2620 } // find_enable_processing_comment_marker
2621
2622
tokenize(const deque<int> & data,chunk_t * ref)2623 void tokenize(const deque<int> &data, chunk_t *ref)
2624 {
2625 tok_ctx ctx(data);
2626 chunk_t chunk;
2627 chunk_t *pc = nullptr;
2628 chunk_t *rprev = nullptr;
2629 bool last_was_tab = false;
2630 size_t prev_sp = 0;
2631 int num_stripped = 0; // Issue #1966
2632
2633 cpd.unc_stage = unc_stage_e::TOKENIZE;
2634
2635 while (ctx.more())
2636 {
2637 chunk.reset();
2638 chunk.pp_level = 0;
2639
2640 if (!parse_next(ctx, chunk, pc))
2641 {
2642 LOG_FMT(LERR, "%s:%zu Bailed before the end?\n",
2643 cpd.filename.c_str(), ctx.c.row);
2644 cpd.error_count++;
2645 break;
2646 }
2647
2648 if ( language_is_set(LANG_JAVA)
2649 && chunk.type == CT_MEMBER
2650 && !memcmp(chunk.text(), "->", 2))
2651 {
2652 chunk.type = CT_LAMBDA;
2653 }
2654
2655 // Don't create an entry for whitespace
2656 if (chunk.type == CT_WHITESPACE)
2657 {
2658 last_was_tab = chunk.after_tab;
2659 prev_sp = chunk.orig_prev_sp;
2660 continue;
2661 }
2662 chunk.orig_prev_sp = prev_sp;
2663 prev_sp = 0;
2664
2665 if (chunk.type == CT_NEWLINE)
2666 {
2667 last_was_tab = chunk.after_tab;
2668 chunk.after_tab = false;
2669 chunk.str.clear();
2670 }
2671 else if (chunk.type == CT_NL_CONT)
2672 {
2673 last_was_tab = chunk.after_tab;
2674 chunk.after_tab = false;
2675 chunk.str = "\\\n";
2676 }
2677 else
2678 {
2679 chunk.after_tab = last_was_tab;
2680 last_was_tab = false;
2681 }
2682
2683 if (chunk.type != CT_IGNORED)
2684 {
2685 // Issue #1338
2686 // Strip trailing whitespace (for CPP comments and PP blocks)
2687 num_stripped = 0; // Issue #1966
2688
2689 while ( (chunk.str.size() > 0)
2690 && ( (chunk.str[chunk.str.size() - 1] == ' ')
2691 || (chunk.str[chunk.str.size() - 1] == '\t')))
2692 {
2693 // If comment contains backslash '\' followed by whitespace chars, keep last one;
2694 // this will prevent it from turning '\' into line continuation.
2695 if ( (chunk.str.size() > 1)
2696 && (chunk.str[chunk.str.size() - 2] == '\\'))
2697 {
2698 break;
2699 }
2700 chunk.str.pop_back();
2701 num_stripped++; // Issue #1966
2702 }
2703 }
2704 // Store off the end column
2705 chunk.orig_col_end = ctx.c.col;
2706
2707 if ( ( chunk.type == CT_COMMENT_MULTI // Issue #1966
2708 || chunk.type == CT_COMMENT
2709 || chunk.type == CT_COMMENT_CPP)
2710 && (pc != nullptr)
2711 && chunk_is_token(pc, CT_PP_IGNORE))
2712 {
2713 chunk.orig_col_end -= num_stripped;
2714 }
2715 // Add the chunk to the list
2716 rprev = pc;
2717
2718 if (rprev != nullptr)
2719 {
2720 chunk_flags_set(pc, rprev->flags & PCF_COPY_FLAGS);
2721
2722 // a newline can't be in a preprocessor
2723 if (chunk_is_token(pc, CT_NEWLINE))
2724 {
2725 chunk_flags_clr(pc, PCF_IN_PREPROC);
2726 }
2727 }
2728
2729 if (ref != nullptr)
2730 {
2731 chunk.flags |= PCF_INSERTED;
2732 }
2733 else
2734 {
2735 chunk.flags &= ~PCF_INSERTED;
2736 }
2737 pc = chunk_add_before(&chunk, ref);
2738
2739 // A newline marks the end of a preprocessor
2740 if (chunk_is_token(pc, CT_NEWLINE)) // || chunk_is_token(pc, CT_COMMENT_MULTI))
2741 {
2742 cpd.in_preproc = CT_NONE;
2743 cpd.preproc_ncnl_count = 0;
2744 }
2745
2746 // Disable indentation when #asm directive found
2747 if (chunk_is_token(pc, CT_PP_ASM))
2748 {
2749 LOG_FMT(LBCTRL, "Found a directive %s on line %zu\n", "#asm", pc->orig_line);
2750 cpd.unc_off = true;
2751 }
2752
2753 // Special handling for preprocessor stuff
2754 if (cpd.in_preproc != CT_NONE)
2755 {
2756 chunk_flags_set(pc, PCF_IN_PREPROC);
2757
2758 // Count words after the preprocessor
2759 if ( !chunk_is_comment(pc)
2760 && !chunk_is_newline(pc))
2761 {
2762 cpd.preproc_ncnl_count++;
2763 }
2764
2765 // Disable indentation if a #pragma asm directive is found
2766 if (cpd.in_preproc == CT_PP_PRAGMA)
2767 {
2768 if (memcmp(pc->text(), "asm", 3) == 0)
2769 {
2770 LOG_FMT(LBCTRL, "Found a pragma %s on line %zu\n", "asm", pc->orig_line);
2771 cpd.unc_off = true;
2772 }
2773 }
2774
2775 // Figure out the type of preprocessor for #include parsing
2776 if (cpd.in_preproc == CT_PREPROC)
2777 {
2778 if ( pc->type < CT_PP_DEFINE
2779 || pc->type > CT_PP_OTHER)
2780 {
2781 set_chunk_type(pc, CT_PP_OTHER);
2782 }
2783 cpd.in_preproc = pc->type;
2784 }
2785 else if (cpd.in_preproc == CT_PP_IGNORE)
2786 {
2787 // ASSERT(options::pp_ignore_define_body());
2788 if ( !chunk_is_token(pc, CT_NL_CONT)
2789 && !chunk_is_token(pc, CT_COMMENT_CPP)
2790 && !chunk_is_token(pc, CT_COMMENT)
2791 && !chunk_is_token(pc, CT_COMMENT_MULTI)) // Issue #1966
2792 {
2793 set_chunk_type(pc, CT_PP_IGNORE);
2794 }
2795 }
2796 else if ( cpd.in_preproc == CT_PP_DEFINE
2797 && chunk_is_token(pc, CT_PAREN_CLOSE)
2798 && options::pp_ignore_define_body())
2799 {
2800 log_rule_B("pp_ignore_define_body");
2801 // When we have a PAREN_CLOSE in a PP_DEFINE we should be terminating a MACRO_FUNC
2802 // arguments list. Therefore we can enter the PP_IGNORE state and ignore next chunks.
2803 cpd.in_preproc = CT_PP_IGNORE;
2804 }
2805 }
2806 else
2807 {
2808 // Check for a preprocessor start
2809 if ( chunk_is_token(pc, CT_POUND)
2810 && ( rprev == nullptr
2811 || chunk_is_token(rprev, CT_NEWLINE)))
2812 {
2813 set_chunk_type(pc, CT_PREPROC);
2814 chunk_flags_set(pc, PCF_IN_PREPROC);
2815 cpd.in_preproc = CT_PREPROC;
2816 }
2817 }
2818
2819 if (chunk_is_token(pc, CT_NEWLINE))
2820 {
2821 LOG_FMT(LGUY, "%s(%d): orig_line is %zu, orig_col is %zu, <Newline>, nl is %zu\n",
2822 __func__, __LINE__, pc->orig_line, pc->orig_col, pc->nl_count);
2823 }
2824 else if (chunk_is_token(pc, CT_VBRACE_OPEN))
2825 {
2826 LOG_FMT(LGUY, "%s(%d): orig_line is %zu, orig_col is %zu, type is %s, orig_col_end is %zu\n",
2827 __func__, __LINE__, pc->orig_line, pc->orig_col, get_token_name(pc->type), pc->orig_col_end);
2828 }
2829 else
2830 {
2831 char copy[1000];
2832 LOG_FMT(LGUY, "%s(%d): orig_line is %zu, orig_col is %zu, text() '%s', type is %s, orig_col_end is %zu\n",
2833 __func__, __LINE__, pc->orig_line, pc->orig_col, pc->elided_text(copy), get_token_name(pc->type), pc->orig_col_end);
2834 }
2835 }
2836 // Set the cpd.newline string for this file
2837 log_rule_B("newlines");
2838
2839 if ( options::newlines() == LE_LF
2840 || ( options::newlines() == LE_AUTO
2841 && (LE_COUNT(LF) >= LE_COUNT(CRLF))
2842 && (LE_COUNT(LF) >= LE_COUNT(CR))))
2843 {
2844 // LF line ends
2845 cpd.newline = "\n";
2846 LOG_FMT(LLINEENDS, "Using LF line endings\n");
2847 }
2848 else if ( options::newlines() == LE_CRLF
2849 || ( options::newlines() == LE_AUTO
2850 && (LE_COUNT(CRLF) >= LE_COUNT(LF))
2851 && (LE_COUNT(CRLF) >= LE_COUNT(CR))))
2852 {
2853 // CRLF line ends
2854 cpd.newline = "\r\n";
2855 LOG_FMT(LLINEENDS, "Using CRLF line endings\r\n");
2856 }
2857 else
2858 {
2859 // CR line ends
2860 cpd.newline = "\r";
2861 LOG_FMT(LLINEENDS, "Using CR line endings\n");
2862 }
2863 } // tokenize
2864