1 // Copyright (C) 2005  Davis E. King (davis@dlib.net)
2 // License: Boost Software License   See LICENSE.txt for the full license.
3 #ifndef DLIB_CPP_TOKENIZER_KERNEl_1_
4 #define DLIB_CPP_TOKENIZER_KERNEl_1_
5 
6 #include <string>
7 #include <iostream>
8 #include "cpp_tokenizer_kernel_abstract.h"
9 #include "../algs.h"
10 
11 namespace dlib
12 {
13 
14     namespace cpp_tok_kernel_1_helper
15     {
16         struct token_text_pair
17         {
18             std::string token;
19             int type=0;
20         };
21 
22     }
23 
24     template <
25         typename tok,
26         typename queue,
27         typename set
28         >
29     class cpp_tokenizer_kernel_1
30     {
31         /*!
32             REQUIREMENTS ON tok
33                 tok must be an implementation of tokenizer/tokenizer_kernel_abstract.h
34 
35             REQUIREMENTS ON queue
36                 queue must be an implementation of queue/queue_kernel_abstract.h
37                 and must have T==cpp_tok_kernel_1_helper::token_text_pair
38 
39             REQUIREMENTS ON set
40                 set must be an implemention of set/set_kernel_abstract.h or
41                 hash_set/hash_set_kernel_abstract.h and must have T==std::string.
42 
43             INITIAL VALUE
44                 - keywords == a set of all the C++ keywords
45                 - tokenizer.stream_is_set() == false
46                 - buffer.size() == 0
47                 - tokenizer.get_identifier_head() == "$_" + tokenizer.lowercase_letters() +
48                   tokenizer.uppercase_letters()
49                 - tokenizer.get_identifier_body() == "$_" + tokenizer.lowercase_letters() +
50                   tokenizer.uppercase_letters() + tokenizer.numbers()
51                 - have_peeked == false
52 
53 
54             CONVENTION
55                 - tokenizer.stream_is_set() == stream_is_set()
56                 - tokenizer.get_stream() == get_stream()
57                 - keywords == a set of all the C++ keywords
58 
59                 - tokenizer.get_identifier_head() == "$_" + tokenizer.lowercase_letters() +
60                   tokenizer.uppercase_letters()
61                 - tokenizer.get_identifier_body() == "$_" + tokenizer.lowercase_letters() +
62                   tokenizer.uppercase_letters() + tokenizer.numbers()
63 
64                 - buffer == a queue of tokens.  This is where we put tokens
65                   we gathered early due to looking ahead.
66 
67 
68                 - if (have_peeked) then
69                     - next_token == the next token to be returned from get_token()
70                     - next_type == the type of token in peek_token
71       !*/
72 
73         typedef cpp_tok_kernel_1_helper::token_text_pair token_text_pair;
74 
75     public:
76 
77         enum
78         {
79             END_OF_FILE,
80             KEYWORD,
81             COMMENT,
82             SINGLE_QUOTED_TEXT,
83             DOUBLE_QUOTED_TEXT,
84             IDENTIFIER,
85             OTHER,
86             NUMBER,
87             WHITE_SPACE
88         };
89 
90         cpp_tokenizer_kernel_1 (
91         );
92 
93         virtual ~cpp_tokenizer_kernel_1 (
94         );
95 
96         void clear(
97         );
98 
99         void set_stream (
100             std::istream& in
101         );
102 
103         bool stream_is_set (
104         ) const;
105 
106         std::istream& get_stream (
107         ) const;
108 
109         void get_token (
110             int& type,
111             std::string& token
112         );
113 
114         int peek_type (
115         ) const;
116 
117         const std::string& peek_token (
118         ) const;
119 
120         void swap (
121             cpp_tokenizer_kernel_1<tok,queue,set>& item
122         );
123 
124     private:
125 
buffer_token(int type,const std::string & token)126         void buffer_token(
127             int type,
128             const std::string& token
129         )
130         /*!
131             ensures
132                 - stores the token and its type into buffer
133         !*/
134         {
135             token_text_pair temp;
136             temp.token = token;
137             temp.type = type;
138             buffer.enqueue(temp);
139         }
140 
buffer_token(int type,char token)141         void buffer_token(
142             int type,
143             char token
144         )
145         /*!
146             ensures
147                 - stores the token and its type into buffer
148         !*/
149         {
150             token_text_pair temp;
151             temp.token = token;
152             temp.type = type;
153             buffer.enqueue(temp);
154         }
155 
156         // restricted functions
157         cpp_tokenizer_kernel_1(const cpp_tokenizer_kernel_1<tok,queue,set>&);        // copy constructor
158         cpp_tokenizer_kernel_1<tok,queue,set>& operator=(const cpp_tokenizer_kernel_1<tok,queue,set>&);    // assignment operator
159 
160         // data members
161         set keywords;
162         queue buffer;
163         tok tokenizer;
164 
165         mutable std::string next_token;
166         mutable int next_type;
167         mutable bool have_peeked;
168 
169 
170     };
171 
172     template <
173         typename tok,
174         typename queue,
175         typename set
176         >
swap(cpp_tokenizer_kernel_1<tok,queue,set> & a,cpp_tokenizer_kernel_1<tok,queue,set> & b)177     inline void swap (
178         cpp_tokenizer_kernel_1<tok,queue,set>& a,
179         cpp_tokenizer_kernel_1<tok,queue,set>& b
180     ) { a.swap(b); }
181 
182 // ----------------------------------------------------------------------------------------
183 // ----------------------------------------------------------------------------------------
184     // member function definitions
185 // ----------------------------------------------------------------------------------------
186 // ----------------------------------------------------------------------------------------
187 
188     template <
189         typename tok,
190         typename queue,
191         typename set
192         >
193     cpp_tokenizer_kernel_1<tok,queue,set>::
cpp_tokenizer_kernel_1()194     cpp_tokenizer_kernel_1(
195     ) :
196         have_peeked(false)
197     {
198         // add C++ keywords to keywords
199         std::string temp;
200         temp = "#include";              keywords.add(temp);
201         temp = "__asm";                 keywords.add(temp);
202         temp = "_asm";                  keywords.add(temp);
203         temp = "if";                    keywords.add(temp);
204         temp = "int";                   keywords.add(temp);
205         temp = "else";                  keywords.add(temp);
206         temp = "template";              keywords.add(temp);
207         temp = "void";                  keywords.add(temp);
208         temp = "false";                 keywords.add(temp);
209         temp = "class";                 keywords.add(temp);
210         temp = "public";                keywords.add(temp);
211         temp = "while";                 keywords.add(temp);
212         temp = "bool";                  keywords.add(temp);
213         temp = "new";                   keywords.add(temp);
214         temp = "delete";                keywords.add(temp);
215         temp = "true";                  keywords.add(temp);
216         temp = "typedef";               keywords.add(temp);
217         temp = "const";                 keywords.add(temp);
218         temp = "virtual";               keywords.add(temp);
219         temp = "inline";                keywords.add(temp);
220         temp = "for";                   keywords.add(temp);
221         temp = "break";                 keywords.add(temp);
222         temp = "struct";                keywords.add(temp);
223         temp = "float";                 keywords.add(temp);
224         temp = "case";                  keywords.add(temp);
225         temp = "enum";                  keywords.add(temp);
226         temp = "this";                  keywords.add(temp);
227         temp = "typeid";                keywords.add(temp);
228         temp = "double";                keywords.add(temp);
229         temp = "char";                  keywords.add(temp);
230         temp = "typename";              keywords.add(temp);
231         temp = "signed";                keywords.add(temp);
232         temp = "friend";                keywords.add(temp);
233         temp = "wint_t";                keywords.add(temp);
234         temp = "default";               keywords.add(temp);
235         temp = "asm";                   keywords.add(temp);
236         temp = "reinterpret_cast";      keywords.add(temp);
237         temp = "#define";               keywords.add(temp);
238         temp = "do";                    keywords.add(temp);
239         temp = "continue";              keywords.add(temp);
240         temp = "auto";                  keywords.add(temp);
241         temp = "unsigned";              keywords.add(temp);
242         temp = "size_t";                keywords.add(temp);
243         temp = "#undef";                keywords.add(temp);
244         temp = "#pragma";               keywords.add(temp);
245         temp = "namespace";             keywords.add(temp);
246         temp = "private";               keywords.add(temp);
247         temp = "#endif";                keywords.add(temp);
248         temp = "catch";                 keywords.add(temp);
249         temp = "#else";                 keywords.add(temp);
250         temp = "register";              keywords.add(temp);
251         temp = "volatile";              keywords.add(temp);
252         temp = "const_cast";            keywords.add(temp);
253         temp = "#end";                  keywords.add(temp);
254         temp = "mutable";               keywords.add(temp);
255         temp = "static_cast";           keywords.add(temp);
256         temp = "wchar_t";               keywords.add(temp);
257         temp = "#if";                   keywords.add(temp);
258         temp = "protected";             keywords.add(temp);
259         temp = "throw";                 keywords.add(temp);
260         temp = "using";                 keywords.add(temp);
261         temp = "dynamic_cast";          keywords.add(temp);
262         temp = "#ifdef";                keywords.add(temp);
263         temp = "return";                keywords.add(temp);
264         temp = "short";                 keywords.add(temp);
265         temp = "#error";                keywords.add(temp);
266         temp = "#line";                 keywords.add(temp);
267         temp = "explicit";              keywords.add(temp);
268         temp = "union";                 keywords.add(temp);
269         temp = "#ifndef";               keywords.add(temp);
270         temp = "try";                   keywords.add(temp);
271         temp = "sizeof";                keywords.add(temp);
272         temp = "goto";                  keywords.add(temp);
273         temp = "long";                  keywords.add(temp);
274         temp = "#elif";                 keywords.add(temp);
275         temp = "static";                keywords.add(temp);
276         temp = "operator";              keywords.add(temp);
277         temp = "switch";                keywords.add(temp);
278         temp = "extern";                keywords.add(temp);
279 
280 
281         // set the tokenizer's IDENTIFIER token for C++ identifiers
282         tokenizer.set_identifier_token(
283             "$_" + tokenizer.lowercase_letters() + tokenizer.uppercase_letters(),
284             "$_" + tokenizer.lowercase_letters() + tokenizer.uppercase_letters() +
285             tokenizer.numbers()
286             );
287     }
288 
289 // ----------------------------------------------------------------------------------------
290 
291     template <
292         typename tok,
293         typename queue,
294         typename set
295         >
296     cpp_tokenizer_kernel_1<tok,queue,set>::
~cpp_tokenizer_kernel_1()297     ~cpp_tokenizer_kernel_1 (
298     )
299     {
300     }
301 
302 // ----------------------------------------------------------------------------------------
303 
304     template <
305         typename tok,
306         typename queue,
307         typename set
308         >
309     void cpp_tokenizer_kernel_1<tok,queue,set>::
clear()310     clear(
311     )
312     {
313         tokenizer.clear();
314         buffer.clear();
315         have_peeked = false;
316 
317         // set the tokenizer's IDENTIFIER token for C++ identifiers
318         tokenizer.set_identifier_token(
319             "$_" + tokenizer.lowercase_letters() + tokenizer.uppercase_letters(),
320             "$_" + tokenizer.lowercase_letters() + tokenizer.uppercase_letters() +
321             tokenizer.numbers()
322             );
323     }
324 
325 // ----------------------------------------------------------------------------------------
326 
327     template <
328         typename tok,
329         typename queue,
330         typename set
331         >
332     void cpp_tokenizer_kernel_1<tok,queue,set>::
set_stream(std::istream & in)333     set_stream (
334         std::istream& in
335     )
336     {
337         tokenizer.set_stream(in);
338         buffer.clear();
339         have_peeked = false;
340     }
341 
342 // ----------------------------------------------------------------------------------------
343 
344     template <
345         typename tok,
346         typename queue,
347         typename set
348         >
349     bool cpp_tokenizer_kernel_1<tok,queue,set>::
stream_is_set()350     stream_is_set (
351     ) const
352     {
353         return tokenizer.stream_is_set();
354     }
355 
356 // ----------------------------------------------------------------------------------------
357 
358     template <
359         typename tok,
360         typename queue,
361         typename set
362         >
363     std::istream& cpp_tokenizer_kernel_1<tok,queue,set>::
get_stream()364     get_stream (
365     ) const
366     {
367         return tokenizer.get_stream();
368     }
369 
370 // ----------------------------------------------------------------------------------------
371 
372     template <
373         typename tok,
374         typename queue,
375         typename set
376         >
377     void cpp_tokenizer_kernel_1<tok,queue,set>::
get_token(int & type,std::string & token)378     get_token (
379         int& type,
380         std::string& token
381     )
382     {
383         using namespace std;
384 
385         if (!have_peeked)
386         {
387 
388             if (buffer.size() > 0)
389             {
390                 // just return what is in the buffer
391                 token_text_pair temp;
392                 buffer.dequeue(temp);
393                 type = temp.type;
394                 token = temp.token;
395                 return;
396             }
397 
398             tokenizer.get_token(type,token);
399 
400             switch (type)
401             {
402             case tok::END_OF_FILE:
403                 {
404                     type = END_OF_FILE;
405                 } break;
406 
407             case tok::END_OF_LINE:
408             case tok::WHITE_SPACE:
409                 {
410                     type = tokenizer.peek_type();
411                     if (type == tok::END_OF_LINE || type == tok::WHITE_SPACE)
412                     {
413                         std::string temp;
414                         do
415                         {
416                             tokenizer.get_token(type,temp);
417                             token += temp;
418                             type = tokenizer.peek_type();
419                         }while (type == tok::END_OF_LINE || type == tok::WHITE_SPACE);
420                     }
421                     type = WHITE_SPACE;
422 
423                 } break;
424 
425             case tok::NUMBER:
426                 {
427                     // this could be a hex number such as 0xa33.  we should check for this.
428                     if (tokenizer.peek_type() == tok::IDENTIFIER && token == "0" &&
429                         (tokenizer.peek_token()[0] == 'x' || tokenizer.peek_token()[0] == 'X'))
430                     {
431                         // this is a hex number so accumulate all the numbers and identifiers that follow
432                         // because they have to be part of the number
433                         std::string temp;
434                         tokenizer.get_token(type,temp);
435                         token = "0" + temp;
436 
437                         // get the rest of the hex number
438                         while (tokenizer.peek_type() == tok::IDENTIFIER ||
439                                tokenizer.peek_type() == tok::NUMBER
440                                )
441                         {
442                             tokenizer.get_token(type,temp);
443                             token += temp;
444                         }
445 
446                     }
447                     // or this could be a floating point value or something with an 'e' or 'E' in it.
448                     else if ((tokenizer.peek_type() == tok::CHAR && tokenizer.peek_token()[0] == '.') ||
449                              (tokenizer.peek_type() == tok::IDENTIFIER && std::tolower(tokenizer.peek_token()[0]) == 'e'))
450                     {
451                         std::string temp;
452                         tokenizer.get_token(type,temp);
453                         token += temp;
454                         // now get the rest of the floating point value
455                         while (tokenizer.peek_type() == tok::IDENTIFIER ||
456                                tokenizer.peek_type() == tok::NUMBER
457                                )
458                         {
459                             tokenizer.get_token(type,temp);
460                             token += temp;
461                         }
462                     }
463                     type = NUMBER;
464 
465                 } break;
466 
467             case tok::IDENTIFIER:
468                 {
469                     if (keywords.is_member(token))
470                     {
471                         type = KEYWORD;
472                     }
473                     else
474                     {
475                         type = IDENTIFIER;
476                     }
477                 } break;
478 
479             case tok::CHAR:
480                 type = OTHER;
481                 switch (token[0])
482                 {
483                 case '#':
484                     {
485                         // this might be a preprocessor keyword so we should check the
486                         // next token
487                         if (tokenizer.peek_type() == tok::IDENTIFIER &&
488                             keywords.is_member('#'+tokenizer.peek_token()))
489                         {
490                             tokenizer.get_token(type,token);
491                             token = '#' + token;
492                             type = KEYWORD;
493                         }
494                         else
495                         {
496                             token = '#';
497                             type = OTHER;
498                         }
499                     }
500                     break;
501 
502                 case '"':
503                     {
504                         string temp;
505                         tokenizer.get_token(type,token);
506                         while (type != tok::END_OF_FILE)
507                         {
508                             // if this is the end of the quoted string
509                             if (type == tok::CHAR && token[0] == '"' &&
510                                 (temp.size() == 0 || temp[temp.size()-1] != '\\' ||
511                                 (temp.size() > 1 && temp[temp.size()-2] == '\\') ))
512                             {
513                                 buffer_token(DOUBLE_QUOTED_TEXT,temp);
514                                 buffer_token(OTHER,"\"");
515                                 break;
516                             }
517                             else
518                             {
519                                 temp += token;
520                             }
521                             tokenizer.get_token(type,token);
522                         }
523 
524 
525                         type = OTHER;
526                         token = '"';
527                     } break;
528 
529                 case '\'':
530                     {
531                         string temp;
532                         tokenizer.get_token(type,token);
533                         if (type == tok::CHAR && token[0] == '\\')
534                         {
535                             temp += '\\';
536                             tokenizer.get_token(type,token);
537                         }
538                         temp += token;
539                         buffer_token(SINGLE_QUOTED_TEXT,temp);
540 
541                         // The next character should be a ' so take it out and put it in
542                         // the buffer.
543                         tokenizer.get_token(type,token);
544                         buffer_token(OTHER,token);
545 
546                         type = OTHER;
547                         token = '\'';
548                     } break;
549 
550                 case '/':
551                     {
552                         // look ahead to see if this is the start of a comment
553                         if (tokenizer.peek_type() == tok::CHAR)
554                         {
555                             if (tokenizer.peek_token()[0] == '/')
556                             {
557                                 tokenizer.get_token(type,token);
558                                 // this is the start of a line comment
559                                 token = "//";
560                                 string temp;
561                                 tokenizer.get_token(type,temp);
562                                 while (type != tok::END_OF_FILE)
563                                 {
564                                     // if this is the end of the comment
565                                     if (type == tok::END_OF_LINE &&
566                                         token[token.size()-1] != '\\' )
567                                     {
568                                         token += '\n';
569                                         break;
570                                     }
571                                     else
572                                     {
573                                         token += temp;
574                                     }
575                                     tokenizer.get_token(type,temp);
576                                 }
577                                 type = COMMENT;
578 
579                             }
580                             else if (tokenizer.peek_token()[0] == '*')
581                             {
582                                 tokenizer.get_token(type,token);
583                                 // this is the start of a block comment
584                                 token = "/*";
585                                 string temp;
586                                 tokenizer.get_token(type,temp);
587                                 while (type != tok::END_OF_FILE)
588                                 {
589                                     // if this is the end of the comment
590                                     if (type == tok::CHAR && temp[0] == '/' &&
591                                         token[token.size()-1] == '*')
592                                     {
593                                         token += '/';
594                                         break;
595                                     }
596                                     else
597                                     {
598                                         token += temp;
599                                     }
600                                     tokenizer.get_token(type,temp);
601                                 }
602                                 type = COMMENT;
603                             }
604                         }
605                     } break;
606 
607                 default:
608                     break;
609                 } // switch (token[0])
610             } // switch (type)
611         }
612         else
613         {
614             // if we get this far it means we have peeked so we should
615             // return the peek data.
616             type = next_type;
617             token = next_token;
618             have_peeked = false;
619         }
620     }
621 
622 // ----------------------------------------------------------------------------------------
623 
624     template <
625         typename tok,
626         typename queue,
627         typename set
628         >
629     int cpp_tokenizer_kernel_1<tok,queue,set>::
peek_type()630     peek_type (
631     ) const
632     {
633         const_cast<cpp_tokenizer_kernel_1<tok,queue,set>*>(this)->get_token(next_type,next_token);
634         have_peeked = true;
635         return next_type;
636     }
637 
638 // ----------------------------------------------------------------------------------------
639 
640     template <
641         typename tok,
642         typename queue,
643         typename set
644         >
645     const std::string& cpp_tokenizer_kernel_1<tok,queue,set>::
peek_token()646     peek_token (
647     ) const
648     {
649         const_cast<cpp_tokenizer_kernel_1<tok,queue,set>*>(this)->get_token(next_type,next_token);
650         have_peeked = true;
651         return next_token;
652     }
653 
654 // ----------------------------------------------------------------------------------------
655 
656     template <
657         typename tok,
658         typename queue,
659         typename set
660         >
661     void cpp_tokenizer_kernel_1<tok,queue,set>::
swap(cpp_tokenizer_kernel_1 & item)662     swap (
663         cpp_tokenizer_kernel_1& item
664     )
665     {
666         tokenizer.swap(item.tokenizer);
667         buffer.swap(item.buffer);
668     }
669 
670 // ----------------------------------------------------------------------------------------
671 
672 }
673 
674 #endif // DLIB_CPP_TOKENIZER_KERNEl_1_
675 
676