1 // Copyright (C) 2005 Davis E. King (davis@dlib.net) 2 // License: Boost Software License See LICENSE.txt for the full license. 3 #ifndef DLIB_CPP_TOKENIZER_KERNEl_1_ 4 #define DLIB_CPP_TOKENIZER_KERNEl_1_ 5 6 #include <string> 7 #include <iostream> 8 #include "cpp_tokenizer_kernel_abstract.h" 9 #include "../algs.h" 10 11 namespace dlib 12 { 13 14 namespace cpp_tok_kernel_1_helper 15 { 16 struct token_text_pair 17 { 18 std::string token; 19 int type=0; 20 }; 21 22 } 23 24 template < 25 typename tok, 26 typename queue, 27 typename set 28 > 29 class cpp_tokenizer_kernel_1 30 { 31 /*! 32 REQUIREMENTS ON tok 33 tok must be an implementation of tokenizer/tokenizer_kernel_abstract.h 34 35 REQUIREMENTS ON queue 36 queue must be an implementation of queue/queue_kernel_abstract.h 37 and must have T==cpp_tok_kernel_1_helper::token_text_pair 38 39 REQUIREMENTS ON set 40 set must be an implemention of set/set_kernel_abstract.h or 41 hash_set/hash_set_kernel_abstract.h and must have T==std::string. 42 43 INITIAL VALUE 44 - keywords == a set of all the C++ keywords 45 - tokenizer.stream_is_set() == false 46 - buffer.size() == 0 47 - tokenizer.get_identifier_head() == "$_" + tokenizer.lowercase_letters() + 48 tokenizer.uppercase_letters() 49 - tokenizer.get_identifier_body() == "$_" + tokenizer.lowercase_letters() + 50 tokenizer.uppercase_letters() + tokenizer.numbers() 51 - have_peeked == false 52 53 54 CONVENTION 55 - tokenizer.stream_is_set() == stream_is_set() 56 - tokenizer.get_stream() == get_stream() 57 - keywords == a set of all the C++ keywords 58 59 - tokenizer.get_identifier_head() == "$_" + tokenizer.lowercase_letters() + 60 tokenizer.uppercase_letters() 61 - tokenizer.get_identifier_body() == "$_" + tokenizer.lowercase_letters() + 62 tokenizer.uppercase_letters() + tokenizer.numbers() 63 64 - buffer == a queue of tokens. This is where we put tokens 65 we gathered early due to looking ahead. 66 67 68 - if (have_peeked) then 69 - next_token == the next token to be returned from get_token() 70 - next_type == the type of token in peek_token 71 !*/ 72 73 typedef cpp_tok_kernel_1_helper::token_text_pair token_text_pair; 74 75 public: 76 77 enum 78 { 79 END_OF_FILE, 80 KEYWORD, 81 COMMENT, 82 SINGLE_QUOTED_TEXT, 83 DOUBLE_QUOTED_TEXT, 84 IDENTIFIER, 85 OTHER, 86 NUMBER, 87 WHITE_SPACE 88 }; 89 90 cpp_tokenizer_kernel_1 ( 91 ); 92 93 virtual ~cpp_tokenizer_kernel_1 ( 94 ); 95 96 void clear( 97 ); 98 99 void set_stream ( 100 std::istream& in 101 ); 102 103 bool stream_is_set ( 104 ) const; 105 106 std::istream& get_stream ( 107 ) const; 108 109 void get_token ( 110 int& type, 111 std::string& token 112 ); 113 114 int peek_type ( 115 ) const; 116 117 const std::string& peek_token ( 118 ) const; 119 120 void swap ( 121 cpp_tokenizer_kernel_1<tok,queue,set>& item 122 ); 123 124 private: 125 buffer_token(int type,const std::string & token)126 void buffer_token( 127 int type, 128 const std::string& token 129 ) 130 /*! 131 ensures 132 - stores the token and its type into buffer 133 !*/ 134 { 135 token_text_pair temp; 136 temp.token = token; 137 temp.type = type; 138 buffer.enqueue(temp); 139 } 140 buffer_token(int type,char token)141 void buffer_token( 142 int type, 143 char token 144 ) 145 /*! 146 ensures 147 - stores the token and its type into buffer 148 !*/ 149 { 150 token_text_pair temp; 151 temp.token = token; 152 temp.type = type; 153 buffer.enqueue(temp); 154 } 155 156 // restricted functions 157 cpp_tokenizer_kernel_1(const cpp_tokenizer_kernel_1<tok,queue,set>&); // copy constructor 158 cpp_tokenizer_kernel_1<tok,queue,set>& operator=(const cpp_tokenizer_kernel_1<tok,queue,set>&); // assignment operator 159 160 // data members 161 set keywords; 162 queue buffer; 163 tok tokenizer; 164 165 mutable std::string next_token; 166 mutable int next_type; 167 mutable bool have_peeked; 168 169 170 }; 171 172 template < 173 typename tok, 174 typename queue, 175 typename set 176 > swap(cpp_tokenizer_kernel_1<tok,queue,set> & a,cpp_tokenizer_kernel_1<tok,queue,set> & b)177 inline void swap ( 178 cpp_tokenizer_kernel_1<tok,queue,set>& a, 179 cpp_tokenizer_kernel_1<tok,queue,set>& b 180 ) { a.swap(b); } 181 182 // ---------------------------------------------------------------------------------------- 183 // ---------------------------------------------------------------------------------------- 184 // member function definitions 185 // ---------------------------------------------------------------------------------------- 186 // ---------------------------------------------------------------------------------------- 187 188 template < 189 typename tok, 190 typename queue, 191 typename set 192 > 193 cpp_tokenizer_kernel_1<tok,queue,set>:: cpp_tokenizer_kernel_1()194 cpp_tokenizer_kernel_1( 195 ) : 196 have_peeked(false) 197 { 198 // add C++ keywords to keywords 199 std::string temp; 200 temp = "#include"; keywords.add(temp); 201 temp = "__asm"; keywords.add(temp); 202 temp = "_asm"; keywords.add(temp); 203 temp = "if"; keywords.add(temp); 204 temp = "int"; keywords.add(temp); 205 temp = "else"; keywords.add(temp); 206 temp = "template"; keywords.add(temp); 207 temp = "void"; keywords.add(temp); 208 temp = "false"; keywords.add(temp); 209 temp = "class"; keywords.add(temp); 210 temp = "public"; keywords.add(temp); 211 temp = "while"; keywords.add(temp); 212 temp = "bool"; keywords.add(temp); 213 temp = "new"; keywords.add(temp); 214 temp = "delete"; keywords.add(temp); 215 temp = "true"; keywords.add(temp); 216 temp = "typedef"; keywords.add(temp); 217 temp = "const"; keywords.add(temp); 218 temp = "virtual"; keywords.add(temp); 219 temp = "inline"; keywords.add(temp); 220 temp = "for"; keywords.add(temp); 221 temp = "break"; keywords.add(temp); 222 temp = "struct"; keywords.add(temp); 223 temp = "float"; keywords.add(temp); 224 temp = "case"; keywords.add(temp); 225 temp = "enum"; keywords.add(temp); 226 temp = "this"; keywords.add(temp); 227 temp = "typeid"; keywords.add(temp); 228 temp = "double"; keywords.add(temp); 229 temp = "char"; keywords.add(temp); 230 temp = "typename"; keywords.add(temp); 231 temp = "signed"; keywords.add(temp); 232 temp = "friend"; keywords.add(temp); 233 temp = "wint_t"; keywords.add(temp); 234 temp = "default"; keywords.add(temp); 235 temp = "asm"; keywords.add(temp); 236 temp = "reinterpret_cast"; keywords.add(temp); 237 temp = "#define"; keywords.add(temp); 238 temp = "do"; keywords.add(temp); 239 temp = "continue"; keywords.add(temp); 240 temp = "auto"; keywords.add(temp); 241 temp = "unsigned"; keywords.add(temp); 242 temp = "size_t"; keywords.add(temp); 243 temp = "#undef"; keywords.add(temp); 244 temp = "#pragma"; keywords.add(temp); 245 temp = "namespace"; keywords.add(temp); 246 temp = "private"; keywords.add(temp); 247 temp = "#endif"; keywords.add(temp); 248 temp = "catch"; keywords.add(temp); 249 temp = "#else"; keywords.add(temp); 250 temp = "register"; keywords.add(temp); 251 temp = "volatile"; keywords.add(temp); 252 temp = "const_cast"; keywords.add(temp); 253 temp = "#end"; keywords.add(temp); 254 temp = "mutable"; keywords.add(temp); 255 temp = "static_cast"; keywords.add(temp); 256 temp = "wchar_t"; keywords.add(temp); 257 temp = "#if"; keywords.add(temp); 258 temp = "protected"; keywords.add(temp); 259 temp = "throw"; keywords.add(temp); 260 temp = "using"; keywords.add(temp); 261 temp = "dynamic_cast"; keywords.add(temp); 262 temp = "#ifdef"; keywords.add(temp); 263 temp = "return"; keywords.add(temp); 264 temp = "short"; keywords.add(temp); 265 temp = "#error"; keywords.add(temp); 266 temp = "#line"; keywords.add(temp); 267 temp = "explicit"; keywords.add(temp); 268 temp = "union"; keywords.add(temp); 269 temp = "#ifndef"; keywords.add(temp); 270 temp = "try"; keywords.add(temp); 271 temp = "sizeof"; keywords.add(temp); 272 temp = "goto"; keywords.add(temp); 273 temp = "long"; keywords.add(temp); 274 temp = "#elif"; keywords.add(temp); 275 temp = "static"; keywords.add(temp); 276 temp = "operator"; keywords.add(temp); 277 temp = "switch"; keywords.add(temp); 278 temp = "extern"; keywords.add(temp); 279 280 281 // set the tokenizer's IDENTIFIER token for C++ identifiers 282 tokenizer.set_identifier_token( 283 "$_" + tokenizer.lowercase_letters() + tokenizer.uppercase_letters(), 284 "$_" + tokenizer.lowercase_letters() + tokenizer.uppercase_letters() + 285 tokenizer.numbers() 286 ); 287 } 288 289 // ---------------------------------------------------------------------------------------- 290 291 template < 292 typename tok, 293 typename queue, 294 typename set 295 > 296 cpp_tokenizer_kernel_1<tok,queue,set>:: ~cpp_tokenizer_kernel_1()297 ~cpp_tokenizer_kernel_1 ( 298 ) 299 { 300 } 301 302 // ---------------------------------------------------------------------------------------- 303 304 template < 305 typename tok, 306 typename queue, 307 typename set 308 > 309 void cpp_tokenizer_kernel_1<tok,queue,set>:: clear()310 clear( 311 ) 312 { 313 tokenizer.clear(); 314 buffer.clear(); 315 have_peeked = false; 316 317 // set the tokenizer's IDENTIFIER token for C++ identifiers 318 tokenizer.set_identifier_token( 319 "$_" + tokenizer.lowercase_letters() + tokenizer.uppercase_letters(), 320 "$_" + tokenizer.lowercase_letters() + tokenizer.uppercase_letters() + 321 tokenizer.numbers() 322 ); 323 } 324 325 // ---------------------------------------------------------------------------------------- 326 327 template < 328 typename tok, 329 typename queue, 330 typename set 331 > 332 void cpp_tokenizer_kernel_1<tok,queue,set>:: set_stream(std::istream & in)333 set_stream ( 334 std::istream& in 335 ) 336 { 337 tokenizer.set_stream(in); 338 buffer.clear(); 339 have_peeked = false; 340 } 341 342 // ---------------------------------------------------------------------------------------- 343 344 template < 345 typename tok, 346 typename queue, 347 typename set 348 > 349 bool cpp_tokenizer_kernel_1<tok,queue,set>:: stream_is_set()350 stream_is_set ( 351 ) const 352 { 353 return tokenizer.stream_is_set(); 354 } 355 356 // ---------------------------------------------------------------------------------------- 357 358 template < 359 typename tok, 360 typename queue, 361 typename set 362 > 363 std::istream& cpp_tokenizer_kernel_1<tok,queue,set>:: get_stream()364 get_stream ( 365 ) const 366 { 367 return tokenizer.get_stream(); 368 } 369 370 // ---------------------------------------------------------------------------------------- 371 372 template < 373 typename tok, 374 typename queue, 375 typename set 376 > 377 void cpp_tokenizer_kernel_1<tok,queue,set>:: get_token(int & type,std::string & token)378 get_token ( 379 int& type, 380 std::string& token 381 ) 382 { 383 using namespace std; 384 385 if (!have_peeked) 386 { 387 388 if (buffer.size() > 0) 389 { 390 // just return what is in the buffer 391 token_text_pair temp; 392 buffer.dequeue(temp); 393 type = temp.type; 394 token = temp.token; 395 return; 396 } 397 398 tokenizer.get_token(type,token); 399 400 switch (type) 401 { 402 case tok::END_OF_FILE: 403 { 404 type = END_OF_FILE; 405 } break; 406 407 case tok::END_OF_LINE: 408 case tok::WHITE_SPACE: 409 { 410 type = tokenizer.peek_type(); 411 if (type == tok::END_OF_LINE || type == tok::WHITE_SPACE) 412 { 413 std::string temp; 414 do 415 { 416 tokenizer.get_token(type,temp); 417 token += temp; 418 type = tokenizer.peek_type(); 419 }while (type == tok::END_OF_LINE || type == tok::WHITE_SPACE); 420 } 421 type = WHITE_SPACE; 422 423 } break; 424 425 case tok::NUMBER: 426 { 427 // this could be a hex number such as 0xa33. we should check for this. 428 if (tokenizer.peek_type() == tok::IDENTIFIER && token == "0" && 429 (tokenizer.peek_token()[0] == 'x' || tokenizer.peek_token()[0] == 'X')) 430 { 431 // this is a hex number so accumulate all the numbers and identifiers that follow 432 // because they have to be part of the number 433 std::string temp; 434 tokenizer.get_token(type,temp); 435 token = "0" + temp; 436 437 // get the rest of the hex number 438 while (tokenizer.peek_type() == tok::IDENTIFIER || 439 tokenizer.peek_type() == tok::NUMBER 440 ) 441 { 442 tokenizer.get_token(type,temp); 443 token += temp; 444 } 445 446 } 447 // or this could be a floating point value or something with an 'e' or 'E' in it. 448 else if ((tokenizer.peek_type() == tok::CHAR && tokenizer.peek_token()[0] == '.') || 449 (tokenizer.peek_type() == tok::IDENTIFIER && std::tolower(tokenizer.peek_token()[0]) == 'e')) 450 { 451 std::string temp; 452 tokenizer.get_token(type,temp); 453 token += temp; 454 // now get the rest of the floating point value 455 while (tokenizer.peek_type() == tok::IDENTIFIER || 456 tokenizer.peek_type() == tok::NUMBER 457 ) 458 { 459 tokenizer.get_token(type,temp); 460 token += temp; 461 } 462 } 463 type = NUMBER; 464 465 } break; 466 467 case tok::IDENTIFIER: 468 { 469 if (keywords.is_member(token)) 470 { 471 type = KEYWORD; 472 } 473 else 474 { 475 type = IDENTIFIER; 476 } 477 } break; 478 479 case tok::CHAR: 480 type = OTHER; 481 switch (token[0]) 482 { 483 case '#': 484 { 485 // this might be a preprocessor keyword so we should check the 486 // next token 487 if (tokenizer.peek_type() == tok::IDENTIFIER && 488 keywords.is_member('#'+tokenizer.peek_token())) 489 { 490 tokenizer.get_token(type,token); 491 token = '#' + token; 492 type = KEYWORD; 493 } 494 else 495 { 496 token = '#'; 497 type = OTHER; 498 } 499 } 500 break; 501 502 case '"': 503 { 504 string temp; 505 tokenizer.get_token(type,token); 506 while (type != tok::END_OF_FILE) 507 { 508 // if this is the end of the quoted string 509 if (type == tok::CHAR && token[0] == '"' && 510 (temp.size() == 0 || temp[temp.size()-1] != '\\' || 511 (temp.size() > 1 && temp[temp.size()-2] == '\\') )) 512 { 513 buffer_token(DOUBLE_QUOTED_TEXT,temp); 514 buffer_token(OTHER,"\""); 515 break; 516 } 517 else 518 { 519 temp += token; 520 } 521 tokenizer.get_token(type,token); 522 } 523 524 525 type = OTHER; 526 token = '"'; 527 } break; 528 529 case '\'': 530 { 531 string temp; 532 tokenizer.get_token(type,token); 533 if (type == tok::CHAR && token[0] == '\\') 534 { 535 temp += '\\'; 536 tokenizer.get_token(type,token); 537 } 538 temp += token; 539 buffer_token(SINGLE_QUOTED_TEXT,temp); 540 541 // The next character should be a ' so take it out and put it in 542 // the buffer. 543 tokenizer.get_token(type,token); 544 buffer_token(OTHER,token); 545 546 type = OTHER; 547 token = '\''; 548 } break; 549 550 case '/': 551 { 552 // look ahead to see if this is the start of a comment 553 if (tokenizer.peek_type() == tok::CHAR) 554 { 555 if (tokenizer.peek_token()[0] == '/') 556 { 557 tokenizer.get_token(type,token); 558 // this is the start of a line comment 559 token = "//"; 560 string temp; 561 tokenizer.get_token(type,temp); 562 while (type != tok::END_OF_FILE) 563 { 564 // if this is the end of the comment 565 if (type == tok::END_OF_LINE && 566 token[token.size()-1] != '\\' ) 567 { 568 token += '\n'; 569 break; 570 } 571 else 572 { 573 token += temp; 574 } 575 tokenizer.get_token(type,temp); 576 } 577 type = COMMENT; 578 579 } 580 else if (tokenizer.peek_token()[0] == '*') 581 { 582 tokenizer.get_token(type,token); 583 // this is the start of a block comment 584 token = "/*"; 585 string temp; 586 tokenizer.get_token(type,temp); 587 while (type != tok::END_OF_FILE) 588 { 589 // if this is the end of the comment 590 if (type == tok::CHAR && temp[0] == '/' && 591 token[token.size()-1] == '*') 592 { 593 token += '/'; 594 break; 595 } 596 else 597 { 598 token += temp; 599 } 600 tokenizer.get_token(type,temp); 601 } 602 type = COMMENT; 603 } 604 } 605 } break; 606 607 default: 608 break; 609 } // switch (token[0]) 610 } // switch (type) 611 } 612 else 613 { 614 // if we get this far it means we have peeked so we should 615 // return the peek data. 616 type = next_type; 617 token = next_token; 618 have_peeked = false; 619 } 620 } 621 622 // ---------------------------------------------------------------------------------------- 623 624 template < 625 typename tok, 626 typename queue, 627 typename set 628 > 629 int cpp_tokenizer_kernel_1<tok,queue,set>:: peek_type()630 peek_type ( 631 ) const 632 { 633 const_cast<cpp_tokenizer_kernel_1<tok,queue,set>*>(this)->get_token(next_type,next_token); 634 have_peeked = true; 635 return next_type; 636 } 637 638 // ---------------------------------------------------------------------------------------- 639 640 template < 641 typename tok, 642 typename queue, 643 typename set 644 > 645 const std::string& cpp_tokenizer_kernel_1<tok,queue,set>:: peek_token()646 peek_token ( 647 ) const 648 { 649 const_cast<cpp_tokenizer_kernel_1<tok,queue,set>*>(this)->get_token(next_type,next_token); 650 have_peeked = true; 651 return next_token; 652 } 653 654 // ---------------------------------------------------------------------------------------- 655 656 template < 657 typename tok, 658 typename queue, 659 typename set 660 > 661 void cpp_tokenizer_kernel_1<tok,queue,set>:: swap(cpp_tokenizer_kernel_1 & item)662 swap ( 663 cpp_tokenizer_kernel_1& item 664 ) 665 { 666 tokenizer.swap(item.tokenizer); 667 buffer.swap(item.buffer); 668 } 669 670 // ---------------------------------------------------------------------------------------- 671 672 } 673 674 #endif // DLIB_CPP_TOKENIZER_KERNEl_1_ 675 676