1 /******************************************************************************\ 2 * Copyright (c) 2016, Robert van Engelen, Genivia Inc. All rights reserved. * 3 * * 4 * Redistribution and use in source and binary forms, with or without * 5 * modification, are permitted provided that the following conditions are met: * 6 * * 7 * (1) Redistributions of source code must retain the above copyright notice, * 8 * this list of conditions and the following disclaimer. * 9 * * 10 * (2) Redistributions in binary form must reproduce the above copyright * 11 * notice, this list of conditions and the following disclaimer in the * 12 * documentation and/or other materials provided with the distribution. * 13 * * 14 * (3) The name of the author may not be used to endorse or promote products * 15 * derived from this software without specific prior written permission. * 16 * * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED * 18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * 19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO * 20 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * 22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * 23 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * 24 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * 25 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * 26 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * 27 \******************************************************************************/ 28 29 /** 30 @file input.h 31 @brief RE/flex input character sequence class 32 @author Robert van Engelen - engelen@genivia.com 33 @copyright (c) 2016-2020, Robert van Engelen, Genivia Inc. All rights reserved. 34 @copyright (c) BSD-3 License - see LICENSE.txt 35 */ 36 37 #ifndef REFLEX_INPUT_H 38 #define REFLEX_INPUT_H 39 40 #include <reflex/utf8.h> 41 #include <cstdio> 42 #include <cstring> 43 #include <iostream> 44 #include <string> 45 #include <stdint.h> 46 47 namespace reflex { 48 49 extern const unsigned short codepages[][256]; 50 51 /// Input character sequence class for unified access to sources of input text. 52 /** 53 Description 54 ----------- 55 56 The Input class unifies access to a source of input text that constitutes a 57 sequence of characters: 58 59 - An Input object is instantiated and (re)assigned a (new) source input: either 60 a `char*` string, a `wchar_t*` wide string, a `std::string`, a 61 `std::wstring`, a `FILE*` descriptor, or a `std::istream` object. 62 63 - When assigned a wide string source as input, the wide character content is 64 automatically converted to an UTF-8 character sequence when reading with 65 get(). Wide strings are UCS-2/UCS-4 and may contain UTF-16 surrogate pairs. 66 67 - When assigned a `FILE*` source as input, the file is checked for the presence 68 of a UTF-8 or a UTF-16 BOM (Byte Order Mark). A UTF-8 BOM is ignored and will 69 not appear on the input character stream (and size is adjusted by 3 bytes). A 70 UTF-16 BOM is intepreted, resulting in the conversion of the file content 71 automatically to an UTF-8 character sequence when reading the file with 72 get(). Also, size() gives the content size in the number of UTF-8 bytes. 73 74 - An input object can be reassigned a new source of input for reading at any 75 time. 76 77 - An input object obeys move semantics. That is, after assigning an input 78 object to another, the former can no longer be used to read input. This 79 prevents adding the overhead and complexity of file and stream duplication. 80 81 - `size_t Input::get(char *buf, size_t len);` reads source input and fills `buf` 82 with up to `len` bytes, returning the number of bytes read or zero when a 83 stream or file is bad or when EOF is reached. 84 85 - `size_t Input::size();` returns the number of ASCII/UTF-8 bytes available 86 to read from the source input or zero (zero is also returned when the size is 87 not determinable). Use this function only before reading input with get(). 88 Wide character strings and UTF-16 `FILE*` content is counted as the total 89 number of UTF-8 bytes that will be produced by get(). The size of a 90 `std::istream` cannot be determined. 91 92 - `bool Input::good();` returns true if the input is readable and has no 93 EOF or error state. Returns false on EOF or if an error condition is 94 present. 95 96 - `bool Input::eof();` returns true if the input reached EOF. Note that 97 good() == ! eof() for string source input only, since files and streams may 98 have error conditions that prevent reading. That is, for files and streams 99 eof() implies good() == false, but not vice versa. Thus, an error is 100 diagnosed when the condition good() == false && eof() == false holds. Note 101 that get(buf, len) == 0 && len > 0 implies good() == false. 102 103 - `class Input::streambbuf(const Input&)` creates a `std::istream` for the 104 given `Input` object. 105 106 - Compile with `WITH_UTF8_UNRESTRICTED` to enable unrestricted UTF-8 beyond 107 U+10FFFF, permitting lossless UTF-8 encoding of 32 bit words without limits. 108 109 Example 110 ------- 111 112 The following example shows how to use the Input class to read a character 113 sequence in blocks from a `std::ifstream` to copy to stdout: 114 115 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp} 116 std::ifstream ifs; 117 ifs.open("input.h", std::ifstream::in); 118 reflex::Input input(ifs); 119 char buf[1024]; 120 size_t len; 121 while ((len = input.get(buf, sizeof(buf))) > 0) 122 fwrite(buf, 1, len, stdout); 123 if (!input.eof()) 124 std::cerr << "An IO error occurred" << std::endl; 125 ifs.close(); 126 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 127 128 Example 129 ------- 130 131 The following example shows how to use the Input class to store the entire 132 content of a file in a temporary buffer: 133 134 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp} 135 reflex::Input input(fopen("input.h", "r")); 136 if (input.file() == NULL) 137 abort(); 138 size_t len = input.size(); // file size (minus any leading UTF BOM) 139 char *buf = new char[len]; 140 input.get(buf, len); 141 if (!input.eof()) 142 std::cerr << "An IO error occurred" << std::endl; 143 fwrite(buf, 1, len, stdout); 144 delete[] buf; 145 fclose(input.file()); 146 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 147 148 In the above, files with UTF-16 and UTF-32 content are converted to UTF-8 by 149 `get(buf, len)`. Also, `size()` returns the total number of UTF-8 bytes to 150 copy in the buffer by `get(buf, len)`. The size is computed depending on the 151 UTF-8/16/32 file content encoding, i.e. given a leading UTF BOM in the file. 152 This means that UTF-16/32 files are read twice, first internally with `size()` 153 and then again with get(buf, len)`. 154 155 Example 156 ------- 157 158 The following example shows how to use the Input class to read a character 159 sequence in blocks from a file: 160 161 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp} 162 reflex::Input input(fopen("input.h", "r")); 163 char buf[1024]; 164 size_t len; 165 while ((len = input.get(buf, sizeof(buf))) > 0) 166 fwrite(buf, 1, len, stdout); 167 fclose(input); 168 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 169 170 Example 171 ------- 172 173 The following example shows how to use the Input class to echo characters one 174 by one from stdin, e.g. reading input from a tty: 175 176 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp} 177 reflex::Input input(stdin); 178 char c; 179 while (input.get(&c, 1)) 180 fputc(c, stdout); 181 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 182 183 Or if you prefer to use an int character and check for EOF explicitly: 184 185 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp} 186 reflex::Input input(stdin); 187 int c; 188 while ((c = input.get()) != EOF) 189 fputc(c, stdout); 190 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 191 192 Example 193 ------- 194 195 The following example shows how to use the Input class to read a character 196 sequence in blocks from a wide character string, converting it to UTF-8 to copy 197 to stdout: 198 199 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp} 200 reflex::Input input(L"Copyright ©"); // © is unicode U+00A9 and UTF-8 C2 A9 201 char buf[8]; 202 size_t len; 203 while ((len = input.get(buf, sizeof(buf))) > 0) 204 fwrite(buf, 1, len, stdout); 205 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 206 207 Example 208 ------- 209 210 The following example shows how to use the Input class to convert a wide 211 character string to UTF-8: 212 213 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp} 214 reflex::Input input(L"Copyright ©"); // © is unicode U+00A9 and UTF-8 C2 A9 215 size_t len = input.size(); // size of UTF-8 string 216 char *buf = new char[len + 1]; 217 input.get(buf, len); 218 buf[len] = '\0'; // make \0-terminated 219 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 220 221 Example 222 ------- 223 224 The following example shows how to switch source inputs while reading input 225 byte by byte (use a buffer as shown in other examples to improve efficiency): 226 227 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp} 228 reflex::Input input = "Hello"; 229 std::string message; 230 char c; 231 while (input.get(&c, 1)) 232 message.append(c); 233 input = L" world! To ∞ and beyond."; // switch input to a wide string 234 while (input.get(&c, 1)) 235 message.append(c); 236 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 237 238 Example 239 ------- 240 241 The following examples shows how to use reflex::Input::streambuf to create an 242 unbuffered std::istream: 243 244 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp} 245 reflex::Input input(fopen("legacy.txt", "r"), reflex::Input::file_encoding::ebcdic); 246 if (input.file() == NULL) 247 abort(); 248 reflex::Input::streambuf streambuf(input); 249 std::istream stream(&streambuf); 250 std::string data; 251 int c; 252 while ((c = stream.get()) != EOF) 253 data.append(c); 254 fclose(input.file()); 255 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 256 257 With reflex::BufferedInput::streambuf to create a buffered std::istream: 258 259 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp} 260 reflex::Input input(fopen("legacy.txt", "r"), reflex::Input::file_encoding::ebcdic); 261 if (input.file() == NULL) 262 abort(); 263 reflex::BufferedInput::streambuf streambuf(input); 264 std::istream stream(&streambuf); 265 std::string data; 266 int c; 267 while ((c = stream.get()) != EOF) 268 data.append(c); 269 fclose(input.file()); 270 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 271 */ 272 class Input { 273 public: 274 /// Common file_encoding constants type. 275 typedef unsigned short file_encoding_type; 276 /// Common file_encoding constants. 277 struct file_encoding { 278 static const file_encoding_type plain = 0; ///< plain octets: 7-bit ASCII, 8-bit binary or UTF-8 without BOM detected 279 static const file_encoding_type utf8 = 1; ///< UTF-8 with BOM detected 280 static const file_encoding_type utf16be = 2; ///< UTF-16 big endian 281 static const file_encoding_type utf16le = 3; ///< UTF-16 little endian 282 static const file_encoding_type utf32be = 4; ///< UTF-32 big endian 283 static const file_encoding_type utf32le = 5; ///< UTF-32 little endian 284 static const file_encoding_type latin = 6; ///< ISO-8859-1, Latin-1 285 static const file_encoding_type cp437 = 7; ///< DOS CP 437 286 static const file_encoding_type cp850 = 8; ///< DOS CP 850 287 static const file_encoding_type cp858 = 9; ///< DOS CP 858 288 static const file_encoding_type ebcdic = 10; ///< EBCDIC 289 static const file_encoding_type cp1250 = 11; ///< Windows CP 1250 290 static const file_encoding_type cp1251 = 12; ///< Windows CP 1251 291 static const file_encoding_type cp1252 = 13; ///< Windows CP 1252 292 static const file_encoding_type cp1253 = 14; ///< Windows CP 1253 293 static const file_encoding_type cp1254 = 15; ///< Windows CP 1254 294 static const file_encoding_type cp1255 = 16; ///< Windows CP 1255 295 static const file_encoding_type cp1256 = 17; ///< Windows CP 1256 296 static const file_encoding_type cp1257 = 18; ///< Windows CP 1257 297 static const file_encoding_type cp1258 = 19; ///< Windows CP 1258 298 static const file_encoding_type iso8859_2 = 20; ///< ISO-8859-2, Latin-2 299 static const file_encoding_type iso8859_3 = 21; ///< ISO-8859-3, Latin-3 300 static const file_encoding_type iso8859_4 = 22; ///< ISO-8859-4, Latin-4 301 static const file_encoding_type iso8859_5 = 23; ///< ISO-8859-5, Cyrillic 302 static const file_encoding_type iso8859_6 = 24; ///< ISO-8859-6, Arabic 303 static const file_encoding_type iso8859_7 = 25; ///< ISO-8859-7, Greek 304 static const file_encoding_type iso8859_8 = 26; ///< ISO-8859-8, Hebrew 305 static const file_encoding_type iso8859_9 = 27; ///< ISO-8859-9, Latin-5 306 static const file_encoding_type iso8859_10 = 28; ///< ISO-8859-10, Latin-6 307 static const file_encoding_type iso8859_11 = 29; ///< ISO-8859-11, Thai 308 static const file_encoding_type iso8859_13 = 30; ///< ISO-8859-13, Latin-7 309 static const file_encoding_type iso8859_14 = 31; ///< ISO-8859-14, Latin-8 310 static const file_encoding_type iso8859_15 = 32; ///< ISO-8859-15, Latin-9 311 static const file_encoding_type iso8859_16 = 33; ///< ISO-8859-16 312 static const file_encoding_type macroman = 34; ///< Macintosh Roman with CR to LF translation 313 static const file_encoding_type koi8_r = 35; ///< KOI8-R 314 static const file_encoding_type koi8_u = 36; ///< KOI8-U 315 static const file_encoding_type koi8_ru = 37; ///< KOI8-RU 316 static const file_encoding_type custom = 38; ///< custom code page 317 }; 318 /// FILE* handler functor base class to handle FILE* errors and non-blocking FILE* reads 319 struct Handler { virtual int operator()() = 0; }; 320 /// Stream buffer for reflex::Input, derived from std::streambuf. 321 class streambuf; 322 /// Stream buffer for reflex::Input to read DOS files, replaces CRLF by LF, derived from std::streambuf. 323 class dos_streambuf; 324 /// Construct empty input character sequence. Input()325 Input() 326 : 327 cstring_(NULL), 328 wstring_(NULL), 329 file_(NULL), 330 istream_(NULL), 331 size_(0) 332 { 333 init(); 334 } 335 /// Copy constructor (with intended "move semantics" as internal state is shared, should not rely on using the rhs after copying). Input(const Input & input)336 Input(const Input& input) ///< an Input object to share state with (undefined behavior results from using both objects) 337 : 338 cstring_(input.cstring_), 339 wstring_(input.wstring_), 340 file_(input.file_), 341 istream_(input.istream_), 342 size_(input.size_), 343 uidx_(input.uidx_), 344 ulen_(input.ulen_), 345 utfx_(input.utfx_), 346 page_(input.page_), 347 handler_(input.handler_) 348 { 349 std::memcpy(utf8_, input.utf8_, sizeof(utf8_)); 350 } 351 /// Construct input character sequence from a char* string Input(const char * cstring,size_t size)352 Input( 353 const char *cstring, ///< char string 354 size_t size) ///< length of the string 355 : 356 cstring_(cstring), 357 wstring_(NULL), 358 file_(NULL), 359 istream_(NULL), 360 size_(size) 361 { 362 init(); 363 } 364 /// Construct input character sequence from a NUL-terminated string. Input(const char * cstring)365 Input(const char *cstring) ///< NUL-terminated char* string 366 : 367 cstring_(cstring), 368 wstring_(NULL), 369 file_(NULL), 370 istream_(NULL), 371 size_(cstring != NULL ? std::strlen(cstring) : 0) 372 { 373 init(); 374 } 375 /// Construct input character sequence from a std::string. Input(const std::string & string)376 Input(const std::string& string) ///< input string 377 : 378 cstring_(string.c_str()), 379 wstring_(NULL), 380 file_(NULL), 381 istream_(NULL), 382 size_(string.size()) 383 { 384 init(); 385 } 386 /// Construct input character sequence from a pointer to a std::string. Input(const std::string * string)387 Input(const std::string *string) ///< input string 388 : 389 cstring_(string != NULL ? string->c_str() : NULL), 390 wstring_(NULL), 391 file_(NULL), 392 istream_(NULL), 393 size_(string != NULL ? string->size() : 0) 394 { 395 init(); 396 } 397 /// Construct input character sequence from a NUL-terminated wide character string. Input(const wchar_t * wstring)398 Input(const wchar_t *wstring) ///< NUL-terminated wchar_t* input string 399 : 400 cstring_(NULL), 401 wstring_(wstring), 402 file_(NULL), 403 istream_(NULL), 404 size_(0) 405 { 406 init(); 407 } 408 /// Construct input character sequence from a std::wstring (may contain UTF-16 surrogate pairs). Input(const std::wstring & wstring)409 Input(const std::wstring& wstring) ///< input wide string 410 : 411 cstring_(NULL), 412 wstring_(wstring.c_str()), 413 file_(NULL), 414 istream_(NULL), 415 size_(0) 416 { 417 init(); 418 } 419 /// Construct input character sequence from a pointer to a std::wstring (may contain UTF-16 surrogate pairs). Input(const std::wstring * wstring)420 Input(const std::wstring *wstring) ///< input wide string 421 : 422 cstring_(NULL), 423 wstring_(wstring != NULL ? wstring->c_str() : NULL), 424 file_(NULL), 425 istream_(NULL), 426 size_(0) 427 { 428 init(); 429 } 430 /// Construct input character sequence from an open FILE* file descriptor, supports UTF-8 conversion from UTF-16 and UTF-32. Input(FILE * file)431 Input(FILE *file) ///< input file 432 : 433 cstring_(NULL), 434 wstring_(NULL), 435 file_(file), 436 istream_(NULL), 437 size_(0) 438 { 439 init(); 440 } 441 /// Construct input character sequence from an open FILE* file descriptor, using the specified file encoding 442 Input( 443 FILE *file, ///< input file 444 file_encoding_type enc, ///< file_encoding (when UTF BOM is not present) 445 const unsigned short *page = NULL) ///< code page for file_encoding::custom 446 : cstring_(NULL)447 cstring_(NULL), 448 wstring_(NULL), 449 file_(file), 450 istream_(NULL), 451 size_(0) 452 { 453 init(); 454 if (file_encoding() == file_encoding::plain) 455 file_encoding(enc, page); 456 } 457 /// Construct input character sequence from a std::istream. Input(std::istream & istream)458 Input(std::istream& istream) ///< input stream 459 : 460 cstring_(NULL), 461 wstring_(NULL), 462 file_(NULL), 463 istream_(&istream), 464 size_(0) 465 { 466 init(); 467 } 468 /// Construct input character sequence from a pointer to a std::istream. Input(std::istream * istream)469 Input(std::istream *istream) ///< input stream 470 : 471 cstring_(NULL), 472 wstring_(NULL), 473 file_(NULL), 474 istream_(istream), 475 size_(0) 476 { 477 init(); 478 } 479 /// Copy assignment operator. 480 Input& operator=(const Input& input) 481 { 482 cstring_ = input.cstring_; 483 wstring_ = input.wstring_; 484 file_ = input.file_; 485 istream_ = input.istream_; 486 size_ = input.size_; 487 uidx_ = input.uidx_; 488 ulen_ = input.ulen_; 489 utfx_ = input.utfx_; 490 page_ = input.page_; 491 handler_ = input.handler_; 492 std::memcpy(utf8_, input.utf8_, sizeof(utf8_)); 493 return *this; 494 } 495 /// Cast this Input object to a string, returns NULL when this Input is not a string. 496 operator const char *() const 497 /// @returns remaining unbuffered part of a NUL-terminated string or NULL 498 { 499 return cstring_; 500 } 501 /// Cast this Input object to a wide character string, returns NULL when this Input is not a wide string. 502 operator const wchar_t *() const 503 /// @returns remaining unbuffered part of the NUL-terminated wide character string or NULL 504 { 505 return wstring_; 506 } 507 /// Cast this Input object to a file descriptor FILE*, returns NULL when this Input is not a FILE*. 508 operator FILE *() const 509 /// @returns pointer to current file descriptor or NULL 510 { 511 return file_; 512 } 513 /// Cast this Input object to a std::istream*, returns NULL when this Input is not a std::istream. 514 operator std::istream *() const 515 /// @returns pointer to current std::istream or NULL 516 { 517 return istream_; 518 } 519 // Cast this Input object to bool, same as checking good(). 520 operator bool() const 521 /// @returns true if a non-empty sequence of characters is available to get 522 { 523 return good(); 524 } 525 /// Get the remaining string of this Input object, returns NULL when this Input is not a string. cstring()526 const char *cstring() const 527 /// @returns remaining unbuffered part of the NUL-terminated string or NULL 528 { 529 return cstring_; 530 } 531 /// Get the remaining wide character string of this Input object, returns NULL when this Input is not a wide string. wstring()532 const wchar_t *wstring() const 533 /// @returns remaining unbuffered part of the NUL-terminated wide character string or NULL 534 { 535 return wstring_; 536 } 537 /// Get the FILE* of this Input object, returns NULL when this Input is not a FILE*. file()538 FILE *file() const 539 /// @returns pointer to current file descriptor or NULL 540 { 541 return file_; 542 } 543 /// Get the std::istream of this Input object, returns NULL when this Input is not a std::istream. istream()544 std::istream *istream() const 545 /// @returns pointer to current std::istream or NULL 546 { 547 return istream_; 548 } 549 /// Get the size of the input character sequence in number of ASCII/UTF-8 bytes (zero if size is not determinable from a `FILE*` or `std::istream` source). size()550 size_t size() 551 /// @returns the nonzero number of ASCII/UTF-8 bytes available to read, or zero when source is empty or if size is not determinable e.g. when reading from standard input 552 { 553 if (cstring_) 554 return size_; 555 if (wstring_) 556 { 557 if (size_ == 0) 558 wstring_size(); 559 } 560 else if (file_) 561 { 562 if (size_ == 0) 563 file_size(); 564 } 565 else if (istream_) 566 { 567 if (size_ == 0) 568 istream_size(); 569 } 570 return size_; 571 } 572 /// Check if this Input object was assigned a character sequence. assigned()573 bool assigned() const 574 /// @returns true if this Input object was assigned (not default constructed or cleared) 575 { 576 return cstring_ || wstring_ || file_ || istream_; 577 } 578 /// Clear this Input by unassigning it. clear()579 void clear() 580 { 581 cstring_ = NULL; 582 wstring_ = NULL; 583 file_ = NULL; 584 istream_ = NULL; 585 size_ = 0; 586 } 587 /// Check if input is available. good()588 bool good() const 589 /// @returns true if a non-empty sequence of characters is available to get 590 { 591 if (cstring_) 592 return size_ > 0; 593 if (wstring_) 594 return *wstring_ != L'\0'; 595 if (file_) 596 return !::feof(file_) && !::ferror(file_); 597 if (istream_) 598 return istream_->good(); 599 return false; 600 } 601 /// Check if input reached EOF. eof()602 bool eof() const 603 /// @returns true if input is at EOF and no characters are available 604 { 605 if (cstring_) 606 return size_ == 0; 607 if (wstring_) 608 return *wstring_ == L'\0'; 609 if (file_) 610 return ::feof(file_) != 0; 611 if (istream_) 612 return istream_->eof(); 613 return true; 614 } 615 /// Get a single character (unsigned char 0..255) or EOF (-1) when end-of-input is reached. get()616 int get() 617 { 618 char c; 619 if (get(&c, 1)) 620 return static_cast<unsigned char>(c); 621 return EOF; 622 } 623 /// Copy character sequence data into buffer. get(char * s,size_t n)624 size_t get( 625 char *s, ///< points to the string buffer to fill with input 626 size_t n) ///< size of buffer pointed to by s 627 /// @returns the nonzero number of (less or equal to n) 8-bit characters added to buffer s from the current input, or zero when EOF 628 { 629 if (cstring_) 630 { 631 size_t k = size_; 632 if (k > n) 633 k = n; 634 std::memcpy(s, cstring_, k); 635 cstring_ += k; 636 size_ -= k; 637 return k; 638 } 639 if (wstring_) 640 { 641 size_t k = n; 642 if (ulen_ > 0) 643 { 644 size_t l = ulen_; 645 if (l > k) 646 l = k; 647 std::memcpy(s, utf8_ + uidx_, l); 648 k -= l; 649 if (k == 0) 650 { 651 uidx_ += static_cast<unsigned short>(l); 652 ulen_ -= static_cast<unsigned short>(l); 653 if (size_ >= n) 654 size_ -= n; 655 return n; 656 } 657 s += l; 658 ulen_ = 0; 659 } 660 wchar_t c; 661 while ((c = *wstring_) != L'\0' && k > 0) 662 { 663 if (c < 0x80) 664 { 665 *s++ = static_cast<char>(c); 666 --k; 667 } 668 else 669 { 670 size_t l; 671 if (c >= 0xD800 && c < 0xE000) 672 { 673 // UTF-16 surrogate pair 674 if (c < 0xDC00 && (wstring_[1] & 0xFC00) == 0xDC00) 675 l = utf8(0x010000 - 0xDC00 + ((c - 0xD800) << 10) + *++wstring_, utf8_); 676 else 677 l = utf8(REFLEX_NONCHAR, utf8_); 678 } 679 else 680 { 681 l = utf8(c, utf8_); 682 } 683 if (k < l) 684 { 685 uidx_ = static_cast<unsigned short>(k); 686 ulen_ = static_cast<unsigned short>(l - k); 687 std::memcpy(s, utf8_, k); 688 s += k; 689 k = 0; 690 } 691 else 692 { 693 std::memcpy(s, utf8_, l); 694 s += l; 695 k -= l; 696 } 697 } 698 ++wstring_; 699 } 700 if (size_ >= n - k) 701 size_ -= n - k; 702 return n - k; 703 } 704 if (file_) 705 { 706 while (true) 707 { 708 size_t k = file_get(s, n); 709 if (k > 0 || feof(file_) || handler_ == NULL || (*handler_)() == 0) 710 return k; 711 } 712 } 713 if (istream_) 714 { 715 size_t k = static_cast<size_t>(n == 1 ? istream_->get(s[0]).gcount() : istream_->read(s, static_cast<std::streamsize>(n)) ? n : istream_->gcount()); 716 if (size_ >= k) 717 size_ -= k; 718 return k; 719 } 720 return 0; 721 } 722 /// Set encoding for `FILE*` input. 723 void file_encoding( 724 file_encoding_type enc, ///< file_encoding 725 const unsigned short *page = NULL) ///< custom code page for file_encoding::custom 726 ; 727 /// Get encoding of the current `FILE*` input. file_encoding()728 file_encoding_type file_encoding() const 729 /// @returns current file_encoding constant 730 { 731 return utfx_; 732 } 733 /// Initialize the state after (re)setting the input source, auto-detects UTF BOM in FILE* input if the file size is known. init()734 void init() 735 { 736 std::memset(utf8_, 0, sizeof(utf8_)); 737 uidx_ = 0; 738 ulen_ = 0; 739 utfx_ = 0; 740 page_ = NULL; 741 handler_ = NULL; 742 if (file_ != NULL) 743 file_init(); 744 } 745 /// Called by init() for a FILE*. 746 void file_init(); 747 /// Called by size() for a wstring. 748 void wstring_size(); 749 /// Called by size() for a FILE*. 750 void file_size(); 751 /// Called by size() for a std::istream. 752 void istream_size(); 753 /// Implements get() on a FILE*. 754 size_t file_get( 755 char *s, ///< points to the string buffer to fill with input 756 size_t n) ///< size of buffer pointed to by s 757 ; 758 /// Set FILE* handler set_handler(Handler * handler)759 void set_handler(Handler *handler) 760 { 761 handler_ = handler; 762 } 763 protected: 764 const char *cstring_; ///< char string input (when non-null) of length reflex::Input::size_ 765 const wchar_t *wstring_; ///< NUL-terminated wide string input (when non-null) 766 FILE *file_; ///< FILE* input (when non-null) 767 std::istream *istream_; ///< stream input (when non-null) 768 size_t size_; ///< size of the remaining input in bytes (size_ == 0 may indicate size is not set) 769 char utf8_[8]; ///< UTF-8 normalization buffer, >=8 bytes 770 unsigned short uidx_; ///< index in utf8_[] 771 unsigned short ulen_; ///< length of data (remaining after uidx_) in utf8_[] or 0 if no data 772 file_encoding_type utfx_; ///< file_encoding 773 const unsigned short *page_; ///< custom code page 774 Handler *handler_; ///< to handle FILE* errors and non-blocking FILE* reads 775 }; 776 777 /// Stream buffer for reflex::Input, derived from std::streambuf. 778 class Input::streambuf : public std::streambuf { 779 public: streambuf(const reflex::Input & input)780 streambuf(const reflex::Input& input) 781 : 782 input_(input), 783 ch_(input_.get()) 784 { } 785 protected: underflow()786 virtual int_type underflow() 787 { 788 return ch_ == EOF ? traits_type::eof() : traits_type::to_int_type(ch_); 789 } uflow()790 virtual int_type uflow() 791 { 792 if (ch_ == EOF) 793 return traits_type::eof(); 794 int c = ch_; 795 ch_ = input_.get(); 796 return traits_type::to_int_type(c); 797 } xsgetn(char * s,std::streamsize n)798 virtual std::streamsize xsgetn(char *s, std::streamsize n) 799 { 800 if (n <= 0 || ch_ == EOF) 801 return 0; 802 *s++ = ch_; 803 std::streamsize k = static_cast<std::streamsize>(input_.get(s, static_cast<size_t>(n - 1))); 804 if (k < n - 1) 805 { 806 ch_ = EOF; 807 return k + 1; 808 } 809 ch_ = input_.get(); 810 return n; 811 } showmanyc()812 virtual std::streamsize showmanyc() 813 { 814 return ch_ == EOF ? -1 : input_.size() + 1; 815 } 816 Input input_; 817 int ch_; 818 }; 819 820 /// Stream buffer for reflex::Input to read DOS files, replaces CRLF by LF, derived from std::streambuf. 821 class Input::dos_streambuf : public std::streambuf { 822 public: dos_streambuf(const reflex::Input & input)823 dos_streambuf(const reflex::Input& input) 824 : 825 input_(input), 826 ch1_(input_.get()), 827 ch2_(EOF) 828 { } 829 protected: underflow()830 virtual int_type underflow() 831 { 832 if (ch1_ == EOF) 833 return traits_type::eof(); 834 if (ch1_ == '\r') 835 { 836 if (ch2_ == EOF) 837 ch2_ = input_.get(); 838 if (ch2_ == '\n') 839 { 840 ch1_ = ch2_; 841 ch2_ = EOF; 842 } 843 } 844 return traits_type::to_int_type(ch1_); 845 } uflow()846 virtual int_type uflow() 847 { 848 int c = get(); 849 return c == EOF ? traits_type::eof() : traits_type::to_int_type(c); 850 } xsgetn(char * s,std::streamsize n)851 virtual std::streamsize xsgetn(char *s, std::streamsize n) 852 { 853 if (n <= 0 || ch1_ == EOF) 854 return 0; 855 std::streamsize k = n; 856 int c; 857 while (k > 0 && (c = get()) != EOF) 858 { 859 *s++ = c; 860 --k; 861 } 862 return n - k; 863 } showmanyc()864 virtual std::streamsize showmanyc() 865 { 866 return ch1_ == EOF ? -1 : 0; 867 } get()868 int get() 869 { 870 if (ch1_ == EOF) 871 return EOF; 872 int c = ch1_; 873 if (c == '\r') 874 { 875 if (ch2_ == EOF) 876 ch2_ = input_.get(); 877 if (ch2_ == '\n') 878 { 879 c = ch2_; 880 ch1_ = input_.get(); 881 } 882 else 883 { 884 ch1_ = ch2_; 885 } 886 ch2_ = EOF; 887 } 888 else 889 { 890 ch1_ = input_.get(); 891 } 892 return c; 893 } 894 Input input_; 895 int ch1_; 896 int ch2_; 897 }; 898 899 /// Buffered input. 900 class BufferedInput : public Input { 901 public: 902 /// Buffer size. 903 static const size_t SIZE = 16384; 904 /// Buffered stream buffer for reflex::Input, derived from std::streambuf. 905 class streambuf; 906 /// Buffered stream buffer for reflex::Input to read DOS files, replaces CRLF by LF, derived from std::streambuf. 907 class dos_streambuf; 908 /// Copy constructor (with intended "move semantics" as internal state is shared, should not rely on using the rhs after copying). 909 /// Construct empty buffered input. BufferedInput()910 BufferedInput() 911 : 912 Input(), 913 len_(0), 914 pos_(0) 915 { } 916 /// Copy constructor. BufferedInput(const BufferedInput & input)917 BufferedInput(const BufferedInput& input) 918 : 919 Input(input), 920 len_(input.len_), 921 pos_(input.pos_) 922 { 923 std::memcpy(buf_, input.buf_, len_); 924 } 925 /// Construct buffered input from unbuffered input. BufferedInput(const Input & input)926 BufferedInput(const Input& input) 927 : 928 Input(input) 929 { 930 len_ = Input::get(buf_, SIZE); 931 pos_ = 0; 932 } 933 /// Assignment operator from unbuffered input. 934 BufferedInput& operator=(const Input& input) 935 { 936 Input::operator=(input); 937 len_ = Input::get(buf_, SIZE); 938 pos_ = 0; 939 return *this; 940 } 941 /// Copy assignment operator. 942 BufferedInput& operator=(const BufferedInput& input) 943 { 944 Input::operator=(input); 945 len_ = input.len_; 946 pos_ = input.pos_; 947 std::memcpy(buf_, input.buf_, len_); 948 return *this; 949 } 950 /// Construct buffered input character sequence from an open FILE* file descriptor, using the specified file encoding 951 BufferedInput( 952 FILE *file, ///< input file 953 file_encoding_type enc, ///< file_encoding (when UTF BOM is not present) 954 const unsigned short *page = NULL) ///< code page for file_encoding::custom 955 : Input(file,enc,page)956 Input(file, enc, page) 957 { 958 len_ = Input::get(buf_, SIZE); 959 pos_ = 0; 960 } 961 // Cast this Input object to bool, same as checking good(). 962 operator bool() 963 /// @returns true if a non-empty sequence of characters is available to get 964 { 965 return good(); 966 } 967 /// Get the size of the input character sequence in number of ASCII/UTF-8 bytes (zero if size is not determinable from a `FILE*` or `std::istream` source). size()968 size_t size() 969 /// @returns the nonzero number of ASCII/UTF-8 bytes available to read, or zero when source is empty or if size is not determinable e.g. when reading from standard input 970 { 971 return len_ - pos_ + Input::size(); 972 } 973 /// Check if input is available. good()974 bool good() 975 /// @returns true if a non-empty sequence of characters is available to get 976 { 977 return pos_ < len_ || Input::good(); 978 } 979 /// Check if input reached EOF. eof()980 bool eof() 981 /// @returns true if input is at EOF and no characters are available 982 { 983 return pos_ >= len_ && Input::eof(); 984 } 985 /// Peek a single character (unsigned char 0..255) or EOF (-1) when end-of-input is reached. peek()986 int peek() 987 { 988 while (true) 989 { 990 if (len_ == 0) 991 return EOF; 992 if (pos_ < len_) 993 return static_cast<unsigned char>(buf_[pos_]); 994 len_ = Input::get(buf_, SIZE); 995 pos_ = 0; 996 } 997 } 998 /// Get a single character (unsigned char 0..255) or EOF (-1) when end-of-input is reached. get()999 int get() 1000 { 1001 while (true) 1002 { 1003 if (len_ == 0) 1004 return EOF; 1005 if (pos_ < len_) 1006 return static_cast<unsigned char>(buf_[pos_++]); 1007 len_ = Input::get(buf_, SIZE); 1008 pos_ = 0; 1009 } 1010 } 1011 /// Copy character sequence data into buffer. get(char * s,size_t n)1012 size_t get( 1013 char *s, ///< points to the string buffer to fill with input 1014 size_t n) ///< size of buffer pointed to by s 1015 { 1016 size_t k = n; 1017 while (k > 0) 1018 { 1019 if (pos_ < len_) 1020 { 1021 *s++ = buf_[pos_++]; 1022 --k; 1023 } 1024 else if (len_ == 0) 1025 { 1026 break; 1027 } 1028 else 1029 { 1030 len_ = Input::get(buf_, SIZE); 1031 pos_ = 0; 1032 } 1033 } 1034 return n - k; 1035 } 1036 protected: 1037 char buf_[SIZE]; 1038 size_t len_; 1039 size_t pos_; 1040 }; 1041 1042 /// Buffered stream buffer for reflex::Input, derived from std::streambuf. 1043 class BufferedInput::streambuf : public std::streambuf { 1044 public: streambuf(const reflex::BufferedInput & input)1045 streambuf(const reflex::BufferedInput& input) 1046 : 1047 input_(input) 1048 { } streambuf(const reflex::Input & input)1049 streambuf(const reflex::Input& input) 1050 : 1051 input_(input) 1052 { } 1053 protected: underflow()1054 virtual int_type underflow() 1055 { 1056 int c = input_.peek(); 1057 return c == EOF ? traits_type::eof() : traits_type::to_int_type(c); 1058 } uflow()1059 virtual int_type uflow() 1060 { 1061 int c = input_.get(); 1062 return c == EOF ? traits_type::eof() : traits_type::to_int_type(c); 1063 } xsgetn(char * s,std::streamsize n)1064 virtual std::streamsize xsgetn(char *s, std::streamsize n) 1065 { 1066 return static_cast<std::streamsize>(input_.get(s, static_cast<size_t>(n))); 1067 } showmanyc()1068 virtual std::streamsize showmanyc() 1069 { 1070 return input_.eof() ? -1 : input_.size(); 1071 } 1072 BufferedInput input_; 1073 }; 1074 1075 /// Buffered stream buffer for reflex::Input to read DOS files, replaces CRLF by LF, derived from std::streambuf. 1076 class BufferedInput::dos_streambuf : public std::streambuf { 1077 public: dos_streambuf(const reflex::BufferedInput & input)1078 dos_streambuf(const reflex::BufferedInput& input) 1079 : 1080 input_(input), 1081 ch1_(input_.get()), 1082 ch2_(EOF) 1083 { } dos_streambuf(const reflex::Input & input)1084 dos_streambuf(const reflex::Input& input) 1085 : 1086 input_(input), 1087 ch1_(input_.get()), 1088 ch2_(EOF) 1089 { } 1090 protected: underflow()1091 virtual int_type underflow() 1092 { 1093 if (ch1_ == EOF) 1094 return traits_type::eof(); 1095 if (ch1_ == '\r') 1096 { 1097 if (ch2_ == EOF) 1098 ch2_ = input_.get(); 1099 if (ch2_ == '\n') 1100 { 1101 ch1_ = ch2_; 1102 ch2_ = EOF; 1103 } 1104 } 1105 return traits_type::to_int_type(ch1_); 1106 } uflow()1107 virtual int_type uflow() 1108 { 1109 int c = get(); 1110 return c == EOF ? traits_type::eof() : traits_type::to_int_type(c); 1111 } xsgetn(char * s,std::streamsize n)1112 virtual std::streamsize xsgetn(char *s, std::streamsize n) 1113 { 1114 if (n <= 0 || ch1_ == EOF) 1115 return 0; 1116 std::streamsize k = n; 1117 int c; 1118 while (k > 0 && (c = get()) != EOF) 1119 { 1120 *s++ = c; 1121 --k; 1122 } 1123 return n - k; 1124 } showmanyc()1125 virtual std::streamsize showmanyc() 1126 { 1127 return ch1_ == EOF ? -1 : 0; 1128 } get()1129 int get() 1130 { 1131 if (ch1_ == EOF) 1132 return EOF; 1133 int c = ch1_; 1134 if (c == '\r') 1135 { 1136 if (ch2_ == EOF) 1137 ch2_ = input_.get(); 1138 if (ch2_ == '\n') 1139 { 1140 c = ch2_; 1141 ch1_ = input_.get(); 1142 } 1143 else 1144 { 1145 ch1_ = ch2_; 1146 } 1147 ch2_ = EOF; 1148 } 1149 else 1150 { 1151 ch1_ = input_.get(); 1152 } 1153 return c; 1154 } 1155 BufferedInput input_; 1156 int ch1_; 1157 int ch2_; 1158 }; 1159 1160 } // namespace reflex 1161 1162 #endif 1163