1 /******************************************************************************\
2 * Copyright (c) 2016, Robert van Engelen, Genivia Inc. All rights reserved.    *
3 *                                                                              *
4 * Redistribution and use in source and binary forms, with or without           *
5 * modification, are permitted provided that the following conditions are met:  *
6 *                                                                              *
7 *   (1) Redistributions of source code must retain the above copyright notice, *
8 *       this list of conditions and the following disclaimer.                  *
9 *                                                                              *
10 *   (2) Redistributions in binary form must reproduce the above copyright      *
11 *       notice, this list of conditions and the following disclaimer in the    *
12 *       documentation and/or other materials provided with the distribution.   *
13 *                                                                              *
14 *   (3) The name of the author may not be used to endorse or promote products  *
15 *       derived from this software without specific prior written permission.  *
16 *                                                                              *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED *
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF         *
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO   *
20 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,       *
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;  *
23 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,     *
24 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR      *
25 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF       *
26 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                   *
27 \******************************************************************************/
28 
29 /**
30 @file      input.h
31 @brief     RE/flex input character sequence class
32 @author    Robert van Engelen - engelen@genivia.com
33 @copyright (c) 2016-2020, Robert van Engelen, Genivia Inc. All rights reserved.
34 @copyright (c) BSD-3 License - see LICENSE.txt
35 */
36 
37 #ifndef REFLEX_INPUT_H
38 #define REFLEX_INPUT_H
39 
40 #include <reflex/utf8.h>
41 #include <cstdio>
42 #include <cstring>
43 #include <iostream>
44 #include <string>
45 #include <stdint.h>
46 
47 namespace reflex {
48 
49 extern const unsigned short codepages[][256];
50 
51 /// Input character sequence class for unified access to sources of input text.
52 /**
53 Description
54 -----------
55 
56 The Input class unifies access to a source of input text that constitutes a
57 sequence of characters:
58 
59 - An Input object is instantiated and (re)assigned a (new) source input: either
60   a `char*` string, a `wchar_t*` wide string, a `std::string`, a
61   `std::wstring`, a `FILE*` descriptor, or a `std::istream` object.
62 
63 - When assigned a wide string source as input, the wide character content is
64   automatically converted to an UTF-8 character sequence when reading with
65   get().  Wide strings are UCS-2/UCS-4 and may contain UTF-16 surrogate pairs.
66 
67 - When assigned a `FILE*` source as input, the file is checked for the presence
68   of a UTF-8 or a UTF-16 BOM (Byte Order Mark). A UTF-8 BOM is ignored and will
69   not appear on the input character stream (and size is adjusted by 3 bytes). A
70   UTF-16 BOM is intepreted, resulting in the conversion of the file content
71   automatically to an UTF-8 character sequence when reading the file with
72   get(). Also, size() gives the content size in the number of UTF-8 bytes.
73 
74 - An input object can be reassigned a new source of input for reading at any
75   time.
76 
77 - An input object obeys move semantics. That is, after assigning an input
78   object to another, the former can no longer be used to read input. This
79   prevents adding the overhead and complexity of file and stream duplication.
80 
81 - `size_t Input::get(char *buf, size_t len);` reads source input and fills `buf`
82   with up to `len` bytes, returning the number of bytes read or zero when a
83   stream or file is bad or when EOF is reached.
84 
85 - `size_t Input::size();` returns the number of ASCII/UTF-8 bytes available
86   to read from the source input or zero (zero is also returned when the size is
87   not determinable). Use this function only before reading input with get().
88   Wide character strings and UTF-16 `FILE*` content is counted as the total
89   number of UTF-8 bytes that will be produced by get(). The size of a
90   `std::istream` cannot be determined.
91 
92 - `bool Input::good();` returns true if the input is readable and has no
93   EOF or error state.  Returns false on EOF or if an error condition is
94   present.
95 
96 - `bool Input::eof();` returns true if the input reached EOF. Note that
97   good() == ! eof() for string source input only, since files and streams may
98   have error conditions that prevent reading. That is, for files and streams
99   eof() implies good() == false, but not vice versa. Thus, an error is
100   diagnosed when the condition good() == false && eof() == false holds. Note
101   that get(buf, len) == 0 && len > 0 implies good() == false.
102 
103 - `class Input::streambbuf(const Input&)` creates a `std::istream` for the
104   given `Input` object.
105 
106 - Compile with `WITH_UTF8_UNRESTRICTED` to enable unrestricted UTF-8 beyond
107   U+10FFFF, permitting lossless UTF-8 encoding of 32 bit words without limits.
108 
109 Example
110 -------
111 
112 The following example shows how to use the Input class to read a character
113 sequence in blocks from a `std::ifstream` to copy to stdout:
114 
115 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
116     std::ifstream ifs;
117     ifs.open("input.h", std::ifstream::in);
118     reflex::Input input(ifs);
119     char buf[1024];
120     size_t len;
121     while ((len = input.get(buf, sizeof(buf))) > 0)
122       fwrite(buf, 1, len, stdout);
123     if (!input.eof())
124       std::cerr << "An IO error occurred" << std::endl;
125     ifs.close();
126 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
127 
128 Example
129 -------
130 
131 The following example shows how to use the Input class to store the entire
132 content of a file in a temporary buffer:
133 
134 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
135     reflex::Input input(fopen("input.h", "r"));
136     if (input.file() == NULL)
137       abort();
138     size_t len = input.size(); // file size (minus any leading UTF BOM)
139     char *buf = new char[len];
140     input.get(buf, len);
141     if (!input.eof())
142       std::cerr << "An IO error occurred" << std::endl;
143     fwrite(buf, 1, len, stdout);
144     delete[] buf;
145     fclose(input.file());
146 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
147 
148 In the above, files with UTF-16 and UTF-32 content are converted to UTF-8 by
149 `get(buf, len)`.  Also, `size()` returns the total number of UTF-8 bytes to
150 copy in the buffer by `get(buf, len)`.  The size is computed depending on the
151 UTF-8/16/32 file content encoding, i.e. given a leading UTF BOM in the file.
152 This means that UTF-16/32 files are read twice, first internally with `size()`
153 and then again with get(buf, len)`.
154 
155 Example
156 -------
157 
158 The following example shows how to use the Input class to read a character
159 sequence in blocks from a file:
160 
161 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
162     reflex::Input input(fopen("input.h", "r"));
163     char buf[1024];
164     size_t len;
165     while ((len = input.get(buf, sizeof(buf))) > 0)
166       fwrite(buf, 1, len, stdout);
167     fclose(input);
168 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
169 
170 Example
171 -------
172 
173 The following example shows how to use the Input class to echo characters one
174 by one from stdin, e.g. reading input from a tty:
175 
176 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
177     reflex::Input input(stdin);
178     char c;
179     while (input.get(&c, 1))
180       fputc(c, stdout);
181 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
182 
183 Or if you prefer to use an int character and check for EOF explicitly:
184 
185 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
186     reflex::Input input(stdin);
187     int c;
188     while ((c = input.get()) != EOF)
189       fputc(c, stdout);
190 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
191 
192 Example
193 -------
194 
195 The following example shows how to use the Input class to read a character
196 sequence in blocks from a wide character string, converting it to UTF-8 to copy
197 to stdout:
198 
199 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
200     reflex::Input input(L"Copyright ©"); // © is unicode U+00A9 and UTF-8 C2 A9
201     char buf[8];
202     size_t len;
203     while ((len = input.get(buf, sizeof(buf))) > 0)
204       fwrite(buf, 1, len, stdout);
205 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
206 
207 Example
208 -------
209 
210 The following example shows how to use the Input class to convert a wide
211 character string to UTF-8:
212 
213 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
214     reflex::Input input(L"Copyright ©"); // © is unicode U+00A9 and UTF-8 C2 A9
215     size_t len = input.size(); // size of UTF-8 string
216     char *buf = new char[len + 1];
217     input.get(buf, len);
218     buf[len] = '\0'; // make \0-terminated
219 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
220 
221 Example
222 -------
223 
224 The following example shows how to switch source inputs while reading input
225 byte by byte (use a buffer as shown in other examples to improve efficiency):
226 
227 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
228     reflex::Input input = "Hello";
229     std::string message;
230     char c;
231     while (input.get(&c, 1))
232       message.append(c);
233     input = L" world! To ∞ and beyond."; // switch input to a wide string
234     while (input.get(&c, 1))
235       message.append(c);
236 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
237 
238 Example
239 -------
240 
241 The following examples shows how to use reflex::Input::streambuf to create an
242 unbuffered std::istream:
243 
244 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
245     reflex::Input input(fopen("legacy.txt", "r"), reflex::Input::file_encoding::ebcdic);
246     if (input.file() == NULL)
247       abort();
248     reflex::Input::streambuf streambuf(input);
249     std::istream stream(&streambuf);
250     std::string data;
251     int c;
252     while ((c = stream.get()) != EOF)
253       data.append(c);
254     fclose(input.file());
255 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
256 
257 With reflex::BufferedInput::streambuf to create a buffered std::istream:
258 
259 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{.cpp}
260     reflex::Input input(fopen("legacy.txt", "r"), reflex::Input::file_encoding::ebcdic);
261     if (input.file() == NULL)
262       abort();
263     reflex::BufferedInput::streambuf streambuf(input);
264     std::istream stream(&streambuf);
265     std::string data;
266     int c;
267     while ((c = stream.get()) != EOF)
268       data.append(c);
269     fclose(input.file());
270 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
271 */
272 class Input {
273  public:
274   /// Common file_encoding constants type.
275   typedef unsigned short file_encoding_type;
276   /// Common file_encoding constants.
277   struct file_encoding {
278     static const file_encoding_type plain      =  0; ///< plain octets: 7-bit ASCII, 8-bit binary or UTF-8 without BOM detected
279     static const file_encoding_type utf8       =  1; ///< UTF-8 with BOM detected
280     static const file_encoding_type utf16be    =  2; ///< UTF-16 big endian
281     static const file_encoding_type utf16le    =  3; ///< UTF-16 little endian
282     static const file_encoding_type utf32be    =  4; ///< UTF-32 big endian
283     static const file_encoding_type utf32le    =  5; ///< UTF-32 little endian
284     static const file_encoding_type latin      =  6; ///< ISO-8859-1, Latin-1
285     static const file_encoding_type cp437      =  7; ///< DOS CP 437
286     static const file_encoding_type cp850      =  8; ///< DOS CP 850
287     static const file_encoding_type cp858      =  9; ///< DOS CP 858
288     static const file_encoding_type ebcdic     = 10; ///< EBCDIC
289     static const file_encoding_type cp1250     = 11; ///< Windows CP 1250
290     static const file_encoding_type cp1251     = 12; ///< Windows CP 1251
291     static const file_encoding_type cp1252     = 13; ///< Windows CP 1252
292     static const file_encoding_type cp1253     = 14; ///< Windows CP 1253
293     static const file_encoding_type cp1254     = 15; ///< Windows CP 1254
294     static const file_encoding_type cp1255     = 16; ///< Windows CP 1255
295     static const file_encoding_type cp1256     = 17; ///< Windows CP 1256
296     static const file_encoding_type cp1257     = 18; ///< Windows CP 1257
297     static const file_encoding_type cp1258     = 19; ///< Windows CP 1258
298     static const file_encoding_type iso8859_2  = 20; ///< ISO-8859-2, Latin-2
299     static const file_encoding_type iso8859_3  = 21; ///< ISO-8859-3, Latin-3
300     static const file_encoding_type iso8859_4  = 22; ///< ISO-8859-4, Latin-4
301     static const file_encoding_type iso8859_5  = 23; ///< ISO-8859-5, Cyrillic
302     static const file_encoding_type iso8859_6  = 24; ///< ISO-8859-6, Arabic
303     static const file_encoding_type iso8859_7  = 25; ///< ISO-8859-7, Greek
304     static const file_encoding_type iso8859_8  = 26; ///< ISO-8859-8, Hebrew
305     static const file_encoding_type iso8859_9  = 27; ///< ISO-8859-9, Latin-5
306     static const file_encoding_type iso8859_10 = 28; ///< ISO-8859-10, Latin-6
307     static const file_encoding_type iso8859_11 = 29; ///< ISO-8859-11, Thai
308     static const file_encoding_type iso8859_13 = 30; ///< ISO-8859-13, Latin-7
309     static const file_encoding_type iso8859_14 = 31; ///< ISO-8859-14, Latin-8
310     static const file_encoding_type iso8859_15 = 32; ///< ISO-8859-15, Latin-9
311     static const file_encoding_type iso8859_16 = 33; ///< ISO-8859-16
312     static const file_encoding_type macroman   = 34; ///< Macintosh Roman with CR to LF translation
313     static const file_encoding_type koi8_r     = 35; ///< KOI8-R
314     static const file_encoding_type koi8_u     = 36; ///< KOI8-U
315     static const file_encoding_type koi8_ru    = 37; ///< KOI8-RU
316     static const file_encoding_type custom     = 38; ///< custom code page
317   };
318   /// FILE* handler functor base class to handle FILE* errors and non-blocking FILE* reads
319   struct Handler { virtual int operator()() = 0; };
320   /// Stream buffer for reflex::Input, derived from std::streambuf.
321   class streambuf;
322   /// Stream buffer for reflex::Input to read DOS files, replaces CRLF by LF, derived from std::streambuf.
323   class dos_streambuf;
324   /// Construct empty input character sequence.
Input()325   Input()
326     :
327       cstring_(NULL),
328       wstring_(NULL),
329       file_(NULL),
330       istream_(NULL),
331       size_(0)
332   {
333     init();
334   }
335   /// Copy constructor (with intended "move semantics" as internal state is shared, should not rely on using the rhs after copying).
Input(const Input & input)336   Input(const Input& input) ///< an Input object to share state with (undefined behavior results from using both objects)
337     :
338       cstring_(input.cstring_),
339       wstring_(input.wstring_),
340       file_(input.file_),
341       istream_(input.istream_),
342       size_(input.size_),
343       uidx_(input.uidx_),
344       ulen_(input.ulen_),
345       utfx_(input.utfx_),
346       page_(input.page_),
347       handler_(input.handler_)
348   {
349     std::memcpy(utf8_, input.utf8_, sizeof(utf8_));
350   }
351   /// Construct input character sequence from a char* string
Input(const char * cstring,size_t size)352   Input(
353       const char *cstring, ///< char string
354       size_t      size)    ///< length of the string
355     :
356       cstring_(cstring),
357       wstring_(NULL),
358       file_(NULL),
359       istream_(NULL),
360       size_(size)
361   {
362     init();
363   }
364   /// Construct input character sequence from a NUL-terminated string.
Input(const char * cstring)365   Input(const char *cstring) ///< NUL-terminated char* string
366     :
367       cstring_(cstring),
368       wstring_(NULL),
369       file_(NULL),
370       istream_(NULL),
371       size_(cstring != NULL ? std::strlen(cstring) : 0)
372   {
373     init();
374   }
375   /// Construct input character sequence from a std::string.
Input(const std::string & string)376   Input(const std::string& string) ///< input string
377     :
378       cstring_(string.c_str()),
379       wstring_(NULL),
380       file_(NULL),
381       istream_(NULL),
382       size_(string.size())
383   {
384     init();
385   }
386   /// Construct input character sequence from a pointer to a std::string.
Input(const std::string * string)387   Input(const std::string *string) ///< input string
388     :
389       cstring_(string != NULL ? string->c_str() : NULL),
390       wstring_(NULL),
391       file_(NULL),
392       istream_(NULL),
393       size_(string != NULL ? string->size() : 0)
394   {
395     init();
396   }
397   /// Construct input character sequence from a NUL-terminated wide character string.
Input(const wchar_t * wstring)398   Input(const wchar_t *wstring) ///< NUL-terminated wchar_t* input string
399     :
400       cstring_(NULL),
401       wstring_(wstring),
402       file_(NULL),
403       istream_(NULL),
404       size_(0)
405   {
406     init();
407   }
408   /// Construct input character sequence from a std::wstring (may contain UTF-16 surrogate pairs).
Input(const std::wstring & wstring)409   Input(const std::wstring& wstring) ///< input wide string
410     :
411       cstring_(NULL),
412       wstring_(wstring.c_str()),
413       file_(NULL),
414       istream_(NULL),
415       size_(0)
416   {
417     init();
418   }
419   /// Construct input character sequence from a pointer to a std::wstring (may contain UTF-16 surrogate pairs).
Input(const std::wstring * wstring)420   Input(const std::wstring *wstring) ///< input wide string
421     :
422       cstring_(NULL),
423       wstring_(wstring != NULL ? wstring->c_str() : NULL),
424       file_(NULL),
425       istream_(NULL),
426       size_(0)
427   {
428     init();
429   }
430   /// Construct input character sequence from an open FILE* file descriptor, supports UTF-8 conversion from UTF-16 and UTF-32.
Input(FILE * file)431   Input(FILE *file) ///< input file
432     :
433       cstring_(NULL),
434       wstring_(NULL),
435       file_(file),
436       istream_(NULL),
437       size_(0)
438   {
439     init();
440   }
441   /// Construct input character sequence from an open FILE* file descriptor, using the specified file encoding
442   Input(
443       FILE                 *file,        ///< input file
444       file_encoding_type    enc,         ///< file_encoding (when UTF BOM is not present)
445       const unsigned short *page = NULL) ///< code page for file_encoding::custom
446     :
cstring_(NULL)447       cstring_(NULL),
448       wstring_(NULL),
449       file_(file),
450       istream_(NULL),
451       size_(0)
452   {
453     init();
454     if (file_encoding() == file_encoding::plain)
455       file_encoding(enc, page);
456   }
457   /// Construct input character sequence from a std::istream.
Input(std::istream & istream)458   Input(std::istream& istream) ///< input stream
459     :
460       cstring_(NULL),
461       wstring_(NULL),
462       file_(NULL),
463       istream_(&istream),
464       size_(0)
465   {
466     init();
467   }
468   /// Construct input character sequence from a pointer to a std::istream.
Input(std::istream * istream)469   Input(std::istream *istream) ///< input stream
470     :
471       cstring_(NULL),
472       wstring_(NULL),
473       file_(NULL),
474       istream_(istream),
475       size_(0)
476   {
477     init();
478   }
479   /// Copy assignment operator.
480   Input& operator=(const Input& input)
481   {
482     cstring_ = input.cstring_;
483     wstring_ = input.wstring_;
484     file_ = input.file_;
485     istream_ = input.istream_;
486     size_ = input.size_;
487     uidx_ = input.uidx_;
488     ulen_ = input.ulen_;
489     utfx_ = input.utfx_;
490     page_ = input.page_;
491     handler_ = input.handler_;
492     std::memcpy(utf8_, input.utf8_, sizeof(utf8_));
493     return *this;
494   }
495   /// Cast this Input object to a string, returns NULL when this Input is not a string.
496   operator const char *() const
497     /// @returns remaining unbuffered part of a NUL-terminated string or NULL
498   {
499     return cstring_;
500   }
501   /// Cast this Input object to a wide character string, returns NULL when this Input is not a wide string.
502   operator const wchar_t *() const
503     /// @returns remaining unbuffered part of the NUL-terminated wide character string or NULL
504   {
505     return wstring_;
506   }
507   /// Cast this Input object to a file descriptor FILE*, returns NULL when this Input is not a FILE*.
508   operator FILE *() const
509     /// @returns pointer to current file descriptor or NULL
510   {
511     return file_;
512   }
513   /// Cast this Input object to a std::istream*, returns NULL when this Input is not a std::istream.
514   operator std::istream *() const
515     /// @returns pointer to current std::istream or NULL
516   {
517     return istream_;
518   }
519   // Cast this Input object to bool, same as checking good().
520   operator bool() const
521     /// @returns true if a non-empty sequence of characters is available to get
522   {
523     return good();
524   }
525   /// Get the remaining string of this Input object, returns NULL when this Input is not a string.
cstring()526   const char *cstring() const
527     /// @returns remaining unbuffered part of the NUL-terminated string or NULL
528   {
529     return cstring_;
530   }
531   /// Get the remaining wide character string of this Input object, returns NULL when this Input is not a wide string.
wstring()532   const wchar_t *wstring() const
533     /// @returns remaining unbuffered part of the NUL-terminated wide character string or NULL
534   {
535     return wstring_;
536   }
537   /// Get the FILE* of this Input object, returns NULL when this Input is not a FILE*.
file()538   FILE *file() const
539     /// @returns pointer to current file descriptor or NULL
540   {
541     return file_;
542   }
543   /// Get the std::istream of this Input object, returns NULL when this Input is not a std::istream.
istream()544   std::istream *istream() const
545     /// @returns pointer to current std::istream or NULL
546   {
547     return istream_;
548   }
549   /// Get the size of the input character sequence in number of ASCII/UTF-8 bytes (zero if size is not determinable from a `FILE*` or `std::istream` source).
size()550   size_t size()
551     /// @returns the nonzero number of ASCII/UTF-8 bytes available to read, or zero when source is empty or if size is not determinable e.g. when reading from standard input
552   {
553     if (cstring_)
554       return size_;
555     if (wstring_)
556     {
557       if (size_ == 0)
558         wstring_size();
559     }
560     else if (file_)
561     {
562       if (size_ == 0)
563         file_size();
564     }
565     else if (istream_)
566     {
567       if (size_ == 0)
568         istream_size();
569     }
570     return size_;
571   }
572   /// Check if this Input object was assigned a character sequence.
assigned()573   bool assigned() const
574     /// @returns true if this Input object was assigned (not default constructed or cleared)
575   {
576     return cstring_ || wstring_ || file_ || istream_;
577   }
578   /// Clear this Input by unassigning it.
clear()579   void clear()
580   {
581     cstring_ = NULL;
582     wstring_ = NULL;
583     file_ = NULL;
584     istream_ = NULL;
585     size_ = 0;
586   }
587   /// Check if input is available.
good()588   bool good() const
589     /// @returns true if a non-empty sequence of characters is available to get
590   {
591     if (cstring_)
592       return size_ > 0;
593     if (wstring_)
594       return *wstring_ != L'\0';
595     if (file_)
596       return !::feof(file_) && !::ferror(file_);
597     if (istream_)
598       return istream_->good();
599     return false;
600   }
601   /// Check if input reached EOF.
eof()602   bool eof() const
603     /// @returns true if input is at EOF and no characters are available
604   {
605     if (cstring_)
606       return size_ == 0;
607     if (wstring_)
608       return *wstring_ == L'\0';
609     if (file_)
610       return ::feof(file_) != 0;
611     if (istream_)
612       return istream_->eof();
613     return true;
614   }
615   /// Get a single character (unsigned char 0..255) or EOF (-1) when end-of-input is reached.
get()616   int get()
617   {
618     char c;
619     if (get(&c, 1))
620       return static_cast<unsigned char>(c);
621     return EOF;
622   }
623   /// Copy character sequence data into buffer.
get(char * s,size_t n)624   size_t get(
625       char  *s, ///< points to the string buffer to fill with input
626       size_t n) ///< size of buffer pointed to by s
627     /// @returns the nonzero number of (less or equal to n) 8-bit characters added to buffer s from the current input, or zero when EOF
628   {
629     if (cstring_)
630     {
631       size_t k = size_;
632       if (k > n)
633         k = n;
634       std::memcpy(s, cstring_, k);
635       cstring_ += k;
636       size_ -= k;
637       return k;
638     }
639     if (wstring_)
640     {
641       size_t k = n;
642       if (ulen_ > 0)
643       {
644         size_t l = ulen_;
645         if (l > k)
646           l = k;
647         std::memcpy(s, utf8_ + uidx_, l);
648         k -= l;
649         if (k == 0)
650         {
651           uidx_ += static_cast<unsigned short>(l);
652           ulen_ -= static_cast<unsigned short>(l);
653           if (size_ >= n)
654             size_ -= n;
655           return n;
656         }
657         s += l;
658         ulen_ = 0;
659       }
660       wchar_t c;
661       while ((c = *wstring_) != L'\0' && k > 0)
662       {
663         if (c < 0x80)
664         {
665           *s++ = static_cast<char>(c);
666           --k;
667         }
668         else
669         {
670           size_t l;
671           if (c >= 0xD800 && c < 0xE000)
672           {
673             // UTF-16 surrogate pair
674             if (c < 0xDC00 && (wstring_[1] & 0xFC00) == 0xDC00)
675               l = utf8(0x010000 - 0xDC00 + ((c - 0xD800) << 10) + *++wstring_, utf8_);
676             else
677               l = utf8(REFLEX_NONCHAR, utf8_);
678           }
679           else
680           {
681             l = utf8(c, utf8_);
682           }
683           if (k < l)
684           {
685             uidx_ = static_cast<unsigned short>(k);
686             ulen_ = static_cast<unsigned short>(l - k);
687             std::memcpy(s, utf8_, k);
688             s += k;
689             k = 0;
690           }
691           else
692           {
693             std::memcpy(s, utf8_, l);
694             s += l;
695             k -= l;
696           }
697         }
698         ++wstring_;
699       }
700       if (size_ >= n - k)
701         size_ -= n - k;
702       return n - k;
703     }
704     if (file_)
705     {
706       while (true)
707       {
708         size_t k = file_get(s, n);
709         if (k > 0 || feof(file_) || handler_ == NULL || (*handler_)() == 0)
710           return k;
711       }
712     }
713     if (istream_)
714     {
715       size_t k = static_cast<size_t>(n == 1 ? istream_->get(s[0]).gcount() : istream_->read(s, static_cast<std::streamsize>(n)) ? n : istream_->gcount());
716       if (size_ >= k)
717         size_ -= k;
718       return k;
719     }
720     return 0;
721   }
722   /// Set encoding for `FILE*` input.
723   void file_encoding(
724       file_encoding_type    enc,         ///< file_encoding
725       const unsigned short *page = NULL) ///< custom code page for file_encoding::custom
726     ;
727   /// Get encoding of the current `FILE*` input.
file_encoding()728   file_encoding_type file_encoding() const
729     /// @returns current file_encoding constant
730   {
731     return utfx_;
732   }
733   /// Initialize the state after (re)setting the input source, auto-detects UTF BOM in FILE* input if the file size is known.
init()734   void init()
735   {
736     std::memset(utf8_, 0, sizeof(utf8_));
737     uidx_ = 0;
738     ulen_ = 0;
739     utfx_ = 0;
740     page_ = NULL;
741     handler_ = NULL;
742     if (file_ != NULL)
743       file_init();
744   }
745   /// Called by init() for a FILE*.
746   void file_init();
747   /// Called by size() for a wstring.
748   void wstring_size();
749   /// Called by size() for a FILE*.
750   void file_size();
751   /// Called by size() for a std::istream.
752   void istream_size();
753   /// Implements get() on a FILE*.
754   size_t file_get(
755       char  *s, ///< points to the string buffer to fill with input
756       size_t n) ///< size of buffer pointed to by s
757       ;
758   /// Set FILE* handler
set_handler(Handler * handler)759   void set_handler(Handler *handler)
760   {
761     handler_ = handler;
762   }
763  protected:
764   const char           *cstring_; ///< char string input (when non-null) of length reflex::Input::size_
765   const wchar_t        *wstring_; ///< NUL-terminated wide string input (when non-null)
766   FILE                 *file_;    ///< FILE* input (when non-null)
767   std::istream         *istream_; ///< stream input (when non-null)
768   size_t                size_;    ///< size of the remaining input in bytes (size_ == 0 may indicate size is not set)
769   char                  utf8_[8]; ///< UTF-8 normalization buffer, >=8 bytes
770   unsigned short        uidx_;    ///< index in utf8_[]
771   unsigned short        ulen_;    ///< length of data (remaining after uidx_) in utf8_[] or 0 if no data
772   file_encoding_type    utfx_;    ///< file_encoding
773   const unsigned short *page_;    ///< custom code page
774   Handler              *handler_; ///< to handle FILE* errors and non-blocking FILE* reads
775 };
776 
777 /// Stream buffer for reflex::Input, derived from std::streambuf.
778 class Input::streambuf : public std::streambuf {
779  public:
streambuf(const reflex::Input & input)780   streambuf(const reflex::Input& input)
781     :
782       input_(input),
783       ch_(input_.get())
784   { }
785  protected:
underflow()786   virtual int_type underflow()
787   {
788     return ch_ == EOF ? traits_type::eof() : traits_type::to_int_type(ch_);
789   }
uflow()790   virtual int_type uflow()
791   {
792     if (ch_ == EOF)
793       return traits_type::eof();
794     int c = ch_;
795     ch_ = input_.get();
796     return traits_type::to_int_type(c);
797   }
xsgetn(char * s,std::streamsize n)798   virtual std::streamsize xsgetn(char *s, std::streamsize n)
799   {
800     if (n <= 0 || ch_ == EOF)
801       return 0;
802     *s++ = ch_;
803     std::streamsize k = static_cast<std::streamsize>(input_.get(s, static_cast<size_t>(n - 1)));
804     if (k < n - 1)
805     {
806       ch_ = EOF;
807       return k + 1;
808     }
809     ch_ = input_.get();
810     return n;
811   }
showmanyc()812   virtual std::streamsize showmanyc()
813   {
814     return ch_ == EOF ? -1 : input_.size() + 1;
815   }
816   Input input_;
817   int ch_;
818 };
819 
820 /// Stream buffer for reflex::Input to read DOS files, replaces CRLF by LF, derived from std::streambuf.
821 class Input::dos_streambuf : public std::streambuf {
822  public:
dos_streambuf(const reflex::Input & input)823   dos_streambuf(const reflex::Input& input)
824     :
825       input_(input),
826       ch1_(input_.get()),
827       ch2_(EOF)
828   { }
829  protected:
underflow()830   virtual int_type underflow()
831   {
832     if (ch1_ == EOF)
833       return traits_type::eof();
834     if (ch1_ == '\r')
835     {
836       if (ch2_ == EOF)
837         ch2_ = input_.get();
838       if (ch2_ == '\n')
839       {
840         ch1_ = ch2_;
841         ch2_ = EOF;
842       }
843     }
844     return traits_type::to_int_type(ch1_);
845   }
uflow()846   virtual int_type uflow()
847   {
848     int c = get();
849     return c == EOF ? traits_type::eof() : traits_type::to_int_type(c);
850   }
xsgetn(char * s,std::streamsize n)851   virtual std::streamsize xsgetn(char *s, std::streamsize n)
852   {
853     if (n <= 0 || ch1_ == EOF)
854       return 0;
855     std::streamsize k = n;
856     int c;
857     while (k > 0 && (c = get()) != EOF)
858     {
859       *s++ = c;
860       --k;
861     }
862     return n - k;
863   }
showmanyc()864   virtual std::streamsize showmanyc()
865   {
866     return ch1_ == EOF ? -1 : 0;
867   }
get()868   int get()
869   {
870     if (ch1_ == EOF)
871       return EOF;
872     int c = ch1_;
873     if (c == '\r')
874     {
875       if (ch2_ == EOF)
876         ch2_ = input_.get();
877       if (ch2_ == '\n')
878       {
879         c = ch2_;
880         ch1_ = input_.get();
881       }
882       else
883       {
884         ch1_ = ch2_;
885       }
886       ch2_ = EOF;
887     }
888     else
889     {
890       ch1_ = input_.get();
891     }
892     return c;
893   }
894   Input input_;
895   int ch1_;
896   int ch2_;
897 };
898 
899 /// Buffered input.
900 class BufferedInput : public Input {
901  public:
902   /// Buffer size.
903   static const size_t SIZE = 16384;
904   /// Buffered stream buffer for reflex::Input, derived from std::streambuf.
905   class streambuf;
906   /// Buffered stream buffer for reflex::Input to read DOS files, replaces CRLF by LF, derived from std::streambuf.
907   class dos_streambuf;
908   /// Copy constructor (with intended "move semantics" as internal state is shared, should not rely on using the rhs after copying).
909   /// Construct empty buffered input.
BufferedInput()910   BufferedInput()
911     :
912       Input(),
913       len_(0),
914       pos_(0)
915   { }
916   /// Copy constructor.
BufferedInput(const BufferedInput & input)917   BufferedInput(const BufferedInput& input)
918     :
919       Input(input),
920       len_(input.len_),
921       pos_(input.pos_)
922   {
923     std::memcpy(buf_, input.buf_, len_);
924   }
925   /// Construct buffered input from unbuffered input.
BufferedInput(const Input & input)926   BufferedInput(const Input& input)
927     :
928       Input(input)
929   {
930     len_ = Input::get(buf_, SIZE);
931     pos_ = 0;
932   }
933   /// Assignment operator from unbuffered input.
934   BufferedInput& operator=(const Input& input)
935   {
936     Input::operator=(input);
937     len_ = Input::get(buf_, SIZE);
938     pos_ = 0;
939     return *this;
940   }
941   /// Copy assignment operator.
942   BufferedInput& operator=(const BufferedInput& input)
943   {
944     Input::operator=(input);
945     len_ = input.len_;
946     pos_ = input.pos_;
947     std::memcpy(buf_, input.buf_, len_);
948     return *this;
949   }
950   /// Construct buffered input character sequence from an open FILE* file descriptor, using the specified file encoding
951   BufferedInput(
952       FILE                 *file,        ///< input file
953       file_encoding_type    enc,         ///< file_encoding (when UTF BOM is not present)
954       const unsigned short *page = NULL) ///< code page for file_encoding::custom
955     :
Input(file,enc,page)956       Input(file, enc, page)
957   {
958     len_ = Input::get(buf_, SIZE);
959     pos_ = 0;
960   }
961   // Cast this Input object to bool, same as checking good().
962   operator bool()
963     /// @returns true if a non-empty sequence of characters is available to get
964   {
965     return good();
966   }
967   /// Get the size of the input character sequence in number of ASCII/UTF-8 bytes (zero if size is not determinable from a `FILE*` or `std::istream` source).
size()968   size_t size()
969     /// @returns the nonzero number of ASCII/UTF-8 bytes available to read, or zero when source is empty or if size is not determinable e.g. when reading from standard input
970   {
971     return len_ - pos_ + Input::size();
972   }
973   /// Check if input is available.
good()974   bool good()
975     /// @returns true if a non-empty sequence of characters is available to get
976   {
977     return pos_ < len_ || Input::good();
978   }
979   /// Check if input reached EOF.
eof()980   bool eof()
981     /// @returns true if input is at EOF and no characters are available
982   {
983     return pos_ >= len_ && Input::eof();
984   }
985   /// Peek a single character (unsigned char 0..255) or EOF (-1) when end-of-input is reached.
peek()986   int peek()
987   {
988     while (true)
989     {
990       if (len_ == 0)
991         return EOF;
992       if (pos_ < len_)
993         return static_cast<unsigned char>(buf_[pos_]);
994       len_ = Input::get(buf_, SIZE);
995       pos_ = 0;
996     }
997   }
998   /// Get a single character (unsigned char 0..255) or EOF (-1) when end-of-input is reached.
get()999   int get()
1000   {
1001     while (true)
1002     {
1003       if (len_ == 0)
1004         return EOF;
1005       if (pos_ < len_)
1006         return static_cast<unsigned char>(buf_[pos_++]);
1007       len_ = Input::get(buf_, SIZE);
1008       pos_ = 0;
1009     }
1010   }
1011   /// Copy character sequence data into buffer.
get(char * s,size_t n)1012   size_t get(
1013       char  *s, ///< points to the string buffer to fill with input
1014       size_t n) ///< size of buffer pointed to by s
1015   {
1016     size_t k = n;
1017     while (k > 0)
1018     {
1019       if (pos_ < len_)
1020       {
1021         *s++ = buf_[pos_++];
1022         --k;
1023       }
1024       else if (len_ == 0)
1025       {
1026         break;
1027       }
1028       else
1029       {
1030         len_ = Input::get(buf_, SIZE);
1031         pos_ = 0;
1032       }
1033     }
1034     return n - k;
1035   }
1036  protected:
1037   char   buf_[SIZE];
1038   size_t len_;
1039   size_t pos_;
1040 };
1041 
1042 /// Buffered stream buffer for reflex::Input, derived from std::streambuf.
1043 class BufferedInput::streambuf : public std::streambuf {
1044  public:
streambuf(const reflex::BufferedInput & input)1045   streambuf(const reflex::BufferedInput& input)
1046     :
1047       input_(input)
1048   { }
streambuf(const reflex::Input & input)1049   streambuf(const reflex::Input& input)
1050     :
1051       input_(input)
1052   { }
1053  protected:
underflow()1054   virtual int_type underflow()
1055   {
1056     int c = input_.peek();
1057     return c == EOF ? traits_type::eof() : traits_type::to_int_type(c);
1058   }
uflow()1059   virtual int_type uflow()
1060   {
1061     int c = input_.get();
1062     return c == EOF ? traits_type::eof() : traits_type::to_int_type(c);
1063   }
xsgetn(char * s,std::streamsize n)1064   virtual std::streamsize xsgetn(char *s, std::streamsize n)
1065   {
1066     return static_cast<std::streamsize>(input_.get(s, static_cast<size_t>(n)));
1067   }
showmanyc()1068   virtual std::streamsize showmanyc()
1069   {
1070     return input_.eof() ? -1 : input_.size();
1071   }
1072   BufferedInput input_;
1073 };
1074 
1075 /// Buffered stream buffer for reflex::Input to read DOS files, replaces CRLF by LF, derived from std::streambuf.
1076 class BufferedInput::dos_streambuf : public std::streambuf {
1077  public:
dos_streambuf(const reflex::BufferedInput & input)1078   dos_streambuf(const reflex::BufferedInput& input)
1079     :
1080       input_(input),
1081       ch1_(input_.get()),
1082       ch2_(EOF)
1083   { }
dos_streambuf(const reflex::Input & input)1084   dos_streambuf(const reflex::Input& input)
1085     :
1086       input_(input),
1087       ch1_(input_.get()),
1088       ch2_(EOF)
1089   { }
1090  protected:
underflow()1091   virtual int_type underflow()
1092   {
1093     if (ch1_ == EOF)
1094       return traits_type::eof();
1095     if (ch1_ == '\r')
1096     {
1097       if (ch2_ == EOF)
1098         ch2_ = input_.get();
1099       if (ch2_ == '\n')
1100       {
1101         ch1_ = ch2_;
1102         ch2_ = EOF;
1103       }
1104     }
1105     return traits_type::to_int_type(ch1_);
1106   }
uflow()1107   virtual int_type uflow()
1108   {
1109     int c = get();
1110     return c == EOF ? traits_type::eof() : traits_type::to_int_type(c);
1111   }
xsgetn(char * s,std::streamsize n)1112   virtual std::streamsize xsgetn(char *s, std::streamsize n)
1113   {
1114     if (n <= 0 || ch1_ == EOF)
1115       return 0;
1116     std::streamsize k = n;
1117     int c;
1118     while (k > 0 && (c = get()) != EOF)
1119     {
1120       *s++ = c;
1121       --k;
1122     }
1123     return n - k;
1124   }
showmanyc()1125   virtual std::streamsize showmanyc()
1126   {
1127     return ch1_ == EOF ? -1 : 0;
1128   }
get()1129   int get()
1130   {
1131     if (ch1_ == EOF)
1132       return EOF;
1133     int c = ch1_;
1134     if (c == '\r')
1135     {
1136       if (ch2_ == EOF)
1137         ch2_ = input_.get();
1138       if (ch2_ == '\n')
1139       {
1140         c = ch2_;
1141         ch1_ = input_.get();
1142       }
1143       else
1144       {
1145         ch1_ = ch2_;
1146       }
1147       ch2_ = EOF;
1148     }
1149     else
1150     {
1151       ch1_ = input_.get();
1152     }
1153     return c;
1154   }
1155   BufferedInput input_;
1156   int ch1_;
1157   int ch2_;
1158 };
1159 
1160 } // namespace reflex
1161 
1162 #endif
1163