1 #ifndef UTIL___LINE_READER__HPP
2 #define UTIL___LINE_READER__HPP
3 
4 /*  $Id: line_reader.hpp 637993 2021-09-21 18:48:26Z ivanov $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Aaron Ucko, Anatoliy Kuznetsov
30  *
31  */
32 
33 /// @file line_reader.hpp
34 /// Lightweight interface for getting lines of data with minimal
35 /// memory copying.
36 ///
37 /// Any implementation must always keep its current line in memory so
38 /// that callers may harvest data from it in place.
39 
40 #include <corelib/ncbifile.hpp>
41 
42 #include <memory>
43 
44 /** @addtogroup Miscellaneous
45  *
46  * @{
47  */
48 
49 
50 BEGIN_NCBI_SCOPE
51 
52 /// Abstract base class for lightweight line-by-line reading.
53 class NCBI_XUTIL_EXPORT ILineReader : public CObject
54 {
55 public:
56     /// Return a new ILineReader object corresponding to the given
57     /// filename, taking "-" (but not "./-") to mean standard input.
58     ///
59     /// As always with ILineReader, an explicit call to operator++ or
60     /// ReadLine() will be necessary to fetch the first line.
61     static CRef<ILineReader> New(const string& filename);
62 
63     /// Return a new ILineReader object corresponding to the given
64     /// input stream, optionally taking ownership thereof.
65     ///
66     /// As always with ILineReader, an explicit call to operator++ or
67     /// ReadLine() will be necessary to fetch the first line.
68     static CRef<ILineReader> New(CNcbiIstream& is,
69                                  EOwnership ownership = eNoOwnership);
70 
71     /// Indicates (negatively) whether there is any more input.
72     virtual bool AtEOF(void) const = 0;
73 
74     /// Returns the first character of the next string without consuming it.
75     /// If the next string is empty, returns zero.
76     ///  @note
77     /// Before the first call of operator++() this function returns the
78     /// first char of the first string.
79     ///  @attention
80     /// It is not guaranteed that any implementation of this function
81     /// will stop reading at EOF, so you should check for EOF yourself
82     virtual char PeekChar(void) const = 0;
83 
84     /// Make a line available. MUST be called even for the first line;
85     /// MAY trigger EOF conditions even when also retrieving data.
86     /// When beyond the last string, becomes a no-op.
87     virtual ILineReader& operator++(void) = 0;
ReadLine(void)88     void ReadLine(void) { ++*this; }
89 
90     /// Unget current line, which must be valid.
91     /// After calling this method:
92     ///   1. AtEOF() returns false,
93     ///   2. PeekChar() returns first char of the current line,
94     ///   3. Calling operator*() or UngetLine() is illegal,
95     ///   4. Calling operator++() will make the line current again
96     virtual void UngetLine(void) = 0;
97 
98     ///  Return the current line, minus its terminator. Before the first run of
99     ///  operator++() returns an empty CTempString.
100     ///  At EOF returns an empty CTempString.
101     /// @attention
102     ///  Right after UngetLine() calling this
103     ///  method is illegal. Call operator++() first
104     virtual CTempString operator*(void) const = 0;
GetCurrentLine(void) const105     CTempString GetCurrentLine(void) const { return **this; }
106 
107     /// Return the current (absolute) position.
108     virtual CT_POS_TYPE GetPosition(void) const = 0;
109 
110     ///  Returns the current line number (counting from 1, not 0).
111     /// @attention
112     ///  Right after constructor or after UngetLine() calling this
113     ///  method is illegal. Call operator++() first
114     virtual Uint8 GetLineNumber(void) const = 0;
115 };
116 
117 
118 /// Simple implementation of ILineReader for i(o)streams.
119 class NCBI_XUTIL_EXPORT CStreamLineReader : public ILineReader
120 {
121 public:
122     enum EEOLStyle {
123         eEOL_unknown = 0, ///< to be detected
124         eEOL_cr      = 1, ///< bare CR (classic Mac)
125         eEOL_lf      = 2, ///< bare LF (Unix et al.)
126         eEOL_crlf    = 3, ///< DOS/Windows
127 #ifdef NCBI_OS_UNIX
128         eEOL_native  = eEOL_lf,
129 #elif defined(NCBI_OS_MSWIN)
130         eEOL_native  = eEOL_crlf,
131 #else
132         eEOL_native  = eEOL_unknown,
133 #endif
134         eEOL_mixed   = 4 ///< contains both bare CRs and bare LFs
135     };
136 
137     /// Open a line reader over a given stream, with the given
138     /// EOL-style and ownership settings (if specified).
139     ///
140     /// As always with ILineReader, an explicit call to operator++ or
141     /// ReadLine() will be necessary to fetch the first line.
142     explicit CStreamLineReader(CNcbiIstream& is,
143                                EEOLStyle eol_style = eEOL_unknown,
144                                EOwnership ownership = eNoOwnership);
145 
146     /// Open a line reader over a given stream, with the given
147     /// ownership setting.
148     ///
149     /// As always with ILineReader, an explicit call to operator++ or
150     /// ReadLine() will be necessary to fetch the first line.
151     CStreamLineReader(CNcbiIstream& is, EOwnership ownership);
152 
153     ~CStreamLineReader();
154 
155     bool               AtEOF(void) const;
156     char               PeekChar(void) const;
157     CStreamLineReader& operator++(void);
158     void               UngetLine(void);
159     CTempString        operator*(void) const;
160     CT_POS_TYPE        GetPosition(void) const;
161     Uint8              GetLineNumber(void) const;
162 
163 private:
164     EEOLStyle x_AdvanceEOLUnknown(void);
165     EEOLStyle x_AdvanceEOLSimple(char eol, char alt_eol);
166     EEOLStyle x_AdvanceEOLCRLF(void);
167 
168     AutoPtr<CNcbiIstream> m_Stream;
169     string                m_Line;
170     Uint8                 m_LineNumber;
171     SIZE_TYPE             m_LastReadSize;
172     bool                  m_UngetLine;
173     bool                  m_AutoEOL;
174     EEOLStyle             m_EOLStyle;
175 };
176 
177 
178 /// Simple implementation of ILineReader for regions of memory
179 /// (such as memory-mapped files).
180 class NCBI_XUTIL_EXPORT CMemoryLineReader : public ILineReader
181 {
182 public:
183     /// Open a line reader over the half-open memory range [start, end).
184     ///
185     /// As always with ILineReader, an explicit call to operator++ or
186     /// ReadLine() will be necessary to fetch the first line.
CMemoryLineReader(const char * start,const char * end)187     CMemoryLineReader(const char* start, const char* end)
188         : m_Start(start), m_End(end), m_Pos(start), m_LineNumber(0) { }
189 
190     /// Open a line reader over the half-open memory range
191     /// [start, start+length).
192     ///
193     /// As always with ILineReader, an explicit call to operator++ or
194     /// ReadLine() will be necessary to fetch the first line.
CMemoryLineReader(const char * start,SIZE_TYPE length)195     CMemoryLineReader(const char* start, SIZE_TYPE length)
196         : m_Start(start), m_End(start + length), m_Pos(start),
197           m_LineNumber(0) { }
198 
199     /// Open a line reader over a given memory-mapped file, with the
200     /// given ownership setting (if specified).
201     ///
202     /// As always with ILineReader, an explicit call to operator++ or
203     /// ReadLine() will be necessary to fetch the first line.
204     CMemoryLineReader(CMemoryFile* mem_file,
205                       EOwnership ownership = eNoOwnership);
206 
207     bool               AtEOF(void) const;
208     char               PeekChar(void) const;
209     CMemoryLineReader& operator++(void);
210     void               UngetLine(void);
211     CTempString        operator*(void) const;
212     CT_POS_TYPE        GetPosition(void) const;
213     Uint8              GetLineNumber(void) const;
214 
215 private:
216     const char*           m_Start;
217     const char*           m_End;
218     const char*           m_Pos;
219     CTempString           m_Line;
220     AutoPtr<CMemoryFile>  m_MemFile;
221     Uint8                 m_LineNumber;
222 };
223 
224 /// Implementation of ILineReader for IReader
225 ///
226 class NCBI_XUTIL_EXPORT CBufferedLineReader : public ILineReader
227 {
228 public:
229     /// read from the IReader
230     ///
231     /// As always with ILineReader, an explicit call to operator++ or
232     /// ReadLine() will be necessary to fetch the first line.
233     CBufferedLineReader(IReader* reader,
234                         EOwnership ownership = eNoOwnership);
235 
236     /// read from the istream
237     ///
238     /// As always with ILineReader, an explicit call to operator++ or
239     /// ReadLine() will be necessary to fetch the first line.
240     CBufferedLineReader(CNcbiIstream& is,
241                         EOwnership ownership = eNoOwnership);
242 
243     /// read from the file, "-" (but not "./-") means standard input
244     ///
245     /// As always with ILineReader, an explicit call to operator++ or
246     /// ReadLine() will be necessary to fetch the first line.
247     CBufferedLineReader(const string& filename);
248 
249     virtual ~CBufferedLineReader();
250 
251     bool                AtEOF(void) const;
252     char                PeekChar(void) const;
253     CBufferedLineReader& operator++(void);
254     void                UngetLine(void);
255     CTempString         operator*(void) const;
256     CT_POS_TYPE         GetPosition(void) const;
257     Uint8               GetLineNumber(void) const;
258 
259 private:
260     CBufferedLineReader(const CBufferedLineReader&);
261     CBufferedLineReader& operator=(const CBufferedLineReader&);
262 private:
263     void x_LoadLong();
264     bool x_ReadBuffer();
265 private:
266     AutoPtr<IReader> m_Reader;
267     bool          m_Eof;
268     bool          m_UngetLine;
269     SIZE_TYPE     m_LastReadSize;
270     size_t        m_BufferSize;
271     AutoArray<char> m_Buffer;
272     const char*   m_Pos;
273     const char*   m_End;
274     CTempString   m_Line;
275     string        m_String;
276     CT_POS_TYPE   m_InputPos;
277     Uint8         m_LineNumber;
278 };
279 
280 
281 
282 END_NCBI_SCOPE
283 
284 
285 /* @} */
286 
287 #endif  /* UTIL___LINE_READER__HPP */
288