1 /*=========================================================================
2 
3   Program:   Visualization Toolkit
4   Module:    vtkDelimitedTextReader.h
5 
6   Copyright (c) Ken Martin, Will Schroeder, Bill Lorensen
7   All rights reserved.
8   See Copyright.txt or http://www.kitware.com/Copyright.htm for details.
9 
10      This software is distributed WITHOUT ANY WARRANTY; without even
11      the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
12      PURPOSE.  See the above copyright notice for more information.
13 
14 =========================================================================*/
15 /*-------------------------------------------------------------------------
16   Copyright 2008 Sandia Corporation.
17   Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
18   the U.S. Government retains certain rights in this software.
19 -------------------------------------------------------------------------*/
20 
21 
22 /**
23  * @class   vtkDelimitedTextReader
24  * @brief   reads in delimited ascii or unicode text files
25  * and outputs a vtkTable data structure.
26  *
27  *
28  * vtkDelimitedTextReader is an interface for pulling in data from a
29  * flat, delimited ascii or unicode text file (delimiter can be any character).
30  *
31  * The behavior of the reader with respect to ascii or unicode input
32  * is controlled by the SetUnicodeCharacterSet() method.  By default
33  * (without calling SetUnicodeCharacterSet()), the reader will expect
34  * to read ascii text and will output vtkStdString columns.  Use the
35  * Set and Get methods to set delimiters that do not contain UTF8 in
36  * the name when operating the reader in default ascii mode.  If the
37  * SetUnicodeCharacterSet() method is called, the reader will output
38  * vtkUnicodeString columns in the output table.  In addition, it is
39  * necessary to use the Set and Get methods that contain UTF8 in the
40  * name to specify delimiters when operating in unicode mode.
41  *
42  * There is also a special character set US-ASCII-WITH-FALLBACK that
43  * will treat the input text as ASCII no matter what.  If and when it
44  * encounters a character with its 8th bit set it will replace that
45  * character with the code point ReplacementCharacter.  You may use
46  * this if you have text that belongs to a code page like LATIN9 or
47  * ISO-8859-1 or friends: mostly ASCII but not entirely.  Eventually
48  * this class will acquire the ability to read gracefully text from
49  * any code page, making this option obsolete.
50  *
51  * This class emits ProgressEvent for every 100 lines it reads.
52  *
53  * @par Thanks:
54  * Thanks to Andy Wilson, Brian Wylie, Tim Shead, and Thomas Otahal
55  * from Sandia National Laboratories for implementing this class.
56  *
57  *
58  * @warning
59  * This reader assumes that the first line in the file (whether that's
60  * headers or the first document) contains at least as many fields as
61  * any other line in the file.
62 */
63 
64 #ifndef vtkDelimitedTextReader_h
65 #define vtkDelimitedTextReader_h
66 
67 #include "vtkIOInfovisModule.h" // For export macro
68 #include "vtkTableAlgorithm.h"
69 #include "vtkUnicodeString.h" // Needed for vtkUnicodeString
70 #include "vtkStdString.h" // Needed for vtkStdString
71 
72 class VTKIOINFOVIS_EXPORT vtkDelimitedTextReader : public vtkTableAlgorithm
73 {
74 public:
75   static vtkDelimitedTextReader* New();
76   vtkTypeMacro(vtkDelimitedTextReader, vtkTableAlgorithm);
77   void PrintSelf(ostream& os, vtkIndent indent) override;
78 
79   //@{
80   /**
81    * Specifies the delimited text file to be loaded.
82    */
83   vtkGetStringMacro(FileName);
84   vtkSetStringMacro(FileName);
85   //@}
86 
87   //@{
88   /**
89    * Specify the InputString for use when reading from a character array.
90    * Optionally include the length for binary strings. Note that a copy
91    * of the string is made and stored. If this causes exceedingly large
92    * memory consumption, consider using InputArray instead.
93    */
94   void SetInputString(const char *in);
95   vtkGetStringMacro(InputString);
96   void SetInputString(const char *in, int len);
97   vtkGetMacro(InputStringLength, int);
SetInputString(const vtkStdString & input)98   void SetInputString(const vtkStdString& input)
99     { this->SetInputString(input.c_str(), static_cast<int>(input.length())); }
100   //@}
101 
102   //@{
103   /**
104    * Enable reading from an InputString or InputArray instead of the default,
105    * a file.
106    */
107   vtkSetMacro(ReadFromInputString,vtkTypeBool);
108   vtkGetMacro(ReadFromInputString,vtkTypeBool);
109   vtkBooleanMacro(ReadFromInputString,vtkTypeBool);
110   //@}
111 
112   //@{
113   /**
114    * Specifies the character set used in the input file.  Valid character set
115    * names will be drawn from the list maintained by the Internet Assigned Name
116    * Authority at
117 
118    * http://www.iana.org/assignments/character-sets
119 
120    * Where multiple aliases are provided for a character set, the preferred MIME name
121    * will be used.  vtkUnicodeDelimitedTextReader currently supports "US-ASCII", "UTF-8",
122    * "UTF-16", "UTF-16BE", and "UTF-16LE" character sets.
123    */
124   vtkGetStringMacro(UnicodeCharacterSet);
125   vtkSetStringMacro(UnicodeCharacterSet);
126   //@}
127 
128   //@{
129   /**
130    * Specify the character(s) that will be used to separate records.
131    * The order of characters in the string does not matter.  Defaults
132    * to "\r\n".
133    */
134   void SetUTF8RecordDelimiters(const char* delimiters);
135   const char* GetUTF8RecordDelimiters();
136   void SetUnicodeRecordDelimiters(const vtkUnicodeString& delimiters);
137   vtkUnicodeString GetUnicodeRecordDelimiters();
138   //@}
139 
140   //@{
141   /**
142    * Specify the character(s) that will be used to separate fields.  For
143    * example, set this to "," for a comma-separated value file.  Set
144    * it to ".:;" for a file where columns can be separated by a
145    * period, colon or semicolon.  The order of the characters in the
146    * string does not matter.  Defaults to a comma.
147    */
148   vtkSetStringMacro(FieldDelimiterCharacters);
149   vtkGetStringMacro(FieldDelimiterCharacters);
150   //@}
151 
152   void SetUTF8FieldDelimiters(const char* delimiters);
153   const char* GetUTF8FieldDelimiters();
154   void SetUnicodeFieldDelimiters(const vtkUnicodeString& delimiters);
155   vtkUnicodeString GetUnicodeFieldDelimiters();
156 
157   //@{
158   /**
159    * Get/set the character that will begin and end strings.  Microsoft
160    * Excel, for example, will export the following format:
161 
162    * "First Field","Second Field","Field, With, Commas","Fourth Field"
163 
164    * The third field has a comma in it.  By using a string delimiter,
165    * this will be correctly read.  The delimiter defaults to '"'.
166    */
167   vtkGetMacro(StringDelimiter, char);
168   vtkSetMacro(StringDelimiter, char);
169   //@}
170 
171   void SetUTF8StringDelimiters(const char* delimiters);
172   const char* GetUTF8StringDelimiters();
173   void SetUnicodeStringDelimiters(const vtkUnicodeString& delimiters);
174   vtkUnicodeString GetUnicodeStringDelimiters();
175 
176   //@{
177   /**
178    * Set/get whether to use the string delimiter.  Defaults to on.
179    */
180   vtkSetMacro(UseStringDelimiter, bool);
181   vtkGetMacro(UseStringDelimiter, bool);
182   vtkBooleanMacro(UseStringDelimiter, bool);
183   //@}
184 
185   //@{
186   /**
187    * Set/get whether to treat the first line of the file as headers.
188    * The default is false (no headers).
189    */
190   vtkGetMacro(HaveHeaders, bool);
191   vtkSetMacro(HaveHeaders, bool);
192   //@}
193 
194   //@{
195   /**
196    * Set/get whether to merge successive delimiters.  Use this if (for
197    * example) your fields are separated by spaces but you don't know
198    * exactly how many.
199    */
200   vtkSetMacro(MergeConsecutiveDelimiters, bool);
201   vtkGetMacro(MergeConsecutiveDelimiters, bool);
202   vtkBooleanMacro(MergeConsecutiveDelimiters, bool);
203   //@}
204 
205   //@{
206   /**
207    * Specifies the maximum number of records to read from the file.  Limiting the
208    * number of records to read is useful for previewing the contents of a file.
209    */
210   vtkGetMacro(MaxRecords, vtkIdType);
211   vtkSetMacro(MaxRecords, vtkIdType);
212   //@}
213 
214   //@{
215   /**
216    * When set to true, the reader will detect numeric columns and create
217    * vtkDoubleArray or vtkIntArray for those instead of vtkStringArray. Default
218    * is off.
219    */
220   vtkSetMacro(DetectNumericColumns, bool);
221   vtkGetMacro(DetectNumericColumns, bool);
222   vtkBooleanMacro(DetectNumericColumns, bool);
223   //@}
224 
225   //@{
226   /**
227    * When set to true and DetectNumericColumns is also true, forces all
228    * numeric columns to vtkDoubleArray even if they contain only
229    * integer values. Default is off.
230    */
231   vtkSetMacro(ForceDouble, bool);
232   vtkGetMacro(ForceDouble, bool);
233   vtkBooleanMacro(ForceDouble, bool);
234   //@}
235 
236   //@{
237   /**
238    * When DetectNumericColumns is set to true, whether to trim whitespace from
239    * strings prior to conversion to a numeric.
240    * Default is false to preserve backward compatibility.
241 
242    * vtkVariant handles whitespace inconsistently, so trim it before we try to
243    * convert it.  For example:
244 
245    * vtkVariant("  2.0").ToDouble() == 2.0 <-- leading whitespace is not a problem
246    * vtkVariant("  2.0  ").ToDouble() == NaN <-- trailing whitespace is a problem
247    * vtkVariant("  infinity  ").ToDouble() == NaN <-- any whitespace is a problem
248 
249    * In these cases, trimming the whitespace gives us the result we expect:
250    * 2.0 and INF respectively.
251    */
252   vtkSetMacro(TrimWhitespacePriorToNumericConversion, bool);
253   vtkGetMacro(TrimWhitespacePriorToNumericConversion, bool);
254   vtkBooleanMacro(TrimWhitespacePriorToNumericConversion, bool);
255   //@}
256 
257   //@{
258   /**
259    * When DetectNumericColumns is set to true, the reader use this value to populate
260    * the vtkIntArray where empty strings are found. Default is 0.
261    */
262   vtkSetMacro(DefaultIntegerValue, int);
263   vtkGetMacro(DefaultIntegerValue, int);
264   //@}
265 
266   //@{
267   /**
268    * When DetectNumericColumns is set to true, the reader use this value to populate
269    * the vtkDoubleArray where empty strings are found. Default is 0.0
270    */
271   vtkSetMacro(DefaultDoubleValue, double);
272   vtkGetMacro(DefaultDoubleValue, double);
273   //@}
274 
275   //@{
276   /**
277    * The name of the array for generating or assigning pedigree ids
278    * (default "id").
279    */
280   vtkSetStringMacro(PedigreeIdArrayName);
281   vtkGetStringMacro(PedigreeIdArrayName);
282   //@}
283 
284   //@{
285   /**
286    * If on (default), generates pedigree ids automatically.
287    * If off, assign one of the arrays to be the pedigree id.
288    */
289   vtkSetMacro(GeneratePedigreeIds, bool);
290   vtkGetMacro(GeneratePedigreeIds, bool);
291   vtkBooleanMacro(GeneratePedigreeIds, bool);
292   //@}
293 
294   //@{
295   /**
296    * If on, assigns pedigree ids to output. Defaults to off.
297    */
298   vtkSetMacro(OutputPedigreeIds, bool);
299   vtkGetMacro(OutputPedigreeIds, bool);
300   vtkBooleanMacro(OutputPedigreeIds, bool);
301   //@}
302 
303   //@{
304   /**
305    * If on, also add in the tab (i.e. '\t') character as a field delimiter.
306    * We add this specially since applications may have a more
307    * difficult time doing this. Defaults to off.
308    */
309   vtkSetMacro(AddTabFieldDelimiter, bool);
310   vtkGetMacro(AddTabFieldDelimiter, bool);
311   vtkBooleanMacro(AddTabFieldDelimiter, bool);
312   //@}
313 
314   /**
315    * Returns a human-readable description of the most recent error, if any.
316    * Otherwise, returns an empty string.  Note that the result is only valid
317    * after calling Update().
318    */
319   vtkStdString GetLastError();
320 
321   //@{
322   /**
323    * Fallback character for use in the US-ASCII-WITH-FALLBACK
324    * character set.  Any characters that have their 8th bit set will
325    * be replaced with this code point.  Defaults to 'x'.
326    */
327   vtkSetMacro(ReplacementCharacter, vtkTypeUInt32);
328   vtkGetMacro(ReplacementCharacter, vtkTypeUInt32);
329   //@}
330 
331 protected:
332   vtkDelimitedTextReader();
333   ~vtkDelimitedTextReader() override;
334 
335   int RequestData(
336     vtkInformation*,
337     vtkInformationVector**,
338     vtkInformationVector*) override;
339 
340   char* FileName;
341   vtkTypeBool ReadFromInputString;
342   char *InputString;
343   int InputStringLength;
344   char* UnicodeCharacterSet;
345   vtkIdType MaxRecords;
346   vtkUnicodeString UnicodeRecordDelimiters;
347   vtkUnicodeString UnicodeFieldDelimiters;
348   vtkUnicodeString UnicodeStringDelimiters;
349   vtkUnicodeString UnicodeWhitespace;
350   vtkUnicodeString UnicodeEscapeCharacter;
351   bool DetectNumericColumns;
352   bool ForceDouble;
353   bool TrimWhitespacePriorToNumericConversion;
354   int DefaultIntegerValue;
355   double DefaultDoubleValue;
356   char* FieldDelimiterCharacters;
357   char StringDelimiter;
358   bool UseStringDelimiter;
359   bool HaveHeaders;
360   bool UnicodeOutputArrays;
361   bool MergeConsecutiveDelimiters;
362   char* PedigreeIdArrayName;
363   bool GeneratePedigreeIds;
364   bool OutputPedigreeIds;
365   bool AddTabFieldDelimiter;
366   vtkStdString LastError;
367   vtkTypeUInt32 ReplacementCharacter;
368 
369 private:
370   vtkDelimitedTextReader(const vtkDelimitedTextReader&) = delete;
371   void operator=(const vtkDelimitedTextReader&) = delete;
372 
373 };
374 
375 #endif
376