1 use core::fmt;
2 
3 use crate::Terminator;
4 
5 // BE ADVISED
6 //
7 // This may just be one of the more complicated CSV parsers you'll come across.
8 // The implementation never allocates and consists of both a functional NFA
9 // parser and a DFA parser. The DFA parser is the work horse and we could elide
10 // much of the work involved in making the NFA parser work, but the NFA parser
11 // is much easier to debug. The NFA parser is tested alongside the DFA parser,
12 // so they should never be out of sync.
13 //
14 // The basic structure of the implementation is to encode the NFA parser as
15 // an explicit state machine in code. The DFA is then generated by populating
16 // a transition table on the stack by exhaustively enumerating all possible
17 // states on all possible inputs (this is possible because the number of states
18 // and the number of inputs is very small).
19 //
20 // Note that some pieces of the NFA parser (such as the NFA state machine) are
21 // required. In particular, the translation from the NFA to the DFA depends on
22 // the configuration of the CSV parser as given by the caller, and indeed, this
23 // is one of the key performance benefits of the DFA: it doesn't have any
24 // overhead (other than a bigger transition table) associated with the number
25 // of configuration options.
26 //
27 // ADVICE FOR HACKERS
28 //
29 // This code is too clever for its own good. As such, changes to some parts of
30 // the code may have a non-obvious impact on other parts. This is mostly
31 // motivated by trying to keep the DFA transition table as small as possible,
32 // since it is stored on the stack. Here are some tips that may save you some
33 // time:
34 //
35 // * If you add a new NFA state, then you also need to consider how it impacts
36 //   the DFA. If all of the incoming transitions into an NFA state are
37 //   epsilon transitions, then it probably isn't materialized in the DFA.
38 //   If the NFA state indicates that a field or a record has been parsed, then
39 //   it should be considered final. Let the comments in `NfaState` be your
40 //   guide.
41 // * If you add a new configuration knob to the parser, then you may need to
42 //   modify the `TRANS_CLASSES` constant below. The `TRANS_CLASSES` constant
43 //   indicates the total number of discriminating bytes in the DFA. And if you
44 //   modify `TRANS_CLASSES`, you probably also need to modify `build_dfa` to
45 //   add a new class. For example, in order to add parsing support for
46 //   comments, I bumped `TRANS_CLASSES` from `6` to `7` and added the comment
47 //   byte (if one exists) to the list of classes in `build_dfa`.
48 // * The special DFA start state doubles as the final state once all input
49 //   from the caller has been exhausted. We must be careful to guard this
50 //   case analysis on whether the input is actually exhausted, since the start
51 //   state is an otherwise valid state.
52 
53 /// A pull based CSV reader.
54 ///
55 /// This reader parses CSV data using a finite state machine. Callers can
56 /// extract parsed data incrementally using one of the `read` methods.
57 ///
58 /// Note that this CSV reader is somewhat encoding agnostic. The source data
59 /// needs to be at least ASCII compatible. There is no support for specifying
60 /// the full gamut of Unicode delimiters/terminators/quotes/escapes. Instead,
61 /// any byte can be used, although callers probably want to stick to the ASCII
62 /// subset (`<= 0x7F`).
63 ///
64 /// # Usage
65 ///
66 /// A reader has two different ways to read CSV data, each with their own
67 /// trade offs.
68 ///
69 /// * `read_field` - Copies a single CSV field into an output buffer while
70 ///   unescaping quotes. This is simple to use and doesn't require storing an
71 ///   entire record contiguously in memory, but it is slower.
72 /// * `read_record` - Copies an entire CSV record into an output buffer while
73 ///   unescaping quotes. The ending positions of each field are copied into
74 ///   an additional buffer. This is harder to use and requires larger output
75 ///   buffers, but it is faster than `read_field` since it amortizes more
76 ///   costs.
77 ///
78 /// # RFC 4180
79 ///
80 /// [RFC 4180](https://tools.ietf.org/html/rfc4180)
81 /// is the closest thing to a specification for CSV data. Unfortunately,
82 /// CSV data that is seen in the wild can vary significantly. Often, the CSV
83 /// data is outright invalid. Instead of fixing the producers of bad CSV data,
84 /// we have seen fit to make consumers much more flexible in what they accept.
85 /// This reader continues that tradition, and therefore, isn't technically
86 /// compliant with RFC 4180. In particular, this reader will never return an
87 /// error and will always find *a* parse.
88 ///
89 /// Here are some detailed differences from RFC 4180:
90 ///
91 /// * CRLF, LF and CR are each treated as a single record terminator by
92 ///   default.
93 /// * Records are permitted to be of varying length.
94 /// * Empty lines (that do not include other whitespace) are ignored.
95 #[derive(Clone, Debug)]
96 pub struct Reader {
97     /// A table-based DFA for parsing CSV.
98     dfa: Dfa,
99     /// The current DFA state, if the DFA is used.
100     dfa_state: DfaState,
101     /// The current NFA state, if the NFA is used.
102     nfa_state: NfaState,
103     /// The delimiter that separates fields.
104     delimiter: u8,
105     /// The terminator that separates records.
106     term: Terminator,
107     /// The quotation byte.
108     quote: u8,
109     /// Whether to recognize escaped quotes.
110     escape: Option<u8>,
111     /// Whether to recognized doubled quotes.
112     double_quote: bool,
113     /// If enabled, lines beginning with this byte are ignored.
114     comment: Option<u8>,
115     /// If enabled (the default), then quotes are respected. When disabled,
116     /// quotes are not treated specially.
117     quoting: bool,
118     /// Whether to use the NFA for parsing.
119     ///
120     /// Generally this is for debugging. There's otherwise no good reason
121     /// to avoid the DFA.
122     use_nfa: bool,
123     /// The current line number.
124     line: u64,
125     /// Whether this parser has ever read anything.
126     has_read: bool,
127     /// The current position in the output buffer when reading a record.
128     output_pos: usize,
129 }
130 
131 impl Default for Reader {
default() -> Reader132     fn default() -> Reader {
133         Reader {
134             dfa: Dfa::new(),
135             dfa_state: DfaState::start(),
136             nfa_state: NfaState::StartRecord,
137             delimiter: b',',
138             term: Terminator::default(),
139             quote: b'"',
140             escape: None,
141             double_quote: true,
142             comment: None,
143             quoting: true,
144             use_nfa: false,
145             line: 1,
146             has_read: false,
147             output_pos: 0,
148         }
149     }
150 }
151 
152 /// Builds a CSV reader with various configuration knobs.
153 ///
154 /// This builder can be used to tweak the field delimiter, record terminator
155 /// and more for parsing CSV. Once a CSV `Reader` is built, its configuration
156 /// cannot be changed.
157 #[derive(Debug, Default)]
158 pub struct ReaderBuilder {
159     rdr: Reader,
160 }
161 
162 impl ReaderBuilder {
163     /// Create a new builder.
new() -> ReaderBuilder164     pub fn new() -> ReaderBuilder {
165         ReaderBuilder::default()
166     }
167 
168     /// Build a CSV parser from this configuration.
build(&self) -> Reader169     pub fn build(&self) -> Reader {
170         let mut rdr = self.rdr.clone();
171         rdr.build_dfa();
172         rdr
173     }
174 
175     /// The field delimiter to use when parsing CSV.
176     ///
177     /// The default is `b','`.
delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder178     pub fn delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder {
179         self.rdr.delimiter = delimiter;
180         self
181     }
182 
183     /// The record terminator to use when parsing CSV.
184     ///
185     /// A record terminator can be any single byte. The default is a special
186     /// value, `Terminator::CRLF`, which treats any occurrence of `\r`, `\n`
187     /// or `\r\n` as a single record terminator.
terminator(&mut self, term: Terminator) -> &mut ReaderBuilder188     pub fn terminator(&mut self, term: Terminator) -> &mut ReaderBuilder {
189         self.rdr.term = term;
190         self
191     }
192 
193     /// The quote character to use when parsing CSV.
194     ///
195     /// The default is `b'"'`.
quote(&mut self, quote: u8) -> &mut ReaderBuilder196     pub fn quote(&mut self, quote: u8) -> &mut ReaderBuilder {
197         self.rdr.quote = quote;
198         self
199     }
200 
201     /// The escape character to use when parsing CSV.
202     ///
203     /// In some variants of CSV, quotes are escaped using a special escape
204     /// character like `\` (instead of escaping quotes by doubling them).
205     ///
206     /// By default, recognizing these idiosyncratic escapes is disabled.
escape(&mut self, escape: Option<u8>) -> &mut ReaderBuilder207     pub fn escape(&mut self, escape: Option<u8>) -> &mut ReaderBuilder {
208         self.rdr.escape = escape;
209         self
210     }
211 
212     /// Enable double quote escapes.
213     ///
214     /// This is enabled by default, but it may be disabled. When disabled,
215     /// doubled quotes are not interpreted as escapes.
double_quote(&mut self, yes: bool) -> &mut ReaderBuilder216     pub fn double_quote(&mut self, yes: bool) -> &mut ReaderBuilder {
217         self.rdr.double_quote = yes;
218         self
219     }
220 
221     /// Enable or disable quoting.
222     ///
223     /// This is enabled by default, but it may be disabled. When disabled,
224     /// quotes are not treated specially.
quoting(&mut self, yes: bool) -> &mut ReaderBuilder225     pub fn quoting(&mut self, yes: bool) -> &mut ReaderBuilder {
226         self.rdr.quoting = yes;
227         self
228     }
229 
230     /// The comment character to use when parsing CSV.
231     ///
232     /// If the start of a record begins with the byte given here, then that
233     /// line is ignored by the CSV parser.
234     ///
235     /// This is disabled by default.
comment(&mut self, comment: Option<u8>) -> &mut ReaderBuilder236     pub fn comment(&mut self, comment: Option<u8>) -> &mut ReaderBuilder {
237         self.rdr.comment = comment;
238         self
239     }
240 
241     /// A convenience method for specifying a configuration to read ASCII
242     /// delimited text.
243     ///
244     /// This sets the delimiter and record terminator to the ASCII unit
245     /// separator (`\x1F`) and record separator (`\x1E`), respectively.
ascii(&mut self) -> &mut ReaderBuilder246     pub fn ascii(&mut self) -> &mut ReaderBuilder {
247         self.delimiter(b'\x1F').terminator(Terminator::Any(b'\x1E'))
248     }
249 
250     /// Enable or disable the NFA for parsing CSV.
251     ///
252     /// This is intended to be a debug option useful for debugging. The NFA
253     /// is always slower than the DFA.
254     #[doc(hidden)]
nfa(&mut self, yes: bool) -> &mut ReaderBuilder255     pub fn nfa(&mut self, yes: bool) -> &mut ReaderBuilder {
256         self.rdr.use_nfa = yes;
257         self
258     }
259 }
260 
261 /// The result of parsing at most one field from CSV data.
262 #[derive(Clone, Debug, Eq, PartialEq)]
263 pub enum ReadFieldResult {
264     /// The caller provided input was exhausted before the end of a field or
265     /// record was found.
266     InputEmpty,
267     /// The caller provided output buffer was filled before an entire field
268     /// could be written to it.
269     OutputFull,
270     /// The end of a field was found.
271     ///
272     /// Note that when `record_end` is true, then the end of this field also
273     /// corresponds to the end of a record.
274     Field {
275         /// Whether this was the last field in a record or not.
276         record_end: bool,
277     },
278     /// All CSV data has been read.
279     ///
280     /// This state can only be returned when an empty input buffer is provided
281     /// by the caller.
282     End,
283 }
284 
285 impl ReadFieldResult {
from_nfa( state: NfaState, inpdone: bool, outdone: bool, ) -> ReadFieldResult286     fn from_nfa(
287         state: NfaState,
288         inpdone: bool,
289         outdone: bool,
290     ) -> ReadFieldResult {
291         match state {
292             NfaState::End => ReadFieldResult::End,
293             NfaState::EndRecord | NfaState::CRLF => {
294                 ReadFieldResult::Field { record_end: true }
295             }
296             NfaState::EndFieldDelim => {
297                 ReadFieldResult::Field { record_end: false }
298             }
299             _ => {
300                 assert!(!state.is_field_final());
301                 if !inpdone && outdone {
302                     ReadFieldResult::OutputFull
303                 } else {
304                     ReadFieldResult::InputEmpty
305                 }
306             }
307         }
308     }
309 }
310 
311 /// The result of parsing at most one field from CSV data while ignoring the
312 /// output.
313 #[derive(Clone, Debug, Eq, PartialEq)]
314 pub enum ReadFieldNoCopyResult {
315     /// The caller provided input was exhausted before the end of a field or
316     /// record was found.
317     InputEmpty,
318     /// The end of a field was found.
319     ///
320     /// Note that when `record_end` is true, then the end of this field also
321     /// corresponds to the end of a record.
322     Field {
323         /// Whether this was the last field in a record or not.
324         record_end: bool,
325     },
326     /// All CSV data has been read.
327     ///
328     /// This state can only be returned when an empty input buffer is provided
329     /// by the caller.
330     End,
331 }
332 
333 /// The result of parsing at most one record from CSV data.
334 #[derive(Clone, Debug, Eq, PartialEq)]
335 pub enum ReadRecordResult {
336     /// The caller provided input was exhausted before the end of a record was
337     /// found.
338     InputEmpty,
339     /// The caller provided output buffer was filled before an entire field
340     /// could be written to it.
341     OutputFull,
342     /// The caller provided output buffer of field end poisitions was filled
343     /// before the next field could be parsed.
344     OutputEndsFull,
345     /// The end of a record was found.
346     Record,
347     /// All CSV data has been read.
348     ///
349     /// This state can only be returned when an empty input buffer is provided
350     /// by the caller.
351     End,
352 }
353 
354 impl ReadRecordResult {
is_record(&self) -> bool355     fn is_record(&self) -> bool {
356         *self == ReadRecordResult::Record
357     }
358 
from_nfa( state: NfaState, inpdone: bool, outdone: bool, endsdone: bool, ) -> ReadRecordResult359     fn from_nfa(
360         state: NfaState,
361         inpdone: bool,
362         outdone: bool,
363         endsdone: bool,
364     ) -> ReadRecordResult {
365         match state {
366             NfaState::End => ReadRecordResult::End,
367             NfaState::EndRecord | NfaState::CRLF => ReadRecordResult::Record,
368             _ => {
369                 assert!(!state.is_record_final());
370                 if !inpdone && outdone {
371                     ReadRecordResult::OutputFull
372                 } else if !inpdone && endsdone {
373                     ReadRecordResult::OutputEndsFull
374                 } else {
375                     ReadRecordResult::InputEmpty
376                 }
377             }
378         }
379     }
380 }
381 
382 /// The result of parsing at most one record from CSV data while ignoring
383 /// output.
384 #[derive(Clone, Debug, Eq, PartialEq)]
385 pub enum ReadRecordNoCopyResult {
386     /// The caller provided input was exhausted before the end of a record was
387     /// found.
388     InputEmpty,
389     /// The end of a record was found.
390     Record,
391     /// All CSV data has been read.
392     ///
393     /// This state can only be returned when an empty input buffer is provided
394     /// by the caller.
395     End,
396 }
397 
398 /// What should be done with input bytes during an NFA transition
399 #[derive(Clone, Debug, Eq, PartialEq)]
400 enum NfaInputAction {
401     // Do not consume an input byte
402     Epsilon,
403     // Copy input byte to a caller-provided output buffer
404     CopyToOutput,
405     // Consume but do not copy input byte (for example, seeing a field
406     // delimiter will consume an input byte but should not copy it to the
407     // output buffer.
408     Discard,
409 }
410 
411 /// An NFA state is a state that can be visited in the NFA parser.
412 ///
413 /// Given the simplicity of the machine, a subset of NFA states double as DFA
414 /// states. NFA states that only have incoming epsilon transitions are
415 /// optimized out when converting the machine to a DFA.
416 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
417 enum NfaState {
418     // These states aren't used in the DFA, so we
419     // assign them meaningless numbers.
420     EndFieldTerm = 200,
421     InRecordTerm = 201,
422     End = 202,
423 
424     // All states below are DFA states.
425     StartRecord = 0,
426     StartField = 1,
427     InField = 2,
428     InQuotedField = 3,
429     InEscapedQuote = 4,
430     InDoubleEscapedQuote = 5,
431     InComment = 6,
432     // All states below are "final field" states.
433     // Namely, they indicate that a field has been parsed.
434     EndFieldDelim = 7,
435     // All states below are "final record" states.
436     // Namely, they indicate that a record has been parsed.
437     EndRecord = 8,
438     CRLF = 9,
439 }
440 
441 /// A list of NFA states that have an explicit representation in the DFA.
442 const NFA_STATES: &'static [NfaState] = &[
443     NfaState::StartRecord,
444     NfaState::StartField,
445     NfaState::EndFieldDelim,
446     NfaState::InField,
447     NfaState::InQuotedField,
448     NfaState::InEscapedQuote,
449     NfaState::InDoubleEscapedQuote,
450     NfaState::InComment,
451     NfaState::EndRecord,
452     NfaState::CRLF,
453 ];
454 
455 impl NfaState {
456     /// Returns true if this state indicates that a field has been parsed.
is_field_final(&self) -> bool457     fn is_field_final(&self) -> bool {
458         match *self {
459             NfaState::End
460             | NfaState::EndRecord
461             | NfaState::CRLF
462             | NfaState::EndFieldDelim => true,
463             _ => false,
464         }
465     }
466 
467     /// Returns true if this state indicates that a record has been parsed.
is_record_final(&self) -> bool468     fn is_record_final(&self) -> bool {
469         match *self {
470             NfaState::End | NfaState::EndRecord | NfaState::CRLF => true,
471             _ => false,
472         }
473     }
474 }
475 
476 impl Reader {
477     /// Create a new CSV reader with a default parser configuration.
new() -> Reader478     pub fn new() -> Reader {
479         ReaderBuilder::new().build()
480     }
481 
482     /// Reset the parser such that it behaves as if it had never been used.
483     ///
484     /// This may be useful when reading CSV data in a random access pattern.
reset(&mut self)485     pub fn reset(&mut self) {
486         self.dfa_state = self.dfa.new_state(NfaState::StartRecord);
487         self.nfa_state = NfaState::StartRecord;
488         self.line = 1;
489         self.has_read = false;
490     }
491 
492     /// Return the current line number as measured by the number of occurrences
493     /// of `\n`.
494     ///
495     /// Line numbers starts at `1` and are reset when `reset` is called.
line(&self) -> u64496     pub fn line(&self) -> u64 {
497         self.line
498     }
499 
500     /// Set the line number.
501     ///
502     /// This is useful after a call to `reset` where the caller knows the
503     /// line number from some additional context.
set_line(&mut self, line: u64)504     pub fn set_line(&mut self, line: u64) {
505         self.line = line;
506     }
507 
508     /// Parse a single CSV field in `input` and copy field data to `output`.
509     ///
510     /// This routine requires a caller provided buffer of CSV data as the
511     /// `input` and a caller provided buffer, `output`, in which to store field
512     /// data extracted from `input`. The field data copied to `output` will
513     /// have its quotes unescaped.
514     ///
515     /// Calling this routine parses at most a single field and returns
516     /// three values indicating the state of the parser. The first value, a
517     /// `ReadFieldResult`, tells the caller what to do next. For example, if
518     /// the entire input was read or if the output buffer was filled before
519     /// a full field had been read, then `ReadFieldResult::InputEmpty` or
520     /// `ReadFieldResult::OutputFull` is returned, respectively. See the
521     /// documentation for `ReadFieldResult` for more details.
522     ///
523     /// The other two values returned correspond to the number of bytes
524     /// read from `input` and written to `output`, respectively.
525     ///
526     /// # Termination
527     ///
528     /// This reader interprets an empty `input` buffer as an indication that
529     /// there is no CSV data left to read. Namely, when the caller has
530     /// exhausted all CSV data, the caller should continue to call `read` with
531     /// an empty input buffer until `ReadFieldResult::End` is returned.
532     ///
533     /// # Errors
534     ///
535     /// This CSV reader can never return an error. Instead, it prefers *a*
536     /// parse over *no* parse.
read_field( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize)537     pub fn read_field(
538         &mut self,
539         input: &[u8],
540         output: &mut [u8],
541     ) -> (ReadFieldResult, usize, usize) {
542         let (input, bom_nin) = self.strip_utf8_bom(input);
543         let (res, nin, nout) = if self.use_nfa {
544             self.read_field_nfa(input, output)
545         } else {
546             self.read_field_dfa(input, output)
547         };
548         self.has_read = true;
549         (res, nin + bom_nin, nout)
550     }
551 
552     /// Parse a single CSV record in `input` and copy each field contiguously
553     /// to `output`, with the end position of each field written to `ends`.
554     ///
555     /// **NOTE**: This method is more cumbersome to use than `read_field`, but
556     /// it can be faster since it amortizes more work.
557     ///
558     /// This routine requires a caller provided buffer of CSV data as the
559     /// `input` and two caller provided buffers to store the unescaped field
560     /// data (`output`) and the end position of each field in the record
561     /// (`fields`).
562     ///
563     /// Calling this routine parses at most a single record and returns four
564     /// values indicating the state of the parser. The first value, a
565     /// `ReadRecordResult`, tells the caller what to do next. For example, if
566     /// the entire input was read or if the output buffer was filled before a
567     /// full field had been read, then `ReadRecordResult::InputEmpty` or
568     /// `ReadRecordResult::OutputFull` is returned, respectively. Similarly, if
569     /// the `ends` buffer is full, then `ReadRecordResult::OutputEndsFull` is
570     /// returned. See the documentation for `ReadRecordResult` for more
571     /// details.
572     ///
573     /// The other three values correspond to the number of bytes read from
574     /// `input`, the number of bytes written to `output` and the number of
575     /// end positions written to `ends`, respectively.
576     ///
577     /// The end positions written to `ends` are constructed as if there was
578     /// a single contiguous buffer in memory containing the entire row, even
579     /// if `ReadRecordResult::OutputFull` was returned in the middle of reading
580     /// a row.
581     ///
582     /// # Termination
583     ///
584     /// This reader interprets an empty `input` buffer as an indication that
585     /// there is no CSV data left to read. Namely, when the caller has
586     /// exhausted all CSV data, the caller should continue to call `read` with
587     /// an empty input buffer until `ReadRecordResult::End` is returned.
588     ///
589     /// # Errors
590     ///
591     /// This CSV reader can never return an error. Instead, it prefers *a*
592     /// parse over *no* parse.
read_record( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize)593     pub fn read_record(
594         &mut self,
595         input: &[u8],
596         output: &mut [u8],
597         ends: &mut [usize],
598     ) -> (ReadRecordResult, usize, usize, usize) {
599         let (input, bom_nin) = self.strip_utf8_bom(input);
600         let (res, nin, nout, nend) = if self.use_nfa {
601             self.read_record_nfa(input, output, ends)
602         } else {
603             self.read_record_dfa(input, output, ends)
604         };
605         self.has_read = true;
606         (res, nin + bom_nin, nout, nend)
607     }
608 
609     /// Strip off a possible UTF-8 BOM at the start of a file. Quick note that
610     /// this method will fail to strip off the BOM if only part of the BOM is
611     /// buffered. Hopefully that won't happen very often.
strip_utf8_bom<'a>(&self, input: &'a [u8]) -> (&'a [u8], usize)612     fn strip_utf8_bom<'a>(&self, input: &'a [u8]) -> (&'a [u8], usize) {
613         let (input, nin) = if {
614             !self.has_read
615                 && input.len() >= 3
616                 && &input[0..3] == b"\xef\xbb\xbf"
617         } {
618             (&input[3..], 3)
619         } else {
620             (input, 0)
621         };
622         (input, nin)
623     }
624 
625     #[inline(always)]
read_record_dfa( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize)626     fn read_record_dfa(
627         &mut self,
628         input: &[u8],
629         output: &mut [u8],
630         ends: &mut [usize],
631     ) -> (ReadRecordResult, usize, usize, usize) {
632         if input.is_empty() {
633             let s = self.transition_final_dfa(self.dfa_state);
634             let res =
635                 self.dfa.new_read_record_result(s, true, false, false, false);
636             // This part is a little tricky. When reading the final record,
637             // the last result the caller will get is an InputEmpty, and while
638             // they'll have everything they need in `output`, they'll be
639             // missing the final end position of the final field in `ends`.
640             // We insert that here, but we must take care to handle the case
641             // where `ends` doesn't have enough space. If it doesn't have
642             // enough space, then we also can't transition to the next state.
643             return match res {
644                 ReadRecordResult::Record => {
645                     if ends.is_empty() {
646                         return (ReadRecordResult::OutputEndsFull, 0, 0, 0);
647                     }
648                     self.dfa_state = s;
649                     ends[0] = self.output_pos;
650                     self.output_pos = 0;
651                     (res, 0, 0, 1)
652                 }
653                 _ => {
654                     self.dfa_state = s;
655                     (res, 0, 0, 0)
656                 }
657             };
658         }
659         if output.is_empty() {
660             return (ReadRecordResult::OutputFull, 0, 0, 0);
661         }
662         if ends.is_empty() {
663             return (ReadRecordResult::OutputEndsFull, 0, 0, 0);
664         }
665         let (mut nin, mut nout, mut nend) = (0, 0, 0);
666         let mut state = self.dfa_state;
667         while nin < input.len() && nout < output.len() && nend < ends.len() {
668             let (s, has_out) = self.dfa.get_output(state, input[nin]);
669             self.line += (input[nin] == b'\n') as u64;
670             state = s;
671             if has_out {
672                 output[nout] = input[nin];
673                 nout += 1;
674             }
675             nin += 1;
676             if state >= self.dfa.final_field {
677                 ends[nend] = self.output_pos + nout;
678                 nend += 1;
679                 if state > self.dfa.final_field {
680                     break;
681                 }
682             }
683             if state == self.dfa.in_field || state == self.dfa.in_quoted {
684                 self.dfa
685                     .classes
686                     .scan_and_copy(input, &mut nin, output, &mut nout);
687             }
688         }
689         let res = self.dfa.new_read_record_result(
690             state,
691             false,
692             nin >= input.len(),
693             nout >= output.len(),
694             nend >= ends.len(),
695         );
696         self.dfa_state = state;
697         if res.is_record() {
698             self.output_pos = 0;
699         } else {
700             self.output_pos += nout;
701         }
702         (res, nin, nout, nend)
703     }
704 
705     #[inline(always)]
read_field_dfa( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize)706     fn read_field_dfa(
707         &mut self,
708         input: &[u8],
709         output: &mut [u8],
710     ) -> (ReadFieldResult, usize, usize) {
711         if input.is_empty() {
712             self.dfa_state = self.transition_final_dfa(self.dfa_state);
713             let res = self.dfa.new_read_field_result(
714                 self.dfa_state,
715                 true,
716                 false,
717                 false,
718             );
719             return (res, 0, 0);
720         }
721         if output.is_empty() {
722             return (ReadFieldResult::OutputFull, 0, 0);
723         }
724         let (mut nin, mut nout) = (0, 0);
725         let mut state = self.dfa_state;
726         while nin < input.len() && nout < output.len() {
727             let b = input[nin];
728             self.line += (b == b'\n') as u64;
729             let (s, has_out) = self.dfa.get_output(state, b);
730             state = s;
731             if has_out {
732                 output[nout] = b;
733                 nout += 1;
734             }
735             nin += 1;
736             if state >= self.dfa.final_field {
737                 break;
738             }
739         }
740         let res = self.dfa.new_read_field_result(
741             state,
742             false,
743             nin >= input.len(),
744             nout >= output.len(),
745         );
746         self.dfa_state = state;
747         (res, nin, nout)
748     }
749 
750     /// Perform the final state transition, i.e., when the caller indicates
751     /// that the input has been exhausted.
transition_final_dfa(&self, state: DfaState) -> DfaState752     fn transition_final_dfa(&self, state: DfaState) -> DfaState {
753         // If we''ve already emitted a record or think we're ready to start
754         // parsing a new record, then we should sink into the final state
755         // and never move from there. (pro-tip: the start state doubles as
756         // the final state!)
757         if state >= self.dfa.final_record || state.is_start() {
758             self.dfa.new_state_final_end()
759         } else {
760             self.dfa.new_state_final_record()
761         }
762     }
763 
764     /// Write the transition tables for the DFA based on this parser's
765     /// configuration.
build_dfa(&mut self)766     fn build_dfa(&mut self) {
767         // A naive DFA transition table has
768         // `cells = (# number of states) * (# size of alphabet)`. While we
769         // could get away with that, the table would have `10 * 256 = 2560`
770         // entries. Even worse, in order to avoid a multiplication instruction
771         // when computing the next transition, we store the starting index of
772         // each state's row, which would not be representible in a single byte.
773         // So we'd need a `u16`, which doubles our transition table size to
774         // ~5KB. This is a lot to put on the stack, even though it probably
775         // fits in the L1 cache of most modern CPUs.
776         //
777         // To avoid this, we note that while our "true" alphabet
778         // has 256 distinct possibilities, the DFA itself is only
779         // discriminatory on a very small subset of that alphabet. For
780         // example, assuming neither `a` nor `b` are set as special
781         // quote/comment/escape/delimiter/terminator bytes, they are otherwise
782         // indistinguishable to the DFA, so it would be OK to treat them as
783         // if they were equivalent. That is, they are in the same equivalence
784         // class.
785         //
786         // As it turns out, using this logic, we can shrink our effective
787         // alphabet down to 7 equivalence classes:
788         //
789         //   1. The field delimiter.
790         //   2. The record terminator.
791         //   3. If the record terminator is CRLF, then CR and LF are
792         //      distinct equivalence classes.
793         //   4. The quote byte.
794         //   5. The escape byte.
795         //   6. The comment byte.
796         //   7. Everything else.
797         //
798         // We add those equivalence classes here. If more configuration knobs
799         // are added to the parser with more discriminating bytes, then this
800         // logic will need to be adjusted further.
801         //
802         // Even though this requires an extra bit of indirection when computing
803         // the next transition, microbenchmarks say that it doesn't make much
804         // of a difference. Perhaps because everything fits into the L1 cache.
805         self.dfa.classes.add(self.delimiter);
806         if self.quoting {
807             self.dfa.classes.add(self.quote);
808             if let Some(escape) = self.escape {
809                 self.dfa.classes.add(escape);
810             }
811         }
812         if let Some(comment) = self.comment {
813             self.dfa.classes.add(comment);
814         }
815         match self.term {
816             Terminator::Any(b) => self.dfa.classes.add(b),
817             Terminator::CRLF => {
818                 self.dfa.classes.add(b'\r');
819                 self.dfa.classes.add(b'\n');
820             }
821             _ => unreachable!(),
822         }
823         // Build the DFA transition table by computing the DFA state for all
824         // possible combinations of state and input byte.
825         for &state in NFA_STATES {
826             for c in (0..256).map(|c| c as u8) {
827                 let mut nfa_result = (state, NfaInputAction::Epsilon);
828                 // Consume NFA states until we hit a non-epsilon transition.
829                 while nfa_result.0 != NfaState::End
830                     && nfa_result.1 == NfaInputAction::Epsilon
831                 {
832                     nfa_result = self.transition_nfa(nfa_result.0, c);
833                 }
834                 let from = self.dfa.new_state(state);
835                 let to = self.dfa.new_state(nfa_result.0);
836                 self.dfa.set(
837                     from,
838                     c,
839                     to,
840                     nfa_result.1 == NfaInputAction::CopyToOutput,
841                 );
842             }
843         }
844         self.dfa_state = self.dfa.new_state(NfaState::StartRecord);
845         self.dfa.finish();
846     }
847 
848     // The NFA implementation follows. The transition_final_nfa and
849     // transition_nfa methods are required for the DFA to operate. The
850     // rest are included for completeness (and debugging). Note that this
851     // NFA implementation is included in most of the CSV parser tests below.
852 
853     #[inline(always)]
read_record_nfa( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize)854     fn read_record_nfa(
855         &mut self,
856         input: &[u8],
857         output: &mut [u8],
858         ends: &mut [usize],
859     ) -> (ReadRecordResult, usize, usize, usize) {
860         if input.is_empty() {
861             let s = self.transition_final_nfa(self.nfa_state);
862             let res = ReadRecordResult::from_nfa(s, false, false, false);
863             return match res {
864                 ReadRecordResult::Record => {
865                     if ends.is_empty() {
866                         return (ReadRecordResult::OutputEndsFull, 0, 0, 0);
867                     }
868                     self.nfa_state = s;
869                     ends[0] = self.output_pos;
870                     self.output_pos = 0;
871                     (res, 0, 0, 1)
872                 }
873                 _ => {
874                     self.nfa_state = s;
875                     (res, 0, 0, 0)
876                 }
877             };
878         }
879         if output.is_empty() {
880             return (ReadRecordResult::OutputFull, 0, 0, 0);
881         }
882         if ends.is_empty() {
883             return (ReadRecordResult::OutputEndsFull, 0, 0, 0);
884         }
885         let (mut nin, mut nout, mut nend) = (0, self.output_pos, 0);
886         let mut state = self.nfa_state;
887         while nin < input.len() && nout < output.len() && nend < ends.len() {
888             let (s, io) = self.transition_nfa(state, input[nin]);
889             match io {
890                 NfaInputAction::CopyToOutput => {
891                     output[nout] = input[nin];
892                     nout += 1;
893                     nin += 1;
894                 }
895                 NfaInputAction::Discard => {
896                     nin += 1;
897                 }
898                 NfaInputAction::Epsilon => {}
899             }
900             state = s;
901             if state.is_field_final() {
902                 ends[nend] = nout;
903                 nend += 1;
904                 if state != NfaState::EndFieldDelim {
905                     break;
906                 }
907             }
908         }
909         let res = ReadRecordResult::from_nfa(
910             state,
911             nin >= input.len(),
912             nout >= output.len(),
913             nend >= ends.len(),
914         );
915         self.nfa_state = state;
916         self.output_pos = if res.is_record() { 0 } else { nout };
917         (res, nin, nout, nend)
918     }
919 
920     #[inline(always)]
read_field_nfa( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize)921     fn read_field_nfa(
922         &mut self,
923         input: &[u8],
924         output: &mut [u8],
925     ) -> (ReadFieldResult, usize, usize) {
926         if input.is_empty() {
927             self.nfa_state = self.transition_final_nfa(self.nfa_state);
928             let res = ReadFieldResult::from_nfa(self.nfa_state, false, false);
929             return (res, 0, 0);
930         }
931         if output.is_empty() {
932             // If the output buffer is empty, then we can never make progress,
933             // so just quit now.
934             return (ReadFieldResult::OutputFull, 0, 0);
935         }
936         let (mut nin, mut nout) = (0, 0);
937         let mut state = self.nfa_state;
938         while nin < input.len() && nout < output.len() {
939             let (s, io) = self.transition_nfa(state, input[nin]);
940             match io {
941                 NfaInputAction::CopyToOutput => {
942                     output[nout] = input[nin];
943                     nout += 1;
944                     nin += 1;
945                 }
946                 NfaInputAction::Discard => {
947                     nin += 1;
948                 }
949                 NfaInputAction::Epsilon => (),
950             }
951             state = s;
952             if state.is_field_final() {
953                 break;
954             }
955         }
956         let res = ReadFieldResult::from_nfa(
957             state,
958             nin >= input.len(),
959             nout >= output.len(),
960         );
961         self.nfa_state = state;
962         (res, nin, nout)
963     }
964 
965     /// Compute the final NFA transition after all caller-provided input has
966     /// been exhausted.
967     #[inline(always)]
transition_final_nfa(&self, state: NfaState) -> NfaState968     fn transition_final_nfa(&self, state: NfaState) -> NfaState {
969         use self::NfaState::*;
970         match state {
971             End | StartRecord | EndRecord | InComment | CRLF => End,
972             StartField | EndFieldDelim | EndFieldTerm | InField
973             | InQuotedField | InEscapedQuote | InDoubleEscapedQuote
974             | InRecordTerm => EndRecord,
975         }
976     }
977 
978     /// Compute the next NFA state given the current NFA state and the current
979     /// input byte.
980     ///
981     /// This returns the next NFA state along with an NfaInputAction that
982     /// indicates what should be done with the input byte (nothing for an epsilon
983     /// transition, copied to a caller provided output buffer, or discarded).
984     #[inline(always)]
transition_nfa( &self, state: NfaState, c: u8, ) -> (NfaState, NfaInputAction)985     fn transition_nfa(
986         &self,
987         state: NfaState,
988         c: u8,
989     ) -> (NfaState, NfaInputAction) {
990         use self::NfaState::*;
991         match state {
992             End => (End, NfaInputAction::Epsilon),
993             StartRecord => {
994                 if self.term.equals(c) {
995                     (StartRecord, NfaInputAction::Discard)
996                 } else if self.comment == Some(c) {
997                     (InComment, NfaInputAction::Discard)
998                 } else {
999                     (StartField, NfaInputAction::Epsilon)
1000                 }
1001             }
1002             EndRecord => (StartRecord, NfaInputAction::Epsilon),
1003             StartField => {
1004                 if self.quoting && self.quote == c {
1005                     (InQuotedField, NfaInputAction::Discard)
1006                 } else if self.delimiter == c {
1007                     (EndFieldDelim, NfaInputAction::Discard)
1008                 } else if self.term.equals(c) {
1009                     (EndFieldTerm, NfaInputAction::Epsilon)
1010                 } else {
1011                     (InField, NfaInputAction::CopyToOutput)
1012                 }
1013             }
1014             EndFieldDelim => (StartField, NfaInputAction::Epsilon),
1015             EndFieldTerm => (InRecordTerm, NfaInputAction::Epsilon),
1016             InField => {
1017                 if self.delimiter == c {
1018                     (EndFieldDelim, NfaInputAction::Discard)
1019                 } else if self.term.equals(c) {
1020                     (EndFieldTerm, NfaInputAction::Epsilon)
1021                 } else {
1022                     (InField, NfaInputAction::CopyToOutput)
1023                 }
1024             }
1025             InQuotedField => {
1026                 if self.quoting && self.quote == c {
1027                     (InDoubleEscapedQuote, NfaInputAction::Discard)
1028                 } else if self.quoting && self.escape == Some(c) {
1029                     (InEscapedQuote, NfaInputAction::Discard)
1030                 } else {
1031                     (InQuotedField, NfaInputAction::CopyToOutput)
1032                 }
1033             }
1034             InEscapedQuote => (InQuotedField, NfaInputAction::CopyToOutput),
1035             InDoubleEscapedQuote => {
1036                 if self.quoting && self.double_quote && self.quote == c {
1037                     (InQuotedField, NfaInputAction::CopyToOutput)
1038                 } else if self.delimiter == c {
1039                     (EndFieldDelim, NfaInputAction::Discard)
1040                 } else if self.term.equals(c) {
1041                     (EndFieldTerm, NfaInputAction::Epsilon)
1042                 } else {
1043                     (InField, NfaInputAction::CopyToOutput)
1044                 }
1045             }
1046             InComment => {
1047                 if b'\n' == c {
1048                     (StartRecord, NfaInputAction::Discard)
1049                 } else {
1050                     (InComment, NfaInputAction::Discard)
1051                 }
1052             }
1053             InRecordTerm => {
1054                 if self.term.is_crlf() && b'\r' == c {
1055                     (CRLF, NfaInputAction::Discard)
1056                 } else {
1057                     (EndRecord, NfaInputAction::Discard)
1058                 }
1059             }
1060             CRLF => {
1061                 if b'\n' == c {
1062                     (StartRecord, NfaInputAction::Discard)
1063                 } else {
1064                     (StartRecord, NfaInputAction::Epsilon)
1065                 }
1066             }
1067         }
1068     }
1069 }
1070 
1071 /// The number of slots in the DFA transition table.
1072 ///
1073 /// This number is computed by multiplying the maximum number of transition
1074 /// classes (7) by the total number of NFA states that are used in the DFA
1075 /// (10).
1076 ///
1077 /// The number of transition classes is determined by an equivalence class of
1078 /// bytes, where every byte in the same equivalence classes is
1079 /// indistinguishable from any other byte with respect to the DFA. For example,
1080 /// if neither `a` nor `b` are specifed as a delimiter/quote/terminator/escape,
1081 /// then the DFA will never discriminate between `a` or `b`, so they can
1082 /// effectively be treated as identical. This reduces storage space
1083 /// substantially.
1084 ///
1085 /// The total number of NFA states (13) is greater than the total number of
1086 /// NFA states that are in the DFA. In particular, any NFA state that can only
1087 /// be reached by epsilon transitions will never have explicit usage in the
1088 /// DFA.
1089 const TRANS_CLASSES: usize = 7;
1090 const DFA_STATES: usize = 10;
1091 const TRANS_SIZE: usize = TRANS_CLASSES * DFA_STATES;
1092 
1093 /// The number of possible transition classes. (See the comment on `TRANS_SIZE`
1094 /// for more details.)
1095 const CLASS_SIZE: usize = 256;
1096 
1097 /// A representation of a DFA.
1098 ///
1099 /// For the most part, this is a transition table, but various optimizations
1100 /// have been applied to reduce its memory footprint.
1101 struct Dfa {
1102     /// The core transition table. Each row corresponds to the transitions for
1103     /// each input equivalence class. (Input bytes are mapped to their
1104     /// corresponding equivalence class with the `classes` map.)
1105     ///
1106     /// DFA states are represented as an index corresponding to the start of
1107     /// its row in this table.
1108     trans: [DfaState; TRANS_SIZE],
1109     /// A table with the same layout as `trans`, except its values indicate
1110     /// whether a particular `(state, equivalence class)` pair should emit an
1111     /// output byte.
1112     has_output: [bool; TRANS_SIZE],
1113     /// A map from input byte to equivalence class.
1114     ///
1115     /// This is responsible for reducing the effective alphabet size from
1116     /// 256 to `TRANS_CLASSES`.
1117     classes: DfaClasses,
1118     /// The DFA state corresponding to being inside an unquoted field.
1119     in_field: DfaState,
1120     /// The DFA state corresponding to being inside an quoted field.
1121     in_quoted: DfaState,
1122     /// The minimum DFA state that indicates a field has been parsed. All DFA
1123     /// states greater than this are also final-field states.
1124     final_field: DfaState,
1125     /// The minimum DFA state that indicates a record has been parsed. All DFA
1126     /// states greater than this are also final-record states.
1127     final_record: DfaState,
1128 }
1129 
1130 impl Dfa {
new() -> Dfa1131     fn new() -> Dfa {
1132         Dfa {
1133             trans: [DfaState(0); TRANS_SIZE],
1134             has_output: [false; TRANS_SIZE],
1135             classes: DfaClasses::new(),
1136             in_field: DfaState(0),
1137             in_quoted: DfaState(0),
1138             final_field: DfaState(0),
1139             final_record: DfaState(0),
1140         }
1141     }
1142 
new_state(&self, nfa_state: NfaState) -> DfaState1143     fn new_state(&self, nfa_state: NfaState) -> DfaState {
1144         let nclasses = self.classes.num_classes() as u8;
1145         let idx = (nfa_state as u8).checked_mul(nclasses).unwrap();
1146         DfaState(idx)
1147     }
1148 
new_state_final_end(&self) -> DfaState1149     fn new_state_final_end(&self) -> DfaState {
1150         self.new_state(NfaState::StartRecord)
1151     }
1152 
new_state_final_record(&self) -> DfaState1153     fn new_state_final_record(&self) -> DfaState {
1154         self.new_state(NfaState::EndRecord)
1155     }
1156 
get_output(&self, state: DfaState, c: u8) -> (DfaState, bool)1157     fn get_output(&self, state: DfaState, c: u8) -> (DfaState, bool) {
1158         let cls = self.classes.classes[c as usize];
1159         let idx = state.0 as usize + cls as usize;
1160         (self.trans[idx], self.has_output[idx])
1161     }
1162 
set(&mut self, from: DfaState, c: u8, to: DfaState, output: bool)1163     fn set(&mut self, from: DfaState, c: u8, to: DfaState, output: bool) {
1164         let cls = self.classes.classes[c as usize];
1165         let idx = from.0 as usize + cls as usize;
1166         self.trans[idx] = to;
1167         self.has_output[idx] = output;
1168     }
1169 
finish(&mut self)1170     fn finish(&mut self) {
1171         self.in_field = self.new_state(NfaState::InField);
1172         self.in_quoted = self.new_state(NfaState::InQuotedField);
1173         self.final_field = self.new_state(NfaState::EndFieldDelim);
1174         self.final_record = self.new_state(NfaState::EndRecord);
1175     }
1176 
new_read_field_result( &self, state: DfaState, is_final_trans: bool, inpdone: bool, outdone: bool, ) -> ReadFieldResult1177     fn new_read_field_result(
1178         &self,
1179         state: DfaState,
1180         is_final_trans: bool,
1181         inpdone: bool,
1182         outdone: bool,
1183     ) -> ReadFieldResult {
1184         if state >= self.final_record {
1185             ReadFieldResult::Field { record_end: true }
1186         } else if state == self.final_field {
1187             ReadFieldResult::Field { record_end: false }
1188         } else if is_final_trans && state.is_start() {
1189             ReadFieldResult::End
1190         } else {
1191             debug_assert!(state < self.final_field);
1192             if !inpdone && outdone {
1193                 ReadFieldResult::OutputFull
1194             } else {
1195                 ReadFieldResult::InputEmpty
1196             }
1197         }
1198     }
1199 
new_read_record_result( &self, state: DfaState, is_final_trans: bool, inpdone: bool, outdone: bool, endsdone: bool, ) -> ReadRecordResult1200     fn new_read_record_result(
1201         &self,
1202         state: DfaState,
1203         is_final_trans: bool,
1204         inpdone: bool,
1205         outdone: bool,
1206         endsdone: bool,
1207     ) -> ReadRecordResult {
1208         if state >= self.final_record {
1209             ReadRecordResult::Record
1210         } else if is_final_trans && state.is_start() {
1211             ReadRecordResult::End
1212         } else {
1213             debug_assert!(state < self.final_record);
1214             if !inpdone && outdone {
1215                 ReadRecordResult::OutputFull
1216             } else if !inpdone && endsdone {
1217                 ReadRecordResult::OutputEndsFull
1218             } else {
1219                 ReadRecordResult::InputEmpty
1220             }
1221         }
1222     }
1223 }
1224 
1225 /// A map from input byte to equivalence class.
1226 struct DfaClasses {
1227     classes: [u8; CLASS_SIZE],
1228     next_class: usize,
1229 }
1230 
1231 impl DfaClasses {
new() -> DfaClasses1232     fn new() -> DfaClasses {
1233         DfaClasses { classes: [0; CLASS_SIZE], next_class: 1 }
1234     }
1235 
add(&mut self, b: u8)1236     fn add(&mut self, b: u8) {
1237         if self.next_class > CLASS_SIZE {
1238             panic!("added too many classes")
1239         }
1240         self.classes[b as usize] = self.next_class as u8;
1241         self.next_class = self.next_class + 1;
1242     }
1243 
num_classes(&self) -> usize1244     fn num_classes(&self) -> usize {
1245         self.next_class as usize
1246     }
1247 
1248     /// Scan and copy the input bytes to the output buffer quickly.
1249     ///
1250     /// This assumes that the current state of the DFA is either `InField` or
1251     /// `InQuotedField`. In this case, all bytes corresponding to the first
1252     /// equivalence class (i.e., not a delimiter/quote/escape/etc.) are
1253     /// guaranteed to never result in a state transition out of the current
1254     /// state. This function takes advantage of that copies every byte from
1255     /// `input` in the first equivalence class to `output`. Once a byte is seen
1256     /// outside the first equivalence class, we quit and should fall back to
1257     /// the main DFA loop.
1258     #[inline(always)]
scan_and_copy( &self, input: &[u8], nin: &mut usize, output: &mut [u8], nout: &mut usize, )1259     fn scan_and_copy(
1260         &self,
1261         input: &[u8],
1262         nin: &mut usize,
1263         output: &mut [u8],
1264         nout: &mut usize,
1265     ) {
1266         while *nin < input.len()
1267             && *nout < output.len()
1268             && self.classes[input[*nin] as usize] == 0
1269         {
1270             output[*nout] = input[*nin];
1271             *nin += 1;
1272             *nout += 1;
1273         }
1274     }
1275 }
1276 
1277 /// A single DFA state.
1278 ///
1279 /// A DFA state is represented by the starting index of its corresponding row
1280 /// in the DFA transition table. This representation allows us to elide a
1281 /// single multiplication instruction when computing the next transition for
1282 /// a particular input byte.
1283 #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
1284 struct DfaState(u8);
1285 
1286 impl DfaState {
start() -> DfaState1287     fn start() -> DfaState {
1288         DfaState(0)
1289     }
1290 
is_start(&self) -> bool1291     fn is_start(&self) -> bool {
1292         self.0 == 0
1293     }
1294 }
1295 
1296 impl fmt::Debug for Dfa {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1297     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1298         write!(f, "Dfa(N/A)")
1299     }
1300 }
1301 
1302 impl fmt::Debug for DfaClasses {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1303     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1304         write!(
1305             f,
1306             "DfaClasses {{ classes: N/A, next_class: {:?} }}",
1307             self.next_class
1308         )
1309     }
1310 }
1311 
1312 impl Clone for Dfa {
clone(&self) -> Dfa1313     fn clone(&self) -> Dfa {
1314         let mut dfa = Dfa::new();
1315         dfa.trans.copy_from_slice(&self.trans);
1316         dfa
1317     }
1318 }
1319 
1320 impl Clone for DfaClasses {
clone(&self) -> DfaClasses1321     fn clone(&self) -> DfaClasses {
1322         let mut x = DfaClasses::new();
1323         x.classes.copy_from_slice(&self.classes);
1324         x
1325     }
1326 }
1327 
1328 #[cfg(test)]
1329 mod tests {
1330     use core::str;
1331 
1332     use arrayvec::{ArrayString, ArrayVec};
1333 
1334     use super::{ReadFieldResult, Reader, ReaderBuilder, Terminator};
1335 
1336     type Csv = ArrayVec<[Row; 10]>;
1337     type Row = ArrayVec<[Field; 10]>;
1338     type Field = ArrayString<[u8; 10]>;
1339 
1340     // OMG I HATE BYTE STRING LITERALS SO MUCH.
b(s: &str) -> &[u8]1341     fn b(s: &str) -> &[u8] {
1342         s.as_bytes()
1343     }
1344 
1345     macro_rules! csv {
1346         ($([$($field:expr),*]),*) => {{
1347             #[allow(unused_mut)]
1348             fn x() -> Csv {
1349                 let mut csv = Csv::new();
1350                 $(
1351                     let mut row = Row::new();
1352                     $(
1353                         row.push(Field::from($field).unwrap());
1354                     )*
1355                     csv.push(row);
1356                 )*
1357                 csv
1358             }
1359             x()
1360         }}
1361     }
1362 
1363     macro_rules! parses_to {
1364         ($name:ident, $data:expr, $expected:expr) => {
1365             parses_to!($name, $data, $expected, |builder| builder);
1366         };
1367         ($name:ident, $data:expr, $expected:expr, $config:expr) => {
1368             #[test]
1369             fn $name() {
1370                 let mut builder = ReaderBuilder::new();
1371                 builder.nfa(true);
1372                 $config(&mut builder);
1373                 let mut rdr = builder.build();
1374                 let got = parse_by_field(&mut rdr, $data);
1375                 let expected = $expected;
1376                 assert_eq!(expected, got, "nfa by field");
1377 
1378                 let mut builder = ReaderBuilder::new();
1379                 builder.nfa(true);
1380                 $config(&mut builder);
1381                 let mut rdr = builder.build();
1382                 let got = parse_by_record(&mut rdr, $data);
1383                 let expected = $expected;
1384                 assert_eq!(expected, got, "nfa by record");
1385 
1386                 let mut builder = ReaderBuilder::new();
1387                 $config(&mut builder);
1388                 let mut rdr = builder.build();
1389                 let got = parse_by_field(&mut rdr, $data);
1390                 let expected = $expected;
1391                 assert_eq!(expected, got, "dfa by field");
1392 
1393                 let mut builder = ReaderBuilder::new();
1394                 $config(&mut builder);
1395                 let mut rdr = builder.build();
1396                 let got = parse_by_record(&mut rdr, $data);
1397                 let expected = $expected;
1398                 assert_eq!(expected, got, "dfa by record");
1399             }
1400         };
1401     }
1402 
parse_by_field(rdr: &mut Reader, data: &str) -> Csv1403     fn parse_by_field(rdr: &mut Reader, data: &str) -> Csv {
1404         let mut data = data.as_bytes();
1405         let mut field = [0u8; 10];
1406         let mut csv = Csv::new();
1407         let mut row = Row::new();
1408         let mut outpos = 0;
1409         loop {
1410             let (res, nin, nout) = rdr.read_field(data, &mut field[outpos..]);
1411             data = &data[nin..];
1412             outpos += nout;
1413 
1414             match res {
1415                 ReadFieldResult::InputEmpty => {
1416                     if !data.is_empty() {
1417                         panic!("missing input data")
1418                     }
1419                 }
1420                 ReadFieldResult::OutputFull => panic!("field too large"),
1421                 ReadFieldResult::Field { record_end } => {
1422                     let s = str::from_utf8(&field[..outpos]).unwrap();
1423                     row.push(Field::from(s).unwrap());
1424                     outpos = 0;
1425                     if record_end {
1426                         csv.push(row);
1427                         row = Row::new();
1428                     }
1429                 }
1430                 ReadFieldResult::End => {
1431                     return csv;
1432                 }
1433             }
1434         }
1435     }
1436 
parse_by_record(rdr: &mut Reader, data: &str) -> Csv1437     fn parse_by_record(rdr: &mut Reader, data: &str) -> Csv {
1438         use crate::ReadRecordResult::*;
1439 
1440         let mut data = data.as_bytes();
1441         let mut record = [0; 1024];
1442         let mut ends = [0; 10];
1443 
1444         let mut csv = Csv::new();
1445         let (mut outpos, mut endpos) = (0, 0);
1446         loop {
1447             let (res, nin, nout, nend) = rdr.read_record(
1448                 data,
1449                 &mut record[outpos..],
1450                 &mut ends[endpos..],
1451             );
1452             data = &data[nin..];
1453             outpos += nout;
1454             endpos += nend;
1455 
1456             match res {
1457                 InputEmpty => {
1458                     if !data.is_empty() {
1459                         panic!("missing input data")
1460                     }
1461                 }
1462                 OutputFull => panic!("record too large (out buffer)"),
1463                 OutputEndsFull => panic!("record too large (end buffer)"),
1464                 Record => {
1465                     let s = str::from_utf8(&record[..outpos]).unwrap();
1466                     let mut start = 0;
1467                     let mut row = Row::new();
1468                     for &end in &ends[..endpos] {
1469                         row.push(Field::from(&s[start..end]).unwrap());
1470                         start = end;
1471                     }
1472                     csv.push(row);
1473                     outpos = 0;
1474                     endpos = 0;
1475                 }
1476                 End => return csv,
1477             }
1478         }
1479     }
1480 
1481     parses_to!(one_row_one_field, "a", csv![["a"]]);
1482     parses_to!(one_row_many_fields, "a,b,c", csv![["a", "b", "c"]]);
1483     parses_to!(one_row_trailing_comma, "a,b,", csv![["a", "b", ""]]);
1484     parses_to!(one_row_one_field_lf, "a\n", csv![["a"]]);
1485     parses_to!(one_row_many_fields_lf, "a,b,c\n", csv![["a", "b", "c"]]);
1486     parses_to!(one_row_trailing_comma_lf, "a,b,\n", csv![["a", "b", ""]]);
1487     parses_to!(one_row_one_field_crlf, "a\r\n", csv![["a"]]);
1488     parses_to!(one_row_many_fields_crlf, "a,b,c\r\n", csv![["a", "b", "c"]]);
1489     parses_to!(one_row_trailing_comma_crlf, "a,b,\r\n", csv![["a", "b", ""]]);
1490     parses_to!(one_row_one_field_cr, "a\r", csv![["a"]]);
1491     parses_to!(one_row_many_fields_cr, "a,b,c\r", csv![["a", "b", "c"]]);
1492     parses_to!(one_row_trailing_comma_cr, "a,b,\r", csv![["a", "b", ""]]);
1493 
1494     parses_to!(many_rows_one_field, "a\nb", csv![["a"], ["b"]]);
1495     parses_to!(
1496         many_rows_many_fields,
1497         "a,b,c\nx,y,z",
1498         csv![["a", "b", "c"], ["x", "y", "z"]]
1499     );
1500     parses_to!(
1501         many_rows_trailing_comma,
1502         "a,b,\nx,y,",
1503         csv![["a", "b", ""], ["x", "y", ""]]
1504     );
1505     parses_to!(many_rows_one_field_lf, "a\nb\n", csv![["a"], ["b"]]);
1506     parses_to!(
1507         many_rows_many_fields_lf,
1508         "a,b,c\nx,y,z\n",
1509         csv![["a", "b", "c"], ["x", "y", "z"]]
1510     );
1511     parses_to!(
1512         many_rows_trailing_comma_lf,
1513         "a,b,\nx,y,\n",
1514         csv![["a", "b", ""], ["x", "y", ""]]
1515     );
1516     parses_to!(many_rows_one_field_crlf, "a\r\nb\r\n", csv![["a"], ["b"]]);
1517     parses_to!(
1518         many_rows_many_fields_crlf,
1519         "a,b,c\r\nx,y,z\r\n",
1520         csv![["a", "b", "c"], ["x", "y", "z"]]
1521     );
1522     parses_to!(
1523         many_rows_trailing_comma_crlf,
1524         "a,b,\r\nx,y,\r\n",
1525         csv![["a", "b", ""], ["x", "y", ""]]
1526     );
1527     parses_to!(many_rows_one_field_cr, "a\rb\r", csv![["a"], ["b"]]);
1528     parses_to!(
1529         many_rows_many_fields_cr,
1530         "a,b,c\rx,y,z\r",
1531         csv![["a", "b", "c"], ["x", "y", "z"]]
1532     );
1533     parses_to!(
1534         many_rows_trailing_comma_cr,
1535         "a,b,\rx,y,\r",
1536         csv![["a", "b", ""], ["x", "y", ""]]
1537     );
1538 
1539     parses_to!(
1540         trailing_lines_no_record,
1541         "\n\n\na,b,c\nx,y,z\n\n\n",
1542         csv![["a", "b", "c"], ["x", "y", "z"]]
1543     );
1544     parses_to!(
1545         trailing_lines_no_record_cr,
1546         "\r\r\ra,b,c\rx,y,z\r\r\r",
1547         csv![["a", "b", "c"], ["x", "y", "z"]]
1548     );
1549     parses_to!(
1550         trailing_lines_no_record_crlf,
1551         "\r\n\r\n\r\na,b,c\r\nx,y,z\r\n\r\n\r\n",
1552         csv![["a", "b", "c"], ["x", "y", "z"]]
1553     );
1554 
1555     parses_to!(empty, "", csv![]);
1556     parses_to!(empty_lines, "\n\n\n\n", csv![]);
1557     parses_to!(
1558         empty_lines_interspersed,
1559         "\n\na,b\n\n\nx,y\n\n\nm,n\n",
1560         csv![["a", "b"], ["x", "y"], ["m", "n"]]
1561     );
1562     parses_to!(empty_lines_crlf, "\r\n\r\n\r\n\r\n", csv![]);
1563     parses_to!(
1564         empty_lines_interspersed_crlf,
1565         "\r\n\r\na,b\r\n\r\n\r\nx,y\r\n\r\n\r\nm,n\r\n",
1566         csv![["a", "b"], ["x", "y"], ["m", "n"]]
1567     );
1568     parses_to!(empty_lines_mixed, "\r\n\n\r\n\n", csv![]);
1569     parses_to!(
1570         empty_lines_interspersed_mixed,
1571         "\n\r\na,b\r\n\n\r\nx,y\r\n\n\r\nm,n\r\n",
1572         csv![["a", "b"], ["x", "y"], ["m", "n"]]
1573     );
1574     parses_to!(empty_lines_cr, "\r\r\r\r", csv![]);
1575     parses_to!(
1576         empty_lines_interspersed_cr,
1577         "\r\ra,b\r\r\rx,y\r\r\rm,n\r",
1578         csv![["a", "b"], ["x", "y"], ["m", "n"]]
1579     );
1580 
1581     parses_to!(
1582         term_weird,
1583         "zza,bzc,dzz",
1584         csv![["a", "b"], ["c", "d"]],
1585         |b: &mut ReaderBuilder| {
1586             b.terminator(Terminator::Any(b'z'));
1587         }
1588     );
1589 
1590     parses_to!(
1591         ascii_delimited,
1592         "a\x1fb\x1ec\x1fd",
1593         csv![["a", "b"], ["c", "d"]],
1594         |b: &mut ReaderBuilder| {
1595             b.ascii();
1596         }
1597     );
1598 
1599     parses_to!(bom_at_start, "\u{feff}a", csv![["a"]]);
1600     parses_to!(bom_in_field, "a\u{feff}", csv![["a\u{feff}"]]);
1601     parses_to!(bom_at_field_start, "a,\u{feff}b", csv![["a", "\u{feff}b"]]);
1602 
1603     parses_to!(quote_empty, "\"\"", csv![[""]]);
1604     parses_to!(quote_lf, "\"\"\n", csv![[""]]);
1605     parses_to!(quote_space, "\" \"", csv![[" "]]);
1606     parses_to!(quote_inner_space, "\" a \"", csv![[" a "]]);
1607     parses_to!(quote_outer_space, "  \"a\"  ", csv![["  \"a\"  "]]);
1608 
1609     parses_to!(quote_change, "zaz", csv![["a"]], |b: &mut ReaderBuilder| {
1610         b.quote(b'z');
1611     });
1612 
1613     // This one is pretty hokey.
1614     // I don't really know what the "right" behavior is.
1615     parses_to!(
1616         quote_delimiter,
1617         ",a,,b",
1618         csv![["a,b"]],
1619         |b: &mut ReaderBuilder| {
1620             b.quote(b',');
1621         }
1622     );
1623 
1624     parses_to!(quote_no_escapes, r#""a\"b""#, csv![[r#"a\b""#]]);
1625     parses_to!(
1626         quote_escapes_no_double,
1627         r#""a""b""#,
1628         csv![[r#"a"b""#]],
1629         |b: &mut ReaderBuilder| {
1630             b.double_quote(false);
1631         }
1632     );
1633     parses_to!(
1634         quote_escapes,
1635         r#""a\"b""#,
1636         csv![[r#"a"b"#]],
1637         |b: &mut ReaderBuilder| {
1638             b.escape(Some(b'\\'));
1639         }
1640     );
1641     parses_to!(
1642         quote_escapes_change,
1643         r#""az"b""#,
1644         csv![[r#"a"b"#]],
1645         |b: &mut ReaderBuilder| {
1646             b.escape(Some(b'z'));
1647         }
1648     );
1649 
1650     parses_to!(
1651         quote_escapes_with_comma,
1652         r#""\"A,B\"""#,
1653         csv![[r#""A,B""#]],
1654         |b: &mut ReaderBuilder| {
1655             b.escape(Some(b'\\')).double_quote(false);
1656         }
1657     );
1658 
1659     parses_to!(
1660         quoting_disabled,
1661         r#""abc,foo""#,
1662         csv![[r#""abc"#, r#"foo""#]],
1663         |b: &mut ReaderBuilder| {
1664             b.quoting(false);
1665         }
1666     );
1667 
1668     parses_to!(
1669         delimiter_tabs,
1670         "a\tb",
1671         csv![["a", "b"]],
1672         |b: &mut ReaderBuilder| {
1673             b.delimiter(b'\t');
1674         }
1675     );
1676     parses_to!(
1677         delimiter_weird,
1678         "azb",
1679         csv![["a", "b"]],
1680         |b: &mut ReaderBuilder| {
1681             b.delimiter(b'z');
1682         }
1683     );
1684 
1685     parses_to!(extra_record_crlf_1, "foo\n1\n", csv![["foo"], ["1"]]);
1686     parses_to!(extra_record_crlf_2, "foo\r\n1\r\n", csv![["foo"], ["1"]]);
1687 
1688     parses_to!(
1689         comment_1,
1690         "foo\n# hi\nbar\n",
1691         csv![["foo"], ["bar"]],
1692         |b: &mut ReaderBuilder| {
1693             b.comment(Some(b'#'));
1694         }
1695     );
1696     parses_to!(
1697         comment_2,
1698         "foo\n # hi\nbar\n",
1699         csv![["foo"], [" # hi"], ["bar"]],
1700         |b: &mut ReaderBuilder| {
1701             b.comment(Some(b'#'));
1702         }
1703     );
1704     parses_to!(
1705         comment_3,
1706         "foo\n# hi\nbar\n",
1707         csv![["foo"], ["# hi"], ["bar"]],
1708         |b: &mut ReaderBuilder| {
1709             b.comment(Some(b'\n'));
1710         }
1711     );
1712     parses_to!(
1713         comment_4,
1714         "foo,b#ar,baz",
1715         csv![["foo", "b#ar", "baz"]],
1716         |b: &mut ReaderBuilder| {
1717             b.comment(Some(b'#'));
1718         }
1719     );
1720     parses_to!(
1721         comment_5,
1722         "foo,#bar,baz",
1723         csv![["foo", "#bar", "baz"]],
1724         |b: &mut ReaderBuilder| {
1725             b.comment(Some(b'#'));
1726         }
1727     );
1728 
1729     macro_rules! assert_read {
1730         (
1731             $rdr:expr, $input:expr, $output:expr,
1732             $expect_in:expr, $expect_out:expr, $expect_res:expr
1733         ) => {{
1734             let (res, nin, nout) = $rdr.read_field($input, $output);
1735             assert_eq!($expect_in, nin);
1736             assert_eq!($expect_out, nout);
1737             assert_eq!($expect_res, res);
1738         }};
1739     }
1740 
1741     // This tests that feeding a new reader with an empty buffer sends us
1742     // straight to End.
1743     #[test]
stream_empty()1744     fn stream_empty() {
1745         use crate::ReadFieldResult::*;
1746 
1747         let mut rdr = Reader::new();
1748         assert_read!(rdr, &[], &mut [], 0, 0, End);
1749     }
1750 
1751     // Test that a single space is treated as a single field.
1752     #[test]
stream_space()1753     fn stream_space() {
1754         use crate::ReadFieldResult::*;
1755 
1756         let mut rdr = Reader::new();
1757         assert_read!(rdr, b(" "), &mut [0], 1, 1, InputEmpty);
1758         assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true });
1759         assert_read!(rdr, &[], &mut [0], 0, 0, End);
1760     }
1761 
1762     // Test that a single comma ...
1763     #[test]
stream_comma()1764     fn stream_comma() {
1765         use crate::ReadFieldResult::*;
1766 
1767         let mut rdr = Reader::new();
1768         assert_read!(rdr, b(","), &mut [0], 1, 0, Field { record_end: false });
1769         assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true });
1770         assert_read!(rdr, &[], &mut [0], 0, 0, End);
1771     }
1772 
1773     // Test that we can read a single large field in multiple output
1774     // buffers.
1775     #[test]
stream_output_chunks()1776     fn stream_output_chunks() {
1777         use crate::ReadFieldResult::*;
1778 
1779         let mut inp = b("fooquux");
1780         let out = &mut [0; 2];
1781         let mut rdr = Reader::new();
1782 
1783         assert_read!(rdr, inp, out, 2, 2, OutputFull);
1784         assert_eq!(out, b("fo"));
1785         inp = &inp[2..];
1786 
1787         assert_read!(rdr, inp, out, 2, 2, OutputFull);
1788         assert_eq!(out, b("oq"));
1789         inp = &inp[2..];
1790 
1791         assert_read!(rdr, inp, out, 2, 2, OutputFull);
1792         assert_eq!(out, b("uu"));
1793         inp = &inp[2..];
1794 
1795         assert_read!(rdr, inp, out, 1, 1, InputEmpty);
1796         assert_eq!(&out[..1], b("x"));
1797         inp = &inp[1..];
1798         assert!(inp.is_empty());
1799 
1800         assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
1801         assert_read!(rdr, inp, out, 0, 0, End);
1802     }
1803 
1804     // Test that we can read a single large field across multiple input
1805     // buffers.
1806     #[test]
stream_input_chunks()1807     fn stream_input_chunks() {
1808         use crate::ReadFieldResult::*;
1809 
1810         let out = &mut [0; 10];
1811         let mut rdr = Reader::new();
1812 
1813         assert_read!(rdr, b("fo"), out, 2, 2, InputEmpty);
1814         assert_eq!(&out[..2], b("fo"));
1815 
1816         assert_read!(rdr, b("oq"), &mut out[2..], 2, 2, InputEmpty);
1817         assert_eq!(&out[..4], b("fooq"));
1818 
1819         assert_read!(rdr, b("uu"), &mut out[4..], 2, 2, InputEmpty);
1820         assert_eq!(&out[..6], b("fooquu"));
1821 
1822         assert_read!(rdr, b("x"), &mut out[6..], 1, 1, InputEmpty);
1823         assert_eq!(&out[..7], b("fooquux"));
1824 
1825         assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
1826         assert_read!(rdr, &[], out, 0, 0, End);
1827     }
1828 
1829     // Test we can read doubled quotes correctly in a stream.
1830     #[test]
stream_doubled_quotes()1831     fn stream_doubled_quotes() {
1832         use crate::ReadFieldResult::*;
1833 
1834         let out = &mut [0; 10];
1835         let mut rdr = Reader::new();
1836 
1837         assert_read!(rdr, b("\"fo\""), out, 4, 2, InputEmpty);
1838         assert_eq!(&out[..2], b("fo"));
1839 
1840         assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty);
1841         assert_eq!(&out[..4], b("fo\"o"));
1842 
1843         assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
1844         assert_read!(rdr, &[], out, 0, 0, End);
1845     }
1846 
1847     // Test we can read escaped quotes correctly in a stream.
1848     #[test]
stream_escaped_quotes()1849     fn stream_escaped_quotes() {
1850         use crate::ReadFieldResult::*;
1851 
1852         let out = &mut [0; 10];
1853         let mut builder = ReaderBuilder::new();
1854         let mut rdr = builder.escape(Some(b'\\')).build();
1855 
1856         assert_read!(rdr, b("\"fo\\"), out, 4, 2, InputEmpty);
1857         assert_eq!(&out[..2], b("fo"));
1858 
1859         assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty);
1860         assert_eq!(&out[..4], b("fo\"o"));
1861 
1862         assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
1863         assert_read!(rdr, &[], out, 0, 0, End);
1864     }
1865 
1866     // Test that empty output buffers don't wreak havoc.
1867     #[test]
stream_empty_output()1868     fn stream_empty_output() {
1869         use crate::ReadFieldResult::*;
1870 
1871         let out = &mut [0; 10];
1872         let mut rdr = Reader::new();
1873 
1874         assert_read!(
1875             rdr,
1876             b("foo,bar"),
1877             out,
1878             4,
1879             3,
1880             Field { record_end: false }
1881         );
1882         assert_eq!(&out[..3], b("foo"));
1883 
1884         assert_read!(rdr, b("bar"), &mut [], 0, 0, OutputFull);
1885 
1886         assert_read!(rdr, b("bar"), out, 3, 3, InputEmpty);
1887         assert_eq!(&out[..3], b("bar"));
1888 
1889         assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
1890         assert_read!(rdr, &[], out, 0, 0, End);
1891     }
1892 
1893     // Test that we can reset the parser mid-stream and count on it to do
1894     // the right thing.
1895     #[test]
reset_works()1896     fn reset_works() {
1897         use crate::ReadFieldResult::*;
1898 
1899         let out = &mut [0; 10];
1900         let mut rdr = Reader::new();
1901 
1902         assert_read!(rdr, b("\"foo"), out, 4, 3, InputEmpty);
1903         assert_eq!(&out[..3], b("foo"));
1904 
1905         // Without reseting the parser state, the reader will remember that
1906         // we're in a quoted field, and therefore interpret the leading double
1907         // quotes below as a single quote and the trailing quote as a matching
1908         // terminator. With the reset, however, the parser forgets the quoted
1909         // field and treats the leading double quotes as a syntax quirk and
1910         // drops them, in addition to hanging on to the trailing unmatched
1911         // quote. (Matches Python's behavior.)
1912         rdr.reset();
1913 
1914         assert_read!(rdr, b("\"\"bar\""), out, 6, 4, InputEmpty);
1915         assert_eq!(&out[..4], b("bar\""));
1916     }
1917 
1918     // Test the line number reporting is correct.
1919     #[test]
line_numbers()1920     fn line_numbers() {
1921         use crate::ReadFieldResult::*;
1922 
1923         let out = &mut [0; 10];
1924         let mut rdr = Reader::new();
1925 
1926         assert_eq!(1, rdr.line());
1927 
1928         assert_read!(rdr, b("\n\n\n\n"), out, 4, 0, InputEmpty);
1929         assert_eq!(5, rdr.line());
1930 
1931         assert_read!(rdr, b("foo,"), out, 4, 3, Field { record_end: false });
1932         assert_eq!(5, rdr.line());
1933 
1934         assert_read!(rdr, b("bar\n"), out, 4, 3, Field { record_end: true });
1935         assert_eq!(6, rdr.line());
1936 
1937         assert_read!(rdr, &[], &mut [0], 0, 0, End);
1938         assert_eq!(6, rdr.line());
1939     }
1940 
1941     macro_rules! assert_read_record {
1942         (
1943             $rdr:expr, $input:expr, $output:expr, $ends:expr,
1944             $expect_in:expr, $expect_out:expr,
1945             $expect_end:expr, $expect_res:expr
1946         ) => {{
1947             let (res, nin, nout, nend) =
1948                 $rdr.read_record($input, $output, $ends);
1949             assert_eq!($expect_res, res, "result");
1950             assert_eq!($expect_in, nin, "input");
1951             assert_eq!($expect_out, nout, "output");
1952             assert_eq!($expect_end, nend, "ends");
1953         }};
1954     }
1955 
1956     // Test that we can incrementally read a record.
1957     #[test]
stream_record()1958     fn stream_record() {
1959         use crate::ReadRecordResult::*;
1960 
1961         let mut inp = b("foo,bar\nbaz");
1962         let out = &mut [0; 1024];
1963         let ends = &mut [0; 10];
1964         let mut rdr = Reader::new();
1965 
1966         assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record);
1967         assert_eq!(ends[0], 3);
1968         assert_eq!(ends[1], 6);
1969         inp = &inp[8..];
1970 
1971         assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty);
1972         inp = &inp[3..];
1973 
1974         assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record);
1975         assert_eq!(ends[0], 3);
1976 
1977         assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End);
1978     }
1979 
1980     // Test that if our output ends are full during the last read that
1981     // we get an appropriate state returned.
1982     #[test]
stream_record_last_end_output_full()1983     fn stream_record_last_end_output_full() {
1984         use crate::ReadRecordResult::*;
1985 
1986         let mut inp = b("foo,bar\nbaz");
1987         let out = &mut [0; 1024];
1988         let ends = &mut [0; 10];
1989         let mut rdr = Reader::new();
1990 
1991         assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record);
1992         assert_eq!(ends[0], 3);
1993         assert_eq!(ends[1], 6);
1994         inp = &inp[8..];
1995 
1996         assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty);
1997         inp = &inp[3..];
1998 
1999         assert_read_record!(rdr, &inp, out, &mut [], 0, 0, 0, OutputEndsFull);
2000         assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record);
2001         assert_eq!(ends[0], 3);
2002 
2003         assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End);
2004     }
2005 }
2006