1 use core::fmt; 2 3 use crate::Terminator; 4 5 // BE ADVISED 6 // 7 // This may just be one of the more complicated CSV parsers you'll come across. 8 // The implementation never allocates and consists of both a functional NFA 9 // parser and a DFA parser. The DFA parser is the work horse and we could elide 10 // much of the work involved in making the NFA parser work, but the NFA parser 11 // is much easier to debug. The NFA parser is tested alongside the DFA parser, 12 // so they should never be out of sync. 13 // 14 // The basic structure of the implementation is to encode the NFA parser as 15 // an explicit state machine in code. The DFA is then generated by populating 16 // a transition table on the stack by exhaustively enumerating all possible 17 // states on all possible inputs (this is possible because the number of states 18 // and the number of inputs is very small). 19 // 20 // Note that some pieces of the NFA parser (such as the NFA state machine) are 21 // required. In particular, the translation from the NFA to the DFA depends on 22 // the configuration of the CSV parser as given by the caller, and indeed, this 23 // is one of the key performance benefits of the DFA: it doesn't have any 24 // overhead (other than a bigger transition table) associated with the number 25 // of configuration options. 26 // 27 // ADVICE FOR HACKERS 28 // 29 // This code is too clever for its own good. As such, changes to some parts of 30 // the code may have a non-obvious impact on other parts. This is mostly 31 // motivated by trying to keep the DFA transition table as small as possible, 32 // since it is stored on the stack. Here are some tips that may save you some 33 // time: 34 // 35 // * If you add a new NFA state, then you also need to consider how it impacts 36 // the DFA. If all of the incoming transitions into an NFA state are 37 // epsilon transitions, then it probably isn't materialized in the DFA. 38 // If the NFA state indicates that a field or a record has been parsed, then 39 // it should be considered final. Let the comments in `NfaState` be your 40 // guide. 41 // * If you add a new configuration knob to the parser, then you may need to 42 // modify the `TRANS_CLASSES` constant below. The `TRANS_CLASSES` constant 43 // indicates the total number of discriminating bytes in the DFA. And if you 44 // modify `TRANS_CLASSES`, you probably also need to modify `build_dfa` to 45 // add a new class. For example, in order to add parsing support for 46 // comments, I bumped `TRANS_CLASSES` from `6` to `7` and added the comment 47 // byte (if one exists) to the list of classes in `build_dfa`. 48 // * The special DFA start state doubles as the final state once all input 49 // from the caller has been exhausted. We must be careful to guard this 50 // case analysis on whether the input is actually exhausted, since the start 51 // state is an otherwise valid state. 52 53 /// A pull based CSV reader. 54 /// 55 /// This reader parses CSV data using a finite state machine. Callers can 56 /// extract parsed data incrementally using one of the `read` methods. 57 /// 58 /// Note that this CSV reader is somewhat encoding agnostic. The source data 59 /// needs to be at least ASCII compatible. There is no support for specifying 60 /// the full gamut of Unicode delimiters/terminators/quotes/escapes. Instead, 61 /// any byte can be used, although callers probably want to stick to the ASCII 62 /// subset (`<= 0x7F`). 63 /// 64 /// # Usage 65 /// 66 /// A reader has two different ways to read CSV data, each with their own 67 /// trade offs. 68 /// 69 /// * `read_field` - Copies a single CSV field into an output buffer while 70 /// unescaping quotes. This is simple to use and doesn't require storing an 71 /// entire record contiguously in memory, but it is slower. 72 /// * `read_record` - Copies an entire CSV record into an output buffer while 73 /// unescaping quotes. The ending positions of each field are copied into 74 /// an additional buffer. This is harder to use and requires larger output 75 /// buffers, but it is faster than `read_field` since it amortizes more 76 /// costs. 77 /// 78 /// # RFC 4180 79 /// 80 /// [RFC 4180](https://tools.ietf.org/html/rfc4180) 81 /// is the closest thing to a specification for CSV data. Unfortunately, 82 /// CSV data that is seen in the wild can vary significantly. Often, the CSV 83 /// data is outright invalid. Instead of fixing the producers of bad CSV data, 84 /// we have seen fit to make consumers much more flexible in what they accept. 85 /// This reader continues that tradition, and therefore, isn't technically 86 /// compliant with RFC 4180. In particular, this reader will never return an 87 /// error and will always find *a* parse. 88 /// 89 /// Here are some detailed differences from RFC 4180: 90 /// 91 /// * CRLF, LF and CR are each treated as a single record terminator by 92 /// default. 93 /// * Records are permitted to be of varying length. 94 /// * Empty lines (that do not include other whitespace) are ignored. 95 #[derive(Clone, Debug)] 96 pub struct Reader { 97 /// A table-based DFA for parsing CSV. 98 dfa: Dfa, 99 /// The current DFA state, if the DFA is used. 100 dfa_state: DfaState, 101 /// The current NFA state, if the NFA is used. 102 nfa_state: NfaState, 103 /// The delimiter that separates fields. 104 delimiter: u8, 105 /// The terminator that separates records. 106 term: Terminator, 107 /// The quotation byte. 108 quote: u8, 109 /// Whether to recognize escaped quotes. 110 escape: Option<u8>, 111 /// Whether to recognized doubled quotes. 112 double_quote: bool, 113 /// If enabled, lines beginning with this byte are ignored. 114 comment: Option<u8>, 115 /// If enabled (the default), then quotes are respected. When disabled, 116 /// quotes are not treated specially. 117 quoting: bool, 118 /// Whether to use the NFA for parsing. 119 /// 120 /// Generally this is for debugging. There's otherwise no good reason 121 /// to avoid the DFA. 122 use_nfa: bool, 123 /// The current line number. 124 line: u64, 125 /// Whether this parser has ever read anything. 126 has_read: bool, 127 /// The current position in the output buffer when reading a record. 128 output_pos: usize, 129 } 130 131 impl Default for Reader { default() -> Reader132 fn default() -> Reader { 133 Reader { 134 dfa: Dfa::new(), 135 dfa_state: DfaState::start(), 136 nfa_state: NfaState::StartRecord, 137 delimiter: b',', 138 term: Terminator::default(), 139 quote: b'"', 140 escape: None, 141 double_quote: true, 142 comment: None, 143 quoting: true, 144 use_nfa: false, 145 line: 1, 146 has_read: false, 147 output_pos: 0, 148 } 149 } 150 } 151 152 /// Builds a CSV reader with various configuration knobs. 153 /// 154 /// This builder can be used to tweak the field delimiter, record terminator 155 /// and more for parsing CSV. Once a CSV `Reader` is built, its configuration 156 /// cannot be changed. 157 #[derive(Debug, Default)] 158 pub struct ReaderBuilder { 159 rdr: Reader, 160 } 161 162 impl ReaderBuilder { 163 /// Create a new builder. new() -> ReaderBuilder164 pub fn new() -> ReaderBuilder { 165 ReaderBuilder::default() 166 } 167 168 /// Build a CSV parser from this configuration. build(&self) -> Reader169 pub fn build(&self) -> Reader { 170 let mut rdr = self.rdr.clone(); 171 rdr.build_dfa(); 172 rdr 173 } 174 175 /// The field delimiter to use when parsing CSV. 176 /// 177 /// The default is `b','`. delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder178 pub fn delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder { 179 self.rdr.delimiter = delimiter; 180 self 181 } 182 183 /// The record terminator to use when parsing CSV. 184 /// 185 /// A record terminator can be any single byte. The default is a special 186 /// value, `Terminator::CRLF`, which treats any occurrence of `\r`, `\n` 187 /// or `\r\n` as a single record terminator. terminator(&mut self, term: Terminator) -> &mut ReaderBuilder188 pub fn terminator(&mut self, term: Terminator) -> &mut ReaderBuilder { 189 self.rdr.term = term; 190 self 191 } 192 193 /// The quote character to use when parsing CSV. 194 /// 195 /// The default is `b'"'`. quote(&mut self, quote: u8) -> &mut ReaderBuilder196 pub fn quote(&mut self, quote: u8) -> &mut ReaderBuilder { 197 self.rdr.quote = quote; 198 self 199 } 200 201 /// The escape character to use when parsing CSV. 202 /// 203 /// In some variants of CSV, quotes are escaped using a special escape 204 /// character like `\` (instead of escaping quotes by doubling them). 205 /// 206 /// By default, recognizing these idiosyncratic escapes is disabled. escape(&mut self, escape: Option<u8>) -> &mut ReaderBuilder207 pub fn escape(&mut self, escape: Option<u8>) -> &mut ReaderBuilder { 208 self.rdr.escape = escape; 209 self 210 } 211 212 /// Enable double quote escapes. 213 /// 214 /// This is enabled by default, but it may be disabled. When disabled, 215 /// doubled quotes are not interpreted as escapes. double_quote(&mut self, yes: bool) -> &mut ReaderBuilder216 pub fn double_quote(&mut self, yes: bool) -> &mut ReaderBuilder { 217 self.rdr.double_quote = yes; 218 self 219 } 220 221 /// Enable or disable quoting. 222 /// 223 /// This is enabled by default, but it may be disabled. When disabled, 224 /// quotes are not treated specially. quoting(&mut self, yes: bool) -> &mut ReaderBuilder225 pub fn quoting(&mut self, yes: bool) -> &mut ReaderBuilder { 226 self.rdr.quoting = yes; 227 self 228 } 229 230 /// The comment character to use when parsing CSV. 231 /// 232 /// If the start of a record begins with the byte given here, then that 233 /// line is ignored by the CSV parser. 234 /// 235 /// This is disabled by default. comment(&mut self, comment: Option<u8>) -> &mut ReaderBuilder236 pub fn comment(&mut self, comment: Option<u8>) -> &mut ReaderBuilder { 237 self.rdr.comment = comment; 238 self 239 } 240 241 /// A convenience method for specifying a configuration to read ASCII 242 /// delimited text. 243 /// 244 /// This sets the delimiter and record terminator to the ASCII unit 245 /// separator (`\x1F`) and record separator (`\x1E`), respectively. ascii(&mut self) -> &mut ReaderBuilder246 pub fn ascii(&mut self) -> &mut ReaderBuilder { 247 self.delimiter(b'\x1F').terminator(Terminator::Any(b'\x1E')) 248 } 249 250 /// Enable or disable the NFA for parsing CSV. 251 /// 252 /// This is intended to be a debug option useful for debugging. The NFA 253 /// is always slower than the DFA. 254 #[doc(hidden)] nfa(&mut self, yes: bool) -> &mut ReaderBuilder255 pub fn nfa(&mut self, yes: bool) -> &mut ReaderBuilder { 256 self.rdr.use_nfa = yes; 257 self 258 } 259 } 260 261 /// The result of parsing at most one field from CSV data. 262 #[derive(Clone, Debug, Eq, PartialEq)] 263 pub enum ReadFieldResult { 264 /// The caller provided input was exhausted before the end of a field or 265 /// record was found. 266 InputEmpty, 267 /// The caller provided output buffer was filled before an entire field 268 /// could be written to it. 269 OutputFull, 270 /// The end of a field was found. 271 /// 272 /// Note that when `record_end` is true, then the end of this field also 273 /// corresponds to the end of a record. 274 Field { 275 /// Whether this was the last field in a record or not. 276 record_end: bool, 277 }, 278 /// All CSV data has been read. 279 /// 280 /// This state can only be returned when an empty input buffer is provided 281 /// by the caller. 282 End, 283 } 284 285 impl ReadFieldResult { from_nfa( state: NfaState, inpdone: bool, outdone: bool, ) -> ReadFieldResult286 fn from_nfa( 287 state: NfaState, 288 inpdone: bool, 289 outdone: bool, 290 ) -> ReadFieldResult { 291 match state { 292 NfaState::End => ReadFieldResult::End, 293 NfaState::EndRecord | NfaState::CRLF => { 294 ReadFieldResult::Field { record_end: true } 295 } 296 NfaState::EndFieldDelim => { 297 ReadFieldResult::Field { record_end: false } 298 } 299 _ => { 300 assert!(!state.is_field_final()); 301 if !inpdone && outdone { 302 ReadFieldResult::OutputFull 303 } else { 304 ReadFieldResult::InputEmpty 305 } 306 } 307 } 308 } 309 } 310 311 /// The result of parsing at most one field from CSV data while ignoring the 312 /// output. 313 #[derive(Clone, Debug, Eq, PartialEq)] 314 pub enum ReadFieldNoCopyResult { 315 /// The caller provided input was exhausted before the end of a field or 316 /// record was found. 317 InputEmpty, 318 /// The end of a field was found. 319 /// 320 /// Note that when `record_end` is true, then the end of this field also 321 /// corresponds to the end of a record. 322 Field { 323 /// Whether this was the last field in a record or not. 324 record_end: bool, 325 }, 326 /// All CSV data has been read. 327 /// 328 /// This state can only be returned when an empty input buffer is provided 329 /// by the caller. 330 End, 331 } 332 333 /// The result of parsing at most one record from CSV data. 334 #[derive(Clone, Debug, Eq, PartialEq)] 335 pub enum ReadRecordResult { 336 /// The caller provided input was exhausted before the end of a record was 337 /// found. 338 InputEmpty, 339 /// The caller provided output buffer was filled before an entire field 340 /// could be written to it. 341 OutputFull, 342 /// The caller provided output buffer of field end poisitions was filled 343 /// before the next field could be parsed. 344 OutputEndsFull, 345 /// The end of a record was found. 346 Record, 347 /// All CSV data has been read. 348 /// 349 /// This state can only be returned when an empty input buffer is provided 350 /// by the caller. 351 End, 352 } 353 354 impl ReadRecordResult { is_record(&self) -> bool355 fn is_record(&self) -> bool { 356 *self == ReadRecordResult::Record 357 } 358 from_nfa( state: NfaState, inpdone: bool, outdone: bool, endsdone: bool, ) -> ReadRecordResult359 fn from_nfa( 360 state: NfaState, 361 inpdone: bool, 362 outdone: bool, 363 endsdone: bool, 364 ) -> ReadRecordResult { 365 match state { 366 NfaState::End => ReadRecordResult::End, 367 NfaState::EndRecord | NfaState::CRLF => ReadRecordResult::Record, 368 _ => { 369 assert!(!state.is_record_final()); 370 if !inpdone && outdone { 371 ReadRecordResult::OutputFull 372 } else if !inpdone && endsdone { 373 ReadRecordResult::OutputEndsFull 374 } else { 375 ReadRecordResult::InputEmpty 376 } 377 } 378 } 379 } 380 } 381 382 /// The result of parsing at most one record from CSV data while ignoring 383 /// output. 384 #[derive(Clone, Debug, Eq, PartialEq)] 385 pub enum ReadRecordNoCopyResult { 386 /// The caller provided input was exhausted before the end of a record was 387 /// found. 388 InputEmpty, 389 /// The end of a record was found. 390 Record, 391 /// All CSV data has been read. 392 /// 393 /// This state can only be returned when an empty input buffer is provided 394 /// by the caller. 395 End, 396 } 397 398 /// What should be done with input bytes during an NFA transition 399 #[derive(Clone, Debug, Eq, PartialEq)] 400 enum NfaInputAction { 401 // Do not consume an input byte 402 Epsilon, 403 // Copy input byte to a caller-provided output buffer 404 CopyToOutput, 405 // Consume but do not copy input byte (for example, seeing a field 406 // delimiter will consume an input byte but should not copy it to the 407 // output buffer. 408 Discard, 409 } 410 411 /// An NFA state is a state that can be visited in the NFA parser. 412 /// 413 /// Given the simplicity of the machine, a subset of NFA states double as DFA 414 /// states. NFA states that only have incoming epsilon transitions are 415 /// optimized out when converting the machine to a DFA. 416 #[derive(Copy, Clone, Debug, Eq, PartialEq)] 417 enum NfaState { 418 // These states aren't used in the DFA, so we 419 // assign them meaningless numbers. 420 EndFieldTerm = 200, 421 InRecordTerm = 201, 422 End = 202, 423 424 // All states below are DFA states. 425 StartRecord = 0, 426 StartField = 1, 427 InField = 2, 428 InQuotedField = 3, 429 InEscapedQuote = 4, 430 InDoubleEscapedQuote = 5, 431 InComment = 6, 432 // All states below are "final field" states. 433 // Namely, they indicate that a field has been parsed. 434 EndFieldDelim = 7, 435 // All states below are "final record" states. 436 // Namely, they indicate that a record has been parsed. 437 EndRecord = 8, 438 CRLF = 9, 439 } 440 441 /// A list of NFA states that have an explicit representation in the DFA. 442 const NFA_STATES: &'static [NfaState] = &[ 443 NfaState::StartRecord, 444 NfaState::StartField, 445 NfaState::EndFieldDelim, 446 NfaState::InField, 447 NfaState::InQuotedField, 448 NfaState::InEscapedQuote, 449 NfaState::InDoubleEscapedQuote, 450 NfaState::InComment, 451 NfaState::EndRecord, 452 NfaState::CRLF, 453 ]; 454 455 impl NfaState { 456 /// Returns true if this state indicates that a field has been parsed. is_field_final(&self) -> bool457 fn is_field_final(&self) -> bool { 458 match *self { 459 NfaState::End 460 | NfaState::EndRecord 461 | NfaState::CRLF 462 | NfaState::EndFieldDelim => true, 463 _ => false, 464 } 465 } 466 467 /// Returns true if this state indicates that a record has been parsed. is_record_final(&self) -> bool468 fn is_record_final(&self) -> bool { 469 match *self { 470 NfaState::End | NfaState::EndRecord | NfaState::CRLF => true, 471 _ => false, 472 } 473 } 474 } 475 476 impl Reader { 477 /// Create a new CSV reader with a default parser configuration. new() -> Reader478 pub fn new() -> Reader { 479 ReaderBuilder::new().build() 480 } 481 482 /// Reset the parser such that it behaves as if it had never been used. 483 /// 484 /// This may be useful when reading CSV data in a random access pattern. reset(&mut self)485 pub fn reset(&mut self) { 486 self.dfa_state = self.dfa.new_state(NfaState::StartRecord); 487 self.nfa_state = NfaState::StartRecord; 488 self.line = 1; 489 self.has_read = false; 490 } 491 492 /// Return the current line number as measured by the number of occurrences 493 /// of `\n`. 494 /// 495 /// Line numbers starts at `1` and are reset when `reset` is called. line(&self) -> u64496 pub fn line(&self) -> u64 { 497 self.line 498 } 499 500 /// Set the line number. 501 /// 502 /// This is useful after a call to `reset` where the caller knows the 503 /// line number from some additional context. set_line(&mut self, line: u64)504 pub fn set_line(&mut self, line: u64) { 505 self.line = line; 506 } 507 508 /// Parse a single CSV field in `input` and copy field data to `output`. 509 /// 510 /// This routine requires a caller provided buffer of CSV data as the 511 /// `input` and a caller provided buffer, `output`, in which to store field 512 /// data extracted from `input`. The field data copied to `output` will 513 /// have its quotes unescaped. 514 /// 515 /// Calling this routine parses at most a single field and returns 516 /// three values indicating the state of the parser. The first value, a 517 /// `ReadFieldResult`, tells the caller what to do next. For example, if 518 /// the entire input was read or if the output buffer was filled before 519 /// a full field had been read, then `ReadFieldResult::InputEmpty` or 520 /// `ReadFieldResult::OutputFull` is returned, respectively. See the 521 /// documentation for `ReadFieldResult` for more details. 522 /// 523 /// The other two values returned correspond to the number of bytes 524 /// read from `input` and written to `output`, respectively. 525 /// 526 /// # Termination 527 /// 528 /// This reader interprets an empty `input` buffer as an indication that 529 /// there is no CSV data left to read. Namely, when the caller has 530 /// exhausted all CSV data, the caller should continue to call `read` with 531 /// an empty input buffer until `ReadFieldResult::End` is returned. 532 /// 533 /// # Errors 534 /// 535 /// This CSV reader can never return an error. Instead, it prefers *a* 536 /// parse over *no* parse. read_field( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize)537 pub fn read_field( 538 &mut self, 539 input: &[u8], 540 output: &mut [u8], 541 ) -> (ReadFieldResult, usize, usize) { 542 let (input, bom_nin) = self.strip_utf8_bom(input); 543 let (res, nin, nout) = if self.use_nfa { 544 self.read_field_nfa(input, output) 545 } else { 546 self.read_field_dfa(input, output) 547 }; 548 self.has_read = true; 549 (res, nin + bom_nin, nout) 550 } 551 552 /// Parse a single CSV record in `input` and copy each field contiguously 553 /// to `output`, with the end position of each field written to `ends`. 554 /// 555 /// **NOTE**: This method is more cumbersome to use than `read_field`, but 556 /// it can be faster since it amortizes more work. 557 /// 558 /// This routine requires a caller provided buffer of CSV data as the 559 /// `input` and two caller provided buffers to store the unescaped field 560 /// data (`output`) and the end position of each field in the record 561 /// (`fields`). 562 /// 563 /// Calling this routine parses at most a single record and returns four 564 /// values indicating the state of the parser. The first value, a 565 /// `ReadRecordResult`, tells the caller what to do next. For example, if 566 /// the entire input was read or if the output buffer was filled before a 567 /// full field had been read, then `ReadRecordResult::InputEmpty` or 568 /// `ReadRecordResult::OutputFull` is returned, respectively. Similarly, if 569 /// the `ends` buffer is full, then `ReadRecordResult::OutputEndsFull` is 570 /// returned. See the documentation for `ReadRecordResult` for more 571 /// details. 572 /// 573 /// The other three values correspond to the number of bytes read from 574 /// `input`, the number of bytes written to `output` and the number of 575 /// end positions written to `ends`, respectively. 576 /// 577 /// The end positions written to `ends` are constructed as if there was 578 /// a single contiguous buffer in memory containing the entire row, even 579 /// if `ReadRecordResult::OutputFull` was returned in the middle of reading 580 /// a row. 581 /// 582 /// # Termination 583 /// 584 /// This reader interprets an empty `input` buffer as an indication that 585 /// there is no CSV data left to read. Namely, when the caller has 586 /// exhausted all CSV data, the caller should continue to call `read` with 587 /// an empty input buffer until `ReadRecordResult::End` is returned. 588 /// 589 /// # Errors 590 /// 591 /// This CSV reader can never return an error. Instead, it prefers *a* 592 /// parse over *no* parse. read_record( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize)593 pub fn read_record( 594 &mut self, 595 input: &[u8], 596 output: &mut [u8], 597 ends: &mut [usize], 598 ) -> (ReadRecordResult, usize, usize, usize) { 599 let (input, bom_nin) = self.strip_utf8_bom(input); 600 let (res, nin, nout, nend) = if self.use_nfa { 601 self.read_record_nfa(input, output, ends) 602 } else { 603 self.read_record_dfa(input, output, ends) 604 }; 605 self.has_read = true; 606 (res, nin + bom_nin, nout, nend) 607 } 608 609 /// Strip off a possible UTF-8 BOM at the start of a file. Quick note that 610 /// this method will fail to strip off the BOM if only part of the BOM is 611 /// buffered. Hopefully that won't happen very often. strip_utf8_bom<'a>(&self, input: &'a [u8]) -> (&'a [u8], usize)612 fn strip_utf8_bom<'a>(&self, input: &'a [u8]) -> (&'a [u8], usize) { 613 let (input, nin) = if { 614 !self.has_read 615 && input.len() >= 3 616 && &input[0..3] == b"\xef\xbb\xbf" 617 } { 618 (&input[3..], 3) 619 } else { 620 (input, 0) 621 }; 622 (input, nin) 623 } 624 625 #[inline(always)] read_record_dfa( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize)626 fn read_record_dfa( 627 &mut self, 628 input: &[u8], 629 output: &mut [u8], 630 ends: &mut [usize], 631 ) -> (ReadRecordResult, usize, usize, usize) { 632 if input.is_empty() { 633 let s = self.transition_final_dfa(self.dfa_state); 634 let res = 635 self.dfa.new_read_record_result(s, true, false, false, false); 636 // This part is a little tricky. When reading the final record, 637 // the last result the caller will get is an InputEmpty, and while 638 // they'll have everything they need in `output`, they'll be 639 // missing the final end position of the final field in `ends`. 640 // We insert that here, but we must take care to handle the case 641 // where `ends` doesn't have enough space. If it doesn't have 642 // enough space, then we also can't transition to the next state. 643 return match res { 644 ReadRecordResult::Record => { 645 if ends.is_empty() { 646 return (ReadRecordResult::OutputEndsFull, 0, 0, 0); 647 } 648 self.dfa_state = s; 649 ends[0] = self.output_pos; 650 self.output_pos = 0; 651 (res, 0, 0, 1) 652 } 653 _ => { 654 self.dfa_state = s; 655 (res, 0, 0, 0) 656 } 657 }; 658 } 659 if output.is_empty() { 660 return (ReadRecordResult::OutputFull, 0, 0, 0); 661 } 662 if ends.is_empty() { 663 return (ReadRecordResult::OutputEndsFull, 0, 0, 0); 664 } 665 let (mut nin, mut nout, mut nend) = (0, 0, 0); 666 let mut state = self.dfa_state; 667 while nin < input.len() && nout < output.len() && nend < ends.len() { 668 let (s, has_out) = self.dfa.get_output(state, input[nin]); 669 self.line += (input[nin] == b'\n') as u64; 670 state = s; 671 if has_out { 672 output[nout] = input[nin]; 673 nout += 1; 674 } 675 nin += 1; 676 if state >= self.dfa.final_field { 677 ends[nend] = self.output_pos + nout; 678 nend += 1; 679 if state > self.dfa.final_field { 680 break; 681 } 682 } 683 if state == self.dfa.in_field || state == self.dfa.in_quoted { 684 self.dfa 685 .classes 686 .scan_and_copy(input, &mut nin, output, &mut nout); 687 } 688 } 689 let res = self.dfa.new_read_record_result( 690 state, 691 false, 692 nin >= input.len(), 693 nout >= output.len(), 694 nend >= ends.len(), 695 ); 696 self.dfa_state = state; 697 if res.is_record() { 698 self.output_pos = 0; 699 } else { 700 self.output_pos += nout; 701 } 702 (res, nin, nout, nend) 703 } 704 705 #[inline(always)] read_field_dfa( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize)706 fn read_field_dfa( 707 &mut self, 708 input: &[u8], 709 output: &mut [u8], 710 ) -> (ReadFieldResult, usize, usize) { 711 if input.is_empty() { 712 self.dfa_state = self.transition_final_dfa(self.dfa_state); 713 let res = self.dfa.new_read_field_result( 714 self.dfa_state, 715 true, 716 false, 717 false, 718 ); 719 return (res, 0, 0); 720 } 721 if output.is_empty() { 722 return (ReadFieldResult::OutputFull, 0, 0); 723 } 724 let (mut nin, mut nout) = (0, 0); 725 let mut state = self.dfa_state; 726 while nin < input.len() && nout < output.len() { 727 let b = input[nin]; 728 self.line += (b == b'\n') as u64; 729 let (s, has_out) = self.dfa.get_output(state, b); 730 state = s; 731 if has_out { 732 output[nout] = b; 733 nout += 1; 734 } 735 nin += 1; 736 if state >= self.dfa.final_field { 737 break; 738 } 739 } 740 let res = self.dfa.new_read_field_result( 741 state, 742 false, 743 nin >= input.len(), 744 nout >= output.len(), 745 ); 746 self.dfa_state = state; 747 (res, nin, nout) 748 } 749 750 /// Perform the final state transition, i.e., when the caller indicates 751 /// that the input has been exhausted. transition_final_dfa(&self, state: DfaState) -> DfaState752 fn transition_final_dfa(&self, state: DfaState) -> DfaState { 753 // If we''ve already emitted a record or think we're ready to start 754 // parsing a new record, then we should sink into the final state 755 // and never move from there. (pro-tip: the start state doubles as 756 // the final state!) 757 if state >= self.dfa.final_record || state.is_start() { 758 self.dfa.new_state_final_end() 759 } else { 760 self.dfa.new_state_final_record() 761 } 762 } 763 764 /// Write the transition tables for the DFA based on this parser's 765 /// configuration. build_dfa(&mut self)766 fn build_dfa(&mut self) { 767 // A naive DFA transition table has 768 // `cells = (# number of states) * (# size of alphabet)`. While we 769 // could get away with that, the table would have `10 * 256 = 2560` 770 // entries. Even worse, in order to avoid a multiplication instruction 771 // when computing the next transition, we store the starting index of 772 // each state's row, which would not be representible in a single byte. 773 // So we'd need a `u16`, which doubles our transition table size to 774 // ~5KB. This is a lot to put on the stack, even though it probably 775 // fits in the L1 cache of most modern CPUs. 776 // 777 // To avoid this, we note that while our "true" alphabet 778 // has 256 distinct possibilities, the DFA itself is only 779 // discriminatory on a very small subset of that alphabet. For 780 // example, assuming neither `a` nor `b` are set as special 781 // quote/comment/escape/delimiter/terminator bytes, they are otherwise 782 // indistinguishable to the DFA, so it would be OK to treat them as 783 // if they were equivalent. That is, they are in the same equivalence 784 // class. 785 // 786 // As it turns out, using this logic, we can shrink our effective 787 // alphabet down to 7 equivalence classes: 788 // 789 // 1. The field delimiter. 790 // 2. The record terminator. 791 // 3. If the record terminator is CRLF, then CR and LF are 792 // distinct equivalence classes. 793 // 4. The quote byte. 794 // 5. The escape byte. 795 // 6. The comment byte. 796 // 7. Everything else. 797 // 798 // We add those equivalence classes here. If more configuration knobs 799 // are added to the parser with more discriminating bytes, then this 800 // logic will need to be adjusted further. 801 // 802 // Even though this requires an extra bit of indirection when computing 803 // the next transition, microbenchmarks say that it doesn't make much 804 // of a difference. Perhaps because everything fits into the L1 cache. 805 self.dfa.classes.add(self.delimiter); 806 if self.quoting { 807 self.dfa.classes.add(self.quote); 808 if let Some(escape) = self.escape { 809 self.dfa.classes.add(escape); 810 } 811 } 812 if let Some(comment) = self.comment { 813 self.dfa.classes.add(comment); 814 } 815 match self.term { 816 Terminator::Any(b) => self.dfa.classes.add(b), 817 Terminator::CRLF => { 818 self.dfa.classes.add(b'\r'); 819 self.dfa.classes.add(b'\n'); 820 } 821 _ => unreachable!(), 822 } 823 // Build the DFA transition table by computing the DFA state for all 824 // possible combinations of state and input byte. 825 for &state in NFA_STATES { 826 for c in (0..256).map(|c| c as u8) { 827 let mut nfa_result = (state, NfaInputAction::Epsilon); 828 // Consume NFA states until we hit a non-epsilon transition. 829 while nfa_result.0 != NfaState::End 830 && nfa_result.1 == NfaInputAction::Epsilon 831 { 832 nfa_result = self.transition_nfa(nfa_result.0, c); 833 } 834 let from = self.dfa.new_state(state); 835 let to = self.dfa.new_state(nfa_result.0); 836 self.dfa.set( 837 from, 838 c, 839 to, 840 nfa_result.1 == NfaInputAction::CopyToOutput, 841 ); 842 } 843 } 844 self.dfa_state = self.dfa.new_state(NfaState::StartRecord); 845 self.dfa.finish(); 846 } 847 848 // The NFA implementation follows. The transition_final_nfa and 849 // transition_nfa methods are required for the DFA to operate. The 850 // rest are included for completeness (and debugging). Note that this 851 // NFA implementation is included in most of the CSV parser tests below. 852 853 #[inline(always)] read_record_nfa( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize)854 fn read_record_nfa( 855 &mut self, 856 input: &[u8], 857 output: &mut [u8], 858 ends: &mut [usize], 859 ) -> (ReadRecordResult, usize, usize, usize) { 860 if input.is_empty() { 861 let s = self.transition_final_nfa(self.nfa_state); 862 let res = ReadRecordResult::from_nfa(s, false, false, false); 863 return match res { 864 ReadRecordResult::Record => { 865 if ends.is_empty() { 866 return (ReadRecordResult::OutputEndsFull, 0, 0, 0); 867 } 868 self.nfa_state = s; 869 ends[0] = self.output_pos; 870 self.output_pos = 0; 871 (res, 0, 0, 1) 872 } 873 _ => { 874 self.nfa_state = s; 875 (res, 0, 0, 0) 876 } 877 }; 878 } 879 if output.is_empty() { 880 return (ReadRecordResult::OutputFull, 0, 0, 0); 881 } 882 if ends.is_empty() { 883 return (ReadRecordResult::OutputEndsFull, 0, 0, 0); 884 } 885 let (mut nin, mut nout, mut nend) = (0, self.output_pos, 0); 886 let mut state = self.nfa_state; 887 while nin < input.len() && nout < output.len() && nend < ends.len() { 888 let (s, io) = self.transition_nfa(state, input[nin]); 889 match io { 890 NfaInputAction::CopyToOutput => { 891 output[nout] = input[nin]; 892 nout += 1; 893 nin += 1; 894 } 895 NfaInputAction::Discard => { 896 nin += 1; 897 } 898 NfaInputAction::Epsilon => {} 899 } 900 state = s; 901 if state.is_field_final() { 902 ends[nend] = nout; 903 nend += 1; 904 if state != NfaState::EndFieldDelim { 905 break; 906 } 907 } 908 } 909 let res = ReadRecordResult::from_nfa( 910 state, 911 nin >= input.len(), 912 nout >= output.len(), 913 nend >= ends.len(), 914 ); 915 self.nfa_state = state; 916 self.output_pos = if res.is_record() { 0 } else { nout }; 917 (res, nin, nout, nend) 918 } 919 920 #[inline(always)] read_field_nfa( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize)921 fn read_field_nfa( 922 &mut self, 923 input: &[u8], 924 output: &mut [u8], 925 ) -> (ReadFieldResult, usize, usize) { 926 if input.is_empty() { 927 self.nfa_state = self.transition_final_nfa(self.nfa_state); 928 let res = ReadFieldResult::from_nfa(self.nfa_state, false, false); 929 return (res, 0, 0); 930 } 931 if output.is_empty() { 932 // If the output buffer is empty, then we can never make progress, 933 // so just quit now. 934 return (ReadFieldResult::OutputFull, 0, 0); 935 } 936 let (mut nin, mut nout) = (0, 0); 937 let mut state = self.nfa_state; 938 while nin < input.len() && nout < output.len() { 939 let (s, io) = self.transition_nfa(state, input[nin]); 940 match io { 941 NfaInputAction::CopyToOutput => { 942 output[nout] = input[nin]; 943 nout += 1; 944 nin += 1; 945 } 946 NfaInputAction::Discard => { 947 nin += 1; 948 } 949 NfaInputAction::Epsilon => (), 950 } 951 state = s; 952 if state.is_field_final() { 953 break; 954 } 955 } 956 let res = ReadFieldResult::from_nfa( 957 state, 958 nin >= input.len(), 959 nout >= output.len(), 960 ); 961 self.nfa_state = state; 962 (res, nin, nout) 963 } 964 965 /// Compute the final NFA transition after all caller-provided input has 966 /// been exhausted. 967 #[inline(always)] transition_final_nfa(&self, state: NfaState) -> NfaState968 fn transition_final_nfa(&self, state: NfaState) -> NfaState { 969 use self::NfaState::*; 970 match state { 971 End | StartRecord | EndRecord | InComment | CRLF => End, 972 StartField | EndFieldDelim | EndFieldTerm | InField 973 | InQuotedField | InEscapedQuote | InDoubleEscapedQuote 974 | InRecordTerm => EndRecord, 975 } 976 } 977 978 /// Compute the next NFA state given the current NFA state and the current 979 /// input byte. 980 /// 981 /// This returns the next NFA state along with an NfaInputAction that 982 /// indicates what should be done with the input byte (nothing for an epsilon 983 /// transition, copied to a caller provided output buffer, or discarded). 984 #[inline(always)] transition_nfa( &self, state: NfaState, c: u8, ) -> (NfaState, NfaInputAction)985 fn transition_nfa( 986 &self, 987 state: NfaState, 988 c: u8, 989 ) -> (NfaState, NfaInputAction) { 990 use self::NfaState::*; 991 match state { 992 End => (End, NfaInputAction::Epsilon), 993 StartRecord => { 994 if self.term.equals(c) { 995 (StartRecord, NfaInputAction::Discard) 996 } else if self.comment == Some(c) { 997 (InComment, NfaInputAction::Discard) 998 } else { 999 (StartField, NfaInputAction::Epsilon) 1000 } 1001 } 1002 EndRecord => (StartRecord, NfaInputAction::Epsilon), 1003 StartField => { 1004 if self.quoting && self.quote == c { 1005 (InQuotedField, NfaInputAction::Discard) 1006 } else if self.delimiter == c { 1007 (EndFieldDelim, NfaInputAction::Discard) 1008 } else if self.term.equals(c) { 1009 (EndFieldTerm, NfaInputAction::Epsilon) 1010 } else { 1011 (InField, NfaInputAction::CopyToOutput) 1012 } 1013 } 1014 EndFieldDelim => (StartField, NfaInputAction::Epsilon), 1015 EndFieldTerm => (InRecordTerm, NfaInputAction::Epsilon), 1016 InField => { 1017 if self.delimiter == c { 1018 (EndFieldDelim, NfaInputAction::Discard) 1019 } else if self.term.equals(c) { 1020 (EndFieldTerm, NfaInputAction::Epsilon) 1021 } else { 1022 (InField, NfaInputAction::CopyToOutput) 1023 } 1024 } 1025 InQuotedField => { 1026 if self.quoting && self.quote == c { 1027 (InDoubleEscapedQuote, NfaInputAction::Discard) 1028 } else if self.quoting && self.escape == Some(c) { 1029 (InEscapedQuote, NfaInputAction::Discard) 1030 } else { 1031 (InQuotedField, NfaInputAction::CopyToOutput) 1032 } 1033 } 1034 InEscapedQuote => (InQuotedField, NfaInputAction::CopyToOutput), 1035 InDoubleEscapedQuote => { 1036 if self.quoting && self.double_quote && self.quote == c { 1037 (InQuotedField, NfaInputAction::CopyToOutput) 1038 } else if self.delimiter == c { 1039 (EndFieldDelim, NfaInputAction::Discard) 1040 } else if self.term.equals(c) { 1041 (EndFieldTerm, NfaInputAction::Epsilon) 1042 } else { 1043 (InField, NfaInputAction::CopyToOutput) 1044 } 1045 } 1046 InComment => { 1047 if b'\n' == c { 1048 (StartRecord, NfaInputAction::Discard) 1049 } else { 1050 (InComment, NfaInputAction::Discard) 1051 } 1052 } 1053 InRecordTerm => { 1054 if self.term.is_crlf() && b'\r' == c { 1055 (CRLF, NfaInputAction::Discard) 1056 } else { 1057 (EndRecord, NfaInputAction::Discard) 1058 } 1059 } 1060 CRLF => { 1061 if b'\n' == c { 1062 (StartRecord, NfaInputAction::Discard) 1063 } else { 1064 (StartRecord, NfaInputAction::Epsilon) 1065 } 1066 } 1067 } 1068 } 1069 } 1070 1071 /// The number of slots in the DFA transition table. 1072 /// 1073 /// This number is computed by multiplying the maximum number of transition 1074 /// classes (7) by the total number of NFA states that are used in the DFA 1075 /// (10). 1076 /// 1077 /// The number of transition classes is determined by an equivalence class of 1078 /// bytes, where every byte in the same equivalence classes is 1079 /// indistinguishable from any other byte with respect to the DFA. For example, 1080 /// if neither `a` nor `b` are specifed as a delimiter/quote/terminator/escape, 1081 /// then the DFA will never discriminate between `a` or `b`, so they can 1082 /// effectively be treated as identical. This reduces storage space 1083 /// substantially. 1084 /// 1085 /// The total number of NFA states (13) is greater than the total number of 1086 /// NFA states that are in the DFA. In particular, any NFA state that can only 1087 /// be reached by epsilon transitions will never have explicit usage in the 1088 /// DFA. 1089 const TRANS_CLASSES: usize = 7; 1090 const DFA_STATES: usize = 10; 1091 const TRANS_SIZE: usize = TRANS_CLASSES * DFA_STATES; 1092 1093 /// The number of possible transition classes. (See the comment on `TRANS_SIZE` 1094 /// for more details.) 1095 const CLASS_SIZE: usize = 256; 1096 1097 /// A representation of a DFA. 1098 /// 1099 /// For the most part, this is a transition table, but various optimizations 1100 /// have been applied to reduce its memory footprint. 1101 struct Dfa { 1102 /// The core transition table. Each row corresponds to the transitions for 1103 /// each input equivalence class. (Input bytes are mapped to their 1104 /// corresponding equivalence class with the `classes` map.) 1105 /// 1106 /// DFA states are represented as an index corresponding to the start of 1107 /// its row in this table. 1108 trans: [DfaState; TRANS_SIZE], 1109 /// A table with the same layout as `trans`, except its values indicate 1110 /// whether a particular `(state, equivalence class)` pair should emit an 1111 /// output byte. 1112 has_output: [bool; TRANS_SIZE], 1113 /// A map from input byte to equivalence class. 1114 /// 1115 /// This is responsible for reducing the effective alphabet size from 1116 /// 256 to `TRANS_CLASSES`. 1117 classes: DfaClasses, 1118 /// The DFA state corresponding to being inside an unquoted field. 1119 in_field: DfaState, 1120 /// The DFA state corresponding to being inside an quoted field. 1121 in_quoted: DfaState, 1122 /// The minimum DFA state that indicates a field has been parsed. All DFA 1123 /// states greater than this are also final-field states. 1124 final_field: DfaState, 1125 /// The minimum DFA state that indicates a record has been parsed. All DFA 1126 /// states greater than this are also final-record states. 1127 final_record: DfaState, 1128 } 1129 1130 impl Dfa { new() -> Dfa1131 fn new() -> Dfa { 1132 Dfa { 1133 trans: [DfaState(0); TRANS_SIZE], 1134 has_output: [false; TRANS_SIZE], 1135 classes: DfaClasses::new(), 1136 in_field: DfaState(0), 1137 in_quoted: DfaState(0), 1138 final_field: DfaState(0), 1139 final_record: DfaState(0), 1140 } 1141 } 1142 new_state(&self, nfa_state: NfaState) -> DfaState1143 fn new_state(&self, nfa_state: NfaState) -> DfaState { 1144 let nclasses = self.classes.num_classes() as u8; 1145 let idx = (nfa_state as u8).checked_mul(nclasses).unwrap(); 1146 DfaState(idx) 1147 } 1148 new_state_final_end(&self) -> DfaState1149 fn new_state_final_end(&self) -> DfaState { 1150 self.new_state(NfaState::StartRecord) 1151 } 1152 new_state_final_record(&self) -> DfaState1153 fn new_state_final_record(&self) -> DfaState { 1154 self.new_state(NfaState::EndRecord) 1155 } 1156 get_output(&self, state: DfaState, c: u8) -> (DfaState, bool)1157 fn get_output(&self, state: DfaState, c: u8) -> (DfaState, bool) { 1158 let cls = self.classes.classes[c as usize]; 1159 let idx = state.0 as usize + cls as usize; 1160 (self.trans[idx], self.has_output[idx]) 1161 } 1162 set(&mut self, from: DfaState, c: u8, to: DfaState, output: bool)1163 fn set(&mut self, from: DfaState, c: u8, to: DfaState, output: bool) { 1164 let cls = self.classes.classes[c as usize]; 1165 let idx = from.0 as usize + cls as usize; 1166 self.trans[idx] = to; 1167 self.has_output[idx] = output; 1168 } 1169 finish(&mut self)1170 fn finish(&mut self) { 1171 self.in_field = self.new_state(NfaState::InField); 1172 self.in_quoted = self.new_state(NfaState::InQuotedField); 1173 self.final_field = self.new_state(NfaState::EndFieldDelim); 1174 self.final_record = self.new_state(NfaState::EndRecord); 1175 } 1176 new_read_field_result( &self, state: DfaState, is_final_trans: bool, inpdone: bool, outdone: bool, ) -> ReadFieldResult1177 fn new_read_field_result( 1178 &self, 1179 state: DfaState, 1180 is_final_trans: bool, 1181 inpdone: bool, 1182 outdone: bool, 1183 ) -> ReadFieldResult { 1184 if state >= self.final_record { 1185 ReadFieldResult::Field { record_end: true } 1186 } else if state == self.final_field { 1187 ReadFieldResult::Field { record_end: false } 1188 } else if is_final_trans && state.is_start() { 1189 ReadFieldResult::End 1190 } else { 1191 debug_assert!(state < self.final_field); 1192 if !inpdone && outdone { 1193 ReadFieldResult::OutputFull 1194 } else { 1195 ReadFieldResult::InputEmpty 1196 } 1197 } 1198 } 1199 new_read_record_result( &self, state: DfaState, is_final_trans: bool, inpdone: bool, outdone: bool, endsdone: bool, ) -> ReadRecordResult1200 fn new_read_record_result( 1201 &self, 1202 state: DfaState, 1203 is_final_trans: bool, 1204 inpdone: bool, 1205 outdone: bool, 1206 endsdone: bool, 1207 ) -> ReadRecordResult { 1208 if state >= self.final_record { 1209 ReadRecordResult::Record 1210 } else if is_final_trans && state.is_start() { 1211 ReadRecordResult::End 1212 } else { 1213 debug_assert!(state < self.final_record); 1214 if !inpdone && outdone { 1215 ReadRecordResult::OutputFull 1216 } else if !inpdone && endsdone { 1217 ReadRecordResult::OutputEndsFull 1218 } else { 1219 ReadRecordResult::InputEmpty 1220 } 1221 } 1222 } 1223 } 1224 1225 /// A map from input byte to equivalence class. 1226 struct DfaClasses { 1227 classes: [u8; CLASS_SIZE], 1228 next_class: usize, 1229 } 1230 1231 impl DfaClasses { new() -> DfaClasses1232 fn new() -> DfaClasses { 1233 DfaClasses { classes: [0; CLASS_SIZE], next_class: 1 } 1234 } 1235 add(&mut self, b: u8)1236 fn add(&mut self, b: u8) { 1237 if self.next_class > CLASS_SIZE { 1238 panic!("added too many classes") 1239 } 1240 self.classes[b as usize] = self.next_class as u8; 1241 self.next_class = self.next_class + 1; 1242 } 1243 num_classes(&self) -> usize1244 fn num_classes(&self) -> usize { 1245 self.next_class as usize 1246 } 1247 1248 /// Scan and copy the input bytes to the output buffer quickly. 1249 /// 1250 /// This assumes that the current state of the DFA is either `InField` or 1251 /// `InQuotedField`. In this case, all bytes corresponding to the first 1252 /// equivalence class (i.e., not a delimiter/quote/escape/etc.) are 1253 /// guaranteed to never result in a state transition out of the current 1254 /// state. This function takes advantage of that copies every byte from 1255 /// `input` in the first equivalence class to `output`. Once a byte is seen 1256 /// outside the first equivalence class, we quit and should fall back to 1257 /// the main DFA loop. 1258 #[inline(always)] scan_and_copy( &self, input: &[u8], nin: &mut usize, output: &mut [u8], nout: &mut usize, )1259 fn scan_and_copy( 1260 &self, 1261 input: &[u8], 1262 nin: &mut usize, 1263 output: &mut [u8], 1264 nout: &mut usize, 1265 ) { 1266 while *nin < input.len() 1267 && *nout < output.len() 1268 && self.classes[input[*nin] as usize] == 0 1269 { 1270 output[*nout] = input[*nin]; 1271 *nin += 1; 1272 *nout += 1; 1273 } 1274 } 1275 } 1276 1277 /// A single DFA state. 1278 /// 1279 /// A DFA state is represented by the starting index of its corresponding row 1280 /// in the DFA transition table. This representation allows us to elide a 1281 /// single multiplication instruction when computing the next transition for 1282 /// a particular input byte. 1283 #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] 1284 struct DfaState(u8); 1285 1286 impl DfaState { start() -> DfaState1287 fn start() -> DfaState { 1288 DfaState(0) 1289 } 1290 is_start(&self) -> bool1291 fn is_start(&self) -> bool { 1292 self.0 == 0 1293 } 1294 } 1295 1296 impl fmt::Debug for Dfa { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1297 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 1298 write!(f, "Dfa(N/A)") 1299 } 1300 } 1301 1302 impl fmt::Debug for DfaClasses { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1303 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 1304 write!( 1305 f, 1306 "DfaClasses {{ classes: N/A, next_class: {:?} }}", 1307 self.next_class 1308 ) 1309 } 1310 } 1311 1312 impl Clone for Dfa { clone(&self) -> Dfa1313 fn clone(&self) -> Dfa { 1314 let mut dfa = Dfa::new(); 1315 dfa.trans.copy_from_slice(&self.trans); 1316 dfa 1317 } 1318 } 1319 1320 impl Clone for DfaClasses { clone(&self) -> DfaClasses1321 fn clone(&self) -> DfaClasses { 1322 let mut x = DfaClasses::new(); 1323 x.classes.copy_from_slice(&self.classes); 1324 x 1325 } 1326 } 1327 1328 #[cfg(test)] 1329 mod tests { 1330 use core::str; 1331 1332 use arrayvec::{ArrayString, ArrayVec}; 1333 1334 use super::{ReadFieldResult, Reader, ReaderBuilder, Terminator}; 1335 1336 type Csv = ArrayVec<[Row; 10]>; 1337 type Row = ArrayVec<[Field; 10]>; 1338 type Field = ArrayString<[u8; 10]>; 1339 1340 // OMG I HATE BYTE STRING LITERALS SO MUCH. b(s: &str) -> &[u8]1341 fn b(s: &str) -> &[u8] { 1342 s.as_bytes() 1343 } 1344 1345 macro_rules! csv { 1346 ($([$($field:expr),*]),*) => {{ 1347 #[allow(unused_mut)] 1348 fn x() -> Csv { 1349 let mut csv = Csv::new(); 1350 $( 1351 let mut row = Row::new(); 1352 $( 1353 row.push(Field::from($field).unwrap()); 1354 )* 1355 csv.push(row); 1356 )* 1357 csv 1358 } 1359 x() 1360 }} 1361 } 1362 1363 macro_rules! parses_to { 1364 ($name:ident, $data:expr, $expected:expr) => { 1365 parses_to!($name, $data, $expected, |builder| builder); 1366 }; 1367 ($name:ident, $data:expr, $expected:expr, $config:expr) => { 1368 #[test] 1369 fn $name() { 1370 let mut builder = ReaderBuilder::new(); 1371 builder.nfa(true); 1372 $config(&mut builder); 1373 let mut rdr = builder.build(); 1374 let got = parse_by_field(&mut rdr, $data); 1375 let expected = $expected; 1376 assert_eq!(expected, got, "nfa by field"); 1377 1378 let mut builder = ReaderBuilder::new(); 1379 builder.nfa(true); 1380 $config(&mut builder); 1381 let mut rdr = builder.build(); 1382 let got = parse_by_record(&mut rdr, $data); 1383 let expected = $expected; 1384 assert_eq!(expected, got, "nfa by record"); 1385 1386 let mut builder = ReaderBuilder::new(); 1387 $config(&mut builder); 1388 let mut rdr = builder.build(); 1389 let got = parse_by_field(&mut rdr, $data); 1390 let expected = $expected; 1391 assert_eq!(expected, got, "dfa by field"); 1392 1393 let mut builder = ReaderBuilder::new(); 1394 $config(&mut builder); 1395 let mut rdr = builder.build(); 1396 let got = parse_by_record(&mut rdr, $data); 1397 let expected = $expected; 1398 assert_eq!(expected, got, "dfa by record"); 1399 } 1400 }; 1401 } 1402 parse_by_field(rdr: &mut Reader, data: &str) -> Csv1403 fn parse_by_field(rdr: &mut Reader, data: &str) -> Csv { 1404 let mut data = data.as_bytes(); 1405 let mut field = [0u8; 10]; 1406 let mut csv = Csv::new(); 1407 let mut row = Row::new(); 1408 let mut outpos = 0; 1409 loop { 1410 let (res, nin, nout) = rdr.read_field(data, &mut field[outpos..]); 1411 data = &data[nin..]; 1412 outpos += nout; 1413 1414 match res { 1415 ReadFieldResult::InputEmpty => { 1416 if !data.is_empty() { 1417 panic!("missing input data") 1418 } 1419 } 1420 ReadFieldResult::OutputFull => panic!("field too large"), 1421 ReadFieldResult::Field { record_end } => { 1422 let s = str::from_utf8(&field[..outpos]).unwrap(); 1423 row.push(Field::from(s).unwrap()); 1424 outpos = 0; 1425 if record_end { 1426 csv.push(row); 1427 row = Row::new(); 1428 } 1429 } 1430 ReadFieldResult::End => { 1431 return csv; 1432 } 1433 } 1434 } 1435 } 1436 parse_by_record(rdr: &mut Reader, data: &str) -> Csv1437 fn parse_by_record(rdr: &mut Reader, data: &str) -> Csv { 1438 use crate::ReadRecordResult::*; 1439 1440 let mut data = data.as_bytes(); 1441 let mut record = [0; 1024]; 1442 let mut ends = [0; 10]; 1443 1444 let mut csv = Csv::new(); 1445 let (mut outpos, mut endpos) = (0, 0); 1446 loop { 1447 let (res, nin, nout, nend) = rdr.read_record( 1448 data, 1449 &mut record[outpos..], 1450 &mut ends[endpos..], 1451 ); 1452 data = &data[nin..]; 1453 outpos += nout; 1454 endpos += nend; 1455 1456 match res { 1457 InputEmpty => { 1458 if !data.is_empty() { 1459 panic!("missing input data") 1460 } 1461 } 1462 OutputFull => panic!("record too large (out buffer)"), 1463 OutputEndsFull => panic!("record too large (end buffer)"), 1464 Record => { 1465 let s = str::from_utf8(&record[..outpos]).unwrap(); 1466 let mut start = 0; 1467 let mut row = Row::new(); 1468 for &end in &ends[..endpos] { 1469 row.push(Field::from(&s[start..end]).unwrap()); 1470 start = end; 1471 } 1472 csv.push(row); 1473 outpos = 0; 1474 endpos = 0; 1475 } 1476 End => return csv, 1477 } 1478 } 1479 } 1480 1481 parses_to!(one_row_one_field, "a", csv![["a"]]); 1482 parses_to!(one_row_many_fields, "a,b,c", csv![["a", "b", "c"]]); 1483 parses_to!(one_row_trailing_comma, "a,b,", csv![["a", "b", ""]]); 1484 parses_to!(one_row_one_field_lf, "a\n", csv![["a"]]); 1485 parses_to!(one_row_many_fields_lf, "a,b,c\n", csv![["a", "b", "c"]]); 1486 parses_to!(one_row_trailing_comma_lf, "a,b,\n", csv![["a", "b", ""]]); 1487 parses_to!(one_row_one_field_crlf, "a\r\n", csv![["a"]]); 1488 parses_to!(one_row_many_fields_crlf, "a,b,c\r\n", csv![["a", "b", "c"]]); 1489 parses_to!(one_row_trailing_comma_crlf, "a,b,\r\n", csv![["a", "b", ""]]); 1490 parses_to!(one_row_one_field_cr, "a\r", csv![["a"]]); 1491 parses_to!(one_row_many_fields_cr, "a,b,c\r", csv![["a", "b", "c"]]); 1492 parses_to!(one_row_trailing_comma_cr, "a,b,\r", csv![["a", "b", ""]]); 1493 1494 parses_to!(many_rows_one_field, "a\nb", csv![["a"], ["b"]]); 1495 parses_to!( 1496 many_rows_many_fields, 1497 "a,b,c\nx,y,z", 1498 csv![["a", "b", "c"], ["x", "y", "z"]] 1499 ); 1500 parses_to!( 1501 many_rows_trailing_comma, 1502 "a,b,\nx,y,", 1503 csv![["a", "b", ""], ["x", "y", ""]] 1504 ); 1505 parses_to!(many_rows_one_field_lf, "a\nb\n", csv![["a"], ["b"]]); 1506 parses_to!( 1507 many_rows_many_fields_lf, 1508 "a,b,c\nx,y,z\n", 1509 csv![["a", "b", "c"], ["x", "y", "z"]] 1510 ); 1511 parses_to!( 1512 many_rows_trailing_comma_lf, 1513 "a,b,\nx,y,\n", 1514 csv![["a", "b", ""], ["x", "y", ""]] 1515 ); 1516 parses_to!(many_rows_one_field_crlf, "a\r\nb\r\n", csv![["a"], ["b"]]); 1517 parses_to!( 1518 many_rows_many_fields_crlf, 1519 "a,b,c\r\nx,y,z\r\n", 1520 csv![["a", "b", "c"], ["x", "y", "z"]] 1521 ); 1522 parses_to!( 1523 many_rows_trailing_comma_crlf, 1524 "a,b,\r\nx,y,\r\n", 1525 csv![["a", "b", ""], ["x", "y", ""]] 1526 ); 1527 parses_to!(many_rows_one_field_cr, "a\rb\r", csv![["a"], ["b"]]); 1528 parses_to!( 1529 many_rows_many_fields_cr, 1530 "a,b,c\rx,y,z\r", 1531 csv![["a", "b", "c"], ["x", "y", "z"]] 1532 ); 1533 parses_to!( 1534 many_rows_trailing_comma_cr, 1535 "a,b,\rx,y,\r", 1536 csv![["a", "b", ""], ["x", "y", ""]] 1537 ); 1538 1539 parses_to!( 1540 trailing_lines_no_record, 1541 "\n\n\na,b,c\nx,y,z\n\n\n", 1542 csv![["a", "b", "c"], ["x", "y", "z"]] 1543 ); 1544 parses_to!( 1545 trailing_lines_no_record_cr, 1546 "\r\r\ra,b,c\rx,y,z\r\r\r", 1547 csv![["a", "b", "c"], ["x", "y", "z"]] 1548 ); 1549 parses_to!( 1550 trailing_lines_no_record_crlf, 1551 "\r\n\r\n\r\na,b,c\r\nx,y,z\r\n\r\n\r\n", 1552 csv![["a", "b", "c"], ["x", "y", "z"]] 1553 ); 1554 1555 parses_to!(empty, "", csv![]); 1556 parses_to!(empty_lines, "\n\n\n\n", csv![]); 1557 parses_to!( 1558 empty_lines_interspersed, 1559 "\n\na,b\n\n\nx,y\n\n\nm,n\n", 1560 csv![["a", "b"], ["x", "y"], ["m", "n"]] 1561 ); 1562 parses_to!(empty_lines_crlf, "\r\n\r\n\r\n\r\n", csv![]); 1563 parses_to!( 1564 empty_lines_interspersed_crlf, 1565 "\r\n\r\na,b\r\n\r\n\r\nx,y\r\n\r\n\r\nm,n\r\n", 1566 csv![["a", "b"], ["x", "y"], ["m", "n"]] 1567 ); 1568 parses_to!(empty_lines_mixed, "\r\n\n\r\n\n", csv![]); 1569 parses_to!( 1570 empty_lines_interspersed_mixed, 1571 "\n\r\na,b\r\n\n\r\nx,y\r\n\n\r\nm,n\r\n", 1572 csv![["a", "b"], ["x", "y"], ["m", "n"]] 1573 ); 1574 parses_to!(empty_lines_cr, "\r\r\r\r", csv![]); 1575 parses_to!( 1576 empty_lines_interspersed_cr, 1577 "\r\ra,b\r\r\rx,y\r\r\rm,n\r", 1578 csv![["a", "b"], ["x", "y"], ["m", "n"]] 1579 ); 1580 1581 parses_to!( 1582 term_weird, 1583 "zza,bzc,dzz", 1584 csv![["a", "b"], ["c", "d"]], 1585 |b: &mut ReaderBuilder| { 1586 b.terminator(Terminator::Any(b'z')); 1587 } 1588 ); 1589 1590 parses_to!( 1591 ascii_delimited, 1592 "a\x1fb\x1ec\x1fd", 1593 csv![["a", "b"], ["c", "d"]], 1594 |b: &mut ReaderBuilder| { 1595 b.ascii(); 1596 } 1597 ); 1598 1599 parses_to!(bom_at_start, "\u{feff}a", csv![["a"]]); 1600 parses_to!(bom_in_field, "a\u{feff}", csv![["a\u{feff}"]]); 1601 parses_to!(bom_at_field_start, "a,\u{feff}b", csv![["a", "\u{feff}b"]]); 1602 1603 parses_to!(quote_empty, "\"\"", csv![[""]]); 1604 parses_to!(quote_lf, "\"\"\n", csv![[""]]); 1605 parses_to!(quote_space, "\" \"", csv![[" "]]); 1606 parses_to!(quote_inner_space, "\" a \"", csv![[" a "]]); 1607 parses_to!(quote_outer_space, " \"a\" ", csv![[" \"a\" "]]); 1608 1609 parses_to!(quote_change, "zaz", csv![["a"]], |b: &mut ReaderBuilder| { 1610 b.quote(b'z'); 1611 }); 1612 1613 // This one is pretty hokey. 1614 // I don't really know what the "right" behavior is. 1615 parses_to!( 1616 quote_delimiter, 1617 ",a,,b", 1618 csv![["a,b"]], 1619 |b: &mut ReaderBuilder| { 1620 b.quote(b','); 1621 } 1622 ); 1623 1624 parses_to!(quote_no_escapes, r#""a\"b""#, csv![[r#"a\b""#]]); 1625 parses_to!( 1626 quote_escapes_no_double, 1627 r#""a""b""#, 1628 csv![[r#"a"b""#]], 1629 |b: &mut ReaderBuilder| { 1630 b.double_quote(false); 1631 } 1632 ); 1633 parses_to!( 1634 quote_escapes, 1635 r#""a\"b""#, 1636 csv![[r#"a"b"#]], 1637 |b: &mut ReaderBuilder| { 1638 b.escape(Some(b'\\')); 1639 } 1640 ); 1641 parses_to!( 1642 quote_escapes_change, 1643 r#""az"b""#, 1644 csv![[r#"a"b"#]], 1645 |b: &mut ReaderBuilder| { 1646 b.escape(Some(b'z')); 1647 } 1648 ); 1649 1650 parses_to!( 1651 quote_escapes_with_comma, 1652 r#""\"A,B\"""#, 1653 csv![[r#""A,B""#]], 1654 |b: &mut ReaderBuilder| { 1655 b.escape(Some(b'\\')).double_quote(false); 1656 } 1657 ); 1658 1659 parses_to!( 1660 quoting_disabled, 1661 r#""abc,foo""#, 1662 csv![[r#""abc"#, r#"foo""#]], 1663 |b: &mut ReaderBuilder| { 1664 b.quoting(false); 1665 } 1666 ); 1667 1668 parses_to!( 1669 delimiter_tabs, 1670 "a\tb", 1671 csv![["a", "b"]], 1672 |b: &mut ReaderBuilder| { 1673 b.delimiter(b'\t'); 1674 } 1675 ); 1676 parses_to!( 1677 delimiter_weird, 1678 "azb", 1679 csv![["a", "b"]], 1680 |b: &mut ReaderBuilder| { 1681 b.delimiter(b'z'); 1682 } 1683 ); 1684 1685 parses_to!(extra_record_crlf_1, "foo\n1\n", csv![["foo"], ["1"]]); 1686 parses_to!(extra_record_crlf_2, "foo\r\n1\r\n", csv![["foo"], ["1"]]); 1687 1688 parses_to!( 1689 comment_1, 1690 "foo\n# hi\nbar\n", 1691 csv![["foo"], ["bar"]], 1692 |b: &mut ReaderBuilder| { 1693 b.comment(Some(b'#')); 1694 } 1695 ); 1696 parses_to!( 1697 comment_2, 1698 "foo\n # hi\nbar\n", 1699 csv![["foo"], [" # hi"], ["bar"]], 1700 |b: &mut ReaderBuilder| { 1701 b.comment(Some(b'#')); 1702 } 1703 ); 1704 parses_to!( 1705 comment_3, 1706 "foo\n# hi\nbar\n", 1707 csv![["foo"], ["# hi"], ["bar"]], 1708 |b: &mut ReaderBuilder| { 1709 b.comment(Some(b'\n')); 1710 } 1711 ); 1712 parses_to!( 1713 comment_4, 1714 "foo,b#ar,baz", 1715 csv![["foo", "b#ar", "baz"]], 1716 |b: &mut ReaderBuilder| { 1717 b.comment(Some(b'#')); 1718 } 1719 ); 1720 parses_to!( 1721 comment_5, 1722 "foo,#bar,baz", 1723 csv![["foo", "#bar", "baz"]], 1724 |b: &mut ReaderBuilder| { 1725 b.comment(Some(b'#')); 1726 } 1727 ); 1728 1729 macro_rules! assert_read { 1730 ( 1731 $rdr:expr, $input:expr, $output:expr, 1732 $expect_in:expr, $expect_out:expr, $expect_res:expr 1733 ) => {{ 1734 let (res, nin, nout) = $rdr.read_field($input, $output); 1735 assert_eq!($expect_in, nin); 1736 assert_eq!($expect_out, nout); 1737 assert_eq!($expect_res, res); 1738 }}; 1739 } 1740 1741 // This tests that feeding a new reader with an empty buffer sends us 1742 // straight to End. 1743 #[test] stream_empty()1744 fn stream_empty() { 1745 use crate::ReadFieldResult::*; 1746 1747 let mut rdr = Reader::new(); 1748 assert_read!(rdr, &[], &mut [], 0, 0, End); 1749 } 1750 1751 // Test that a single space is treated as a single field. 1752 #[test] stream_space()1753 fn stream_space() { 1754 use crate::ReadFieldResult::*; 1755 1756 let mut rdr = Reader::new(); 1757 assert_read!(rdr, b(" "), &mut [0], 1, 1, InputEmpty); 1758 assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true }); 1759 assert_read!(rdr, &[], &mut [0], 0, 0, End); 1760 } 1761 1762 // Test that a single comma ... 1763 #[test] stream_comma()1764 fn stream_comma() { 1765 use crate::ReadFieldResult::*; 1766 1767 let mut rdr = Reader::new(); 1768 assert_read!(rdr, b(","), &mut [0], 1, 0, Field { record_end: false }); 1769 assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true }); 1770 assert_read!(rdr, &[], &mut [0], 0, 0, End); 1771 } 1772 1773 // Test that we can read a single large field in multiple output 1774 // buffers. 1775 #[test] stream_output_chunks()1776 fn stream_output_chunks() { 1777 use crate::ReadFieldResult::*; 1778 1779 let mut inp = b("fooquux"); 1780 let out = &mut [0; 2]; 1781 let mut rdr = Reader::new(); 1782 1783 assert_read!(rdr, inp, out, 2, 2, OutputFull); 1784 assert_eq!(out, b("fo")); 1785 inp = &inp[2..]; 1786 1787 assert_read!(rdr, inp, out, 2, 2, OutputFull); 1788 assert_eq!(out, b("oq")); 1789 inp = &inp[2..]; 1790 1791 assert_read!(rdr, inp, out, 2, 2, OutputFull); 1792 assert_eq!(out, b("uu")); 1793 inp = &inp[2..]; 1794 1795 assert_read!(rdr, inp, out, 1, 1, InputEmpty); 1796 assert_eq!(&out[..1], b("x")); 1797 inp = &inp[1..]; 1798 assert!(inp.is_empty()); 1799 1800 assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); 1801 assert_read!(rdr, inp, out, 0, 0, End); 1802 } 1803 1804 // Test that we can read a single large field across multiple input 1805 // buffers. 1806 #[test] stream_input_chunks()1807 fn stream_input_chunks() { 1808 use crate::ReadFieldResult::*; 1809 1810 let out = &mut [0; 10]; 1811 let mut rdr = Reader::new(); 1812 1813 assert_read!(rdr, b("fo"), out, 2, 2, InputEmpty); 1814 assert_eq!(&out[..2], b("fo")); 1815 1816 assert_read!(rdr, b("oq"), &mut out[2..], 2, 2, InputEmpty); 1817 assert_eq!(&out[..4], b("fooq")); 1818 1819 assert_read!(rdr, b("uu"), &mut out[4..], 2, 2, InputEmpty); 1820 assert_eq!(&out[..6], b("fooquu")); 1821 1822 assert_read!(rdr, b("x"), &mut out[6..], 1, 1, InputEmpty); 1823 assert_eq!(&out[..7], b("fooquux")); 1824 1825 assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); 1826 assert_read!(rdr, &[], out, 0, 0, End); 1827 } 1828 1829 // Test we can read doubled quotes correctly in a stream. 1830 #[test] stream_doubled_quotes()1831 fn stream_doubled_quotes() { 1832 use crate::ReadFieldResult::*; 1833 1834 let out = &mut [0; 10]; 1835 let mut rdr = Reader::new(); 1836 1837 assert_read!(rdr, b("\"fo\""), out, 4, 2, InputEmpty); 1838 assert_eq!(&out[..2], b("fo")); 1839 1840 assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty); 1841 assert_eq!(&out[..4], b("fo\"o")); 1842 1843 assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); 1844 assert_read!(rdr, &[], out, 0, 0, End); 1845 } 1846 1847 // Test we can read escaped quotes correctly in a stream. 1848 #[test] stream_escaped_quotes()1849 fn stream_escaped_quotes() { 1850 use crate::ReadFieldResult::*; 1851 1852 let out = &mut [0; 10]; 1853 let mut builder = ReaderBuilder::new(); 1854 let mut rdr = builder.escape(Some(b'\\')).build(); 1855 1856 assert_read!(rdr, b("\"fo\\"), out, 4, 2, InputEmpty); 1857 assert_eq!(&out[..2], b("fo")); 1858 1859 assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty); 1860 assert_eq!(&out[..4], b("fo\"o")); 1861 1862 assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); 1863 assert_read!(rdr, &[], out, 0, 0, End); 1864 } 1865 1866 // Test that empty output buffers don't wreak havoc. 1867 #[test] stream_empty_output()1868 fn stream_empty_output() { 1869 use crate::ReadFieldResult::*; 1870 1871 let out = &mut [0; 10]; 1872 let mut rdr = Reader::new(); 1873 1874 assert_read!( 1875 rdr, 1876 b("foo,bar"), 1877 out, 1878 4, 1879 3, 1880 Field { record_end: false } 1881 ); 1882 assert_eq!(&out[..3], b("foo")); 1883 1884 assert_read!(rdr, b("bar"), &mut [], 0, 0, OutputFull); 1885 1886 assert_read!(rdr, b("bar"), out, 3, 3, InputEmpty); 1887 assert_eq!(&out[..3], b("bar")); 1888 1889 assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); 1890 assert_read!(rdr, &[], out, 0, 0, End); 1891 } 1892 1893 // Test that we can reset the parser mid-stream and count on it to do 1894 // the right thing. 1895 #[test] reset_works()1896 fn reset_works() { 1897 use crate::ReadFieldResult::*; 1898 1899 let out = &mut [0; 10]; 1900 let mut rdr = Reader::new(); 1901 1902 assert_read!(rdr, b("\"foo"), out, 4, 3, InputEmpty); 1903 assert_eq!(&out[..3], b("foo")); 1904 1905 // Without reseting the parser state, the reader will remember that 1906 // we're in a quoted field, and therefore interpret the leading double 1907 // quotes below as a single quote and the trailing quote as a matching 1908 // terminator. With the reset, however, the parser forgets the quoted 1909 // field and treats the leading double quotes as a syntax quirk and 1910 // drops them, in addition to hanging on to the trailing unmatched 1911 // quote. (Matches Python's behavior.) 1912 rdr.reset(); 1913 1914 assert_read!(rdr, b("\"\"bar\""), out, 6, 4, InputEmpty); 1915 assert_eq!(&out[..4], b("bar\"")); 1916 } 1917 1918 // Test the line number reporting is correct. 1919 #[test] line_numbers()1920 fn line_numbers() { 1921 use crate::ReadFieldResult::*; 1922 1923 let out = &mut [0; 10]; 1924 let mut rdr = Reader::new(); 1925 1926 assert_eq!(1, rdr.line()); 1927 1928 assert_read!(rdr, b("\n\n\n\n"), out, 4, 0, InputEmpty); 1929 assert_eq!(5, rdr.line()); 1930 1931 assert_read!(rdr, b("foo,"), out, 4, 3, Field { record_end: false }); 1932 assert_eq!(5, rdr.line()); 1933 1934 assert_read!(rdr, b("bar\n"), out, 4, 3, Field { record_end: true }); 1935 assert_eq!(6, rdr.line()); 1936 1937 assert_read!(rdr, &[], &mut [0], 0, 0, End); 1938 assert_eq!(6, rdr.line()); 1939 } 1940 1941 macro_rules! assert_read_record { 1942 ( 1943 $rdr:expr, $input:expr, $output:expr, $ends:expr, 1944 $expect_in:expr, $expect_out:expr, 1945 $expect_end:expr, $expect_res:expr 1946 ) => {{ 1947 let (res, nin, nout, nend) = 1948 $rdr.read_record($input, $output, $ends); 1949 assert_eq!($expect_res, res, "result"); 1950 assert_eq!($expect_in, nin, "input"); 1951 assert_eq!($expect_out, nout, "output"); 1952 assert_eq!($expect_end, nend, "ends"); 1953 }}; 1954 } 1955 1956 // Test that we can incrementally read a record. 1957 #[test] stream_record()1958 fn stream_record() { 1959 use crate::ReadRecordResult::*; 1960 1961 let mut inp = b("foo,bar\nbaz"); 1962 let out = &mut [0; 1024]; 1963 let ends = &mut [0; 10]; 1964 let mut rdr = Reader::new(); 1965 1966 assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record); 1967 assert_eq!(ends[0], 3); 1968 assert_eq!(ends[1], 6); 1969 inp = &inp[8..]; 1970 1971 assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty); 1972 inp = &inp[3..]; 1973 1974 assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record); 1975 assert_eq!(ends[0], 3); 1976 1977 assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End); 1978 } 1979 1980 // Test that if our output ends are full during the last read that 1981 // we get an appropriate state returned. 1982 #[test] stream_record_last_end_output_full()1983 fn stream_record_last_end_output_full() { 1984 use crate::ReadRecordResult::*; 1985 1986 let mut inp = b("foo,bar\nbaz"); 1987 let out = &mut [0; 1024]; 1988 let ends = &mut [0; 10]; 1989 let mut rdr = Reader::new(); 1990 1991 assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record); 1992 assert_eq!(ends[0], 3); 1993 assert_eq!(ends[1], 6); 1994 inp = &inp[8..]; 1995 1996 assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty); 1997 inp = &inp[3..]; 1998 1999 assert_read_record!(rdr, &inp, out, &mut [], 0, 0, 0, OutputEndsFull); 2000 assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record); 2001 assert_eq!(ends[0], 3); 2002 2003 assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End); 2004 } 2005 } 2006