1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package csv reads and writes comma-separated values (CSV) files.
6// There are many kinds of CSV files; this package supports the format
7// described in RFC 4180.
8//
9// A csv file contains zero or more records of one or more fields per record.
10// Each record is separated by the newline character. The final record may
11// optionally be followed by a newline character.
12//
13//	field1,field2,field3
14//
15// White space is considered part of a field.
16//
17// Carriage returns before newline characters are silently removed.
18//
19// Blank lines are ignored. A line with only whitespace characters (excluding
20// the ending newline character) is not considered a blank line.
21//
22// Fields which start and stop with the quote character " are called
23// quoted-fields. The beginning and ending quote are not part of the
24// field.
25//
26// The source:
27//
28//	normal string,"quoted-field"
29//
30// results in the fields
31//
32//	{`normal string`, `quoted-field`}
33//
34// Within a quoted-field a quote character followed by a second quote
35// character is considered a single quote.
36//
37//	"the ""word"" is true","a ""quoted-field"""
38//
39// results in
40//
41//	{`the "word" is true`, `a "quoted-field"`}
42//
43// Newlines and commas may be included in a quoted-field
44//
45//	"Multi-line
46//	field","comma is ,"
47//
48// results in
49//
50//	{`Multi-line
51//	field`, `comma is ,`}
52package csv
53
54import (
55	"bufio"
56	"bytes"
57	"errors"
58	"fmt"
59	"io"
60	"unicode"
61	"unicode/utf8"
62)
63
64// A ParseError is returned for parsing errors.
65// Line numbers are 1-indexed and columns are 0-indexed.
66type ParseError struct {
67	StartLine int   // Line where the record starts
68	Line      int   // Line where the error occurred
69	Column    int   // Column (rune index) where the error occurred
70	Err       error // The actual error
71}
72
73func (e *ParseError) Error() string {
74	if e.Err == ErrFieldCount {
75		return fmt.Sprintf("record on line %d: %v", e.Line, e.Err)
76	}
77	if e.StartLine != e.Line {
78		return fmt.Sprintf("record on line %d; parse error on line %d, column %d: %v", e.StartLine, e.Line, e.Column, e.Err)
79	}
80	return fmt.Sprintf("parse error on line %d, column %d: %v", e.Line, e.Column, e.Err)
81}
82
83// These are the errors that can be returned in ParseError.Err.
84var (
85	ErrTrailingComma = errors.New("extra delimiter at end of line") // Deprecated: No longer used.
86	ErrBareQuote     = errors.New("bare \" in non-quoted-field")
87	ErrQuote         = errors.New("extraneous or missing \" in quoted-field")
88	ErrFieldCount    = errors.New("wrong number of fields")
89)
90
91var errInvalidDelim = errors.New("csv: invalid field or comment delimiter")
92
93func validDelim(r rune) bool {
94	return r != 0 && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError
95}
96
97// A Reader reads records from a CSV-encoded file.
98//
99// As returned by NewReader, a Reader expects input conforming to RFC 4180.
100// The exported fields can be changed to customize the details before the
101// first call to Read or ReadAll.
102//
103// The Reader converts all \r\n sequences in its input to plain \n,
104// including in multiline field values, so that the returned data does
105// not depend on which line-ending convention an input file uses.
106type Reader struct {
107	// Comma is the field delimiter.
108	// It is set to comma (',') by NewReader.
109	// Comma must be a valid rune and must not be \r, \n,
110	// or the Unicode replacement character (0xFFFD).
111	Comma rune
112
113	// Comment, if not 0, is the comment character. Lines beginning with the
114	// Comment character without preceding whitespace are ignored.
115	// With leading whitespace the Comment character becomes part of the
116	// field, even if TrimLeadingSpace is true.
117	// Comment must be a valid rune and must not be \r, \n,
118	// or the Unicode replacement character (0xFFFD).
119	// It must also not be equal to Comma.
120	Comment rune
121
122	// FieldsPerRecord is the number of expected fields per record.
123	// If FieldsPerRecord is positive, Read requires each record to
124	// have the given number of fields. If FieldsPerRecord is 0, Read sets it to
125	// the number of fields in the first record, so that future records must
126	// have the same field count. If FieldsPerRecord is negative, no check is
127	// made and records may have a variable number of fields.
128	FieldsPerRecord int
129
130	// If LazyQuotes is true, a quote may appear in an unquoted field and a
131	// non-doubled quote may appear in a quoted field.
132	LazyQuotes bool
133
134	// If TrimLeadingSpace is true, leading white space in a field is ignored.
135	// This is done even if the field delimiter, Comma, is white space.
136	TrimLeadingSpace bool
137
138	// ReuseRecord controls whether calls to Read may return a slice sharing
139	// the backing array of the previous call's returned slice for performance.
140	// By default, each call to Read returns newly allocated memory owned by the caller.
141	ReuseRecord bool
142
143	TrailingComma bool // Deprecated: No longer used.
144
145	r *bufio.Reader
146
147	// numLine is the current line being read in the CSV file.
148	numLine int
149
150	// rawBuffer is a line buffer only used by the readLine method.
151	rawBuffer []byte
152
153	// recordBuffer holds the unescaped fields, one after another.
154	// The fields can be accessed by using the indexes in fieldIndexes.
155	// E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de`
156	// and fieldIndexes will contain the indexes [1, 2, 5, 6].
157	recordBuffer []byte
158
159	// fieldIndexes is an index of fields inside recordBuffer.
160	// The i'th field ends at offset fieldIndexes[i] in recordBuffer.
161	fieldIndexes []int
162
163	// lastRecord is a record cache and only used when ReuseRecord == true.
164	lastRecord []string
165}
166
167// NewReader returns a new Reader that reads from r.
168func NewReader(r io.Reader) *Reader {
169	return &Reader{
170		Comma: ',',
171		r:     bufio.NewReader(r),
172	}
173}
174
175// Read reads one record (a slice of fields) from r.
176// If the record has an unexpected number of fields,
177// Read returns the record along with the error ErrFieldCount.
178// Except for that case, Read always returns either a non-nil
179// record or a non-nil error, but not both.
180// If there is no data left to be read, Read returns nil, io.EOF.
181// If ReuseRecord is true, the returned slice may be shared
182// between multiple calls to Read.
183func (r *Reader) Read() (record []string, err error) {
184	if r.ReuseRecord {
185		record, err = r.readRecord(r.lastRecord)
186		r.lastRecord = record
187	} else {
188		record, err = r.readRecord(nil)
189	}
190	return record, err
191}
192
193// ReadAll reads all the remaining records from r.
194// Each record is a slice of fields.
195// A successful call returns err == nil, not err == io.EOF. Because ReadAll is
196// defined to read until EOF, it does not treat end of file as an error to be
197// reported.
198func (r *Reader) ReadAll() (records [][]string, err error) {
199	for {
200		record, err := r.readRecord(nil)
201		if err == io.EOF {
202			return records, nil
203		}
204		if err != nil {
205			return nil, err
206		}
207		records = append(records, record)
208	}
209}
210
211// readLine reads the next line (with the trailing endline).
212// If EOF is hit without a trailing endline, it will be omitted.
213// If some bytes were read, then the error is never io.EOF.
214// The result is only valid until the next call to readLine.
215func (r *Reader) readLine() ([]byte, error) {
216	line, err := r.r.ReadSlice('\n')
217	if err == bufio.ErrBufferFull {
218		r.rawBuffer = append(r.rawBuffer[:0], line...)
219		for err == bufio.ErrBufferFull {
220			line, err = r.r.ReadSlice('\n')
221			r.rawBuffer = append(r.rawBuffer, line...)
222		}
223		line = r.rawBuffer
224	}
225	if len(line) > 0 && err == io.EOF {
226		err = nil
227		// For backwards compatibility, drop trailing \r before EOF.
228		if line[len(line)-1] == '\r' {
229			line = line[:len(line)-1]
230		}
231	}
232	r.numLine++
233	// Normalize \r\n to \n on all input lines.
234	if n := len(line); n >= 2 && line[n-2] == '\r' && line[n-1] == '\n' {
235		line[n-2] = '\n'
236		line = line[:n-1]
237	}
238	return line, err
239}
240
241// lengthNL reports the number of bytes for the trailing \n.
242func lengthNL(b []byte) int {
243	if len(b) > 0 && b[len(b)-1] == '\n' {
244		return 1
245	}
246	return 0
247}
248
249// nextRune returns the next rune in b or utf8.RuneError.
250func nextRune(b []byte) rune {
251	r, _ := utf8.DecodeRune(b)
252	return r
253}
254
255func (r *Reader) readRecord(dst []string) ([]string, error) {
256	if r.Comma == r.Comment || !validDelim(r.Comma) || (r.Comment != 0 && !validDelim(r.Comment)) {
257		return nil, errInvalidDelim
258	}
259
260	// Read line (automatically skipping past empty lines and any comments).
261	var line, fullLine []byte
262	var errRead error
263	for errRead == nil {
264		line, errRead = r.readLine()
265		if r.Comment != 0 && nextRune(line) == r.Comment {
266			line = nil
267			continue // Skip comment lines
268		}
269		if errRead == nil && len(line) == lengthNL(line) {
270			line = nil
271			continue // Skip empty lines
272		}
273		fullLine = line
274		break
275	}
276	if errRead == io.EOF {
277		return nil, errRead
278	}
279
280	// Parse each field in the record.
281	var err error
282	const quoteLen = len(`"`)
283	commaLen := utf8.RuneLen(r.Comma)
284	recLine := r.numLine // Starting line for record
285	r.recordBuffer = r.recordBuffer[:0]
286	r.fieldIndexes = r.fieldIndexes[:0]
287parseField:
288	for {
289		if r.TrimLeadingSpace {
290			line = bytes.TrimLeftFunc(line, unicode.IsSpace)
291		}
292		if len(line) == 0 || line[0] != '"' {
293			// Non-quoted string field
294			i := bytes.IndexRune(line, r.Comma)
295			field := line
296			if i >= 0 {
297				field = field[:i]
298			} else {
299				field = field[:len(field)-lengthNL(field)]
300			}
301			// Check to make sure a quote does not appear in field.
302			if !r.LazyQuotes {
303				if j := bytes.IndexByte(field, '"'); j >= 0 {
304					col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])])
305					err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrBareQuote}
306					break parseField
307				}
308			}
309			r.recordBuffer = append(r.recordBuffer, field...)
310			r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
311			if i >= 0 {
312				line = line[i+commaLen:]
313				continue parseField
314			}
315			break parseField
316		} else {
317			// Quoted string field
318			line = line[quoteLen:]
319			for {
320				i := bytes.IndexByte(line, '"')
321				if i >= 0 {
322					// Hit next quote.
323					r.recordBuffer = append(r.recordBuffer, line[:i]...)
324					line = line[i+quoteLen:]
325					switch rn := nextRune(line); {
326					case rn == '"':
327						// `""` sequence (append quote).
328						r.recordBuffer = append(r.recordBuffer, '"')
329						line = line[quoteLen:]
330					case rn == r.Comma:
331						// `",` sequence (end of field).
332						line = line[commaLen:]
333						r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
334						continue parseField
335					case lengthNL(line) == len(line):
336						// `"\n` sequence (end of line).
337						r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
338						break parseField
339					case r.LazyQuotes:
340						// `"` sequence (bare quote).
341						r.recordBuffer = append(r.recordBuffer, '"')
342					default:
343						// `"*` sequence (invalid non-escaped quote).
344						col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen])
345						err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
346						break parseField
347					}
348				} else if len(line) > 0 {
349					// Hit end of line (copy all data so far).
350					r.recordBuffer = append(r.recordBuffer, line...)
351					if errRead != nil {
352						break parseField
353					}
354					line, errRead = r.readLine()
355					if errRead == io.EOF {
356						errRead = nil
357					}
358					fullLine = line
359				} else {
360					// Abrupt end of file (EOF or error).
361					if !r.LazyQuotes && errRead == nil {
362						col := utf8.RuneCount(fullLine)
363						err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
364						break parseField
365					}
366					r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
367					break parseField
368				}
369			}
370		}
371	}
372	if err == nil {
373		err = errRead
374	}
375
376	// Create a single string and create slices out of it.
377	// This pins the memory of the fields together, but allocates once.
378	str := string(r.recordBuffer) // Convert to string once to batch allocations
379	dst = dst[:0]
380	if cap(dst) < len(r.fieldIndexes) {
381		dst = make([]string, len(r.fieldIndexes))
382	}
383	dst = dst[:len(r.fieldIndexes)]
384	var preIdx int
385	for i, idx := range r.fieldIndexes {
386		dst[i] = str[preIdx:idx]
387		preIdx = idx
388	}
389
390	// Check or update the expected fields per record.
391	if r.FieldsPerRecord > 0 {
392		if len(dst) != r.FieldsPerRecord && err == nil {
393			err = &ParseError{StartLine: recLine, Line: recLine, Err: ErrFieldCount}
394		}
395	} else if r.FieldsPerRecord == 0 {
396		r.FieldsPerRecord = len(dst)
397	}
398	return dst, err
399}
400