1 /*-------------------------------------------------------------------------
2  *
3  * copyfromparse.c
4  *		Parse CSV/text/binary format for COPY FROM.
5  *
6  * This file contains routines to parse the text, CSV and binary input
7  * formats.  The main entry point is NextCopyFrom(), which parses the
8  * next input line and returns it as Datums.
9  *
10  * In text/CSV mode, the parsing happens in multiple stages:
11  *
12  * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
13  *                1.          2.            3.           4.
14  *
15  * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
16  *    places it into 'raw_buf'.
17  *
18  * 2. CopyConvertBuf() calls the encoding conversion function to convert
19  *    the data in 'raw_buf' from client to server encoding, placing the
20  *    converted result in 'input_buf'.
21  *
22  * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
23  *    It is responsible for finding the next newline marker, taking quote and
24  *    escape characters into account according to the COPY options.  The line
25  *    is copied into 'line_buf', with quotes and escape characters still
26  *    intact.
27  *
28  * 4. CopyReadAttributesText/CSV() function takes the input line from
29  *    'line_buf', and splits it into fields, unescaping the data as required.
30  *    The fields are stored in 'attribute_buf', and 'raw_fields' array holds
31  *    pointers to each field.
32  *
33  * If encoding conversion is not required, a shortcut is taken in step 2 to
34  * avoid copying the data unnecessarily.  The 'input_buf' pointer is set to
35  * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
36  * directly into 'input_buf'.  CopyConvertBuf() then merely validates that
37  * the data is valid in the current encoding.
38  *
39  * In binary mode, the pipeline is much simpler.  Input is loaded into
40  * into 'raw_buf', and encoding conversion is done in the datatype-specific
41  * receive functions, if required.  'input_buf' and 'line_buf' are not used,
42  * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
43  * data when it's passed the receive function.
44  *
45  * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE).  'input_buf' is also
46  * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required.  'line_buf'
47  * and 'attribute_buf' are expanded on demand, to hold the longest line
48  * encountered so far.
49  *
50  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
51  * Portions Copyright (c) 1994, Regents of the University of California
52  *
53  *
54  * IDENTIFICATION
55  *	  src/backend/commands/copyfromparse.c
56  *
57  *-------------------------------------------------------------------------
58  */
59 #include "postgres.h"
60 
61 #include <ctype.h>
62 #include <unistd.h>
63 #include <sys/stat.h>
64 
65 #include "commands/copy.h"
66 #include "commands/copyfrom_internal.h"
67 #include "commands/progress.h"
68 #include "executor/executor.h"
69 #include "libpq/libpq.h"
70 #include "libpq/pqformat.h"
71 #include "mb/pg_wchar.h"
72 #include "miscadmin.h"
73 #include "pgstat.h"
74 #include "port/pg_bswap.h"
75 #include "utils/memutils.h"
76 #include "utils/rel.h"
77 
78 #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
79 #define OCTVALUE(c) ((c) - '0')
80 
81 /*
82  * These macros centralize code used to process line_buf and input_buf buffers.
83  * They are macros because they often do continue/break control and to avoid
84  * function call overhead in tight COPY loops.
85  *
86  * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
87  * prevent the continue/break processing from working.  We end the "if (1)"
88  * with "else ((void) 0)" to ensure the "if" does not unintentionally match
89  * any "else" in the calling code, and to avoid any compiler warnings about
90  * empty statements.  See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
91  */
92 
93 /*
94  * This keeps the character read at the top of the loop in the buffer
95  * even if there is more than one read-ahead.
96  */
97 #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
98 if (1) \
99 { \
100 	if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
101 	{ \
102 		input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
103 		need_data = true; \
104 		continue; \
105 	} \
106 } else ((void) 0)
107 
108 /* This consumes the remainder of the buffer and breaks */
109 #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
110 if (1) \
111 { \
112 	if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
113 	{ \
114 		if (extralen) \
115 			input_buf_ptr = copy_buf_len; /* consume the partial character */ \
116 		/* backslash just before EOF, treat as data char */ \
117 		result = true; \
118 		break; \
119 	} \
120 } else ((void) 0)
121 
122 /*
123  * Transfer any approved data to line_buf; must do this to be sure
124  * there is some room in input_buf.
125  */
126 #define REFILL_LINEBUF \
127 if (1) \
128 { \
129 	if (input_buf_ptr > cstate->input_buf_index) \
130 	{ \
131 		appendBinaryStringInfo(&cstate->line_buf, \
132 							 cstate->input_buf + cstate->input_buf_index, \
133 							   input_buf_ptr - cstate->input_buf_index); \
134 		cstate->input_buf_index = input_buf_ptr; \
135 	} \
136 } else ((void) 0)
137 
138 /* Undo any read-ahead and jump out of the block. */
139 #define NO_END_OF_COPY_GOTO \
140 if (1) \
141 { \
142 	input_buf_ptr = prev_raw_ptr + 1; \
143 	goto not_end_of_copy; \
144 } else ((void) 0)
145 
146 /* NOTE: there's a copy of this in copyto.c */
147 static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
148 
149 
150 /* non-export function prototypes */
151 static bool CopyReadLine(CopyFromState cstate);
152 static bool CopyReadLineText(CopyFromState cstate);
153 static int	CopyReadAttributesText(CopyFromState cstate);
154 static int	CopyReadAttributesCSV(CopyFromState cstate);
155 static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
156 									 Oid typioparam, int32 typmod,
157 									 bool *isnull);
158 
159 
160 /* Low-level communications functions */
161 static int	CopyGetData(CopyFromState cstate, void *databuf,
162 						int minread, int maxread);
163 static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
164 static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
165 static void CopyLoadInputBuf(CopyFromState cstate);
166 static int	CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
167 
168 void
ReceiveCopyBegin(CopyFromState cstate)169 ReceiveCopyBegin(CopyFromState cstate)
170 {
171 	StringInfoData buf;
172 	int			natts = list_length(cstate->attnumlist);
173 	int16		format = (cstate->opts.binary ? 1 : 0);
174 	int			i;
175 
176 	pq_beginmessage(&buf, 'G');
177 	pq_sendbyte(&buf, format);	/* overall format */
178 	pq_sendint16(&buf, natts);
179 	for (i = 0; i < natts; i++)
180 		pq_sendint16(&buf, format); /* per-column formats */
181 	pq_endmessage(&buf);
182 	cstate->copy_src = COPY_FRONTEND;
183 	cstate->fe_msgbuf = makeStringInfo();
184 	/* We *must* flush here to ensure FE knows it can send. */
185 	pq_flush();
186 }
187 
188 void
ReceiveCopyBinaryHeader(CopyFromState cstate)189 ReceiveCopyBinaryHeader(CopyFromState cstate)
190 {
191 	char		readSig[11];
192 	int32		tmp;
193 
194 	/* Signature */
195 	if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
196 		memcmp(readSig, BinarySignature, 11) != 0)
197 		ereport(ERROR,
198 				(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
199 				 errmsg("COPY file signature not recognized")));
200 	/* Flags field */
201 	if (!CopyGetInt32(cstate, &tmp))
202 		ereport(ERROR,
203 				(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
204 				 errmsg("invalid COPY file header (missing flags)")));
205 	if ((tmp & (1 << 16)) != 0)
206 		ereport(ERROR,
207 				(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
208 				 errmsg("invalid COPY file header (WITH OIDS)")));
209 	tmp &= ~(1 << 16);
210 	if ((tmp >> 16) != 0)
211 		ereport(ERROR,
212 				(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
213 				 errmsg("unrecognized critical flags in COPY file header")));
214 	/* Header extension length */
215 	if (!CopyGetInt32(cstate, &tmp) ||
216 		tmp < 0)
217 		ereport(ERROR,
218 				(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
219 				 errmsg("invalid COPY file header (missing length)")));
220 	/* Skip extension header, if present */
221 	while (tmp-- > 0)
222 	{
223 		if (CopyReadBinaryData(cstate, readSig, 1) != 1)
224 			ereport(ERROR,
225 					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
226 					 errmsg("invalid COPY file header (wrong length)")));
227 	}
228 }
229 
230 /*
231  * CopyGetData reads data from the source (file or frontend)
232  *
233  * We attempt to read at least minread, and at most maxread, bytes from
234  * the source.  The actual number of bytes read is returned; if this is
235  * less than minread, EOF was detected.
236  *
237  * Note: when copying from the frontend, we expect a proper EOF mark per
238  * protocol; if the frontend simply drops the connection, we raise error.
239  * It seems unwise to allow the COPY IN to complete normally in that case.
240  *
241  * NB: no data conversion is applied here.
242  */
243 static int
CopyGetData(CopyFromState cstate,void * databuf,int minread,int maxread)244 CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
245 {
246 	int			bytesread = 0;
247 
248 	switch (cstate->copy_src)
249 	{
250 		case COPY_FILE:
251 			bytesread = fread(databuf, 1, maxread, cstate->copy_file);
252 			if (ferror(cstate->copy_file))
253 				ereport(ERROR,
254 						(errcode_for_file_access(),
255 						 errmsg("could not read from COPY file: %m")));
256 			if (bytesread == 0)
257 				cstate->raw_reached_eof = true;
258 			break;
259 		case COPY_FRONTEND:
260 			while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
261 			{
262 				int			avail;
263 
264 				while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len)
265 				{
266 					/* Try to receive another message */
267 					int			mtype;
268 					int			maxmsglen;
269 
270 			readmessage:
271 					HOLD_CANCEL_INTERRUPTS();
272 					pq_startmsgread();
273 					mtype = pq_getbyte();
274 					if (mtype == EOF)
275 						ereport(ERROR,
276 								(errcode(ERRCODE_CONNECTION_FAILURE),
277 								 errmsg("unexpected EOF on client connection with an open transaction")));
278 					/* Validate message type and set packet size limit */
279 					switch (mtype)
280 					{
281 						case 'd':	/* CopyData */
282 							maxmsglen = PQ_LARGE_MESSAGE_LIMIT;
283 							break;
284 						case 'c':	/* CopyDone */
285 						case 'f':	/* CopyFail */
286 						case 'H':	/* Flush */
287 						case 'S':	/* Sync */
288 							maxmsglen = PQ_SMALL_MESSAGE_LIMIT;
289 							break;
290 						default:
291 							ereport(ERROR,
292 									(errcode(ERRCODE_PROTOCOL_VIOLATION),
293 									 errmsg("unexpected message type 0x%02X during COPY from stdin",
294 											mtype)));
295 							maxmsglen = 0;	/* keep compiler quiet */
296 							break;
297 					}
298 					/* Now collect the message body */
299 					if (pq_getmessage(cstate->fe_msgbuf, maxmsglen))
300 						ereport(ERROR,
301 								(errcode(ERRCODE_CONNECTION_FAILURE),
302 								 errmsg("unexpected EOF on client connection with an open transaction")));
303 					RESUME_CANCEL_INTERRUPTS();
304 					/* ... and process it */
305 					switch (mtype)
306 					{
307 						case 'd':	/* CopyData */
308 							break;
309 						case 'c':	/* CopyDone */
310 							/* COPY IN correctly terminated by frontend */
311 							cstate->raw_reached_eof = true;
312 							return bytesread;
313 						case 'f':	/* CopyFail */
314 							ereport(ERROR,
315 									(errcode(ERRCODE_QUERY_CANCELED),
316 									 errmsg("COPY from stdin failed: %s",
317 											pq_getmsgstring(cstate->fe_msgbuf))));
318 							break;
319 						case 'H':	/* Flush */
320 						case 'S':	/* Sync */
321 
322 							/*
323 							 * Ignore Flush/Sync for the convenience of client
324 							 * libraries (such as libpq) that may send those
325 							 * without noticing that the command they just
326 							 * sent was COPY.
327 							 */
328 							goto readmessage;
329 						default:
330 							Assert(false);	/* NOT REACHED */
331 					}
332 				}
333 				avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor;
334 				if (avail > maxread)
335 					avail = maxread;
336 				pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail);
337 				databuf = (void *) ((char *) databuf + avail);
338 				maxread -= avail;
339 				bytesread += avail;
340 			}
341 			break;
342 		case COPY_CALLBACK:
343 			bytesread = cstate->data_source_cb(databuf, minread, maxread);
344 			break;
345 	}
346 
347 	return bytesread;
348 }
349 
350 
351 /*
352  * These functions do apply some data conversion
353  */
354 
355 /*
356  * CopyGetInt32 reads an int32 that appears in network byte order
357  *
358  * Returns true if OK, false if EOF
359  */
360 static inline bool
CopyGetInt32(CopyFromState cstate,int32 * val)361 CopyGetInt32(CopyFromState cstate, int32 *val)
362 {
363 	uint32		buf;
364 
365 	if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
366 	{
367 		*val = 0;				/* suppress compiler warning */
368 		return false;
369 	}
370 	*val = (int32) pg_ntoh32(buf);
371 	return true;
372 }
373 
374 /*
375  * CopyGetInt16 reads an int16 that appears in network byte order
376  */
377 static inline bool
CopyGetInt16(CopyFromState cstate,int16 * val)378 CopyGetInt16(CopyFromState cstate, int16 *val)
379 {
380 	uint16		buf;
381 
382 	if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
383 	{
384 		*val = 0;				/* suppress compiler warning */
385 		return false;
386 	}
387 	*val = (int16) pg_ntoh16(buf);
388 	return true;
389 }
390 
391 
392 /*
393  * Perform encoding conversion on data in 'raw_buf', writing the converted
394  * data into 'input_buf'.
395  *
396  * On entry, there must be some data to convert in 'raw_buf'.
397  */
398 static void
CopyConvertBuf(CopyFromState cstate)399 CopyConvertBuf(CopyFromState cstate)
400 {
401 	/*
402 	 * If the file and server encoding are the same, no encoding conversion is
403 	 * required.  However, we still need to verify that the input is valid for
404 	 * the encoding.
405 	 */
406 	if (!cstate->need_transcoding)
407 	{
408 		/*
409 		 * When conversion is not required, input_buf and raw_buf are the
410 		 * same.  raw_buf_len is the total number of bytes in the buffer, and
411 		 * input_buf_len tracks how many of those bytes have already been
412 		 * verified.
413 		 */
414 		int			preverifiedlen = cstate->input_buf_len;
415 		int			unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
416 		int			nverified;
417 
418 		if (unverifiedlen == 0)
419 		{
420 			/*
421 			 * If no more raw data is coming, report the EOF to the caller.
422 			 */
423 			if (cstate->raw_reached_eof)
424 				cstate->input_reached_eof = true;
425 			return;
426 		}
427 
428 		/*
429 		 * Verify the new data, including any residual unverified bytes from
430 		 * previous round.
431 		 */
432 		nverified = pg_encoding_verifymbstr(cstate->file_encoding,
433 											cstate->raw_buf + preverifiedlen,
434 											unverifiedlen);
435 		if (nverified == 0)
436 		{
437 			/*
438 			 * Could not verify anything.
439 			 *
440 			 * If there is no more raw input data coming, it means that there
441 			 * was an incomplete multi-byte sequence at the end.  Also, if
442 			 * there's "enough" input left, we should be able to verify at
443 			 * least one character, and a failure to do so means that we've
444 			 * hit an invalid byte sequence.
445 			 */
446 			if (cstate->raw_reached_eof || unverifiedlen >= pg_database_encoding_max_length())
447 				cstate->input_reached_error = true;
448 			return;
449 		}
450 		cstate->input_buf_len += nverified;
451 	}
452 	else
453 	{
454 		/*
455 		 * Encoding conversion is needed.
456 		 */
457 		int			nbytes;
458 		unsigned char *src;
459 		int			srclen;
460 		unsigned char *dst;
461 		int			dstlen;
462 		int			convertedlen;
463 
464 		if (RAW_BUF_BYTES(cstate) == 0)
465 		{
466 			/*
467 			 * If no more raw data is coming, report the EOF to the caller.
468 			 */
469 			if (cstate->raw_reached_eof)
470 				cstate->input_reached_eof = true;
471 			return;
472 		}
473 
474 		/*
475 		 * First, copy down any unprocessed data.
476 		 */
477 		nbytes = INPUT_BUF_BYTES(cstate);
478 		if (nbytes > 0 && cstate->input_buf_index > 0)
479 			memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
480 					nbytes);
481 		cstate->input_buf_index = 0;
482 		cstate->input_buf_len = nbytes;
483 		cstate->input_buf[nbytes] = '\0';
484 
485 		src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
486 		srclen = cstate->raw_buf_len - cstate->raw_buf_index;
487 		dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
488 		dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
489 
490 		/*
491 		 * Do the conversion.  This might stop short, if there is an invalid
492 		 * byte sequence in the input.  We'll convert as much as we can in
493 		 * that case.
494 		 *
495 		 * Note: Even if we hit an invalid byte sequence, we don't report the
496 		 * error until all the valid bytes have been consumed.  The input
497 		 * might contain an end-of-input marker (\.), and we don't want to
498 		 * report an error if the invalid byte sequence is after the
499 		 * end-of-input marker.  We might unnecessarily convert some data
500 		 * after the end-of-input marker as long as it's valid for the
501 		 * encoding, but that's harmless.
502 		 */
503 		convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc,
504 													 cstate->file_encoding,
505 													 GetDatabaseEncoding(),
506 													 src, srclen,
507 													 dst, dstlen,
508 													 true);
509 		if (convertedlen == 0)
510 		{
511 			/*
512 			 * Could not convert anything.  If there is no more raw input data
513 			 * coming, it means that there was an incomplete multi-byte
514 			 * sequence at the end.  Also, if there is plenty of input left,
515 			 * we should be able to convert at least one character, so a
516 			 * failure to do so must mean that we've hit a byte sequence
517 			 * that's invalid.
518 			 */
519 			if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH)
520 				cstate->input_reached_error = true;
521 			return;
522 		}
523 		cstate->raw_buf_index += convertedlen;
524 		cstate->input_buf_len += strlen((char *) dst);
525 	}
526 }
527 
528 /*
529  * Report an encoding or conversion error.
530  */
531 static void
CopyConversionError(CopyFromState cstate)532 CopyConversionError(CopyFromState cstate)
533 {
534 	Assert(cstate->raw_buf_len > 0);
535 	Assert(cstate->input_reached_error);
536 
537 	if (!cstate->need_transcoding)
538 	{
539 		/*
540 		 * Everything up to input_buf_len was successfully verified, and
541 		 * input_buf_len points to the invalid or incomplete character.
542 		 */
543 		report_invalid_encoding(cstate->file_encoding,
544 								cstate->raw_buf + cstate->input_buf_len,
545 								cstate->raw_buf_len - cstate->input_buf_len);
546 	}
547 	else
548 	{
549 		/*
550 		 * raw_buf_index points to the invalid or untranslatable character. We
551 		 * let the conversion routine report the error, because it can provide
552 		 * a more specific error message than we could here.  An earlier call
553 		 * to the conversion routine in CopyConvertBuf() detected that there
554 		 * is an error, now we call the conversion routine again with
555 		 * noError=false, to have it throw the error.
556 		 */
557 		unsigned char *src;
558 		int			srclen;
559 		unsigned char *dst;
560 		int			dstlen;
561 
562 		src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
563 		srclen = cstate->raw_buf_len - cstate->raw_buf_index;
564 		dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
565 		dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
566 
567 		(void) pg_do_encoding_conversion_buf(cstate->conversion_proc,
568 											 cstate->file_encoding,
569 											 GetDatabaseEncoding(),
570 											 src, srclen,
571 											 dst, dstlen,
572 											 false);
573 
574 		/*
575 		 * The conversion routine should have reported an error, so this
576 		 * should not be reached.
577 		 */
578 		elog(ERROR, "encoding conversion failed without error");
579 	}
580 }
581 
582 /*
583  * Load more data from data source to raw_buf.
584  *
585  * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
586  * beginning of the buffer, and we load new data after that.
587  */
588 static void
CopyLoadRawBuf(CopyFromState cstate)589 CopyLoadRawBuf(CopyFromState cstate)
590 {
591 	int			nbytes;
592 	int			inbytes;
593 
594 	/*
595 	 * In text mode, if encoding conversion is not required, raw_buf and
596 	 * input_buf point to the same buffer.  Their len/index better agree, too.
597 	 */
598 	if (cstate->raw_buf == cstate->input_buf)
599 	{
600 		Assert(!cstate->need_transcoding);
601 		Assert(cstate->raw_buf_index == cstate->input_buf_index);
602 		Assert(cstate->input_buf_len <= cstate->raw_buf_len);
603 	}
604 
605 	/*
606 	 * Copy down the unprocessed data if any.
607 	 */
608 	nbytes = RAW_BUF_BYTES(cstate);
609 	if (nbytes > 0 && cstate->raw_buf_index > 0)
610 		memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
611 				nbytes);
612 	cstate->raw_buf_len -= cstate->raw_buf_index;
613 	cstate->raw_buf_index = 0;
614 
615 	/*
616 	 * If raw_buf and input_buf are in fact the same buffer, adjust the
617 	 * input_buf variables, too.
618 	 */
619 	if (cstate->raw_buf == cstate->input_buf)
620 	{
621 		cstate->input_buf_len -= cstate->input_buf_index;
622 		cstate->input_buf_index = 0;
623 	}
624 
625 	/* Load more data */
626 	inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
627 						  1, RAW_BUF_SIZE - cstate->raw_buf_len);
628 	nbytes += inbytes;
629 	cstate->raw_buf[nbytes] = '\0';
630 	cstate->raw_buf_len = nbytes;
631 
632 	cstate->bytes_processed += inbytes;
633 	pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
634 
635 	if (inbytes == 0)
636 		cstate->raw_reached_eof = true;
637 }
638 
639 /*
640  * CopyLoadInputBuf loads some more data into input_buf
641  *
642  * On return, at least one more input character is loaded into
643  * input_buf, or input_reached_eof is set.
644  *
645  * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
646  * of the buffer and then we load more data after that.
647  */
648 static void
CopyLoadInputBuf(CopyFromState cstate)649 CopyLoadInputBuf(CopyFromState cstate)
650 {
651 	int			nbytes = INPUT_BUF_BYTES(cstate);
652 
653 	/*
654 	 * The caller has updated input_buf_index to indicate how much of the
655 	 * input has been consumed and isn't needed anymore.  If input_buf is the
656 	 * same physical area as raw_buf, update raw_buf_index accordingly.
657 	 */
658 	if (cstate->raw_buf == cstate->input_buf)
659 	{
660 		Assert(!cstate->need_transcoding);
661 		Assert(cstate->input_buf_index >= cstate->raw_buf_index);
662 		cstate->raw_buf_index = cstate->input_buf_index;
663 	}
664 
665 	for (;;)
666 	{
667 		/* If we now have some unconverted data, try to convert it */
668 		CopyConvertBuf(cstate);
669 
670 		/* If we now have some more input bytes ready, return them */
671 		if (INPUT_BUF_BYTES(cstate) > nbytes)
672 			return;
673 
674 		/*
675 		 * If we reached an invalid byte sequence, or we're at an incomplete
676 		 * multi-byte character but there is no more raw input data, report
677 		 * conversion error.
678 		 */
679 		if (cstate->input_reached_error)
680 			CopyConversionError(cstate);
681 
682 		/* no more input, and everything has been converted */
683 		if (cstate->input_reached_eof)
684 			break;
685 
686 		/* Try to load more raw data */
687 		Assert(!cstate->raw_reached_eof);
688 		CopyLoadRawBuf(cstate);
689 	}
690 }
691 
692 /*
693  * CopyReadBinaryData
694  *
695  * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
696  * and writes them to 'dest'.  Returns the number of bytes read (which
697  * would be less than 'nbytes' only if we reach EOF).
698  */
699 static int
CopyReadBinaryData(CopyFromState cstate,char * dest,int nbytes)700 CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
701 {
702 	int			copied_bytes = 0;
703 
704 	if (RAW_BUF_BYTES(cstate) >= nbytes)
705 	{
706 		/* Enough bytes are present in the buffer. */
707 		memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
708 		cstate->raw_buf_index += nbytes;
709 		copied_bytes = nbytes;
710 	}
711 	else
712 	{
713 		/*
714 		 * Not enough bytes in the buffer, so must read from the file.  Need
715 		 * to loop since 'nbytes' could be larger than the buffer size.
716 		 */
717 		do
718 		{
719 			int			copy_bytes;
720 
721 			/* Load more data if buffer is empty. */
722 			if (RAW_BUF_BYTES(cstate) == 0)
723 			{
724 				CopyLoadRawBuf(cstate);
725 				if (cstate->raw_reached_eof)
726 					break;		/* EOF */
727 			}
728 
729 			/* Transfer some bytes. */
730 			copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
731 			memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
732 			cstate->raw_buf_index += copy_bytes;
733 			dest += copy_bytes;
734 			copied_bytes += copy_bytes;
735 		} while (copied_bytes < nbytes);
736 	}
737 
738 	return copied_bytes;
739 }
740 
741 /*
742  * Read raw fields in the next line for COPY FROM in text or csv mode.
743  * Return false if no more lines.
744  *
745  * An internal temporary buffer is returned via 'fields'. It is valid until
746  * the next call of the function. Since the function returns all raw fields
747  * in the input file, 'nfields' could be different from the number of columns
748  * in the relation.
749  *
750  * NOTE: force_not_null option are not applied to the returned fields.
751  */
752 bool
NextCopyFromRawFields(CopyFromState cstate,char *** fields,int * nfields)753 NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
754 {
755 	int			fldct;
756 	bool		done;
757 
758 	/* only available for text or csv input */
759 	Assert(!cstate->opts.binary);
760 
761 	/* on input just throw the header line away */
762 	if (cstate->cur_lineno == 0 && cstate->opts.header_line)
763 	{
764 		cstate->cur_lineno++;
765 		if (CopyReadLine(cstate))
766 			return false;		/* done */
767 	}
768 
769 	cstate->cur_lineno++;
770 
771 	/* Actually read the line into memory here */
772 	done = CopyReadLine(cstate);
773 
774 	/*
775 	 * EOF at start of line means we're done.  If we see EOF after some
776 	 * characters, we act as though it was newline followed by EOF, ie,
777 	 * process the line and then exit loop on next iteration.
778 	 */
779 	if (done && cstate->line_buf.len == 0)
780 		return false;
781 
782 	/* Parse the line into de-escaped field values */
783 	if (cstate->opts.csv_mode)
784 		fldct = CopyReadAttributesCSV(cstate);
785 	else
786 		fldct = CopyReadAttributesText(cstate);
787 
788 	*fields = cstate->raw_fields;
789 	*nfields = fldct;
790 	return true;
791 }
792 
793 /*
794  * Read next tuple from file for COPY FROM. Return false if no more tuples.
795  *
796  * 'econtext' is used to evaluate default expression for each columns not
797  * read from the file. It can be NULL when no default values are used, i.e.
798  * when all columns are read from the file.
799  *
800  * 'values' and 'nulls' arrays must be the same length as columns of the
801  * relation passed to BeginCopyFrom. This function fills the arrays.
802  */
803 bool
NextCopyFrom(CopyFromState cstate,ExprContext * econtext,Datum * values,bool * nulls)804 NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
805 			 Datum *values, bool *nulls)
806 {
807 	TupleDesc	tupDesc;
808 	AttrNumber	num_phys_attrs,
809 				attr_count,
810 				num_defaults = cstate->num_defaults;
811 	FmgrInfo   *in_functions = cstate->in_functions;
812 	Oid		   *typioparams = cstate->typioparams;
813 	int			i;
814 	int		   *defmap = cstate->defmap;
815 	ExprState **defexprs = cstate->defexprs;
816 
817 	tupDesc = RelationGetDescr(cstate->rel);
818 	num_phys_attrs = tupDesc->natts;
819 	attr_count = list_length(cstate->attnumlist);
820 
821 	/* Initialize all values for row to NULL */
822 	MemSet(values, 0, num_phys_attrs * sizeof(Datum));
823 	MemSet(nulls, true, num_phys_attrs * sizeof(bool));
824 
825 	if (!cstate->opts.binary)
826 	{
827 		char	  **field_strings;
828 		ListCell   *cur;
829 		int			fldct;
830 		int			fieldno;
831 		char	   *string;
832 
833 		/* read raw fields in the next line */
834 		if (!NextCopyFromRawFields(cstate, &field_strings, &fldct))
835 			return false;
836 
837 		/* check for overflowing fields */
838 		if (attr_count > 0 && fldct > attr_count)
839 			ereport(ERROR,
840 					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
841 					 errmsg("extra data after last expected column")));
842 
843 		fieldno = 0;
844 
845 		/* Loop to read the user attributes on the line. */
846 		foreach(cur, cstate->attnumlist)
847 		{
848 			int			attnum = lfirst_int(cur);
849 			int			m = attnum - 1;
850 			Form_pg_attribute att = TupleDescAttr(tupDesc, m);
851 
852 			if (fieldno >= fldct)
853 				ereport(ERROR,
854 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
855 						 errmsg("missing data for column \"%s\"",
856 								NameStr(att->attname))));
857 			string = field_strings[fieldno++];
858 
859 			if (cstate->convert_select_flags &&
860 				!cstate->convert_select_flags[m])
861 			{
862 				/* ignore input field, leaving column as NULL */
863 				continue;
864 			}
865 
866 			if (cstate->opts.csv_mode)
867 			{
868 				if (string == NULL &&
869 					cstate->opts.force_notnull_flags[m])
870 				{
871 					/*
872 					 * FORCE_NOT_NULL option is set and column is NULL -
873 					 * convert it to the NULL string.
874 					 */
875 					string = cstate->opts.null_print;
876 				}
877 				else if (string != NULL && cstate->opts.force_null_flags[m]
878 						 && strcmp(string, cstate->opts.null_print) == 0)
879 				{
880 					/*
881 					 * FORCE_NULL option is set and column matches the NULL
882 					 * string. It must have been quoted, or otherwise the
883 					 * string would already have been set to NULL. Convert it
884 					 * to NULL as specified.
885 					 */
886 					string = NULL;
887 				}
888 			}
889 
890 			cstate->cur_attname = NameStr(att->attname);
891 			cstate->cur_attval = string;
892 			values[m] = InputFunctionCall(&in_functions[m],
893 										  string,
894 										  typioparams[m],
895 										  att->atttypmod);
896 			if (string != NULL)
897 				nulls[m] = false;
898 			cstate->cur_attname = NULL;
899 			cstate->cur_attval = NULL;
900 		}
901 
902 		Assert(fieldno == attr_count);
903 	}
904 	else
905 	{
906 		/* binary */
907 		int16		fld_count;
908 		ListCell   *cur;
909 
910 		cstate->cur_lineno++;
911 
912 		if (!CopyGetInt16(cstate, &fld_count))
913 		{
914 			/* EOF detected (end of file, or protocol-level EOF) */
915 			return false;
916 		}
917 
918 		if (fld_count == -1)
919 		{
920 			/*
921 			 * Received EOF marker.  Wait for the protocol-level EOF, and
922 			 * complain if it doesn't come immediately.  In COPY FROM STDIN,
923 			 * this ensures that we correctly handle CopyFail, if client
924 			 * chooses to send that now.  When copying from file, we could
925 			 * ignore the rest of the file like in text mode, but we choose to
926 			 * be consistent with the COPY FROM STDIN case.
927 			 */
928 			char		dummy;
929 
930 			if (CopyReadBinaryData(cstate, &dummy, 1) > 0)
931 				ereport(ERROR,
932 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
933 						 errmsg("received copy data after EOF marker")));
934 			return false;
935 		}
936 
937 		if (fld_count != attr_count)
938 			ereport(ERROR,
939 					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
940 					 errmsg("row field count is %d, expected %d",
941 							(int) fld_count, attr_count)));
942 
943 		foreach(cur, cstate->attnumlist)
944 		{
945 			int			attnum = lfirst_int(cur);
946 			int			m = attnum - 1;
947 			Form_pg_attribute att = TupleDescAttr(tupDesc, m);
948 
949 			cstate->cur_attname = NameStr(att->attname);
950 			values[m] = CopyReadBinaryAttribute(cstate,
951 												&in_functions[m],
952 												typioparams[m],
953 												att->atttypmod,
954 												&nulls[m]);
955 			cstate->cur_attname = NULL;
956 		}
957 	}
958 
959 	/*
960 	 * Now compute and insert any defaults available for the columns not
961 	 * provided by the input data.  Anything not processed here or above will
962 	 * remain NULL.
963 	 */
964 	for (i = 0; i < num_defaults; i++)
965 	{
966 		/*
967 		 * The caller must supply econtext and have switched into the
968 		 * per-tuple memory context in it.
969 		 */
970 		Assert(econtext != NULL);
971 		Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
972 
973 		values[defmap[i]] = ExecEvalExpr(defexprs[i], econtext,
974 										 &nulls[defmap[i]]);
975 	}
976 
977 	return true;
978 }
979 
980 /*
981  * Read the next input line and stash it in line_buf.
982  *
983  * Result is true if read was terminated by EOF, false if terminated
984  * by newline.  The terminating newline or EOF marker is not included
985  * in the final value of line_buf.
986  */
987 static bool
CopyReadLine(CopyFromState cstate)988 CopyReadLine(CopyFromState cstate)
989 {
990 	bool		result;
991 
992 	resetStringInfo(&cstate->line_buf);
993 	cstate->line_buf_valid = false;
994 
995 	/* Parse data and transfer into line_buf */
996 	result = CopyReadLineText(cstate);
997 
998 	if (result)
999 	{
1000 		/*
1001 		 * Reached EOF.  In protocol version 3, we should ignore anything
1002 		 * after \. up to the protocol end of copy data.  (XXX maybe better
1003 		 * not to treat \. as special?)
1004 		 */
1005 		if (cstate->copy_src == COPY_FRONTEND)
1006 		{
1007 			int			inbytes;
1008 
1009 			do
1010 			{
1011 				inbytes = CopyGetData(cstate, cstate->input_buf,
1012 									  1, INPUT_BUF_SIZE);
1013 			} while (inbytes > 0);
1014 			cstate->input_buf_index = 0;
1015 			cstate->input_buf_len = 0;
1016 			cstate->raw_buf_index = 0;
1017 			cstate->raw_buf_len = 0;
1018 		}
1019 	}
1020 	else
1021 	{
1022 		/*
1023 		 * If we didn't hit EOF, then we must have transferred the EOL marker
1024 		 * to line_buf along with the data.  Get rid of it.
1025 		 */
1026 		switch (cstate->eol_type)
1027 		{
1028 			case EOL_NL:
1029 				Assert(cstate->line_buf.len >= 1);
1030 				Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1031 				cstate->line_buf.len--;
1032 				cstate->line_buf.data[cstate->line_buf.len] = '\0';
1033 				break;
1034 			case EOL_CR:
1035 				Assert(cstate->line_buf.len >= 1);
1036 				Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
1037 				cstate->line_buf.len--;
1038 				cstate->line_buf.data[cstate->line_buf.len] = '\0';
1039 				break;
1040 			case EOL_CRNL:
1041 				Assert(cstate->line_buf.len >= 2);
1042 				Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
1043 				Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1044 				cstate->line_buf.len -= 2;
1045 				cstate->line_buf.data[cstate->line_buf.len] = '\0';
1046 				break;
1047 			case EOL_UNKNOWN:
1048 				/* shouldn't get here */
1049 				Assert(false);
1050 				break;
1051 		}
1052 	}
1053 
1054 	/* Now it's safe to use the buffer in error messages */
1055 	cstate->line_buf_valid = true;
1056 
1057 	return result;
1058 }
1059 
1060 /*
1061  * CopyReadLineText - inner loop of CopyReadLine for text mode
1062  */
1063 static bool
CopyReadLineText(CopyFromState cstate)1064 CopyReadLineText(CopyFromState cstate)
1065 {
1066 	char	   *copy_input_buf;
1067 	int			input_buf_ptr;
1068 	int			copy_buf_len;
1069 	bool		need_data = false;
1070 	bool		hit_eof = false;
1071 	bool		result = false;
1072 
1073 	/* CSV variables */
1074 	bool		first_char_in_line = true;
1075 	bool		in_quote = false,
1076 				last_was_esc = false;
1077 	char		quotec = '\0';
1078 	char		escapec = '\0';
1079 
1080 	if (cstate->opts.csv_mode)
1081 	{
1082 		quotec = cstate->opts.quote[0];
1083 		escapec = cstate->opts.escape[0];
1084 		/* ignore special escape processing if it's the same as quotec */
1085 		if (quotec == escapec)
1086 			escapec = '\0';
1087 	}
1088 
1089 	/*
1090 	 * The objective of this loop is to transfer the entire next input line
1091 	 * into line_buf.  Hence, we only care for detecting newlines (\r and/or
1092 	 * \n) and the end-of-copy marker (\.).
1093 	 *
1094 	 * In CSV mode, \r and \n inside a quoted field are just part of the data
1095 	 * value and are put in line_buf.  We keep just enough state to know if we
1096 	 * are currently in a quoted field or not.
1097 	 *
1098 	 * These four characters, and the CSV escape and quote characters, are
1099 	 * assumed the same in frontend and backend encodings.
1100 	 *
1101 	 * The input has already been converted to the database encoding.  All
1102 	 * supported server encodings have the property that all bytes in a
1103 	 * multi-byte sequence have the high bit set, so a multibyte character
1104 	 * cannot contain any newline or escape characters embedded in the
1105 	 * multibyte sequence.  Therefore, we can process the input byte-by-byte,
1106 	 * regardless of the encoding.
1107 	 *
1108 	 * For speed, we try to move data from input_buf to line_buf in chunks
1109 	 * rather than one character at a time.  input_buf_ptr points to the next
1110 	 * character to examine; any characters from input_buf_index to
1111 	 * input_buf_ptr have been determined to be part of the line, but not yet
1112 	 * transferred to line_buf.
1113 	 *
1114 	 * For a little extra speed within the loop, we copy input_buf and
1115 	 * input_buf_len into local variables.
1116 	 */
1117 	copy_input_buf = cstate->input_buf;
1118 	input_buf_ptr = cstate->input_buf_index;
1119 	copy_buf_len = cstate->input_buf_len;
1120 
1121 	for (;;)
1122 	{
1123 		int			prev_raw_ptr;
1124 		char		c;
1125 
1126 		/*
1127 		 * Load more data if needed.  Ideally we would just force four bytes
1128 		 * of read-ahead and avoid the many calls to
1129 		 * IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(), but the COPY_OLD_FE protocol
1130 		 * does not allow us to read too far ahead or we might read into the
1131 		 * next data, so we read-ahead only as far we know we can.  One
1132 		 * optimization would be to read-ahead four byte here if
1133 		 * cstate->copy_src != COPY_OLD_FE, but it hardly seems worth it,
1134 		 * considering the size of the buffer.
1135 		 */
1136 		if (input_buf_ptr >= copy_buf_len || need_data)
1137 		{
1138 			REFILL_LINEBUF;
1139 
1140 			CopyLoadInputBuf(cstate);
1141 			/* update our local variables */
1142 			hit_eof = cstate->input_reached_eof;
1143 			input_buf_ptr = cstate->input_buf_index;
1144 			copy_buf_len = cstate->input_buf_len;
1145 
1146 			/*
1147 			 * If we are completely out of data, break out of the loop,
1148 			 * reporting EOF.
1149 			 */
1150 			if (INPUT_BUF_BYTES(cstate) <= 0)
1151 			{
1152 				result = true;
1153 				break;
1154 			}
1155 			need_data = false;
1156 		}
1157 
1158 		/* OK to fetch a character */
1159 		prev_raw_ptr = input_buf_ptr;
1160 		c = copy_input_buf[input_buf_ptr++];
1161 
1162 		if (cstate->opts.csv_mode)
1163 		{
1164 			/*
1165 			 * If character is '\\' or '\r', we may need to look ahead below.
1166 			 * Force fetch of the next character if we don't already have it.
1167 			 * We need to do this before changing CSV state, in case one of
1168 			 * these characters is also the quote or escape character.
1169 			 *
1170 			 * Note: old-protocol does not like forced prefetch, but it's OK
1171 			 * here since we cannot validly be at EOF.
1172 			 */
1173 			if (c == '\\' || c == '\r')
1174 			{
1175 				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1176 			}
1177 
1178 			/*
1179 			 * Dealing with quotes and escapes here is mildly tricky. If the
1180 			 * quote char is also the escape char, there's no problem - we
1181 			 * just use the char as a toggle. If they are different, we need
1182 			 * to ensure that we only take account of an escape inside a
1183 			 * quoted field and immediately preceding a quote char, and not
1184 			 * the second in an escape-escape sequence.
1185 			 */
1186 			if (in_quote && c == escapec)
1187 				last_was_esc = !last_was_esc;
1188 			if (c == quotec && !last_was_esc)
1189 				in_quote = !in_quote;
1190 			if (c != escapec)
1191 				last_was_esc = false;
1192 
1193 			/*
1194 			 * Updating the line count for embedded CR and/or LF chars is
1195 			 * necessarily a little fragile - this test is probably about the
1196 			 * best we can do.  (XXX it's arguable whether we should do this
1197 			 * at all --- is cur_lineno a physical or logical count?)
1198 			 */
1199 			if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
1200 				cstate->cur_lineno++;
1201 		}
1202 
1203 		/* Process \r */
1204 		if (c == '\r' && (!cstate->opts.csv_mode || !in_quote))
1205 		{
1206 			/* Check for \r\n on first line, _and_ handle \r\n. */
1207 			if (cstate->eol_type == EOL_UNKNOWN ||
1208 				cstate->eol_type == EOL_CRNL)
1209 			{
1210 				/*
1211 				 * If need more data, go back to loop top to load it.
1212 				 *
1213 				 * Note that if we are at EOF, c will wind up as '\0' because
1214 				 * of the guaranteed pad of input_buf.
1215 				 */
1216 				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1217 
1218 				/* get next char */
1219 				c = copy_input_buf[input_buf_ptr];
1220 
1221 				if (c == '\n')
1222 				{
1223 					input_buf_ptr++;	/* eat newline */
1224 					cstate->eol_type = EOL_CRNL;	/* in case not set yet */
1225 				}
1226 				else
1227 				{
1228 					/* found \r, but no \n */
1229 					if (cstate->eol_type == EOL_CRNL)
1230 						ereport(ERROR,
1231 								(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1232 								 !cstate->opts.csv_mode ?
1233 								 errmsg("literal carriage return found in data") :
1234 								 errmsg("unquoted carriage return found in data"),
1235 								 !cstate->opts.csv_mode ?
1236 								 errhint("Use \"\\r\" to represent carriage return.") :
1237 								 errhint("Use quoted CSV field to represent carriage return.")));
1238 
1239 					/*
1240 					 * if we got here, it is the first line and we didn't find
1241 					 * \n, so don't consume the peeked character
1242 					 */
1243 					cstate->eol_type = EOL_CR;
1244 				}
1245 			}
1246 			else if (cstate->eol_type == EOL_NL)
1247 				ereport(ERROR,
1248 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1249 						 !cstate->opts.csv_mode ?
1250 						 errmsg("literal carriage return found in data") :
1251 						 errmsg("unquoted carriage return found in data"),
1252 						 !cstate->opts.csv_mode ?
1253 						 errhint("Use \"\\r\" to represent carriage return.") :
1254 						 errhint("Use quoted CSV field to represent carriage return.")));
1255 			/* If reach here, we have found the line terminator */
1256 			break;
1257 		}
1258 
1259 		/* Process \n */
1260 		if (c == '\n' && (!cstate->opts.csv_mode || !in_quote))
1261 		{
1262 			if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
1263 				ereport(ERROR,
1264 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1265 						 !cstate->opts.csv_mode ?
1266 						 errmsg("literal newline found in data") :
1267 						 errmsg("unquoted newline found in data"),
1268 						 !cstate->opts.csv_mode ?
1269 						 errhint("Use \"\\n\" to represent newline.") :
1270 						 errhint("Use quoted CSV field to represent newline.")));
1271 			cstate->eol_type = EOL_NL;	/* in case not set yet */
1272 			/* If reach here, we have found the line terminator */
1273 			break;
1274 		}
1275 
1276 		/*
1277 		 * In CSV mode, we only recognize \. alone on a line.  This is because
1278 		 * \. is a valid CSV data value.
1279 		 */
1280 		if (c == '\\' && (!cstate->opts.csv_mode || first_char_in_line))
1281 		{
1282 			char		c2;
1283 
1284 			IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1285 			IF_NEED_REFILL_AND_EOF_BREAK(0);
1286 
1287 			/* -----
1288 			 * get next character
1289 			 * Note: we do not change c so if it isn't \., we can fall
1290 			 * through and continue processing.
1291 			 * -----
1292 			 */
1293 			c2 = copy_input_buf[input_buf_ptr];
1294 
1295 			if (c2 == '.')
1296 			{
1297 				input_buf_ptr++;	/* consume the '.' */
1298 
1299 				/*
1300 				 * Note: if we loop back for more data here, it does not
1301 				 * matter that the CSV state change checks are re-executed; we
1302 				 * will come back here with no important state changed.
1303 				 */
1304 				if (cstate->eol_type == EOL_CRNL)
1305 				{
1306 					/* Get the next character */
1307 					IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1308 					/* if hit_eof, c2 will become '\0' */
1309 					c2 = copy_input_buf[input_buf_ptr++];
1310 
1311 					if (c2 == '\n')
1312 					{
1313 						if (!cstate->opts.csv_mode)
1314 							ereport(ERROR,
1315 									(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1316 									 errmsg("end-of-copy marker does not match previous newline style")));
1317 						else
1318 							NO_END_OF_COPY_GOTO;
1319 					}
1320 					else if (c2 != '\r')
1321 					{
1322 						if (!cstate->opts.csv_mode)
1323 							ereport(ERROR,
1324 									(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1325 									 errmsg("end-of-copy marker corrupt")));
1326 						else
1327 							NO_END_OF_COPY_GOTO;
1328 					}
1329 				}
1330 
1331 				/* Get the next character */
1332 				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1333 				/* if hit_eof, c2 will become '\0' */
1334 				c2 = copy_input_buf[input_buf_ptr++];
1335 
1336 				if (c2 != '\r' && c2 != '\n')
1337 				{
1338 					if (!cstate->opts.csv_mode)
1339 						ereport(ERROR,
1340 								(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1341 								 errmsg("end-of-copy marker corrupt")));
1342 					else
1343 						NO_END_OF_COPY_GOTO;
1344 				}
1345 
1346 				if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
1347 					(cstate->eol_type == EOL_CRNL && c2 != '\n') ||
1348 					(cstate->eol_type == EOL_CR && c2 != '\r'))
1349 				{
1350 					ereport(ERROR,
1351 							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1352 							 errmsg("end-of-copy marker does not match previous newline style")));
1353 				}
1354 
1355 				/*
1356 				 * Transfer only the data before the \. into line_buf, then
1357 				 * discard the data and the \. sequence.
1358 				 */
1359 				if (prev_raw_ptr > cstate->input_buf_index)
1360 					appendBinaryStringInfo(&cstate->line_buf,
1361 										   cstate->input_buf + cstate->input_buf_index,
1362 										   prev_raw_ptr - cstate->input_buf_index);
1363 				cstate->input_buf_index = input_buf_ptr;
1364 				result = true;	/* report EOF */
1365 				break;
1366 			}
1367 			else if (!cstate->opts.csv_mode)
1368 			{
1369 				/*
1370 				 * If we are here, it means we found a backslash followed by
1371 				 * something other than a period.  In non-CSV mode, anything
1372 				 * after a backslash is special, so we skip over that second
1373 				 * character too.  If we didn't do that \\. would be
1374 				 * considered an eof-of copy, while in non-CSV mode it is a
1375 				 * literal backslash followed by a period.  In CSV mode,
1376 				 * backslashes are not special, so we want to process the
1377 				 * character after the backslash just like a normal character,
1378 				 * so we don't increment in those cases.
1379 				 */
1380 				input_buf_ptr++;
1381 			}
1382 		}
1383 
1384 		/*
1385 		 * This label is for CSV cases where \. appears at the start of a
1386 		 * line, but there is more text after it, meaning it was a data value.
1387 		 * We are more strict for \. in CSV mode because \. could be a data
1388 		 * value, while in non-CSV mode, \. cannot be a data value.
1389 		 */
1390 not_end_of_copy:
1391 		first_char_in_line = false;
1392 	}							/* end of outer loop */
1393 
1394 	/*
1395 	 * Transfer any still-uncopied data to line_buf.
1396 	 */
1397 	REFILL_LINEBUF;
1398 
1399 	return result;
1400 }
1401 
1402 /*
1403  *	Return decimal value for a hexadecimal digit
1404  */
1405 static int
GetDecimalFromHex(char hex)1406 GetDecimalFromHex(char hex)
1407 {
1408 	if (isdigit((unsigned char) hex))
1409 		return hex - '0';
1410 	else
1411 		return tolower((unsigned char) hex) - 'a' + 10;
1412 }
1413 
1414 /*
1415  * Parse the current line into separate attributes (fields),
1416  * performing de-escaping as needed.
1417  *
1418  * The input is in line_buf.  We use attribute_buf to hold the result
1419  * strings.  cstate->raw_fields[k] is set to point to the k'th attribute
1420  * string, or NULL when the input matches the null marker string.
1421  * This array is expanded as necessary.
1422  *
1423  * (Note that the caller cannot check for nulls since the returned
1424  * string would be the post-de-escaping equivalent, which may look
1425  * the same as some valid data string.)
1426  *
1427  * delim is the column delimiter string (must be just one byte for now).
1428  * null_print is the null marker string.  Note that this is compared to
1429  * the pre-de-escaped input string.
1430  *
1431  * The return value is the number of fields actually read.
1432  */
1433 static int
CopyReadAttributesText(CopyFromState cstate)1434 CopyReadAttributesText(CopyFromState cstate)
1435 {
1436 	char		delimc = cstate->opts.delim[0];
1437 	int			fieldno;
1438 	char	   *output_ptr;
1439 	char	   *cur_ptr;
1440 	char	   *line_end_ptr;
1441 
1442 	/*
1443 	 * We need a special case for zero-column tables: check that the input
1444 	 * line is empty, and return.
1445 	 */
1446 	if (cstate->max_fields <= 0)
1447 	{
1448 		if (cstate->line_buf.len != 0)
1449 			ereport(ERROR,
1450 					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1451 					 errmsg("extra data after last expected column")));
1452 		return 0;
1453 	}
1454 
1455 	resetStringInfo(&cstate->attribute_buf);
1456 
1457 	/*
1458 	 * The de-escaped attributes will certainly not be longer than the input
1459 	 * data line, so we can just force attribute_buf to be large enough and
1460 	 * then transfer data without any checks for enough space.  We need to do
1461 	 * it this way because enlarging attribute_buf mid-stream would invalidate
1462 	 * pointers already stored into cstate->raw_fields[].
1463 	 */
1464 	if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1465 		enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1466 	output_ptr = cstate->attribute_buf.data;
1467 
1468 	/* set pointer variables for loop */
1469 	cur_ptr = cstate->line_buf.data;
1470 	line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1471 
1472 	/* Outer loop iterates over fields */
1473 	fieldno = 0;
1474 	for (;;)
1475 	{
1476 		bool		found_delim = false;
1477 		char	   *start_ptr;
1478 		char	   *end_ptr;
1479 		int			input_len;
1480 		bool		saw_non_ascii = false;
1481 
1482 		/* Make sure there is enough space for the next value */
1483 		if (fieldno >= cstate->max_fields)
1484 		{
1485 			cstate->max_fields *= 2;
1486 			cstate->raw_fields =
1487 				repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1488 		}
1489 
1490 		/* Remember start of field on both input and output sides */
1491 		start_ptr = cur_ptr;
1492 		cstate->raw_fields[fieldno] = output_ptr;
1493 
1494 		/*
1495 		 * Scan data for field.
1496 		 *
1497 		 * Note that in this loop, we are scanning to locate the end of field
1498 		 * and also speculatively performing de-escaping.  Once we find the
1499 		 * end-of-field, we can match the raw field contents against the null
1500 		 * marker string.  Only after that comparison fails do we know that
1501 		 * de-escaping is actually the right thing to do; therefore we *must
1502 		 * not* throw any syntax errors before we've done the null-marker
1503 		 * check.
1504 		 */
1505 		for (;;)
1506 		{
1507 			char		c;
1508 
1509 			end_ptr = cur_ptr;
1510 			if (cur_ptr >= line_end_ptr)
1511 				break;
1512 			c = *cur_ptr++;
1513 			if (c == delimc)
1514 			{
1515 				found_delim = true;
1516 				break;
1517 			}
1518 			if (c == '\\')
1519 			{
1520 				if (cur_ptr >= line_end_ptr)
1521 					break;
1522 				c = *cur_ptr++;
1523 				switch (c)
1524 				{
1525 					case '0':
1526 					case '1':
1527 					case '2':
1528 					case '3':
1529 					case '4':
1530 					case '5':
1531 					case '6':
1532 					case '7':
1533 						{
1534 							/* handle \013 */
1535 							int			val;
1536 
1537 							val = OCTVALUE(c);
1538 							if (cur_ptr < line_end_ptr)
1539 							{
1540 								c = *cur_ptr;
1541 								if (ISOCTAL(c))
1542 								{
1543 									cur_ptr++;
1544 									val = (val << 3) + OCTVALUE(c);
1545 									if (cur_ptr < line_end_ptr)
1546 									{
1547 										c = *cur_ptr;
1548 										if (ISOCTAL(c))
1549 										{
1550 											cur_ptr++;
1551 											val = (val << 3) + OCTVALUE(c);
1552 										}
1553 									}
1554 								}
1555 							}
1556 							c = val & 0377;
1557 							if (c == '\0' || IS_HIGHBIT_SET(c))
1558 								saw_non_ascii = true;
1559 						}
1560 						break;
1561 					case 'x':
1562 						/* Handle \x3F */
1563 						if (cur_ptr < line_end_ptr)
1564 						{
1565 							char		hexchar = *cur_ptr;
1566 
1567 							if (isxdigit((unsigned char) hexchar))
1568 							{
1569 								int			val = GetDecimalFromHex(hexchar);
1570 
1571 								cur_ptr++;
1572 								if (cur_ptr < line_end_ptr)
1573 								{
1574 									hexchar = *cur_ptr;
1575 									if (isxdigit((unsigned char) hexchar))
1576 									{
1577 										cur_ptr++;
1578 										val = (val << 4) + GetDecimalFromHex(hexchar);
1579 									}
1580 								}
1581 								c = val & 0xff;
1582 								if (c == '\0' || IS_HIGHBIT_SET(c))
1583 									saw_non_ascii = true;
1584 							}
1585 						}
1586 						break;
1587 					case 'b':
1588 						c = '\b';
1589 						break;
1590 					case 'f':
1591 						c = '\f';
1592 						break;
1593 					case 'n':
1594 						c = '\n';
1595 						break;
1596 					case 'r':
1597 						c = '\r';
1598 						break;
1599 					case 't':
1600 						c = '\t';
1601 						break;
1602 					case 'v':
1603 						c = '\v';
1604 						break;
1605 
1606 						/*
1607 						 * in all other cases, take the char after '\'
1608 						 * literally
1609 						 */
1610 				}
1611 			}
1612 
1613 			/* Add c to output string */
1614 			*output_ptr++ = c;
1615 		}
1616 
1617 		/* Check whether raw input matched null marker */
1618 		input_len = end_ptr - start_ptr;
1619 		if (input_len == cstate->opts.null_print_len &&
1620 			strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1621 			cstate->raw_fields[fieldno] = NULL;
1622 		else
1623 		{
1624 			/*
1625 			 * At this point we know the field is supposed to contain data.
1626 			 *
1627 			 * If we de-escaped any non-7-bit-ASCII chars, make sure the
1628 			 * resulting string is valid data for the db encoding.
1629 			 */
1630 			if (saw_non_ascii)
1631 			{
1632 				char	   *fld = cstate->raw_fields[fieldno];
1633 
1634 				pg_verifymbstr(fld, output_ptr - fld, false);
1635 			}
1636 		}
1637 
1638 		/* Terminate attribute value in output area */
1639 		*output_ptr++ = '\0';
1640 
1641 		fieldno++;
1642 		/* Done if we hit EOL instead of a delim */
1643 		if (!found_delim)
1644 			break;
1645 	}
1646 
1647 	/* Clean up state of attribute_buf */
1648 	output_ptr--;
1649 	Assert(*output_ptr == '\0');
1650 	cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
1651 
1652 	return fieldno;
1653 }
1654 
1655 /*
1656  * Parse the current line into separate attributes (fields),
1657  * performing de-escaping as needed.  This has exactly the same API as
1658  * CopyReadAttributesText, except we parse the fields according to
1659  * "standard" (i.e. common) CSV usage.
1660  */
1661 static int
CopyReadAttributesCSV(CopyFromState cstate)1662 CopyReadAttributesCSV(CopyFromState cstate)
1663 {
1664 	char		delimc = cstate->opts.delim[0];
1665 	char		quotec = cstate->opts.quote[0];
1666 	char		escapec = cstate->opts.escape[0];
1667 	int			fieldno;
1668 	char	   *output_ptr;
1669 	char	   *cur_ptr;
1670 	char	   *line_end_ptr;
1671 
1672 	/*
1673 	 * We need a special case for zero-column tables: check that the input
1674 	 * line is empty, and return.
1675 	 */
1676 	if (cstate->max_fields <= 0)
1677 	{
1678 		if (cstate->line_buf.len != 0)
1679 			ereport(ERROR,
1680 					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1681 					 errmsg("extra data after last expected column")));
1682 		return 0;
1683 	}
1684 
1685 	resetStringInfo(&cstate->attribute_buf);
1686 
1687 	/*
1688 	 * The de-escaped attributes will certainly not be longer than the input
1689 	 * data line, so we can just force attribute_buf to be large enough and
1690 	 * then transfer data without any checks for enough space.  We need to do
1691 	 * it this way because enlarging attribute_buf mid-stream would invalidate
1692 	 * pointers already stored into cstate->raw_fields[].
1693 	 */
1694 	if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1695 		enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1696 	output_ptr = cstate->attribute_buf.data;
1697 
1698 	/* set pointer variables for loop */
1699 	cur_ptr = cstate->line_buf.data;
1700 	line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1701 
1702 	/* Outer loop iterates over fields */
1703 	fieldno = 0;
1704 	for (;;)
1705 	{
1706 		bool		found_delim = false;
1707 		bool		saw_quote = false;
1708 		char	   *start_ptr;
1709 		char	   *end_ptr;
1710 		int			input_len;
1711 
1712 		/* Make sure there is enough space for the next value */
1713 		if (fieldno >= cstate->max_fields)
1714 		{
1715 			cstate->max_fields *= 2;
1716 			cstate->raw_fields =
1717 				repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1718 		}
1719 
1720 		/* Remember start of field on both input and output sides */
1721 		start_ptr = cur_ptr;
1722 		cstate->raw_fields[fieldno] = output_ptr;
1723 
1724 		/*
1725 		 * Scan data for field,
1726 		 *
1727 		 * The loop starts in "not quote" mode and then toggles between that
1728 		 * and "in quote" mode. The loop exits normally if it is in "not
1729 		 * quote" mode and a delimiter or line end is seen.
1730 		 */
1731 		for (;;)
1732 		{
1733 			char		c;
1734 
1735 			/* Not in quote */
1736 			for (;;)
1737 			{
1738 				end_ptr = cur_ptr;
1739 				if (cur_ptr >= line_end_ptr)
1740 					goto endfield;
1741 				c = *cur_ptr++;
1742 				/* unquoted field delimiter */
1743 				if (c == delimc)
1744 				{
1745 					found_delim = true;
1746 					goto endfield;
1747 				}
1748 				/* start of quoted field (or part of field) */
1749 				if (c == quotec)
1750 				{
1751 					saw_quote = true;
1752 					break;
1753 				}
1754 				/* Add c to output string */
1755 				*output_ptr++ = c;
1756 			}
1757 
1758 			/* In quote */
1759 			for (;;)
1760 			{
1761 				end_ptr = cur_ptr;
1762 				if (cur_ptr >= line_end_ptr)
1763 					ereport(ERROR,
1764 							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1765 							 errmsg("unterminated CSV quoted field")));
1766 
1767 				c = *cur_ptr++;
1768 
1769 				/* escape within a quoted field */
1770 				if (c == escapec)
1771 				{
1772 					/*
1773 					 * peek at the next char if available, and escape it if it
1774 					 * is an escape char or a quote char
1775 					 */
1776 					if (cur_ptr < line_end_ptr)
1777 					{
1778 						char		nextc = *cur_ptr;
1779 
1780 						if (nextc == escapec || nextc == quotec)
1781 						{
1782 							*output_ptr++ = nextc;
1783 							cur_ptr++;
1784 							continue;
1785 						}
1786 					}
1787 				}
1788 
1789 				/*
1790 				 * end of quoted field. Must do this test after testing for
1791 				 * escape in case quote char and escape char are the same
1792 				 * (which is the common case).
1793 				 */
1794 				if (c == quotec)
1795 					break;
1796 
1797 				/* Add c to output string */
1798 				*output_ptr++ = c;
1799 			}
1800 		}
1801 endfield:
1802 
1803 		/* Terminate attribute value in output area */
1804 		*output_ptr++ = '\0';
1805 
1806 		/* Check whether raw input matched null marker */
1807 		input_len = end_ptr - start_ptr;
1808 		if (!saw_quote && input_len == cstate->opts.null_print_len &&
1809 			strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1810 			cstate->raw_fields[fieldno] = NULL;
1811 
1812 		fieldno++;
1813 		/* Done if we hit EOL instead of a delim */
1814 		if (!found_delim)
1815 			break;
1816 	}
1817 
1818 	/* Clean up state of attribute_buf */
1819 	output_ptr--;
1820 	Assert(*output_ptr == '\0');
1821 	cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
1822 
1823 	return fieldno;
1824 }
1825 
1826 
1827 /*
1828  * Read a binary attribute
1829  */
1830 static Datum
CopyReadBinaryAttribute(CopyFromState cstate,FmgrInfo * flinfo,Oid typioparam,int32 typmod,bool * isnull)1831 CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
1832 						Oid typioparam, int32 typmod,
1833 						bool *isnull)
1834 {
1835 	int32		fld_size;
1836 	Datum		result;
1837 
1838 	if (!CopyGetInt32(cstate, &fld_size))
1839 		ereport(ERROR,
1840 				(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1841 				 errmsg("unexpected EOF in COPY data")));
1842 	if (fld_size == -1)
1843 	{
1844 		*isnull = true;
1845 		return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
1846 	}
1847 	if (fld_size < 0)
1848 		ereport(ERROR,
1849 				(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1850 				 errmsg("invalid field size")));
1851 
1852 	/* reset attribute_buf to empty, and load raw data in it */
1853 	resetStringInfo(&cstate->attribute_buf);
1854 
1855 	enlargeStringInfo(&cstate->attribute_buf, fld_size);
1856 	if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
1857 						   fld_size) != fld_size)
1858 		ereport(ERROR,
1859 				(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1860 				 errmsg("unexpected EOF in COPY data")));
1861 
1862 	cstate->attribute_buf.len = fld_size;
1863 	cstate->attribute_buf.data[fld_size] = '\0';
1864 
1865 	/* Call the column type's binary input converter */
1866 	result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf,
1867 								 typioparam, typmod);
1868 
1869 	/* Trouble if it didn't eat the whole buffer */
1870 	if (cstate->attribute_buf.cursor != cstate->attribute_buf.len)
1871 		ereport(ERROR,
1872 				(errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
1873 				 errmsg("incorrect binary data format")));
1874 
1875 	*isnull = false;
1876 	return result;
1877 }
1878