1 /*-------------------------------------------------------------------------
2 *
3 * copyfromparse.c
4 * Parse CSV/text/binary format for COPY FROM.
5 *
6 * This file contains routines to parse the text, CSV and binary input
7 * formats. The main entry point is NextCopyFrom(), which parses the
8 * next input line and returns it as Datums.
9 *
10 * In text/CSV mode, the parsing happens in multiple stages:
11 *
12 * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
13 * 1. 2. 3. 4.
14 *
15 * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
16 * places it into 'raw_buf'.
17 *
18 * 2. CopyConvertBuf() calls the encoding conversion function to convert
19 * the data in 'raw_buf' from client to server encoding, placing the
20 * converted result in 'input_buf'.
21 *
22 * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
23 * It is responsible for finding the next newline marker, taking quote and
24 * escape characters into account according to the COPY options. The line
25 * is copied into 'line_buf', with quotes and escape characters still
26 * intact.
27 *
28 * 4. CopyReadAttributesText/CSV() function takes the input line from
29 * 'line_buf', and splits it into fields, unescaping the data as required.
30 * The fields are stored in 'attribute_buf', and 'raw_fields' array holds
31 * pointers to each field.
32 *
33 * If encoding conversion is not required, a shortcut is taken in step 2 to
34 * avoid copying the data unnecessarily. The 'input_buf' pointer is set to
35 * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
36 * directly into 'input_buf'. CopyConvertBuf() then merely validates that
37 * the data is valid in the current encoding.
38 *
39 * In binary mode, the pipeline is much simpler. Input is loaded into
40 * into 'raw_buf', and encoding conversion is done in the datatype-specific
41 * receive functions, if required. 'input_buf' and 'line_buf' are not used,
42 * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
43 * data when it's passed the receive function.
44 *
45 * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE). 'input_buf' is also
46 * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required. 'line_buf'
47 * and 'attribute_buf' are expanded on demand, to hold the longest line
48 * encountered so far.
49 *
50 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
51 * Portions Copyright (c) 1994, Regents of the University of California
52 *
53 *
54 * IDENTIFICATION
55 * src/backend/commands/copyfromparse.c
56 *
57 *-------------------------------------------------------------------------
58 */
59 #include "postgres.h"
60
61 #include <ctype.h>
62 #include <unistd.h>
63 #include <sys/stat.h>
64
65 #include "commands/copy.h"
66 #include "commands/copyfrom_internal.h"
67 #include "commands/progress.h"
68 #include "executor/executor.h"
69 #include "libpq/libpq.h"
70 #include "libpq/pqformat.h"
71 #include "mb/pg_wchar.h"
72 #include "miscadmin.h"
73 #include "pgstat.h"
74 #include "port/pg_bswap.h"
75 #include "utils/memutils.h"
76 #include "utils/rel.h"
77
78 #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
79 #define OCTVALUE(c) ((c) - '0')
80
81 /*
82 * These macros centralize code used to process line_buf and input_buf buffers.
83 * They are macros because they often do continue/break control and to avoid
84 * function call overhead in tight COPY loops.
85 *
86 * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
87 * prevent the continue/break processing from working. We end the "if (1)"
88 * with "else ((void) 0)" to ensure the "if" does not unintentionally match
89 * any "else" in the calling code, and to avoid any compiler warnings about
90 * empty statements. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
91 */
92
93 /*
94 * This keeps the character read at the top of the loop in the buffer
95 * even if there is more than one read-ahead.
96 */
97 #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
98 if (1) \
99 { \
100 if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
101 { \
102 input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
103 need_data = true; \
104 continue; \
105 } \
106 } else ((void) 0)
107
108 /* This consumes the remainder of the buffer and breaks */
109 #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
110 if (1) \
111 { \
112 if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
113 { \
114 if (extralen) \
115 input_buf_ptr = copy_buf_len; /* consume the partial character */ \
116 /* backslash just before EOF, treat as data char */ \
117 result = true; \
118 break; \
119 } \
120 } else ((void) 0)
121
122 /*
123 * Transfer any approved data to line_buf; must do this to be sure
124 * there is some room in input_buf.
125 */
126 #define REFILL_LINEBUF \
127 if (1) \
128 { \
129 if (input_buf_ptr > cstate->input_buf_index) \
130 { \
131 appendBinaryStringInfo(&cstate->line_buf, \
132 cstate->input_buf + cstate->input_buf_index, \
133 input_buf_ptr - cstate->input_buf_index); \
134 cstate->input_buf_index = input_buf_ptr; \
135 } \
136 } else ((void) 0)
137
138 /* Undo any read-ahead and jump out of the block. */
139 #define NO_END_OF_COPY_GOTO \
140 if (1) \
141 { \
142 input_buf_ptr = prev_raw_ptr + 1; \
143 goto not_end_of_copy; \
144 } else ((void) 0)
145
146 /* NOTE: there's a copy of this in copyto.c */
147 static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
148
149
150 /* non-export function prototypes */
151 static bool CopyReadLine(CopyFromState cstate);
152 static bool CopyReadLineText(CopyFromState cstate);
153 static int CopyReadAttributesText(CopyFromState cstate);
154 static int CopyReadAttributesCSV(CopyFromState cstate);
155 static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
156 Oid typioparam, int32 typmod,
157 bool *isnull);
158
159
160 /* Low-level communications functions */
161 static int CopyGetData(CopyFromState cstate, void *databuf,
162 int minread, int maxread);
163 static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
164 static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
165 static void CopyLoadInputBuf(CopyFromState cstate);
166 static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
167
168 void
ReceiveCopyBegin(CopyFromState cstate)169 ReceiveCopyBegin(CopyFromState cstate)
170 {
171 StringInfoData buf;
172 int natts = list_length(cstate->attnumlist);
173 int16 format = (cstate->opts.binary ? 1 : 0);
174 int i;
175
176 pq_beginmessage(&buf, 'G');
177 pq_sendbyte(&buf, format); /* overall format */
178 pq_sendint16(&buf, natts);
179 for (i = 0; i < natts; i++)
180 pq_sendint16(&buf, format); /* per-column formats */
181 pq_endmessage(&buf);
182 cstate->copy_src = COPY_FRONTEND;
183 cstate->fe_msgbuf = makeStringInfo();
184 /* We *must* flush here to ensure FE knows it can send. */
185 pq_flush();
186 }
187
188 void
ReceiveCopyBinaryHeader(CopyFromState cstate)189 ReceiveCopyBinaryHeader(CopyFromState cstate)
190 {
191 char readSig[11];
192 int32 tmp;
193
194 /* Signature */
195 if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
196 memcmp(readSig, BinarySignature, 11) != 0)
197 ereport(ERROR,
198 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
199 errmsg("COPY file signature not recognized")));
200 /* Flags field */
201 if (!CopyGetInt32(cstate, &tmp))
202 ereport(ERROR,
203 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
204 errmsg("invalid COPY file header (missing flags)")));
205 if ((tmp & (1 << 16)) != 0)
206 ereport(ERROR,
207 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
208 errmsg("invalid COPY file header (WITH OIDS)")));
209 tmp &= ~(1 << 16);
210 if ((tmp >> 16) != 0)
211 ereport(ERROR,
212 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
213 errmsg("unrecognized critical flags in COPY file header")));
214 /* Header extension length */
215 if (!CopyGetInt32(cstate, &tmp) ||
216 tmp < 0)
217 ereport(ERROR,
218 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
219 errmsg("invalid COPY file header (missing length)")));
220 /* Skip extension header, if present */
221 while (tmp-- > 0)
222 {
223 if (CopyReadBinaryData(cstate, readSig, 1) != 1)
224 ereport(ERROR,
225 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
226 errmsg("invalid COPY file header (wrong length)")));
227 }
228 }
229
230 /*
231 * CopyGetData reads data from the source (file or frontend)
232 *
233 * We attempt to read at least minread, and at most maxread, bytes from
234 * the source. The actual number of bytes read is returned; if this is
235 * less than minread, EOF was detected.
236 *
237 * Note: when copying from the frontend, we expect a proper EOF mark per
238 * protocol; if the frontend simply drops the connection, we raise error.
239 * It seems unwise to allow the COPY IN to complete normally in that case.
240 *
241 * NB: no data conversion is applied here.
242 */
243 static int
CopyGetData(CopyFromState cstate,void * databuf,int minread,int maxread)244 CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
245 {
246 int bytesread = 0;
247
248 switch (cstate->copy_src)
249 {
250 case COPY_FILE:
251 bytesread = fread(databuf, 1, maxread, cstate->copy_file);
252 if (ferror(cstate->copy_file))
253 ereport(ERROR,
254 (errcode_for_file_access(),
255 errmsg("could not read from COPY file: %m")));
256 if (bytesread == 0)
257 cstate->raw_reached_eof = true;
258 break;
259 case COPY_FRONTEND:
260 while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
261 {
262 int avail;
263
264 while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len)
265 {
266 /* Try to receive another message */
267 int mtype;
268 int maxmsglen;
269
270 readmessage:
271 HOLD_CANCEL_INTERRUPTS();
272 pq_startmsgread();
273 mtype = pq_getbyte();
274 if (mtype == EOF)
275 ereport(ERROR,
276 (errcode(ERRCODE_CONNECTION_FAILURE),
277 errmsg("unexpected EOF on client connection with an open transaction")));
278 /* Validate message type and set packet size limit */
279 switch (mtype)
280 {
281 case 'd': /* CopyData */
282 maxmsglen = PQ_LARGE_MESSAGE_LIMIT;
283 break;
284 case 'c': /* CopyDone */
285 case 'f': /* CopyFail */
286 case 'H': /* Flush */
287 case 'S': /* Sync */
288 maxmsglen = PQ_SMALL_MESSAGE_LIMIT;
289 break;
290 default:
291 ereport(ERROR,
292 (errcode(ERRCODE_PROTOCOL_VIOLATION),
293 errmsg("unexpected message type 0x%02X during COPY from stdin",
294 mtype)));
295 maxmsglen = 0; /* keep compiler quiet */
296 break;
297 }
298 /* Now collect the message body */
299 if (pq_getmessage(cstate->fe_msgbuf, maxmsglen))
300 ereport(ERROR,
301 (errcode(ERRCODE_CONNECTION_FAILURE),
302 errmsg("unexpected EOF on client connection with an open transaction")));
303 RESUME_CANCEL_INTERRUPTS();
304 /* ... and process it */
305 switch (mtype)
306 {
307 case 'd': /* CopyData */
308 break;
309 case 'c': /* CopyDone */
310 /* COPY IN correctly terminated by frontend */
311 cstate->raw_reached_eof = true;
312 return bytesread;
313 case 'f': /* CopyFail */
314 ereport(ERROR,
315 (errcode(ERRCODE_QUERY_CANCELED),
316 errmsg("COPY from stdin failed: %s",
317 pq_getmsgstring(cstate->fe_msgbuf))));
318 break;
319 case 'H': /* Flush */
320 case 'S': /* Sync */
321
322 /*
323 * Ignore Flush/Sync for the convenience of client
324 * libraries (such as libpq) that may send those
325 * without noticing that the command they just
326 * sent was COPY.
327 */
328 goto readmessage;
329 default:
330 Assert(false); /* NOT REACHED */
331 }
332 }
333 avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor;
334 if (avail > maxread)
335 avail = maxread;
336 pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail);
337 databuf = (void *) ((char *) databuf + avail);
338 maxread -= avail;
339 bytesread += avail;
340 }
341 break;
342 case COPY_CALLBACK:
343 bytesread = cstate->data_source_cb(databuf, minread, maxread);
344 break;
345 }
346
347 return bytesread;
348 }
349
350
351 /*
352 * These functions do apply some data conversion
353 */
354
355 /*
356 * CopyGetInt32 reads an int32 that appears in network byte order
357 *
358 * Returns true if OK, false if EOF
359 */
360 static inline bool
CopyGetInt32(CopyFromState cstate,int32 * val)361 CopyGetInt32(CopyFromState cstate, int32 *val)
362 {
363 uint32 buf;
364
365 if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
366 {
367 *val = 0; /* suppress compiler warning */
368 return false;
369 }
370 *val = (int32) pg_ntoh32(buf);
371 return true;
372 }
373
374 /*
375 * CopyGetInt16 reads an int16 that appears in network byte order
376 */
377 static inline bool
CopyGetInt16(CopyFromState cstate,int16 * val)378 CopyGetInt16(CopyFromState cstate, int16 *val)
379 {
380 uint16 buf;
381
382 if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
383 {
384 *val = 0; /* suppress compiler warning */
385 return false;
386 }
387 *val = (int16) pg_ntoh16(buf);
388 return true;
389 }
390
391
392 /*
393 * Perform encoding conversion on data in 'raw_buf', writing the converted
394 * data into 'input_buf'.
395 *
396 * On entry, there must be some data to convert in 'raw_buf'.
397 */
398 static void
CopyConvertBuf(CopyFromState cstate)399 CopyConvertBuf(CopyFromState cstate)
400 {
401 /*
402 * If the file and server encoding are the same, no encoding conversion is
403 * required. However, we still need to verify that the input is valid for
404 * the encoding.
405 */
406 if (!cstate->need_transcoding)
407 {
408 /*
409 * When conversion is not required, input_buf and raw_buf are the
410 * same. raw_buf_len is the total number of bytes in the buffer, and
411 * input_buf_len tracks how many of those bytes have already been
412 * verified.
413 */
414 int preverifiedlen = cstate->input_buf_len;
415 int unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
416 int nverified;
417
418 if (unverifiedlen == 0)
419 {
420 /*
421 * If no more raw data is coming, report the EOF to the caller.
422 */
423 if (cstate->raw_reached_eof)
424 cstate->input_reached_eof = true;
425 return;
426 }
427
428 /*
429 * Verify the new data, including any residual unverified bytes from
430 * previous round.
431 */
432 nverified = pg_encoding_verifymbstr(cstate->file_encoding,
433 cstate->raw_buf + preverifiedlen,
434 unverifiedlen);
435 if (nverified == 0)
436 {
437 /*
438 * Could not verify anything.
439 *
440 * If there is no more raw input data coming, it means that there
441 * was an incomplete multi-byte sequence at the end. Also, if
442 * there's "enough" input left, we should be able to verify at
443 * least one character, and a failure to do so means that we've
444 * hit an invalid byte sequence.
445 */
446 if (cstate->raw_reached_eof || unverifiedlen >= pg_database_encoding_max_length())
447 cstate->input_reached_error = true;
448 return;
449 }
450 cstate->input_buf_len += nverified;
451 }
452 else
453 {
454 /*
455 * Encoding conversion is needed.
456 */
457 int nbytes;
458 unsigned char *src;
459 int srclen;
460 unsigned char *dst;
461 int dstlen;
462 int convertedlen;
463
464 if (RAW_BUF_BYTES(cstate) == 0)
465 {
466 /*
467 * If no more raw data is coming, report the EOF to the caller.
468 */
469 if (cstate->raw_reached_eof)
470 cstate->input_reached_eof = true;
471 return;
472 }
473
474 /*
475 * First, copy down any unprocessed data.
476 */
477 nbytes = INPUT_BUF_BYTES(cstate);
478 if (nbytes > 0 && cstate->input_buf_index > 0)
479 memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
480 nbytes);
481 cstate->input_buf_index = 0;
482 cstate->input_buf_len = nbytes;
483 cstate->input_buf[nbytes] = '\0';
484
485 src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
486 srclen = cstate->raw_buf_len - cstate->raw_buf_index;
487 dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
488 dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
489
490 /*
491 * Do the conversion. This might stop short, if there is an invalid
492 * byte sequence in the input. We'll convert as much as we can in
493 * that case.
494 *
495 * Note: Even if we hit an invalid byte sequence, we don't report the
496 * error until all the valid bytes have been consumed. The input
497 * might contain an end-of-input marker (\.), and we don't want to
498 * report an error if the invalid byte sequence is after the
499 * end-of-input marker. We might unnecessarily convert some data
500 * after the end-of-input marker as long as it's valid for the
501 * encoding, but that's harmless.
502 */
503 convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc,
504 cstate->file_encoding,
505 GetDatabaseEncoding(),
506 src, srclen,
507 dst, dstlen,
508 true);
509 if (convertedlen == 0)
510 {
511 /*
512 * Could not convert anything. If there is no more raw input data
513 * coming, it means that there was an incomplete multi-byte
514 * sequence at the end. Also, if there is plenty of input left,
515 * we should be able to convert at least one character, so a
516 * failure to do so must mean that we've hit a byte sequence
517 * that's invalid.
518 */
519 if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH)
520 cstate->input_reached_error = true;
521 return;
522 }
523 cstate->raw_buf_index += convertedlen;
524 cstate->input_buf_len += strlen((char *) dst);
525 }
526 }
527
528 /*
529 * Report an encoding or conversion error.
530 */
531 static void
CopyConversionError(CopyFromState cstate)532 CopyConversionError(CopyFromState cstate)
533 {
534 Assert(cstate->raw_buf_len > 0);
535 Assert(cstate->input_reached_error);
536
537 if (!cstate->need_transcoding)
538 {
539 /*
540 * Everything up to input_buf_len was successfully verified, and
541 * input_buf_len points to the invalid or incomplete character.
542 */
543 report_invalid_encoding(cstate->file_encoding,
544 cstate->raw_buf + cstate->input_buf_len,
545 cstate->raw_buf_len - cstate->input_buf_len);
546 }
547 else
548 {
549 /*
550 * raw_buf_index points to the invalid or untranslatable character. We
551 * let the conversion routine report the error, because it can provide
552 * a more specific error message than we could here. An earlier call
553 * to the conversion routine in CopyConvertBuf() detected that there
554 * is an error, now we call the conversion routine again with
555 * noError=false, to have it throw the error.
556 */
557 unsigned char *src;
558 int srclen;
559 unsigned char *dst;
560 int dstlen;
561
562 src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
563 srclen = cstate->raw_buf_len - cstate->raw_buf_index;
564 dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
565 dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
566
567 (void) pg_do_encoding_conversion_buf(cstate->conversion_proc,
568 cstate->file_encoding,
569 GetDatabaseEncoding(),
570 src, srclen,
571 dst, dstlen,
572 false);
573
574 /*
575 * The conversion routine should have reported an error, so this
576 * should not be reached.
577 */
578 elog(ERROR, "encoding conversion failed without error");
579 }
580 }
581
582 /*
583 * Load more data from data source to raw_buf.
584 *
585 * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
586 * beginning of the buffer, and we load new data after that.
587 */
588 static void
CopyLoadRawBuf(CopyFromState cstate)589 CopyLoadRawBuf(CopyFromState cstate)
590 {
591 int nbytes;
592 int inbytes;
593
594 /*
595 * In text mode, if encoding conversion is not required, raw_buf and
596 * input_buf point to the same buffer. Their len/index better agree, too.
597 */
598 if (cstate->raw_buf == cstate->input_buf)
599 {
600 Assert(!cstate->need_transcoding);
601 Assert(cstate->raw_buf_index == cstate->input_buf_index);
602 Assert(cstate->input_buf_len <= cstate->raw_buf_len);
603 }
604
605 /*
606 * Copy down the unprocessed data if any.
607 */
608 nbytes = RAW_BUF_BYTES(cstate);
609 if (nbytes > 0 && cstate->raw_buf_index > 0)
610 memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
611 nbytes);
612 cstate->raw_buf_len -= cstate->raw_buf_index;
613 cstate->raw_buf_index = 0;
614
615 /*
616 * If raw_buf and input_buf are in fact the same buffer, adjust the
617 * input_buf variables, too.
618 */
619 if (cstate->raw_buf == cstate->input_buf)
620 {
621 cstate->input_buf_len -= cstate->input_buf_index;
622 cstate->input_buf_index = 0;
623 }
624
625 /* Load more data */
626 inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
627 1, RAW_BUF_SIZE - cstate->raw_buf_len);
628 nbytes += inbytes;
629 cstate->raw_buf[nbytes] = '\0';
630 cstate->raw_buf_len = nbytes;
631
632 cstate->bytes_processed += inbytes;
633 pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
634
635 if (inbytes == 0)
636 cstate->raw_reached_eof = true;
637 }
638
639 /*
640 * CopyLoadInputBuf loads some more data into input_buf
641 *
642 * On return, at least one more input character is loaded into
643 * input_buf, or input_reached_eof is set.
644 *
645 * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
646 * of the buffer and then we load more data after that.
647 */
648 static void
CopyLoadInputBuf(CopyFromState cstate)649 CopyLoadInputBuf(CopyFromState cstate)
650 {
651 int nbytes = INPUT_BUF_BYTES(cstate);
652
653 /*
654 * The caller has updated input_buf_index to indicate how much of the
655 * input has been consumed and isn't needed anymore. If input_buf is the
656 * same physical area as raw_buf, update raw_buf_index accordingly.
657 */
658 if (cstate->raw_buf == cstate->input_buf)
659 {
660 Assert(!cstate->need_transcoding);
661 Assert(cstate->input_buf_index >= cstate->raw_buf_index);
662 cstate->raw_buf_index = cstate->input_buf_index;
663 }
664
665 for (;;)
666 {
667 /* If we now have some unconverted data, try to convert it */
668 CopyConvertBuf(cstate);
669
670 /* If we now have some more input bytes ready, return them */
671 if (INPUT_BUF_BYTES(cstate) > nbytes)
672 return;
673
674 /*
675 * If we reached an invalid byte sequence, or we're at an incomplete
676 * multi-byte character but there is no more raw input data, report
677 * conversion error.
678 */
679 if (cstate->input_reached_error)
680 CopyConversionError(cstate);
681
682 /* no more input, and everything has been converted */
683 if (cstate->input_reached_eof)
684 break;
685
686 /* Try to load more raw data */
687 Assert(!cstate->raw_reached_eof);
688 CopyLoadRawBuf(cstate);
689 }
690 }
691
692 /*
693 * CopyReadBinaryData
694 *
695 * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
696 * and writes them to 'dest'. Returns the number of bytes read (which
697 * would be less than 'nbytes' only if we reach EOF).
698 */
699 static int
CopyReadBinaryData(CopyFromState cstate,char * dest,int nbytes)700 CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
701 {
702 int copied_bytes = 0;
703
704 if (RAW_BUF_BYTES(cstate) >= nbytes)
705 {
706 /* Enough bytes are present in the buffer. */
707 memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
708 cstate->raw_buf_index += nbytes;
709 copied_bytes = nbytes;
710 }
711 else
712 {
713 /*
714 * Not enough bytes in the buffer, so must read from the file. Need
715 * to loop since 'nbytes' could be larger than the buffer size.
716 */
717 do
718 {
719 int copy_bytes;
720
721 /* Load more data if buffer is empty. */
722 if (RAW_BUF_BYTES(cstate) == 0)
723 {
724 CopyLoadRawBuf(cstate);
725 if (cstate->raw_reached_eof)
726 break; /* EOF */
727 }
728
729 /* Transfer some bytes. */
730 copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
731 memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
732 cstate->raw_buf_index += copy_bytes;
733 dest += copy_bytes;
734 copied_bytes += copy_bytes;
735 } while (copied_bytes < nbytes);
736 }
737
738 return copied_bytes;
739 }
740
741 /*
742 * Read raw fields in the next line for COPY FROM in text or csv mode.
743 * Return false if no more lines.
744 *
745 * An internal temporary buffer is returned via 'fields'. It is valid until
746 * the next call of the function. Since the function returns all raw fields
747 * in the input file, 'nfields' could be different from the number of columns
748 * in the relation.
749 *
750 * NOTE: force_not_null option are not applied to the returned fields.
751 */
752 bool
NextCopyFromRawFields(CopyFromState cstate,char *** fields,int * nfields)753 NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
754 {
755 int fldct;
756 bool done;
757
758 /* only available for text or csv input */
759 Assert(!cstate->opts.binary);
760
761 /* on input just throw the header line away */
762 if (cstate->cur_lineno == 0 && cstate->opts.header_line)
763 {
764 cstate->cur_lineno++;
765 if (CopyReadLine(cstate))
766 return false; /* done */
767 }
768
769 cstate->cur_lineno++;
770
771 /* Actually read the line into memory here */
772 done = CopyReadLine(cstate);
773
774 /*
775 * EOF at start of line means we're done. If we see EOF after some
776 * characters, we act as though it was newline followed by EOF, ie,
777 * process the line and then exit loop on next iteration.
778 */
779 if (done && cstate->line_buf.len == 0)
780 return false;
781
782 /* Parse the line into de-escaped field values */
783 if (cstate->opts.csv_mode)
784 fldct = CopyReadAttributesCSV(cstate);
785 else
786 fldct = CopyReadAttributesText(cstate);
787
788 *fields = cstate->raw_fields;
789 *nfields = fldct;
790 return true;
791 }
792
793 /*
794 * Read next tuple from file for COPY FROM. Return false if no more tuples.
795 *
796 * 'econtext' is used to evaluate default expression for each columns not
797 * read from the file. It can be NULL when no default values are used, i.e.
798 * when all columns are read from the file.
799 *
800 * 'values' and 'nulls' arrays must be the same length as columns of the
801 * relation passed to BeginCopyFrom. This function fills the arrays.
802 */
803 bool
NextCopyFrom(CopyFromState cstate,ExprContext * econtext,Datum * values,bool * nulls)804 NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
805 Datum *values, bool *nulls)
806 {
807 TupleDesc tupDesc;
808 AttrNumber num_phys_attrs,
809 attr_count,
810 num_defaults = cstate->num_defaults;
811 FmgrInfo *in_functions = cstate->in_functions;
812 Oid *typioparams = cstate->typioparams;
813 int i;
814 int *defmap = cstate->defmap;
815 ExprState **defexprs = cstate->defexprs;
816
817 tupDesc = RelationGetDescr(cstate->rel);
818 num_phys_attrs = tupDesc->natts;
819 attr_count = list_length(cstate->attnumlist);
820
821 /* Initialize all values for row to NULL */
822 MemSet(values, 0, num_phys_attrs * sizeof(Datum));
823 MemSet(nulls, true, num_phys_attrs * sizeof(bool));
824
825 if (!cstate->opts.binary)
826 {
827 char **field_strings;
828 ListCell *cur;
829 int fldct;
830 int fieldno;
831 char *string;
832
833 /* read raw fields in the next line */
834 if (!NextCopyFromRawFields(cstate, &field_strings, &fldct))
835 return false;
836
837 /* check for overflowing fields */
838 if (attr_count > 0 && fldct > attr_count)
839 ereport(ERROR,
840 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
841 errmsg("extra data after last expected column")));
842
843 fieldno = 0;
844
845 /* Loop to read the user attributes on the line. */
846 foreach(cur, cstate->attnumlist)
847 {
848 int attnum = lfirst_int(cur);
849 int m = attnum - 1;
850 Form_pg_attribute att = TupleDescAttr(tupDesc, m);
851
852 if (fieldno >= fldct)
853 ereport(ERROR,
854 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
855 errmsg("missing data for column \"%s\"",
856 NameStr(att->attname))));
857 string = field_strings[fieldno++];
858
859 if (cstate->convert_select_flags &&
860 !cstate->convert_select_flags[m])
861 {
862 /* ignore input field, leaving column as NULL */
863 continue;
864 }
865
866 if (cstate->opts.csv_mode)
867 {
868 if (string == NULL &&
869 cstate->opts.force_notnull_flags[m])
870 {
871 /*
872 * FORCE_NOT_NULL option is set and column is NULL -
873 * convert it to the NULL string.
874 */
875 string = cstate->opts.null_print;
876 }
877 else if (string != NULL && cstate->opts.force_null_flags[m]
878 && strcmp(string, cstate->opts.null_print) == 0)
879 {
880 /*
881 * FORCE_NULL option is set and column matches the NULL
882 * string. It must have been quoted, or otherwise the
883 * string would already have been set to NULL. Convert it
884 * to NULL as specified.
885 */
886 string = NULL;
887 }
888 }
889
890 cstate->cur_attname = NameStr(att->attname);
891 cstate->cur_attval = string;
892 values[m] = InputFunctionCall(&in_functions[m],
893 string,
894 typioparams[m],
895 att->atttypmod);
896 if (string != NULL)
897 nulls[m] = false;
898 cstate->cur_attname = NULL;
899 cstate->cur_attval = NULL;
900 }
901
902 Assert(fieldno == attr_count);
903 }
904 else
905 {
906 /* binary */
907 int16 fld_count;
908 ListCell *cur;
909
910 cstate->cur_lineno++;
911
912 if (!CopyGetInt16(cstate, &fld_count))
913 {
914 /* EOF detected (end of file, or protocol-level EOF) */
915 return false;
916 }
917
918 if (fld_count == -1)
919 {
920 /*
921 * Received EOF marker. Wait for the protocol-level EOF, and
922 * complain if it doesn't come immediately. In COPY FROM STDIN,
923 * this ensures that we correctly handle CopyFail, if client
924 * chooses to send that now. When copying from file, we could
925 * ignore the rest of the file like in text mode, but we choose to
926 * be consistent with the COPY FROM STDIN case.
927 */
928 char dummy;
929
930 if (CopyReadBinaryData(cstate, &dummy, 1) > 0)
931 ereport(ERROR,
932 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
933 errmsg("received copy data after EOF marker")));
934 return false;
935 }
936
937 if (fld_count != attr_count)
938 ereport(ERROR,
939 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
940 errmsg("row field count is %d, expected %d",
941 (int) fld_count, attr_count)));
942
943 foreach(cur, cstate->attnumlist)
944 {
945 int attnum = lfirst_int(cur);
946 int m = attnum - 1;
947 Form_pg_attribute att = TupleDescAttr(tupDesc, m);
948
949 cstate->cur_attname = NameStr(att->attname);
950 values[m] = CopyReadBinaryAttribute(cstate,
951 &in_functions[m],
952 typioparams[m],
953 att->atttypmod,
954 &nulls[m]);
955 cstate->cur_attname = NULL;
956 }
957 }
958
959 /*
960 * Now compute and insert any defaults available for the columns not
961 * provided by the input data. Anything not processed here or above will
962 * remain NULL.
963 */
964 for (i = 0; i < num_defaults; i++)
965 {
966 /*
967 * The caller must supply econtext and have switched into the
968 * per-tuple memory context in it.
969 */
970 Assert(econtext != NULL);
971 Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
972
973 values[defmap[i]] = ExecEvalExpr(defexprs[i], econtext,
974 &nulls[defmap[i]]);
975 }
976
977 return true;
978 }
979
980 /*
981 * Read the next input line and stash it in line_buf.
982 *
983 * Result is true if read was terminated by EOF, false if terminated
984 * by newline. The terminating newline or EOF marker is not included
985 * in the final value of line_buf.
986 */
987 static bool
CopyReadLine(CopyFromState cstate)988 CopyReadLine(CopyFromState cstate)
989 {
990 bool result;
991
992 resetStringInfo(&cstate->line_buf);
993 cstate->line_buf_valid = false;
994
995 /* Parse data and transfer into line_buf */
996 result = CopyReadLineText(cstate);
997
998 if (result)
999 {
1000 /*
1001 * Reached EOF. In protocol version 3, we should ignore anything
1002 * after \. up to the protocol end of copy data. (XXX maybe better
1003 * not to treat \. as special?)
1004 */
1005 if (cstate->copy_src == COPY_FRONTEND)
1006 {
1007 int inbytes;
1008
1009 do
1010 {
1011 inbytes = CopyGetData(cstate, cstate->input_buf,
1012 1, INPUT_BUF_SIZE);
1013 } while (inbytes > 0);
1014 cstate->input_buf_index = 0;
1015 cstate->input_buf_len = 0;
1016 cstate->raw_buf_index = 0;
1017 cstate->raw_buf_len = 0;
1018 }
1019 }
1020 else
1021 {
1022 /*
1023 * If we didn't hit EOF, then we must have transferred the EOL marker
1024 * to line_buf along with the data. Get rid of it.
1025 */
1026 switch (cstate->eol_type)
1027 {
1028 case EOL_NL:
1029 Assert(cstate->line_buf.len >= 1);
1030 Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1031 cstate->line_buf.len--;
1032 cstate->line_buf.data[cstate->line_buf.len] = '\0';
1033 break;
1034 case EOL_CR:
1035 Assert(cstate->line_buf.len >= 1);
1036 Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
1037 cstate->line_buf.len--;
1038 cstate->line_buf.data[cstate->line_buf.len] = '\0';
1039 break;
1040 case EOL_CRNL:
1041 Assert(cstate->line_buf.len >= 2);
1042 Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
1043 Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1044 cstate->line_buf.len -= 2;
1045 cstate->line_buf.data[cstate->line_buf.len] = '\0';
1046 break;
1047 case EOL_UNKNOWN:
1048 /* shouldn't get here */
1049 Assert(false);
1050 break;
1051 }
1052 }
1053
1054 /* Now it's safe to use the buffer in error messages */
1055 cstate->line_buf_valid = true;
1056
1057 return result;
1058 }
1059
1060 /*
1061 * CopyReadLineText - inner loop of CopyReadLine for text mode
1062 */
1063 static bool
CopyReadLineText(CopyFromState cstate)1064 CopyReadLineText(CopyFromState cstate)
1065 {
1066 char *copy_input_buf;
1067 int input_buf_ptr;
1068 int copy_buf_len;
1069 bool need_data = false;
1070 bool hit_eof = false;
1071 bool result = false;
1072
1073 /* CSV variables */
1074 bool first_char_in_line = true;
1075 bool in_quote = false,
1076 last_was_esc = false;
1077 char quotec = '\0';
1078 char escapec = '\0';
1079
1080 if (cstate->opts.csv_mode)
1081 {
1082 quotec = cstate->opts.quote[0];
1083 escapec = cstate->opts.escape[0];
1084 /* ignore special escape processing if it's the same as quotec */
1085 if (quotec == escapec)
1086 escapec = '\0';
1087 }
1088
1089 /*
1090 * The objective of this loop is to transfer the entire next input line
1091 * into line_buf. Hence, we only care for detecting newlines (\r and/or
1092 * \n) and the end-of-copy marker (\.).
1093 *
1094 * In CSV mode, \r and \n inside a quoted field are just part of the data
1095 * value and are put in line_buf. We keep just enough state to know if we
1096 * are currently in a quoted field or not.
1097 *
1098 * These four characters, and the CSV escape and quote characters, are
1099 * assumed the same in frontend and backend encodings.
1100 *
1101 * The input has already been converted to the database encoding. All
1102 * supported server encodings have the property that all bytes in a
1103 * multi-byte sequence have the high bit set, so a multibyte character
1104 * cannot contain any newline or escape characters embedded in the
1105 * multibyte sequence. Therefore, we can process the input byte-by-byte,
1106 * regardless of the encoding.
1107 *
1108 * For speed, we try to move data from input_buf to line_buf in chunks
1109 * rather than one character at a time. input_buf_ptr points to the next
1110 * character to examine; any characters from input_buf_index to
1111 * input_buf_ptr have been determined to be part of the line, but not yet
1112 * transferred to line_buf.
1113 *
1114 * For a little extra speed within the loop, we copy input_buf and
1115 * input_buf_len into local variables.
1116 */
1117 copy_input_buf = cstate->input_buf;
1118 input_buf_ptr = cstate->input_buf_index;
1119 copy_buf_len = cstate->input_buf_len;
1120
1121 for (;;)
1122 {
1123 int prev_raw_ptr;
1124 char c;
1125
1126 /*
1127 * Load more data if needed. Ideally we would just force four bytes
1128 * of read-ahead and avoid the many calls to
1129 * IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(), but the COPY_OLD_FE protocol
1130 * does not allow us to read too far ahead or we might read into the
1131 * next data, so we read-ahead only as far we know we can. One
1132 * optimization would be to read-ahead four byte here if
1133 * cstate->copy_src != COPY_OLD_FE, but it hardly seems worth it,
1134 * considering the size of the buffer.
1135 */
1136 if (input_buf_ptr >= copy_buf_len || need_data)
1137 {
1138 REFILL_LINEBUF;
1139
1140 CopyLoadInputBuf(cstate);
1141 /* update our local variables */
1142 hit_eof = cstate->input_reached_eof;
1143 input_buf_ptr = cstate->input_buf_index;
1144 copy_buf_len = cstate->input_buf_len;
1145
1146 /*
1147 * If we are completely out of data, break out of the loop,
1148 * reporting EOF.
1149 */
1150 if (INPUT_BUF_BYTES(cstate) <= 0)
1151 {
1152 result = true;
1153 break;
1154 }
1155 need_data = false;
1156 }
1157
1158 /* OK to fetch a character */
1159 prev_raw_ptr = input_buf_ptr;
1160 c = copy_input_buf[input_buf_ptr++];
1161
1162 if (cstate->opts.csv_mode)
1163 {
1164 /*
1165 * If character is '\\' or '\r', we may need to look ahead below.
1166 * Force fetch of the next character if we don't already have it.
1167 * We need to do this before changing CSV state, in case one of
1168 * these characters is also the quote or escape character.
1169 *
1170 * Note: old-protocol does not like forced prefetch, but it's OK
1171 * here since we cannot validly be at EOF.
1172 */
1173 if (c == '\\' || c == '\r')
1174 {
1175 IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1176 }
1177
1178 /*
1179 * Dealing with quotes and escapes here is mildly tricky. If the
1180 * quote char is also the escape char, there's no problem - we
1181 * just use the char as a toggle. If they are different, we need
1182 * to ensure that we only take account of an escape inside a
1183 * quoted field and immediately preceding a quote char, and not
1184 * the second in an escape-escape sequence.
1185 */
1186 if (in_quote && c == escapec)
1187 last_was_esc = !last_was_esc;
1188 if (c == quotec && !last_was_esc)
1189 in_quote = !in_quote;
1190 if (c != escapec)
1191 last_was_esc = false;
1192
1193 /*
1194 * Updating the line count for embedded CR and/or LF chars is
1195 * necessarily a little fragile - this test is probably about the
1196 * best we can do. (XXX it's arguable whether we should do this
1197 * at all --- is cur_lineno a physical or logical count?)
1198 */
1199 if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
1200 cstate->cur_lineno++;
1201 }
1202
1203 /* Process \r */
1204 if (c == '\r' && (!cstate->opts.csv_mode || !in_quote))
1205 {
1206 /* Check for \r\n on first line, _and_ handle \r\n. */
1207 if (cstate->eol_type == EOL_UNKNOWN ||
1208 cstate->eol_type == EOL_CRNL)
1209 {
1210 /*
1211 * If need more data, go back to loop top to load it.
1212 *
1213 * Note that if we are at EOF, c will wind up as '\0' because
1214 * of the guaranteed pad of input_buf.
1215 */
1216 IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1217
1218 /* get next char */
1219 c = copy_input_buf[input_buf_ptr];
1220
1221 if (c == '\n')
1222 {
1223 input_buf_ptr++; /* eat newline */
1224 cstate->eol_type = EOL_CRNL; /* in case not set yet */
1225 }
1226 else
1227 {
1228 /* found \r, but no \n */
1229 if (cstate->eol_type == EOL_CRNL)
1230 ereport(ERROR,
1231 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1232 !cstate->opts.csv_mode ?
1233 errmsg("literal carriage return found in data") :
1234 errmsg("unquoted carriage return found in data"),
1235 !cstate->opts.csv_mode ?
1236 errhint("Use \"\\r\" to represent carriage return.") :
1237 errhint("Use quoted CSV field to represent carriage return.")));
1238
1239 /*
1240 * if we got here, it is the first line and we didn't find
1241 * \n, so don't consume the peeked character
1242 */
1243 cstate->eol_type = EOL_CR;
1244 }
1245 }
1246 else if (cstate->eol_type == EOL_NL)
1247 ereport(ERROR,
1248 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1249 !cstate->opts.csv_mode ?
1250 errmsg("literal carriage return found in data") :
1251 errmsg("unquoted carriage return found in data"),
1252 !cstate->opts.csv_mode ?
1253 errhint("Use \"\\r\" to represent carriage return.") :
1254 errhint("Use quoted CSV field to represent carriage return.")));
1255 /* If reach here, we have found the line terminator */
1256 break;
1257 }
1258
1259 /* Process \n */
1260 if (c == '\n' && (!cstate->opts.csv_mode || !in_quote))
1261 {
1262 if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
1263 ereport(ERROR,
1264 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1265 !cstate->opts.csv_mode ?
1266 errmsg("literal newline found in data") :
1267 errmsg("unquoted newline found in data"),
1268 !cstate->opts.csv_mode ?
1269 errhint("Use \"\\n\" to represent newline.") :
1270 errhint("Use quoted CSV field to represent newline.")));
1271 cstate->eol_type = EOL_NL; /* in case not set yet */
1272 /* If reach here, we have found the line terminator */
1273 break;
1274 }
1275
1276 /*
1277 * In CSV mode, we only recognize \. alone on a line. This is because
1278 * \. is a valid CSV data value.
1279 */
1280 if (c == '\\' && (!cstate->opts.csv_mode || first_char_in_line))
1281 {
1282 char c2;
1283
1284 IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1285 IF_NEED_REFILL_AND_EOF_BREAK(0);
1286
1287 /* -----
1288 * get next character
1289 * Note: we do not change c so if it isn't \., we can fall
1290 * through and continue processing.
1291 * -----
1292 */
1293 c2 = copy_input_buf[input_buf_ptr];
1294
1295 if (c2 == '.')
1296 {
1297 input_buf_ptr++; /* consume the '.' */
1298
1299 /*
1300 * Note: if we loop back for more data here, it does not
1301 * matter that the CSV state change checks are re-executed; we
1302 * will come back here with no important state changed.
1303 */
1304 if (cstate->eol_type == EOL_CRNL)
1305 {
1306 /* Get the next character */
1307 IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1308 /* if hit_eof, c2 will become '\0' */
1309 c2 = copy_input_buf[input_buf_ptr++];
1310
1311 if (c2 == '\n')
1312 {
1313 if (!cstate->opts.csv_mode)
1314 ereport(ERROR,
1315 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1316 errmsg("end-of-copy marker does not match previous newline style")));
1317 else
1318 NO_END_OF_COPY_GOTO;
1319 }
1320 else if (c2 != '\r')
1321 {
1322 if (!cstate->opts.csv_mode)
1323 ereport(ERROR,
1324 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1325 errmsg("end-of-copy marker corrupt")));
1326 else
1327 NO_END_OF_COPY_GOTO;
1328 }
1329 }
1330
1331 /* Get the next character */
1332 IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1333 /* if hit_eof, c2 will become '\0' */
1334 c2 = copy_input_buf[input_buf_ptr++];
1335
1336 if (c2 != '\r' && c2 != '\n')
1337 {
1338 if (!cstate->opts.csv_mode)
1339 ereport(ERROR,
1340 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1341 errmsg("end-of-copy marker corrupt")));
1342 else
1343 NO_END_OF_COPY_GOTO;
1344 }
1345
1346 if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
1347 (cstate->eol_type == EOL_CRNL && c2 != '\n') ||
1348 (cstate->eol_type == EOL_CR && c2 != '\r'))
1349 {
1350 ereport(ERROR,
1351 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1352 errmsg("end-of-copy marker does not match previous newline style")));
1353 }
1354
1355 /*
1356 * Transfer only the data before the \. into line_buf, then
1357 * discard the data and the \. sequence.
1358 */
1359 if (prev_raw_ptr > cstate->input_buf_index)
1360 appendBinaryStringInfo(&cstate->line_buf,
1361 cstate->input_buf + cstate->input_buf_index,
1362 prev_raw_ptr - cstate->input_buf_index);
1363 cstate->input_buf_index = input_buf_ptr;
1364 result = true; /* report EOF */
1365 break;
1366 }
1367 else if (!cstate->opts.csv_mode)
1368 {
1369 /*
1370 * If we are here, it means we found a backslash followed by
1371 * something other than a period. In non-CSV mode, anything
1372 * after a backslash is special, so we skip over that second
1373 * character too. If we didn't do that \\. would be
1374 * considered an eof-of copy, while in non-CSV mode it is a
1375 * literal backslash followed by a period. In CSV mode,
1376 * backslashes are not special, so we want to process the
1377 * character after the backslash just like a normal character,
1378 * so we don't increment in those cases.
1379 */
1380 input_buf_ptr++;
1381 }
1382 }
1383
1384 /*
1385 * This label is for CSV cases where \. appears at the start of a
1386 * line, but there is more text after it, meaning it was a data value.
1387 * We are more strict for \. in CSV mode because \. could be a data
1388 * value, while in non-CSV mode, \. cannot be a data value.
1389 */
1390 not_end_of_copy:
1391 first_char_in_line = false;
1392 } /* end of outer loop */
1393
1394 /*
1395 * Transfer any still-uncopied data to line_buf.
1396 */
1397 REFILL_LINEBUF;
1398
1399 return result;
1400 }
1401
1402 /*
1403 * Return decimal value for a hexadecimal digit
1404 */
1405 static int
GetDecimalFromHex(char hex)1406 GetDecimalFromHex(char hex)
1407 {
1408 if (isdigit((unsigned char) hex))
1409 return hex - '0';
1410 else
1411 return tolower((unsigned char) hex) - 'a' + 10;
1412 }
1413
1414 /*
1415 * Parse the current line into separate attributes (fields),
1416 * performing de-escaping as needed.
1417 *
1418 * The input is in line_buf. We use attribute_buf to hold the result
1419 * strings. cstate->raw_fields[k] is set to point to the k'th attribute
1420 * string, or NULL when the input matches the null marker string.
1421 * This array is expanded as necessary.
1422 *
1423 * (Note that the caller cannot check for nulls since the returned
1424 * string would be the post-de-escaping equivalent, which may look
1425 * the same as some valid data string.)
1426 *
1427 * delim is the column delimiter string (must be just one byte for now).
1428 * null_print is the null marker string. Note that this is compared to
1429 * the pre-de-escaped input string.
1430 *
1431 * The return value is the number of fields actually read.
1432 */
1433 static int
CopyReadAttributesText(CopyFromState cstate)1434 CopyReadAttributesText(CopyFromState cstate)
1435 {
1436 char delimc = cstate->opts.delim[0];
1437 int fieldno;
1438 char *output_ptr;
1439 char *cur_ptr;
1440 char *line_end_ptr;
1441
1442 /*
1443 * We need a special case for zero-column tables: check that the input
1444 * line is empty, and return.
1445 */
1446 if (cstate->max_fields <= 0)
1447 {
1448 if (cstate->line_buf.len != 0)
1449 ereport(ERROR,
1450 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1451 errmsg("extra data after last expected column")));
1452 return 0;
1453 }
1454
1455 resetStringInfo(&cstate->attribute_buf);
1456
1457 /*
1458 * The de-escaped attributes will certainly not be longer than the input
1459 * data line, so we can just force attribute_buf to be large enough and
1460 * then transfer data without any checks for enough space. We need to do
1461 * it this way because enlarging attribute_buf mid-stream would invalidate
1462 * pointers already stored into cstate->raw_fields[].
1463 */
1464 if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1465 enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1466 output_ptr = cstate->attribute_buf.data;
1467
1468 /* set pointer variables for loop */
1469 cur_ptr = cstate->line_buf.data;
1470 line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1471
1472 /* Outer loop iterates over fields */
1473 fieldno = 0;
1474 for (;;)
1475 {
1476 bool found_delim = false;
1477 char *start_ptr;
1478 char *end_ptr;
1479 int input_len;
1480 bool saw_non_ascii = false;
1481
1482 /* Make sure there is enough space for the next value */
1483 if (fieldno >= cstate->max_fields)
1484 {
1485 cstate->max_fields *= 2;
1486 cstate->raw_fields =
1487 repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1488 }
1489
1490 /* Remember start of field on both input and output sides */
1491 start_ptr = cur_ptr;
1492 cstate->raw_fields[fieldno] = output_ptr;
1493
1494 /*
1495 * Scan data for field.
1496 *
1497 * Note that in this loop, we are scanning to locate the end of field
1498 * and also speculatively performing de-escaping. Once we find the
1499 * end-of-field, we can match the raw field contents against the null
1500 * marker string. Only after that comparison fails do we know that
1501 * de-escaping is actually the right thing to do; therefore we *must
1502 * not* throw any syntax errors before we've done the null-marker
1503 * check.
1504 */
1505 for (;;)
1506 {
1507 char c;
1508
1509 end_ptr = cur_ptr;
1510 if (cur_ptr >= line_end_ptr)
1511 break;
1512 c = *cur_ptr++;
1513 if (c == delimc)
1514 {
1515 found_delim = true;
1516 break;
1517 }
1518 if (c == '\\')
1519 {
1520 if (cur_ptr >= line_end_ptr)
1521 break;
1522 c = *cur_ptr++;
1523 switch (c)
1524 {
1525 case '0':
1526 case '1':
1527 case '2':
1528 case '3':
1529 case '4':
1530 case '5':
1531 case '6':
1532 case '7':
1533 {
1534 /* handle \013 */
1535 int val;
1536
1537 val = OCTVALUE(c);
1538 if (cur_ptr < line_end_ptr)
1539 {
1540 c = *cur_ptr;
1541 if (ISOCTAL(c))
1542 {
1543 cur_ptr++;
1544 val = (val << 3) + OCTVALUE(c);
1545 if (cur_ptr < line_end_ptr)
1546 {
1547 c = *cur_ptr;
1548 if (ISOCTAL(c))
1549 {
1550 cur_ptr++;
1551 val = (val << 3) + OCTVALUE(c);
1552 }
1553 }
1554 }
1555 }
1556 c = val & 0377;
1557 if (c == '\0' || IS_HIGHBIT_SET(c))
1558 saw_non_ascii = true;
1559 }
1560 break;
1561 case 'x':
1562 /* Handle \x3F */
1563 if (cur_ptr < line_end_ptr)
1564 {
1565 char hexchar = *cur_ptr;
1566
1567 if (isxdigit((unsigned char) hexchar))
1568 {
1569 int val = GetDecimalFromHex(hexchar);
1570
1571 cur_ptr++;
1572 if (cur_ptr < line_end_ptr)
1573 {
1574 hexchar = *cur_ptr;
1575 if (isxdigit((unsigned char) hexchar))
1576 {
1577 cur_ptr++;
1578 val = (val << 4) + GetDecimalFromHex(hexchar);
1579 }
1580 }
1581 c = val & 0xff;
1582 if (c == '\0' || IS_HIGHBIT_SET(c))
1583 saw_non_ascii = true;
1584 }
1585 }
1586 break;
1587 case 'b':
1588 c = '\b';
1589 break;
1590 case 'f':
1591 c = '\f';
1592 break;
1593 case 'n':
1594 c = '\n';
1595 break;
1596 case 'r':
1597 c = '\r';
1598 break;
1599 case 't':
1600 c = '\t';
1601 break;
1602 case 'v':
1603 c = '\v';
1604 break;
1605
1606 /*
1607 * in all other cases, take the char after '\'
1608 * literally
1609 */
1610 }
1611 }
1612
1613 /* Add c to output string */
1614 *output_ptr++ = c;
1615 }
1616
1617 /* Check whether raw input matched null marker */
1618 input_len = end_ptr - start_ptr;
1619 if (input_len == cstate->opts.null_print_len &&
1620 strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1621 cstate->raw_fields[fieldno] = NULL;
1622 else
1623 {
1624 /*
1625 * At this point we know the field is supposed to contain data.
1626 *
1627 * If we de-escaped any non-7-bit-ASCII chars, make sure the
1628 * resulting string is valid data for the db encoding.
1629 */
1630 if (saw_non_ascii)
1631 {
1632 char *fld = cstate->raw_fields[fieldno];
1633
1634 pg_verifymbstr(fld, output_ptr - fld, false);
1635 }
1636 }
1637
1638 /* Terminate attribute value in output area */
1639 *output_ptr++ = '\0';
1640
1641 fieldno++;
1642 /* Done if we hit EOL instead of a delim */
1643 if (!found_delim)
1644 break;
1645 }
1646
1647 /* Clean up state of attribute_buf */
1648 output_ptr--;
1649 Assert(*output_ptr == '\0');
1650 cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
1651
1652 return fieldno;
1653 }
1654
1655 /*
1656 * Parse the current line into separate attributes (fields),
1657 * performing de-escaping as needed. This has exactly the same API as
1658 * CopyReadAttributesText, except we parse the fields according to
1659 * "standard" (i.e. common) CSV usage.
1660 */
1661 static int
CopyReadAttributesCSV(CopyFromState cstate)1662 CopyReadAttributesCSV(CopyFromState cstate)
1663 {
1664 char delimc = cstate->opts.delim[0];
1665 char quotec = cstate->opts.quote[0];
1666 char escapec = cstate->opts.escape[0];
1667 int fieldno;
1668 char *output_ptr;
1669 char *cur_ptr;
1670 char *line_end_ptr;
1671
1672 /*
1673 * We need a special case for zero-column tables: check that the input
1674 * line is empty, and return.
1675 */
1676 if (cstate->max_fields <= 0)
1677 {
1678 if (cstate->line_buf.len != 0)
1679 ereport(ERROR,
1680 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1681 errmsg("extra data after last expected column")));
1682 return 0;
1683 }
1684
1685 resetStringInfo(&cstate->attribute_buf);
1686
1687 /*
1688 * The de-escaped attributes will certainly not be longer than the input
1689 * data line, so we can just force attribute_buf to be large enough and
1690 * then transfer data without any checks for enough space. We need to do
1691 * it this way because enlarging attribute_buf mid-stream would invalidate
1692 * pointers already stored into cstate->raw_fields[].
1693 */
1694 if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1695 enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1696 output_ptr = cstate->attribute_buf.data;
1697
1698 /* set pointer variables for loop */
1699 cur_ptr = cstate->line_buf.data;
1700 line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1701
1702 /* Outer loop iterates over fields */
1703 fieldno = 0;
1704 for (;;)
1705 {
1706 bool found_delim = false;
1707 bool saw_quote = false;
1708 char *start_ptr;
1709 char *end_ptr;
1710 int input_len;
1711
1712 /* Make sure there is enough space for the next value */
1713 if (fieldno >= cstate->max_fields)
1714 {
1715 cstate->max_fields *= 2;
1716 cstate->raw_fields =
1717 repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1718 }
1719
1720 /* Remember start of field on both input and output sides */
1721 start_ptr = cur_ptr;
1722 cstate->raw_fields[fieldno] = output_ptr;
1723
1724 /*
1725 * Scan data for field,
1726 *
1727 * The loop starts in "not quote" mode and then toggles between that
1728 * and "in quote" mode. The loop exits normally if it is in "not
1729 * quote" mode and a delimiter or line end is seen.
1730 */
1731 for (;;)
1732 {
1733 char c;
1734
1735 /* Not in quote */
1736 for (;;)
1737 {
1738 end_ptr = cur_ptr;
1739 if (cur_ptr >= line_end_ptr)
1740 goto endfield;
1741 c = *cur_ptr++;
1742 /* unquoted field delimiter */
1743 if (c == delimc)
1744 {
1745 found_delim = true;
1746 goto endfield;
1747 }
1748 /* start of quoted field (or part of field) */
1749 if (c == quotec)
1750 {
1751 saw_quote = true;
1752 break;
1753 }
1754 /* Add c to output string */
1755 *output_ptr++ = c;
1756 }
1757
1758 /* In quote */
1759 for (;;)
1760 {
1761 end_ptr = cur_ptr;
1762 if (cur_ptr >= line_end_ptr)
1763 ereport(ERROR,
1764 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1765 errmsg("unterminated CSV quoted field")));
1766
1767 c = *cur_ptr++;
1768
1769 /* escape within a quoted field */
1770 if (c == escapec)
1771 {
1772 /*
1773 * peek at the next char if available, and escape it if it
1774 * is an escape char or a quote char
1775 */
1776 if (cur_ptr < line_end_ptr)
1777 {
1778 char nextc = *cur_ptr;
1779
1780 if (nextc == escapec || nextc == quotec)
1781 {
1782 *output_ptr++ = nextc;
1783 cur_ptr++;
1784 continue;
1785 }
1786 }
1787 }
1788
1789 /*
1790 * end of quoted field. Must do this test after testing for
1791 * escape in case quote char and escape char are the same
1792 * (which is the common case).
1793 */
1794 if (c == quotec)
1795 break;
1796
1797 /* Add c to output string */
1798 *output_ptr++ = c;
1799 }
1800 }
1801 endfield:
1802
1803 /* Terminate attribute value in output area */
1804 *output_ptr++ = '\0';
1805
1806 /* Check whether raw input matched null marker */
1807 input_len = end_ptr - start_ptr;
1808 if (!saw_quote && input_len == cstate->opts.null_print_len &&
1809 strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1810 cstate->raw_fields[fieldno] = NULL;
1811
1812 fieldno++;
1813 /* Done if we hit EOL instead of a delim */
1814 if (!found_delim)
1815 break;
1816 }
1817
1818 /* Clean up state of attribute_buf */
1819 output_ptr--;
1820 Assert(*output_ptr == '\0');
1821 cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
1822
1823 return fieldno;
1824 }
1825
1826
1827 /*
1828 * Read a binary attribute
1829 */
1830 static Datum
CopyReadBinaryAttribute(CopyFromState cstate,FmgrInfo * flinfo,Oid typioparam,int32 typmod,bool * isnull)1831 CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
1832 Oid typioparam, int32 typmod,
1833 bool *isnull)
1834 {
1835 int32 fld_size;
1836 Datum result;
1837
1838 if (!CopyGetInt32(cstate, &fld_size))
1839 ereport(ERROR,
1840 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1841 errmsg("unexpected EOF in COPY data")));
1842 if (fld_size == -1)
1843 {
1844 *isnull = true;
1845 return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
1846 }
1847 if (fld_size < 0)
1848 ereport(ERROR,
1849 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1850 errmsg("invalid field size")));
1851
1852 /* reset attribute_buf to empty, and load raw data in it */
1853 resetStringInfo(&cstate->attribute_buf);
1854
1855 enlargeStringInfo(&cstate->attribute_buf, fld_size);
1856 if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
1857 fld_size) != fld_size)
1858 ereport(ERROR,
1859 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1860 errmsg("unexpected EOF in COPY data")));
1861
1862 cstate->attribute_buf.len = fld_size;
1863 cstate->attribute_buf.data[fld_size] = '\0';
1864
1865 /* Call the column type's binary input converter */
1866 result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf,
1867 typioparam, typmod);
1868
1869 /* Trouble if it didn't eat the whole buffer */
1870 if (cstate->attribute_buf.cursor != cstate->attribute_buf.len)
1871 ereport(ERROR,
1872 (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
1873 errmsg("incorrect binary data format")));
1874
1875 *isnull = false;
1876 return result;
1877 }
1878