1 /*-------------------------------------------------------------------------
2 *
3 * varlena.c
4 * Functions for the variable-length built-in types.
5 *
6 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/utils/adt/varlena.c
12 *
13 *-------------------------------------------------------------------------
14 */
15 #include "postgres.h"
16
17 #include <stdbool.h>
18 #include <ctype.h>
19 #include <limits.h>
20
21 #include "access/hash.h"
22 #include "access/tuptoaster.h"
23 #include "catalog/pg_collation.h"
24 #include "catalog/pg_type.h"
25 #include "lib/hyperloglog.h"
26 #include "libpq/md5.h"
27 #include "libpq/pqformat.h"
28 #include "miscadmin.h"
29 #include "parser/scansup.h"
30 #include "port/pg_bswap.h"
31 #include "regex/regex.h"
32 #include "utils/builtins.h"
33 #include "utils/bytea.h"
34 #include "utils/lsyscache.h"
35 #include "utils/memutils.h"
36 #include "utils/pg_locale.h"
37 #include "utils/sortsupport.h"
38
39
40 /* GUC variable */
41 int bytea_output = BYTEA_OUTPUT_HEX;
42
43 typedef struct varlena unknown;
44 typedef struct varlena VarString;
45
46 typedef struct
47 {
48 bool use_wchar; /* T if multibyte encoding */
49 char *str1; /* use these if not use_wchar */
50 char *str2; /* note: these point to original texts */
51 pg_wchar *wstr1; /* use these if use_wchar */
52 pg_wchar *wstr2; /* note: these are palloc'd */
53 int len1; /* string lengths in logical characters */
54 int len2;
55 /* Skip table for Boyer-Moore-Horspool search algorithm: */
56 int skiptablemask; /* mask for ANDing with skiptable subscripts */
57 int skiptable[256]; /* skip distance for given mismatched char */
58 } TextPositionState;
59
60 typedef struct
61 {
62 char *buf1; /* 1st string, or abbreviation original string
63 * buf */
64 char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
65 int buflen1;
66 int buflen2;
67 int last_len1; /* Length of last buf1 string/strxfrm() input */
68 int last_len2; /* Length of last buf2 string/strxfrm() blob */
69 int last_returned; /* Last comparison result (cache) */
70 bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
71 bool collate_c;
72 bool bpchar; /* Sorting pbchar, not varchar/text/bytea? */
73 hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
74 hyperLogLogState full_card; /* Full key cardinality state */
75 double prop_card; /* Required cardinality proportion */
76 #ifdef HAVE_LOCALE_T
77 pg_locale_t locale;
78 #endif
79 } VarStringSortSupport;
80
81 /*
82 * This should be large enough that most strings will fit, but small enough
83 * that we feel comfortable putting it on the stack
84 */
85 #define TEXTBUFLEN 1024
86
87 #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
88 #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
89 #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
90 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
91 #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
92
93 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
94 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
95
96 static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
97 static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
98 static int varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup);
99 static int varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
100 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
101 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
102 static int32 text_length(Datum str);
103 static text *text_catenate(text *t1, text *t2);
104 static text *text_substring(Datum str,
105 int32 start,
106 int32 length,
107 bool length_not_specified);
108 static text *text_overlay(text *t1, text *t2, int sp, int sl);
109 static int text_position(text *t1, text *t2);
110 static void text_position_setup(text *t1, text *t2, TextPositionState *state);
111 static int text_position_next(int start_pos, TextPositionState *state);
112 static void text_position_cleanup(TextPositionState *state);
113 static int text_cmp(text *arg1, text *arg2, Oid collid);
114 static bytea *bytea_catenate(bytea *t1, bytea *t2);
115 static bytea *bytea_substring(Datum str,
116 int S,
117 int L,
118 bool length_not_specified);
119 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
120 static void appendStringInfoText(StringInfo str, const text *t);
121 static Datum text_to_array_internal(PG_FUNCTION_ARGS);
122 static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
123 const char *fldsep, const char *null_string);
124 static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
125 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
126 int *value);
127 static const char *text_format_parse_format(const char *start_ptr,
128 const char *end_ptr,
129 int *argpos, int *widthpos,
130 int *flags, int *width);
131 static void text_format_string_conversion(StringInfo buf, char conversion,
132 FmgrInfo *typOutputInfo,
133 Datum value, bool isNull,
134 int flags, int width);
135 static void text_format_append_string(StringInfo buf, const char *str,
136 int flags, int width);
137
138
139 /*****************************************************************************
140 * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
141 *****************************************************************************/
142
143 /*
144 * cstring_to_text
145 *
146 * Create a text value from a null-terminated C string.
147 *
148 * The new text value is freshly palloc'd with a full-size VARHDR.
149 */
150 text *
cstring_to_text(const char * s)151 cstring_to_text(const char *s)
152 {
153 return cstring_to_text_with_len(s, strlen(s));
154 }
155
156 /*
157 * cstring_to_text_with_len
158 *
159 * Same as cstring_to_text except the caller specifies the string length;
160 * the string need not be null_terminated.
161 */
162 text *
cstring_to_text_with_len(const char * s,int len)163 cstring_to_text_with_len(const char *s, int len)
164 {
165 text *result = (text *) palloc(len + VARHDRSZ);
166
167 SET_VARSIZE(result, len + VARHDRSZ);
168 memcpy(VARDATA(result), s, len);
169
170 return result;
171 }
172
173 /*
174 * text_to_cstring
175 *
176 * Create a palloc'd, null-terminated C string from a text value.
177 *
178 * We support being passed a compressed or toasted text value.
179 * This is a bit bogus since such values shouldn't really be referred to as
180 * "text *", but it seems useful for robustness. If we didn't handle that
181 * case here, we'd need another routine that did, anyway.
182 */
183 char *
text_to_cstring(const text * t)184 text_to_cstring(const text *t)
185 {
186 /* must cast away the const, unfortunately */
187 text *tunpacked = pg_detoast_datum_packed((struct varlena *) t);
188 int len = VARSIZE_ANY_EXHDR(tunpacked);
189 char *result;
190
191 result = (char *) palloc(len + 1);
192 memcpy(result, VARDATA_ANY(tunpacked), len);
193 result[len] = '\0';
194
195 if (tunpacked != t)
196 pfree(tunpacked);
197
198 return result;
199 }
200
201 /*
202 * text_to_cstring_buffer
203 *
204 * Copy a text value into a caller-supplied buffer of size dst_len.
205 *
206 * The text string is truncated if necessary to fit. The result is
207 * guaranteed null-terminated (unless dst_len == 0).
208 *
209 * We support being passed a compressed or toasted text value.
210 * This is a bit bogus since such values shouldn't really be referred to as
211 * "text *", but it seems useful for robustness. If we didn't handle that
212 * case here, we'd need another routine that did, anyway.
213 */
214 void
text_to_cstring_buffer(const text * src,char * dst,size_t dst_len)215 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
216 {
217 /* must cast away the const, unfortunately */
218 text *srcunpacked = pg_detoast_datum_packed((struct varlena *) src);
219 size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
220
221 if (dst_len > 0)
222 {
223 dst_len--;
224 if (dst_len >= src_len)
225 dst_len = src_len;
226 else /* ensure truncation is encoding-safe */
227 dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
228 memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
229 dst[dst_len] = '\0';
230 }
231
232 if (srcunpacked != src)
233 pfree(srcunpacked);
234 }
235
236
237 /*****************************************************************************
238 * USER I/O ROUTINES *
239 *****************************************************************************/
240
241
242 #define VAL(CH) ((CH) - '0')
243 #define DIG(VAL) ((VAL) + '0')
244
245 /*
246 * byteain - converts from printable representation of byte array
247 *
248 * Non-printable characters must be passed as '\nnn' (octal) and are
249 * converted to internal form. '\' must be passed as '\\'.
250 * ereport(ERROR, ...) if bad form.
251 *
252 * BUGS:
253 * The input is scanned twice.
254 * The error checking of input is minimal.
255 */
256 Datum
byteain(PG_FUNCTION_ARGS)257 byteain(PG_FUNCTION_ARGS)
258 {
259 char *inputText = PG_GETARG_CSTRING(0);
260 char *tp;
261 char *rp;
262 int bc;
263 bytea *result;
264
265 /* Recognize hex input */
266 if (inputText[0] == '\\' && inputText[1] == 'x')
267 {
268 size_t len = strlen(inputText);
269
270 bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
271 result = palloc(bc);
272 bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
273 SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
274
275 PG_RETURN_BYTEA_P(result);
276 }
277
278 /* Else, it's the traditional escaped style */
279 for (bc = 0, tp = inputText; *tp != '\0'; bc++)
280 {
281 if (tp[0] != '\\')
282 tp++;
283 else if ((tp[0] == '\\') &&
284 (tp[1] >= '0' && tp[1] <= '3') &&
285 (tp[2] >= '0' && tp[2] <= '7') &&
286 (tp[3] >= '0' && tp[3] <= '7'))
287 tp += 4;
288 else if ((tp[0] == '\\') &&
289 (tp[1] == '\\'))
290 tp += 2;
291 else
292 {
293 /*
294 * one backslash, not followed by another or ### valid octal
295 */
296 ereport(ERROR,
297 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
298 errmsg("invalid input syntax for type bytea")));
299 }
300 }
301
302 bc += VARHDRSZ;
303
304 result = (bytea *) palloc(bc);
305 SET_VARSIZE(result, bc);
306
307 tp = inputText;
308 rp = VARDATA(result);
309 while (*tp != '\0')
310 {
311 if (tp[0] != '\\')
312 *rp++ = *tp++;
313 else if ((tp[0] == '\\') &&
314 (tp[1] >= '0' && tp[1] <= '3') &&
315 (tp[2] >= '0' && tp[2] <= '7') &&
316 (tp[3] >= '0' && tp[3] <= '7'))
317 {
318 bc = VAL(tp[1]);
319 bc <<= 3;
320 bc += VAL(tp[2]);
321 bc <<= 3;
322 *rp++ = bc + VAL(tp[3]);
323
324 tp += 4;
325 }
326 else if ((tp[0] == '\\') &&
327 (tp[1] == '\\'))
328 {
329 *rp++ = '\\';
330 tp += 2;
331 }
332 else
333 {
334 /*
335 * We should never get here. The first pass should not allow it.
336 */
337 ereport(ERROR,
338 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
339 errmsg("invalid input syntax for type bytea")));
340 }
341 }
342
343 PG_RETURN_BYTEA_P(result);
344 }
345
346 /*
347 * byteaout - converts to printable representation of byte array
348 *
349 * In the traditional escaped format, non-printable characters are
350 * printed as '\nnn' (octal) and '\' as '\\'.
351 */
352 Datum
byteaout(PG_FUNCTION_ARGS)353 byteaout(PG_FUNCTION_ARGS)
354 {
355 bytea *vlena = PG_GETARG_BYTEA_PP(0);
356 char *result;
357 char *rp;
358
359 if (bytea_output == BYTEA_OUTPUT_HEX)
360 {
361 /* Print hex format */
362 rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
363 *rp++ = '\\';
364 *rp++ = 'x';
365 rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
366 }
367 else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
368 {
369 /* Print traditional escaped format */
370 char *vp;
371 int len;
372 int i;
373
374 len = 1; /* empty string has 1 char */
375 vp = VARDATA_ANY(vlena);
376 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
377 {
378 if (*vp == '\\')
379 len += 2;
380 else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
381 len += 4;
382 else
383 len++;
384 }
385 rp = result = (char *) palloc(len);
386 vp = VARDATA_ANY(vlena);
387 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
388 {
389 if (*vp == '\\')
390 {
391 *rp++ = '\\';
392 *rp++ = '\\';
393 }
394 else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
395 {
396 int val; /* holds unprintable chars */
397
398 val = *vp;
399 rp[0] = '\\';
400 rp[3] = DIG(val & 07);
401 val >>= 3;
402 rp[2] = DIG(val & 07);
403 val >>= 3;
404 rp[1] = DIG(val & 03);
405 rp += 4;
406 }
407 else
408 *rp++ = *vp;
409 }
410 }
411 else
412 {
413 elog(ERROR, "unrecognized bytea_output setting: %d",
414 bytea_output);
415 rp = result = NULL; /* keep compiler quiet */
416 }
417 *rp = '\0';
418 PG_RETURN_CSTRING(result);
419 }
420
421 /*
422 * bytearecv - converts external binary format to bytea
423 */
424 Datum
bytearecv(PG_FUNCTION_ARGS)425 bytearecv(PG_FUNCTION_ARGS)
426 {
427 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
428 bytea *result;
429 int nbytes;
430
431 nbytes = buf->len - buf->cursor;
432 result = (bytea *) palloc(nbytes + VARHDRSZ);
433 SET_VARSIZE(result, nbytes + VARHDRSZ);
434 pq_copymsgbytes(buf, VARDATA(result), nbytes);
435 PG_RETURN_BYTEA_P(result);
436 }
437
438 /*
439 * byteasend - converts bytea to binary format
440 *
441 * This is a special case: just copy the input...
442 */
443 Datum
byteasend(PG_FUNCTION_ARGS)444 byteasend(PG_FUNCTION_ARGS)
445 {
446 bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
447
448 PG_RETURN_BYTEA_P(vlena);
449 }
450
451 Datum
bytea_string_agg_transfn(PG_FUNCTION_ARGS)452 bytea_string_agg_transfn(PG_FUNCTION_ARGS)
453 {
454 StringInfo state;
455
456 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
457
458 /* Append the value unless null. */
459 if (!PG_ARGISNULL(1))
460 {
461 bytea *value = PG_GETARG_BYTEA_PP(1);
462
463 /* On the first time through, we ignore the delimiter. */
464 if (state == NULL)
465 state = makeStringAggState(fcinfo);
466 else if (!PG_ARGISNULL(2))
467 {
468 bytea *delim = PG_GETARG_BYTEA_PP(2);
469
470 appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
471 }
472
473 appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
474 }
475
476 /*
477 * The transition type for string_agg() is declared to be "internal",
478 * which is a pass-by-value type the same size as a pointer.
479 */
480 PG_RETURN_POINTER(state);
481 }
482
483 Datum
bytea_string_agg_finalfn(PG_FUNCTION_ARGS)484 bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
485 {
486 StringInfo state;
487
488 /* cannot be called directly because of internal-type argument */
489 Assert(AggCheckCallContext(fcinfo, NULL));
490
491 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
492
493 if (state != NULL)
494 {
495 bytea *result;
496
497 result = (bytea *) palloc(state->len + VARHDRSZ);
498 SET_VARSIZE(result, state->len + VARHDRSZ);
499 memcpy(VARDATA(result), state->data, state->len);
500 PG_RETURN_BYTEA_P(result);
501 }
502 else
503 PG_RETURN_NULL();
504 }
505
506 /*
507 * textin - converts "..." to internal representation
508 */
509 Datum
textin(PG_FUNCTION_ARGS)510 textin(PG_FUNCTION_ARGS)
511 {
512 char *inputText = PG_GETARG_CSTRING(0);
513
514 PG_RETURN_TEXT_P(cstring_to_text(inputText));
515 }
516
517 /*
518 * textout - converts internal representation to "..."
519 */
520 Datum
textout(PG_FUNCTION_ARGS)521 textout(PG_FUNCTION_ARGS)
522 {
523 Datum txt = PG_GETARG_DATUM(0);
524
525 PG_RETURN_CSTRING(TextDatumGetCString(txt));
526 }
527
528 /*
529 * textrecv - converts external binary format to text
530 */
531 Datum
textrecv(PG_FUNCTION_ARGS)532 textrecv(PG_FUNCTION_ARGS)
533 {
534 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
535 text *result;
536 char *str;
537 int nbytes;
538
539 str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
540
541 result = cstring_to_text_with_len(str, nbytes);
542 pfree(str);
543 PG_RETURN_TEXT_P(result);
544 }
545
546 /*
547 * textsend - converts text to binary format
548 */
549 Datum
textsend(PG_FUNCTION_ARGS)550 textsend(PG_FUNCTION_ARGS)
551 {
552 text *t = PG_GETARG_TEXT_PP(0);
553 StringInfoData buf;
554
555 pq_begintypsend(&buf);
556 pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
557 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
558 }
559
560
561 /*
562 * unknownin - converts "..." to internal representation
563 */
564 Datum
unknownin(PG_FUNCTION_ARGS)565 unknownin(PG_FUNCTION_ARGS)
566 {
567 char *str = PG_GETARG_CSTRING(0);
568
569 /* representation is same as cstring */
570 PG_RETURN_CSTRING(pstrdup(str));
571 }
572
573 /*
574 * unknownout - converts internal representation to "..."
575 */
576 Datum
unknownout(PG_FUNCTION_ARGS)577 unknownout(PG_FUNCTION_ARGS)
578 {
579 /* representation is same as cstring */
580 char *str = PG_GETARG_CSTRING(0);
581
582 PG_RETURN_CSTRING(pstrdup(str));
583 }
584
585 /*
586 * unknownrecv - converts external binary format to unknown
587 */
588 Datum
unknownrecv(PG_FUNCTION_ARGS)589 unknownrecv(PG_FUNCTION_ARGS)
590 {
591 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
592 char *str;
593 int nbytes;
594
595 str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
596 /* representation is same as cstring */
597 PG_RETURN_CSTRING(str);
598 }
599
600 /*
601 * unknownsend - converts unknown to binary format
602 */
603 Datum
unknownsend(PG_FUNCTION_ARGS)604 unknownsend(PG_FUNCTION_ARGS)
605 {
606 /* representation is same as cstring */
607 char *str = PG_GETARG_CSTRING(0);
608 StringInfoData buf;
609
610 pq_begintypsend(&buf);
611 pq_sendtext(&buf, str, strlen(str));
612 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
613 }
614
615
616 /* ========== PUBLIC ROUTINES ========== */
617
618 /*
619 * textlen -
620 * returns the logical length of a text*
621 * (which is less than the VARSIZE of the text*)
622 */
623 Datum
textlen(PG_FUNCTION_ARGS)624 textlen(PG_FUNCTION_ARGS)
625 {
626 Datum str = PG_GETARG_DATUM(0);
627
628 /* try to avoid decompressing argument */
629 PG_RETURN_INT32(text_length(str));
630 }
631
632 /*
633 * text_length -
634 * Does the real work for textlen()
635 *
636 * This is broken out so it can be called directly by other string processing
637 * functions. Note that the argument is passed as a Datum, to indicate that
638 * it may still be in compressed form. We can avoid decompressing it at all
639 * in some cases.
640 */
641 static int32
text_length(Datum str)642 text_length(Datum str)
643 {
644 /* fastpath when max encoding length is one */
645 if (pg_database_encoding_max_length() == 1)
646 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
647 else
648 {
649 text *t = DatumGetTextPP(str);
650
651 PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
652 VARSIZE_ANY_EXHDR(t)));
653 }
654 }
655
656 /*
657 * textoctetlen -
658 * returns the physical length of a text*
659 * (which is less than the VARSIZE of the text*)
660 */
661 Datum
textoctetlen(PG_FUNCTION_ARGS)662 textoctetlen(PG_FUNCTION_ARGS)
663 {
664 Datum str = PG_GETARG_DATUM(0);
665
666 /* We need not detoast the input at all */
667 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
668 }
669
670 /*
671 * textcat -
672 * takes two text* and returns a text* that is the concatenation of
673 * the two.
674 *
675 * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
676 * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
677 * Allocate space for output in all cases.
678 * XXX - thomas 1997-07-10
679 */
680 Datum
textcat(PG_FUNCTION_ARGS)681 textcat(PG_FUNCTION_ARGS)
682 {
683 text *t1 = PG_GETARG_TEXT_PP(0);
684 text *t2 = PG_GETARG_TEXT_PP(1);
685
686 PG_RETURN_TEXT_P(text_catenate(t1, t2));
687 }
688
689 /*
690 * text_catenate
691 * Guts of textcat(), broken out so it can be used by other functions
692 *
693 * Arguments can be in short-header form, but not compressed or out-of-line
694 */
695 static text *
text_catenate(text * t1,text * t2)696 text_catenate(text *t1, text *t2)
697 {
698 text *result;
699 int len1,
700 len2,
701 len;
702 char *ptr;
703
704 len1 = VARSIZE_ANY_EXHDR(t1);
705 len2 = VARSIZE_ANY_EXHDR(t2);
706
707 /* paranoia ... probably should throw error instead? */
708 if (len1 < 0)
709 len1 = 0;
710 if (len2 < 0)
711 len2 = 0;
712
713 len = len1 + len2 + VARHDRSZ;
714 result = (text *) palloc(len);
715
716 /* Set size of result string... */
717 SET_VARSIZE(result, len);
718
719 /* Fill data field of result string... */
720 ptr = VARDATA(result);
721 if (len1 > 0)
722 memcpy(ptr, VARDATA_ANY(t1), len1);
723 if (len2 > 0)
724 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
725
726 return result;
727 }
728
729 /*
730 * charlen_to_bytelen()
731 * Compute the number of bytes occupied by n characters starting at *p
732 *
733 * It is caller's responsibility that there actually are n characters;
734 * the string need not be null-terminated.
735 */
736 static int
charlen_to_bytelen(const char * p,int n)737 charlen_to_bytelen(const char *p, int n)
738 {
739 if (pg_database_encoding_max_length() == 1)
740 {
741 /* Optimization for single-byte encodings */
742 return n;
743 }
744 else
745 {
746 const char *s;
747
748 for (s = p; n > 0; n--)
749 s += pg_mblen(s);
750
751 return s - p;
752 }
753 }
754
755 /*
756 * text_substr()
757 * Return a substring starting at the specified position.
758 * - thomas 1997-12-31
759 *
760 * Input:
761 * - string
762 * - starting position (is one-based)
763 * - string length
764 *
765 * If the starting position is zero or less, then return from the start of the string
766 * adjusting the length to be consistent with the "negative start" per SQL.
767 * If the length is less than zero, return the remaining string.
768 *
769 * Added multibyte support.
770 * - Tatsuo Ishii 1998-4-21
771 * Changed behavior if starting position is less than one to conform to SQL behavior.
772 * Formerly returned the entire string; now returns a portion.
773 * - Thomas Lockhart 1998-12-10
774 * Now uses faster TOAST-slicing interface
775 * - John Gray 2002-02-22
776 * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
777 * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
778 * error; if E < 1, return '', not entire string). Fixed MB related bug when
779 * S > LC and < LC + 4 sometimes garbage characters are returned.
780 * - Joe Conway 2002-08-10
781 */
782 Datum
text_substr(PG_FUNCTION_ARGS)783 text_substr(PG_FUNCTION_ARGS)
784 {
785 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
786 PG_GETARG_INT32(1),
787 PG_GETARG_INT32(2),
788 false));
789 }
790
791 /*
792 * text_substr_no_len -
793 * Wrapper to avoid opr_sanity failure due to
794 * one function accepting a different number of args.
795 */
796 Datum
text_substr_no_len(PG_FUNCTION_ARGS)797 text_substr_no_len(PG_FUNCTION_ARGS)
798 {
799 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
800 PG_GETARG_INT32(1),
801 -1, true));
802 }
803
804 /*
805 * text_substring -
806 * Does the real work for text_substr() and text_substr_no_len()
807 *
808 * This is broken out so it can be called directly by other string processing
809 * functions. Note that the argument is passed as a Datum, to indicate that
810 * it may still be in compressed/toasted form. We can avoid detoasting all
811 * of it in some cases.
812 *
813 * The result is always a freshly palloc'd datum.
814 */
815 static text *
text_substring(Datum str,int32 start,int32 length,bool length_not_specified)816 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
817 {
818 int32 eml = pg_database_encoding_max_length();
819 int32 S = start; /* start position */
820 int32 S1; /* adjusted start position */
821 int32 L1; /* adjusted substring length */
822
823 /* life is easy if the encoding max length is 1 */
824 if (eml == 1)
825 {
826 S1 = Max(S, 1);
827
828 if (length_not_specified) /* special case - get length to end of
829 * string */
830 L1 = -1;
831 else
832 {
833 /* end position */
834 int E = S + length;
835
836 /*
837 * A negative value for L is the only way for the end position to
838 * be before the start. SQL99 says to throw an error.
839 */
840 if (E < S)
841 ereport(ERROR,
842 (errcode(ERRCODE_SUBSTRING_ERROR),
843 errmsg("negative substring length not allowed")));
844
845 /*
846 * A zero or negative value for the end position can happen if the
847 * start was negative or one. SQL99 says to return a zero-length
848 * string.
849 */
850 if (E < 1)
851 return cstring_to_text("");
852
853 L1 = E - S1;
854 }
855
856 /*
857 * If the start position is past the end of the string, SQL99 says to
858 * return a zero-length string -- PG_GETARG_TEXT_P_SLICE() will do
859 * that for us. Convert to zero-based starting position
860 */
861 return DatumGetTextPSlice(str, S1 - 1, L1);
862 }
863 else if (eml > 1)
864 {
865 /*
866 * When encoding max length is > 1, we can't get LC without
867 * detoasting, so we'll grab a conservatively large slice now and go
868 * back later to do the right thing
869 */
870 int32 slice_start;
871 int32 slice_size;
872 int32 slice_strlen;
873 text *slice;
874 int32 E1;
875 int32 i;
876 char *p;
877 char *s;
878 text *ret;
879
880 /*
881 * if S is past the end of the string, the tuple toaster will return a
882 * zero-length string to us
883 */
884 S1 = Max(S, 1);
885
886 /*
887 * We need to start at position zero because there is no way to know
888 * in advance which byte offset corresponds to the supplied start
889 * position.
890 */
891 slice_start = 0;
892
893 if (length_not_specified) /* special case - get length to end of
894 * string */
895 slice_size = L1 = -1;
896 else
897 {
898 int E = S + length;
899
900 /*
901 * A negative value for L is the only way for the end position to
902 * be before the start. SQL99 says to throw an error.
903 */
904 if (E < S)
905 ereport(ERROR,
906 (errcode(ERRCODE_SUBSTRING_ERROR),
907 errmsg("negative substring length not allowed")));
908
909 /*
910 * A zero or negative value for the end position can happen if the
911 * start was negative or one. SQL99 says to return a zero-length
912 * string.
913 */
914 if (E < 1)
915 return cstring_to_text("");
916
917 /*
918 * if E is past the end of the string, the tuple toaster will
919 * truncate the length for us
920 */
921 L1 = E - S1;
922
923 /*
924 * Total slice size in bytes can't be any longer than the start
925 * position plus substring length times the encoding max length.
926 */
927 slice_size = (S1 + L1) * eml;
928 }
929
930 /*
931 * If we're working with an untoasted source, no need to do an extra
932 * copying step.
933 */
934 if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
935 VARATT_IS_EXTERNAL(DatumGetPointer(str)))
936 slice = DatumGetTextPSlice(str, slice_start, slice_size);
937 else
938 slice = (text *) DatumGetPointer(str);
939
940 /* see if we got back an empty string */
941 if (VARSIZE_ANY_EXHDR(slice) == 0)
942 {
943 if (slice != (text *) DatumGetPointer(str))
944 pfree(slice);
945 return cstring_to_text("");
946 }
947
948 /* Now we can get the actual length of the slice in MB characters */
949 slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
950 VARSIZE_ANY_EXHDR(slice));
951
952 /*
953 * Check that the start position wasn't > slice_strlen. If so, SQL99
954 * says to return a zero-length string.
955 */
956 if (S1 > slice_strlen)
957 {
958 if (slice != (text *) DatumGetPointer(str))
959 pfree(slice);
960 return cstring_to_text("");
961 }
962
963 /*
964 * Adjust L1 and E1 now that we know the slice string length. Again
965 * remember that S1 is one based, and slice_start is zero based.
966 */
967 if (L1 > -1)
968 E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
969 else
970 E1 = slice_start + 1 + slice_strlen;
971
972 /*
973 * Find the start position in the slice; remember S1 is not zero based
974 */
975 p = VARDATA_ANY(slice);
976 for (i = 0; i < S1 - 1; i++)
977 p += pg_mblen(p);
978
979 /* hang onto a pointer to our start position */
980 s = p;
981
982 /*
983 * Count the actual bytes used by the substring of the requested
984 * length.
985 */
986 for (i = S1; i < E1; i++)
987 p += pg_mblen(p);
988
989 ret = (text *) palloc(VARHDRSZ + (p - s));
990 SET_VARSIZE(ret, VARHDRSZ + (p - s));
991 memcpy(VARDATA(ret), s, (p - s));
992
993 if (slice != (text *) DatumGetPointer(str))
994 pfree(slice);
995
996 return ret;
997 }
998 else
999 elog(ERROR, "invalid backend encoding: encoding max length < 1");
1000
1001 /* not reached: suppress compiler warning */
1002 return NULL;
1003 }
1004
1005 /*
1006 * textoverlay
1007 * Replace specified substring of first string with second
1008 *
1009 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1010 * This code is a direct implementation of what the standard says.
1011 */
1012 Datum
textoverlay(PG_FUNCTION_ARGS)1013 textoverlay(PG_FUNCTION_ARGS)
1014 {
1015 text *t1 = PG_GETARG_TEXT_PP(0);
1016 text *t2 = PG_GETARG_TEXT_PP(1);
1017 int sp = PG_GETARG_INT32(2); /* substring start position */
1018 int sl = PG_GETARG_INT32(3); /* substring length */
1019
1020 PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1021 }
1022
1023 Datum
textoverlay_no_len(PG_FUNCTION_ARGS)1024 textoverlay_no_len(PG_FUNCTION_ARGS)
1025 {
1026 text *t1 = PG_GETARG_TEXT_PP(0);
1027 text *t2 = PG_GETARG_TEXT_PP(1);
1028 int sp = PG_GETARG_INT32(2); /* substring start position */
1029 int sl;
1030
1031 sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1032 PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1033 }
1034
1035 static text *
text_overlay(text * t1,text * t2,int sp,int sl)1036 text_overlay(text *t1, text *t2, int sp, int sl)
1037 {
1038 text *result;
1039 text *s1;
1040 text *s2;
1041 int sp_pl_sl;
1042
1043 /*
1044 * Check for possible integer-overflow cases. For negative sp, throw a
1045 * "substring length" error because that's what should be expected
1046 * according to the spec's definition of OVERLAY().
1047 */
1048 if (sp <= 0)
1049 ereport(ERROR,
1050 (errcode(ERRCODE_SUBSTRING_ERROR),
1051 errmsg("negative substring length not allowed")));
1052 sp_pl_sl = sp + sl;
1053 if (sp_pl_sl <= sl)
1054 ereport(ERROR,
1055 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1056 errmsg("integer out of range")));
1057
1058 s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1059 s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1060 result = text_catenate(s1, t2);
1061 result = text_catenate(result, s2);
1062
1063 return result;
1064 }
1065
1066 /*
1067 * textpos -
1068 * Return the position of the specified substring.
1069 * Implements the SQL POSITION() function.
1070 * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1071 * - thomas 1997-07-27
1072 */
1073 Datum
textpos(PG_FUNCTION_ARGS)1074 textpos(PG_FUNCTION_ARGS)
1075 {
1076 text *str = PG_GETARG_TEXT_PP(0);
1077 text *search_str = PG_GETARG_TEXT_PP(1);
1078
1079 PG_RETURN_INT32((int32) text_position(str, search_str));
1080 }
1081
1082 /*
1083 * text_position -
1084 * Does the real work for textpos()
1085 *
1086 * Inputs:
1087 * t1 - string to be searched
1088 * t2 - pattern to match within t1
1089 * Result:
1090 * Character index of the first matched char, starting from 1,
1091 * or 0 if no match.
1092 *
1093 * This is broken out so it can be called directly by other string processing
1094 * functions.
1095 */
1096 static int
text_position(text * t1,text * t2)1097 text_position(text *t1, text *t2)
1098 {
1099 TextPositionState state;
1100 int result;
1101
1102 text_position_setup(t1, t2, &state);
1103 result = text_position_next(1, &state);
1104 text_position_cleanup(&state);
1105 return result;
1106 }
1107
1108
1109 /*
1110 * text_position_setup, text_position_next, text_position_cleanup -
1111 * Component steps of text_position()
1112 *
1113 * These are broken out so that a string can be efficiently searched for
1114 * multiple occurrences of the same pattern. text_position_next may be
1115 * called multiple times with increasing values of start_pos, which is
1116 * the 1-based character position to start the search from. The "state"
1117 * variable is normally just a local variable in the caller.
1118 */
1119
1120 static void
text_position_setup(text * t1,text * t2,TextPositionState * state)1121 text_position_setup(text *t1, text *t2, TextPositionState *state)
1122 {
1123 int len1 = VARSIZE_ANY_EXHDR(t1);
1124 int len2 = VARSIZE_ANY_EXHDR(t2);
1125
1126 if (pg_database_encoding_max_length() == 1)
1127 {
1128 /* simple case - single byte encoding */
1129 state->use_wchar = false;
1130 state->str1 = VARDATA_ANY(t1);
1131 state->str2 = VARDATA_ANY(t2);
1132 state->len1 = len1;
1133 state->len2 = len2;
1134 }
1135 else
1136 {
1137 /* not as simple - multibyte encoding */
1138 pg_wchar *p1,
1139 *p2;
1140
1141 p1 = (pg_wchar *) palloc((len1 + 1) * sizeof(pg_wchar));
1142 len1 = pg_mb2wchar_with_len(VARDATA_ANY(t1), p1, len1);
1143 p2 = (pg_wchar *) palloc((len2 + 1) * sizeof(pg_wchar));
1144 len2 = pg_mb2wchar_with_len(VARDATA_ANY(t2), p2, len2);
1145
1146 state->use_wchar = true;
1147 state->wstr1 = p1;
1148 state->wstr2 = p2;
1149 state->len1 = len1;
1150 state->len2 = len2;
1151 }
1152
1153 /*
1154 * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1155 * notes we use the terminology that the "haystack" is the string to be
1156 * searched (t1) and the "needle" is the pattern being sought (t2).
1157 *
1158 * If the needle is empty or bigger than the haystack then there is no
1159 * point in wasting cycles initializing the table. We also choose not to
1160 * use B-M-H for needles of length 1, since the skip table can't possibly
1161 * save anything in that case.
1162 */
1163 if (len1 >= len2 && len2 > 1)
1164 {
1165 int searchlength = len1 - len2;
1166 int skiptablemask;
1167 int last;
1168 int i;
1169
1170 /*
1171 * First we must determine how much of the skip table to use. The
1172 * declaration of TextPositionState allows up to 256 elements, but for
1173 * short search problems we don't really want to have to initialize so
1174 * many elements --- it would take too long in comparison to the
1175 * actual search time. So we choose a useful skip table size based on
1176 * the haystack length minus the needle length. The closer the needle
1177 * length is to the haystack length the less useful skipping becomes.
1178 *
1179 * Note: since we use bit-masking to select table elements, the skip
1180 * table size MUST be a power of 2, and so the mask must be 2^N-1.
1181 */
1182 if (searchlength < 16)
1183 skiptablemask = 3;
1184 else if (searchlength < 64)
1185 skiptablemask = 7;
1186 else if (searchlength < 128)
1187 skiptablemask = 15;
1188 else if (searchlength < 512)
1189 skiptablemask = 31;
1190 else if (searchlength < 2048)
1191 skiptablemask = 63;
1192 else if (searchlength < 4096)
1193 skiptablemask = 127;
1194 else
1195 skiptablemask = 255;
1196 state->skiptablemask = skiptablemask;
1197
1198 /*
1199 * Initialize the skip table. We set all elements to the needle
1200 * length, since this is the correct skip distance for any character
1201 * not found in the needle.
1202 */
1203 for (i = 0; i <= skiptablemask; i++)
1204 state->skiptable[i] = len2;
1205
1206 /*
1207 * Now examine the needle. For each character except the last one,
1208 * set the corresponding table element to the appropriate skip
1209 * distance. Note that when two characters share the same skip table
1210 * entry, the one later in the needle must determine the skip
1211 * distance.
1212 */
1213 last = len2 - 1;
1214
1215 if (!state->use_wchar)
1216 {
1217 const char *str2 = state->str2;
1218
1219 for (i = 0; i < last; i++)
1220 state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1221 }
1222 else
1223 {
1224 const pg_wchar *wstr2 = state->wstr2;
1225
1226 for (i = 0; i < last; i++)
1227 state->skiptable[wstr2[i] & skiptablemask] = last - i;
1228 }
1229 }
1230 }
1231
1232 static int
text_position_next(int start_pos,TextPositionState * state)1233 text_position_next(int start_pos, TextPositionState *state)
1234 {
1235 int haystack_len = state->len1;
1236 int needle_len = state->len2;
1237 int skiptablemask = state->skiptablemask;
1238
1239 Assert(start_pos > 0); /* else caller error */
1240
1241 if (needle_len <= 0)
1242 return start_pos; /* result for empty pattern */
1243
1244 start_pos--; /* adjust for zero based arrays */
1245
1246 /* Done if the needle can't possibly fit */
1247 if (haystack_len < start_pos + needle_len)
1248 return 0;
1249
1250 if (!state->use_wchar)
1251 {
1252 /* simple case - single byte encoding */
1253 const char *haystack = state->str1;
1254 const char *needle = state->str2;
1255 const char *haystack_end = &haystack[haystack_len];
1256 const char *hptr;
1257
1258 if (needle_len == 1)
1259 {
1260 /* No point in using B-M-H for a one-character needle */
1261 char nchar = *needle;
1262
1263 hptr = &haystack[start_pos];
1264 while (hptr < haystack_end)
1265 {
1266 if (*hptr == nchar)
1267 return hptr - haystack + 1;
1268 hptr++;
1269 }
1270 }
1271 else
1272 {
1273 const char *needle_last = &needle[needle_len - 1];
1274
1275 /* Start at startpos plus the length of the needle */
1276 hptr = &haystack[start_pos + needle_len - 1];
1277 while (hptr < haystack_end)
1278 {
1279 /* Match the needle scanning *backward* */
1280 const char *nptr;
1281 const char *p;
1282
1283 nptr = needle_last;
1284 p = hptr;
1285 while (*nptr == *p)
1286 {
1287 /* Matched it all? If so, return 1-based position */
1288 if (nptr == needle)
1289 return p - haystack + 1;
1290 nptr--, p--;
1291 }
1292
1293 /*
1294 * No match, so use the haystack char at hptr to decide how
1295 * far to advance. If the needle had any occurrence of that
1296 * character (or more precisely, one sharing the same
1297 * skiptable entry) before its last character, then we advance
1298 * far enough to align the last such needle character with
1299 * that haystack position. Otherwise we can advance by the
1300 * whole needle length.
1301 */
1302 hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1303 }
1304 }
1305 }
1306 else
1307 {
1308 /* The multibyte char version. This works exactly the same way. */
1309 const pg_wchar *haystack = state->wstr1;
1310 const pg_wchar *needle = state->wstr2;
1311 const pg_wchar *haystack_end = &haystack[haystack_len];
1312 const pg_wchar *hptr;
1313
1314 if (needle_len == 1)
1315 {
1316 /* No point in using B-M-H for a one-character needle */
1317 pg_wchar nchar = *needle;
1318
1319 hptr = &haystack[start_pos];
1320 while (hptr < haystack_end)
1321 {
1322 if (*hptr == nchar)
1323 return hptr - haystack + 1;
1324 hptr++;
1325 }
1326 }
1327 else
1328 {
1329 const pg_wchar *needle_last = &needle[needle_len - 1];
1330
1331 /* Start at startpos plus the length of the needle */
1332 hptr = &haystack[start_pos + needle_len - 1];
1333 while (hptr < haystack_end)
1334 {
1335 /* Match the needle scanning *backward* */
1336 const pg_wchar *nptr;
1337 const pg_wchar *p;
1338
1339 nptr = needle_last;
1340 p = hptr;
1341 while (*nptr == *p)
1342 {
1343 /* Matched it all? If so, return 1-based position */
1344 if (nptr == needle)
1345 return p - haystack + 1;
1346 nptr--, p--;
1347 }
1348
1349 /*
1350 * No match, so use the haystack char at hptr to decide how
1351 * far to advance. If the needle had any occurrence of that
1352 * character (or more precisely, one sharing the same
1353 * skiptable entry) before its last character, then we advance
1354 * far enough to align the last such needle character with
1355 * that haystack position. Otherwise we can advance by the
1356 * whole needle length.
1357 */
1358 hptr += state->skiptable[*hptr & skiptablemask];
1359 }
1360 }
1361 }
1362
1363 return 0; /* not found */
1364 }
1365
1366 static void
text_position_cleanup(TextPositionState * state)1367 text_position_cleanup(TextPositionState *state)
1368 {
1369 if (state->use_wchar)
1370 {
1371 pfree(state->wstr1);
1372 pfree(state->wstr2);
1373 }
1374 }
1375
1376 /* varstr_cmp()
1377 * Comparison function for text strings with given lengths.
1378 * Includes locale support, but must copy strings to temporary memory
1379 * to allow null-termination for inputs to strcoll().
1380 * Returns an integer less than, equal to, or greater than zero, indicating
1381 * whether arg1 is less than, equal to, or greater than arg2.
1382 */
1383 int
varstr_cmp(char * arg1,int len1,char * arg2,int len2,Oid collid)1384 varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid)
1385 {
1386 int result;
1387
1388 /*
1389 * Unfortunately, there is no strncoll(), so in the non-C locale case we
1390 * have to do some memory copying. This turns out to be significantly
1391 * slower, so we optimize the case where LC_COLLATE is C. We also try to
1392 * optimize relatively-short strings by avoiding palloc/pfree overhead.
1393 */
1394 if (lc_collate_is_c(collid))
1395 {
1396 result = memcmp(arg1, arg2, Min(len1, len2));
1397 if ((result == 0) && (len1 != len2))
1398 result = (len1 < len2) ? -1 : 1;
1399 }
1400 else
1401 {
1402 char a1buf[TEXTBUFLEN];
1403 char a2buf[TEXTBUFLEN];
1404 char *a1p,
1405 *a2p;
1406
1407 #ifdef HAVE_LOCALE_T
1408 pg_locale_t mylocale = 0;
1409 #endif
1410
1411 if (collid != DEFAULT_COLLATION_OID)
1412 {
1413 if (!OidIsValid(collid))
1414 {
1415 /*
1416 * This typically means that the parser could not resolve a
1417 * conflict of implicit collations, so report it that way.
1418 */
1419 ereport(ERROR,
1420 (errcode(ERRCODE_INDETERMINATE_COLLATION),
1421 errmsg("could not determine which collation to use for string comparison"),
1422 errhint("Use the COLLATE clause to set the collation explicitly.")));
1423 }
1424 #ifdef HAVE_LOCALE_T
1425 mylocale = pg_newlocale_from_collation(collid);
1426 #endif
1427 }
1428
1429 /*
1430 * memcmp() can't tell us which of two unequal strings sorts first,
1431 * but it's a cheap way to tell if they're equal. Testing shows that
1432 * memcmp() followed by strcoll() is only trivially slower than
1433 * strcoll() by itself, so we don't lose much if this doesn't work out
1434 * very often, and if it does - for example, because there are many
1435 * equal strings in the input - then we win big by avoiding expensive
1436 * collation-aware comparisons.
1437 */
1438 if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1439 return 0;
1440
1441 #ifdef WIN32
1442 /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1443 if (GetDatabaseEncoding() == PG_UTF8)
1444 {
1445 int a1len;
1446 int a2len;
1447 int r;
1448
1449 if (len1 >= TEXTBUFLEN / 2)
1450 {
1451 a1len = len1 * 2 + 2;
1452 a1p = palloc(a1len);
1453 }
1454 else
1455 {
1456 a1len = TEXTBUFLEN;
1457 a1p = a1buf;
1458 }
1459 if (len2 >= TEXTBUFLEN / 2)
1460 {
1461 a2len = len2 * 2 + 2;
1462 a2p = palloc(a2len);
1463 }
1464 else
1465 {
1466 a2len = TEXTBUFLEN;
1467 a2p = a2buf;
1468 }
1469
1470 /* stupid Microsloth API does not work for zero-length input */
1471 if (len1 == 0)
1472 r = 0;
1473 else
1474 {
1475 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1476 (LPWSTR) a1p, a1len / 2);
1477 if (!r)
1478 ereport(ERROR,
1479 (errmsg("could not convert string to UTF-16: error code %lu",
1480 GetLastError())));
1481 }
1482 ((LPWSTR) a1p)[r] = 0;
1483
1484 if (len2 == 0)
1485 r = 0;
1486 else
1487 {
1488 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1489 (LPWSTR) a2p, a2len / 2);
1490 if (!r)
1491 ereport(ERROR,
1492 (errmsg("could not convert string to UTF-16: error code %lu",
1493 GetLastError())));
1494 }
1495 ((LPWSTR) a2p)[r] = 0;
1496
1497 errno = 0;
1498 #ifdef HAVE_LOCALE_T
1499 if (mylocale)
1500 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale);
1501 else
1502 #endif
1503 result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1504 if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1505 * headers */
1506 ereport(ERROR,
1507 (errmsg("could not compare Unicode strings: %m")));
1508
1509 /*
1510 * In some locales wcscoll() can claim that nonidentical strings
1511 * are equal. Believing that would be bad news for a number of
1512 * reasons, so we follow Perl's lead and sort "equal" strings
1513 * according to strcmp (on the UTF-8 representation).
1514 */
1515 if (result == 0)
1516 {
1517 result = memcmp(arg1, arg2, Min(len1, len2));
1518 if ((result == 0) && (len1 != len2))
1519 result = (len1 < len2) ? -1 : 1;
1520 }
1521
1522 if (a1p != a1buf)
1523 pfree(a1p);
1524 if (a2p != a2buf)
1525 pfree(a2p);
1526
1527 return result;
1528 }
1529 #endif /* WIN32 */
1530
1531 if (len1 >= TEXTBUFLEN)
1532 a1p = (char *) palloc(len1 + 1);
1533 else
1534 a1p = a1buf;
1535 if (len2 >= TEXTBUFLEN)
1536 a2p = (char *) palloc(len2 + 1);
1537 else
1538 a2p = a2buf;
1539
1540 memcpy(a1p, arg1, len1);
1541 a1p[len1] = '\0';
1542 memcpy(a2p, arg2, len2);
1543 a2p[len2] = '\0';
1544
1545 #ifdef HAVE_LOCALE_T
1546 if (mylocale)
1547 result = strcoll_l(a1p, a2p, mylocale);
1548 else
1549 #endif
1550 result = strcoll(a1p, a2p);
1551
1552 /*
1553 * In some locales strcoll() can claim that nonidentical strings are
1554 * equal. Believing that would be bad news for a number of reasons,
1555 * so we follow Perl's lead and sort "equal" strings according to
1556 * strcmp().
1557 */
1558 if (result == 0)
1559 result = strcmp(a1p, a2p);
1560
1561 if (a1p != a1buf)
1562 pfree(a1p);
1563 if (a2p != a2buf)
1564 pfree(a2p);
1565 }
1566
1567 return result;
1568 }
1569
1570 /* text_cmp()
1571 * Internal comparison function for text strings.
1572 * Returns -1, 0 or 1
1573 */
1574 static int
text_cmp(text * arg1,text * arg2,Oid collid)1575 text_cmp(text *arg1, text *arg2, Oid collid)
1576 {
1577 char *a1p,
1578 *a2p;
1579 int len1,
1580 len2;
1581
1582 a1p = VARDATA_ANY(arg1);
1583 a2p = VARDATA_ANY(arg2);
1584
1585 len1 = VARSIZE_ANY_EXHDR(arg1);
1586 len2 = VARSIZE_ANY_EXHDR(arg2);
1587
1588 return varstr_cmp(a1p, len1, a2p, len2, collid);
1589 }
1590
1591 /*
1592 * Comparison functions for text strings.
1593 *
1594 * Note: btree indexes need these routines not to leak memory; therefore,
1595 * be careful to free working copies of toasted datums. Most places don't
1596 * need to be so careful.
1597 */
1598
1599 Datum
texteq(PG_FUNCTION_ARGS)1600 texteq(PG_FUNCTION_ARGS)
1601 {
1602 Datum arg1 = PG_GETARG_DATUM(0);
1603 Datum arg2 = PG_GETARG_DATUM(1);
1604 bool result;
1605 Size len1,
1606 len2;
1607
1608 /*
1609 * Since we only care about equality or not-equality, we can avoid all the
1610 * expense of strcoll() here, and just do bitwise comparison. In fact, we
1611 * don't even have to do a bitwise comparison if we can show the lengths
1612 * of the strings are unequal; which might save us from having to detoast
1613 * one or both values.
1614 */
1615 len1 = toast_raw_datum_size(arg1);
1616 len2 = toast_raw_datum_size(arg2);
1617 if (len1 != len2)
1618 result = false;
1619 else
1620 {
1621 text *targ1 = DatumGetTextPP(arg1);
1622 text *targ2 = DatumGetTextPP(arg2);
1623
1624 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1625 len1 - VARHDRSZ) == 0);
1626
1627 PG_FREE_IF_COPY(targ1, 0);
1628 PG_FREE_IF_COPY(targ2, 1);
1629 }
1630
1631 PG_RETURN_BOOL(result);
1632 }
1633
1634 Datum
textne(PG_FUNCTION_ARGS)1635 textne(PG_FUNCTION_ARGS)
1636 {
1637 Datum arg1 = PG_GETARG_DATUM(0);
1638 Datum arg2 = PG_GETARG_DATUM(1);
1639 bool result;
1640 Size len1,
1641 len2;
1642
1643 /* See comment in texteq() */
1644 len1 = toast_raw_datum_size(arg1);
1645 len2 = toast_raw_datum_size(arg2);
1646 if (len1 != len2)
1647 result = true;
1648 else
1649 {
1650 text *targ1 = DatumGetTextPP(arg1);
1651 text *targ2 = DatumGetTextPP(arg2);
1652
1653 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1654 len1 - VARHDRSZ) != 0);
1655
1656 PG_FREE_IF_COPY(targ1, 0);
1657 PG_FREE_IF_COPY(targ2, 1);
1658 }
1659
1660 PG_RETURN_BOOL(result);
1661 }
1662
1663 Datum
text_lt(PG_FUNCTION_ARGS)1664 text_lt(PG_FUNCTION_ARGS)
1665 {
1666 text *arg1 = PG_GETARG_TEXT_PP(0);
1667 text *arg2 = PG_GETARG_TEXT_PP(1);
1668 bool result;
1669
1670 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1671
1672 PG_FREE_IF_COPY(arg1, 0);
1673 PG_FREE_IF_COPY(arg2, 1);
1674
1675 PG_RETURN_BOOL(result);
1676 }
1677
1678 Datum
text_le(PG_FUNCTION_ARGS)1679 text_le(PG_FUNCTION_ARGS)
1680 {
1681 text *arg1 = PG_GETARG_TEXT_PP(0);
1682 text *arg2 = PG_GETARG_TEXT_PP(1);
1683 bool result;
1684
1685 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1686
1687 PG_FREE_IF_COPY(arg1, 0);
1688 PG_FREE_IF_COPY(arg2, 1);
1689
1690 PG_RETURN_BOOL(result);
1691 }
1692
1693 Datum
text_gt(PG_FUNCTION_ARGS)1694 text_gt(PG_FUNCTION_ARGS)
1695 {
1696 text *arg1 = PG_GETARG_TEXT_PP(0);
1697 text *arg2 = PG_GETARG_TEXT_PP(1);
1698 bool result;
1699
1700 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1701
1702 PG_FREE_IF_COPY(arg1, 0);
1703 PG_FREE_IF_COPY(arg2, 1);
1704
1705 PG_RETURN_BOOL(result);
1706 }
1707
1708 Datum
text_ge(PG_FUNCTION_ARGS)1709 text_ge(PG_FUNCTION_ARGS)
1710 {
1711 text *arg1 = PG_GETARG_TEXT_PP(0);
1712 text *arg2 = PG_GETARG_TEXT_PP(1);
1713 bool result;
1714
1715 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1716
1717 PG_FREE_IF_COPY(arg1, 0);
1718 PG_FREE_IF_COPY(arg2, 1);
1719
1720 PG_RETURN_BOOL(result);
1721 }
1722
1723 Datum
bttextcmp(PG_FUNCTION_ARGS)1724 bttextcmp(PG_FUNCTION_ARGS)
1725 {
1726 text *arg1 = PG_GETARG_TEXT_PP(0);
1727 text *arg2 = PG_GETARG_TEXT_PP(1);
1728 int32 result;
1729
1730 result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1731
1732 PG_FREE_IF_COPY(arg1, 0);
1733 PG_FREE_IF_COPY(arg2, 1);
1734
1735 PG_RETURN_INT32(result);
1736 }
1737
1738 Datum
bttextsortsupport(PG_FUNCTION_ARGS)1739 bttextsortsupport(PG_FUNCTION_ARGS)
1740 {
1741 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1742 Oid collid = ssup->ssup_collation;
1743 MemoryContext oldcontext;
1744
1745 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1746
1747 /* Use generic string SortSupport */
1748 varstr_sortsupport(ssup, collid, false);
1749
1750 MemoryContextSwitchTo(oldcontext);
1751
1752 PG_RETURN_VOID();
1753 }
1754
1755 /*
1756 * Generic sortsupport interface for character type's operator classes.
1757 * Includes locale support, and support for BpChar semantics (i.e. removing
1758 * trailing spaces before comparison).
1759 *
1760 * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1761 * same representation. Callers that always use the C collation (e.g.
1762 * non-collatable type callers like bytea) may have NUL bytes in their strings;
1763 * this will not work with any other collation, though.
1764 */
1765 void
varstr_sortsupport(SortSupport ssup,Oid collid,bool bpchar)1766 varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar)
1767 {
1768 bool abbreviate = ssup->abbreviate;
1769 bool collate_c = false;
1770 VarStringSortSupport *sss;
1771
1772 #ifdef HAVE_LOCALE_T
1773 pg_locale_t locale = 0;
1774 #endif
1775
1776 /*
1777 * If possible, set ssup->comparator to a function which can be used to
1778 * directly compare two datums. If we can do this, we'll avoid the
1779 * overhead of a trip through the fmgr layer for every comparison, which
1780 * can be substantial.
1781 *
1782 * Most typically, we'll set the comparator to varstrfastcmp_locale, which
1783 * uses strcoll() to perform comparisons and knows about the special
1784 * requirements of BpChar callers. However, if LC_COLLATE = C, we can
1785 * make things quite a bit faster with varstrfastcmp_c or bpcharfastcmp_c,
1786 * both of which use memcmp() rather than strcoll().
1787 *
1788 * There is a further exception on Windows. When the database encoding is
1789 * UTF-8 and we are not using the C collation, complex hacks are required.
1790 * We don't currently have a comparator that handles that case, so we fall
1791 * back on the slow method of having the sort code invoke bttextcmp() (in
1792 * the case of text) via the fmgr trampoline.
1793 */
1794 if (lc_collate_is_c(collid))
1795 {
1796 if (!bpchar)
1797 ssup->comparator = varstrfastcmp_c;
1798 else
1799 ssup->comparator = bpcharfastcmp_c;
1800
1801 collate_c = true;
1802 }
1803 #ifdef WIN32
1804 else if (GetDatabaseEncoding() == PG_UTF8)
1805 return;
1806 #endif
1807 else
1808 {
1809 ssup->comparator = varstrfastcmp_locale;
1810
1811 /*
1812 * We need a collation-sensitive comparison. To make things faster,
1813 * we'll figure out the collation based on the locale id and cache the
1814 * result.
1815 */
1816 if (collid != DEFAULT_COLLATION_OID)
1817 {
1818 if (!OidIsValid(collid))
1819 {
1820 /*
1821 * This typically means that the parser could not resolve a
1822 * conflict of implicit collations, so report it that way.
1823 */
1824 ereport(ERROR,
1825 (errcode(ERRCODE_INDETERMINATE_COLLATION),
1826 errmsg("could not determine which collation to use for string comparison"),
1827 errhint("Use the COLLATE clause to set the collation explicitly.")));
1828 }
1829 #ifdef HAVE_LOCALE_T
1830 locale = pg_newlocale_from_collation(collid);
1831 #endif
1832 }
1833 }
1834
1835 /*
1836 * Unfortunately, it seems that abbreviation for non-C collations is
1837 * broken on many common platforms; testing of multiple versions of glibc
1838 * reveals that, for many locales, strcoll() and strxfrm() do not return
1839 * consistent results, which is fatal to this optimization. While no
1840 * other libc other than Cygwin has so far been shown to have a problem,
1841 * we take the conservative course of action for right now and disable
1842 * this categorically. (Users who are certain this isn't a problem on
1843 * their system can define TRUST_STRXFRM.)
1844 *
1845 * Even apart from the risk of broken locales, it's possible that there
1846 * are platforms where the use of abbreviated keys should be disabled at
1847 * compile time. Having only 4 byte datums could make worst-case
1848 * performance drastically more likely, for example. Moreover, Darwin's
1849 * strxfrm() implementations is known to not effectively concentrate a
1850 * significant amount of entropy from the original string in earlier
1851 * transformed blobs. It's possible that other supported platforms are
1852 * similarly encumbered. So, if we ever get past disabling this
1853 * categorically, we may still want or need to disable it for particular
1854 * platforms.
1855 */
1856 #ifndef TRUST_STRXFRM
1857 if (!collate_c)
1858 abbreviate = false;
1859 #endif
1860
1861 /*
1862 * If we're using abbreviated keys, or if we're using a locale-aware
1863 * comparison, we need to initialize a StringSortSupport object. Both
1864 * cases will make use of the temporary buffers we initialize here for
1865 * scratch space (and to detect requirement for BpChar semantics from
1866 * caller), and the abbreviation case requires additional state.
1867 */
1868 if (abbreviate || !collate_c)
1869 {
1870 sss = palloc(sizeof(VarStringSortSupport));
1871 sss->buf1 = palloc(TEXTBUFLEN);
1872 sss->buflen1 = TEXTBUFLEN;
1873 sss->buf2 = palloc(TEXTBUFLEN);
1874 sss->buflen2 = TEXTBUFLEN;
1875 /* Start with invalid values */
1876 sss->last_len1 = -1;
1877 sss->last_len2 = -1;
1878 /* Initialize */
1879 sss->last_returned = 0;
1880 #ifdef HAVE_LOCALE_T
1881 sss->locale = locale;
1882 #endif
1883
1884 /*
1885 * To avoid somehow confusing a strxfrm() blob and an original string,
1886 * constantly keep track of the variety of data that buf1 and buf2
1887 * currently contain.
1888 *
1889 * Comparisons may be interleaved with conversion calls. Frequently,
1890 * conversions and comparisons are batched into two distinct phases,
1891 * but the correctness of caching cannot hinge upon this. For
1892 * comparison caching, buffer state is only trusted if cache_blob is
1893 * found set to false, whereas strxfrm() caching only trusts the state
1894 * when cache_blob is found set to true.
1895 *
1896 * Arbitrarily initialize cache_blob to true.
1897 */
1898 sss->cache_blob = true;
1899 sss->collate_c = collate_c;
1900 sss->bpchar = bpchar;
1901 ssup->ssup_extra = sss;
1902
1903 /*
1904 * If possible, plan to use the abbreviated keys optimization. The
1905 * core code may switch back to authoritative comparator should
1906 * abbreviation be aborted.
1907 */
1908 if (abbreviate)
1909 {
1910 sss->prop_card = 0.20;
1911 initHyperLogLog(&sss->abbr_card, 10);
1912 initHyperLogLog(&sss->full_card, 10);
1913 ssup->abbrev_full_comparator = ssup->comparator;
1914 ssup->comparator = varstrcmp_abbrev;
1915 ssup->abbrev_converter = varstr_abbrev_convert;
1916 ssup->abbrev_abort = varstr_abbrev_abort;
1917 }
1918 }
1919 }
1920
1921 /*
1922 * sortsupport comparison func (for C locale case)
1923 */
1924 static int
varstrfastcmp_c(Datum x,Datum y,SortSupport ssup)1925 varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
1926 {
1927 VarString *arg1 = DatumGetVarStringPP(x);
1928 VarString *arg2 = DatumGetVarStringPP(y);
1929 char *a1p,
1930 *a2p;
1931 int len1,
1932 len2,
1933 result;
1934
1935 a1p = VARDATA_ANY(arg1);
1936 a2p = VARDATA_ANY(arg2);
1937
1938 len1 = VARSIZE_ANY_EXHDR(arg1);
1939 len2 = VARSIZE_ANY_EXHDR(arg2);
1940
1941 result = memcmp(a1p, a2p, Min(len1, len2));
1942 if ((result == 0) && (len1 != len2))
1943 result = (len1 < len2) ? -1 : 1;
1944
1945 /* We can't afford to leak memory here. */
1946 if (PointerGetDatum(arg1) != x)
1947 pfree(arg1);
1948 if (PointerGetDatum(arg2) != y)
1949 pfree(arg2);
1950
1951 return result;
1952 }
1953
1954 /*
1955 * sortsupport comparison func (for BpChar C locale case)
1956 *
1957 * BpChar outsources its sortsupport to this module. Specialization for the
1958 * varstr_sortsupport BpChar case, modeled on
1959 * internal_bpchar_pattern_compare().
1960 */
1961 static int
bpcharfastcmp_c(Datum x,Datum y,SortSupport ssup)1962 bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
1963 {
1964 BpChar *arg1 = DatumGetBpCharPP(x);
1965 BpChar *arg2 = DatumGetBpCharPP(y);
1966 char *a1p,
1967 *a2p;
1968 int len1,
1969 len2,
1970 result;
1971
1972 a1p = VARDATA_ANY(arg1);
1973 a2p = VARDATA_ANY(arg2);
1974
1975 len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
1976 len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
1977
1978 result = memcmp(a1p, a2p, Min(len1, len2));
1979 if ((result == 0) && (len1 != len2))
1980 result = (len1 < len2) ? -1 : 1;
1981
1982 /* We can't afford to leak memory here. */
1983 if (PointerGetDatum(arg1) != x)
1984 pfree(arg1);
1985 if (PointerGetDatum(arg2) != y)
1986 pfree(arg2);
1987
1988 return result;
1989 }
1990
1991 /*
1992 * sortsupport comparison func (for locale case)
1993 */
1994 static int
varstrfastcmp_locale(Datum x,Datum y,SortSupport ssup)1995 varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup)
1996 {
1997 VarString *arg1 = DatumGetVarStringPP(x);
1998 VarString *arg2 = DatumGetVarStringPP(y);
1999 bool arg1_match;
2000 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2001
2002 /* working state */
2003 char *a1p,
2004 *a2p;
2005 int len1,
2006 len2,
2007 result;
2008
2009 a1p = VARDATA_ANY(arg1);
2010 a2p = VARDATA_ANY(arg2);
2011
2012 len1 = VARSIZE_ANY_EXHDR(arg1);
2013 len2 = VARSIZE_ANY_EXHDR(arg2);
2014
2015 /* Fast pre-check for equality, as discussed in varstr_cmp() */
2016 if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2017 {
2018 /*
2019 * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2020 * last_len2. Existing contents of buffers might still be used by
2021 * next call.
2022 *
2023 * It's fine to allow the comparison of BpChar padding bytes here,
2024 * even though that implies that the memcmp() will usually be
2025 * performed for BpChar callers (though multibyte characters could
2026 * still prevent that from occurring). The memcmp() is still very
2027 * cheap, and BpChar's funny semantics have us remove trailing spaces
2028 * (not limited to padding), so we need make no distinction between
2029 * padding space characters and "real" space characters.
2030 */
2031 result = 0;
2032 goto done;
2033 }
2034
2035 if (sss->bpchar)
2036 {
2037 /* Get true number of bytes, ignoring trailing spaces */
2038 len1 = bpchartruelen(a1p, len1);
2039 len2 = bpchartruelen(a2p, len2);
2040 }
2041
2042 if (len1 >= sss->buflen1)
2043 {
2044 pfree(sss->buf1);
2045 sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2046 sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2047 }
2048 if (len2 >= sss->buflen2)
2049 {
2050 pfree(sss->buf2);
2051 sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2052 sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2053 }
2054
2055 /*
2056 * We're likely to be asked to compare the same strings repeatedly, and
2057 * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2058 * comparisons, even though in general there is no reason to think that
2059 * that will work out (every string datum may be unique). Caching does
2060 * not slow things down measurably when it doesn't work out, and can speed
2061 * things up by rather a lot when it does. In part, this is because the
2062 * memcmp() compares data from cachelines that are needed in L1 cache even
2063 * when the last comparison's result cannot be reused.
2064 */
2065 arg1_match = true;
2066 if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2067 {
2068 arg1_match = false;
2069 memcpy(sss->buf1, a1p, len1);
2070 sss->buf1[len1] = '\0';
2071 sss->last_len1 = len1;
2072 }
2073
2074 /*
2075 * If we're comparing the same two strings as last time, we can return the
2076 * same answer without calling strcoll() again. This is more likely than
2077 * it seems (at least with moderate to low cardinality sets), because
2078 * quicksort compares the same pivot against many values.
2079 */
2080 if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2081 {
2082 memcpy(sss->buf2, a2p, len2);
2083 sss->buf2[len2] = '\0';
2084 sss->last_len2 = len2;
2085 }
2086 else if (arg1_match && !sss->cache_blob)
2087 {
2088 /* Use result cached following last actual strcoll() call */
2089 result = sss->last_returned;
2090 goto done;
2091 }
2092
2093 #ifdef HAVE_LOCALE_T
2094 if (sss->locale)
2095 result = strcoll_l(sss->buf1, sss->buf2, sss->locale);
2096 else
2097 #endif
2098 result = strcoll(sss->buf1, sss->buf2);
2099
2100 /*
2101 * In some locales strcoll() can claim that nonidentical strings are
2102 * equal. Believing that would be bad news for a number of reasons, so we
2103 * follow Perl's lead and sort "equal" strings according to strcmp().
2104 */
2105 if (result == 0)
2106 result = strcmp(sss->buf1, sss->buf2);
2107
2108 /* Cache result, perhaps saving an expensive strcoll() call next time */
2109 sss->cache_blob = false;
2110 sss->last_returned = result;
2111 done:
2112 /* We can't afford to leak memory here. */
2113 if (PointerGetDatum(arg1) != x)
2114 pfree(arg1);
2115 if (PointerGetDatum(arg2) != y)
2116 pfree(arg2);
2117
2118 return result;
2119 }
2120
2121 /*
2122 * Abbreviated key comparison func
2123 */
2124 static int
varstrcmp_abbrev(Datum x,Datum y,SortSupport ssup)2125 varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup)
2126 {
2127 /*
2128 * When 0 is returned, the core system will call varstrfastcmp_c()
2129 * (bpcharfastcmp_c() in BpChar case) or varstrfastcmp_locale(). Even a
2130 * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2131 * authoritatively, for the same reason that there is a strcoll()
2132 * tie-breaker call to strcmp() in varstr_cmp().
2133 */
2134 if (x > y)
2135 return 1;
2136 else if (x == y)
2137 return 0;
2138 else
2139 return -1;
2140 }
2141
2142 /*
2143 * Conversion routine for sortsupport. Converts original to abbreviated key
2144 * representation. Our encoding strategy is simple -- pack the first 8 bytes
2145 * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2146 * stored in reverse order), and treat it as an unsigned integer. When the "C"
2147 * locale is used, or in case of bytea, just memcpy() from original instead.
2148 */
2149 static Datum
varstr_abbrev_convert(Datum original,SortSupport ssup)2150 varstr_abbrev_convert(Datum original, SortSupport ssup)
2151 {
2152 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2153 VarString *authoritative = DatumGetVarStringPP(original);
2154 char *authoritative_data = VARDATA_ANY(authoritative);
2155
2156 /* working state */
2157 Datum res;
2158 char *pres;
2159 int len;
2160 uint32 hash;
2161
2162 pres = (char *) &res;
2163 /* memset(), so any non-overwritten bytes are NUL */
2164 memset(pres, 0, sizeof(Datum));
2165 len = VARSIZE_ANY_EXHDR(authoritative);
2166
2167 /* Get number of bytes, ignoring trailing spaces */
2168 if (sss->bpchar)
2169 len = bpchartruelen(authoritative_data, len);
2170
2171 /*
2172 * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2173 * abbreviate keys. The full comparator for the C locale is always
2174 * memcmp(). It would be incorrect to allow bytea callers (callers that
2175 * always force the C collation -- bytea isn't a collatable type, but this
2176 * approach is convenient) to use strxfrm(). This is because bytea
2177 * strings may contain NUL bytes. Besides, this should be faster, too.
2178 *
2179 * More generally, it's okay that bytea callers can have NUL bytes in
2180 * strings because varstrcmp_abbrev() need not make a distinction between
2181 * terminating NUL bytes, and NUL bytes representing actual NULs in the
2182 * authoritative representation. Hopefully a comparison at or past one
2183 * abbreviated key's terminating NUL byte will resolve the comparison
2184 * without consulting the authoritative representation; specifically, some
2185 * later non-NUL byte in the longer string can resolve the comparison
2186 * against a subsequent terminating NUL in the shorter string. There will
2187 * usually be what is effectively a "length-wise" resolution there and
2188 * then.
2189 *
2190 * If that doesn't work out -- if all bytes in the longer string
2191 * positioned at or past the offset of the smaller string's (first)
2192 * terminating NUL are actually representative of NUL bytes in the
2193 * authoritative binary string (perhaps with some *terminating* NUL bytes
2194 * towards the end of the longer string iff it happens to still be small)
2195 * -- then an authoritative tie-breaker will happen, and do the right
2196 * thing: explicitly consider string length.
2197 */
2198 if (sss->collate_c)
2199 memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2200 else
2201 {
2202 Size bsize;
2203
2204 /*
2205 * We're not using the C collation, so fall back on strxfrm.
2206 */
2207
2208 /* By convention, we use buffer 1 to store and NUL-terminate */
2209 if (len >= sss->buflen1)
2210 {
2211 pfree(sss->buf1);
2212 sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2213 sss->buf1 = palloc(sss->buflen1);
2214 }
2215
2216 /* Might be able to reuse strxfrm() blob from last call */
2217 if (sss->last_len1 == len && sss->cache_blob &&
2218 memcmp(sss->buf1, authoritative_data, len) == 0)
2219 {
2220 memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2221 /* No change affecting cardinality, so no hashing required */
2222 goto done;
2223 }
2224
2225 /* Just like strcoll(), strxfrm() expects a NUL-terminated string */
2226 memcpy(sss->buf1, authoritative_data, len);
2227 sss->buf1[len] = '\0';
2228 sss->last_len1 = len;
2229
2230 for (;;)
2231 {
2232 #ifdef HAVE_LOCALE_T
2233 if (sss->locale)
2234 bsize = strxfrm_l(sss->buf2, sss->buf1,
2235 sss->buflen2, sss->locale);
2236 else
2237 #endif
2238 bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2239
2240 sss->last_len2 = bsize;
2241 if (bsize < sss->buflen2)
2242 break;
2243
2244 /*
2245 * The C standard states that the contents of the buffer is now
2246 * unspecified. Grow buffer, and retry.
2247 */
2248 pfree(sss->buf2);
2249 sss->buflen2 = Max(bsize + 1,
2250 Min(sss->buflen2 * 2, MaxAllocSize));
2251 sss->buf2 = palloc(sss->buflen2);
2252 }
2253
2254 /*
2255 * Every Datum byte is always compared. This is safe because the
2256 * strxfrm() blob is itself NUL terminated, leaving no danger of
2257 * misinterpreting any NUL bytes not intended to be interpreted as
2258 * logically representing termination.
2259 *
2260 * (Actually, even if there were NUL bytes in the blob it would be
2261 * okay. See remarks on bytea case above.)
2262 */
2263 memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2264 }
2265
2266 /*
2267 * Maintain approximate cardinality of both abbreviated keys and original,
2268 * authoritative keys using HyperLogLog. Used as cheap insurance against
2269 * the worst case, where we do many string transformations for no saving
2270 * in full strcoll()-based comparisons. These statistics are used by
2271 * varstr_abbrev_abort().
2272 *
2273 * First, Hash key proper, or a significant fraction of it. Mix in length
2274 * in order to compensate for cases where differences are past
2275 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2276 */
2277 hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2278 Min(len, PG_CACHE_LINE_SIZE)));
2279
2280 if (len > PG_CACHE_LINE_SIZE)
2281 hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2282
2283 addHyperLogLog(&sss->full_card, hash);
2284
2285 /* Hash abbreviated key */
2286 #if SIZEOF_DATUM == 8
2287 {
2288 uint32 lohalf,
2289 hihalf;
2290
2291 lohalf = (uint32) res;
2292 hihalf = (uint32) (res >> 32);
2293 hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2294 }
2295 #else /* SIZEOF_DATUM != 8 */
2296 hash = DatumGetUInt32(hash_uint32((uint32) res));
2297 #endif
2298
2299 addHyperLogLog(&sss->abbr_card, hash);
2300
2301 /* Cache result, perhaps saving an expensive strxfrm() call next time */
2302 sss->cache_blob = true;
2303 done:
2304
2305 /*
2306 * Byteswap on little-endian machines.
2307 *
2308 * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2309 * comparator) works correctly on all platforms. If we didn't do this,
2310 * the comparator would have to call memcmp() with a pair of pointers to
2311 * the first byte of each abbreviated key, which is slower.
2312 */
2313 res = DatumBigEndianToNative(res);
2314
2315 /* Don't leak memory here */
2316 if (PointerGetDatum(authoritative) != original)
2317 pfree(authoritative);
2318
2319 return res;
2320 }
2321
2322 /*
2323 * Callback for estimating effectiveness of abbreviated key optimization, using
2324 * heuristic rules. Returns value indicating if the abbreviation optimization
2325 * should be aborted, based on its projected effectiveness.
2326 */
2327 static bool
varstr_abbrev_abort(int memtupcount,SortSupport ssup)2328 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2329 {
2330 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2331 double abbrev_distinct,
2332 key_distinct;
2333
2334 Assert(ssup->abbreviate);
2335
2336 /* Have a little patience */
2337 if (memtupcount < 100)
2338 return false;
2339
2340 abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2341 key_distinct = estimateHyperLogLog(&sss->full_card);
2342
2343 /*
2344 * Clamp cardinality estimates to at least one distinct value. While
2345 * NULLs are generally disregarded, if only NULL values were seen so far,
2346 * that might misrepresent costs if we failed to clamp.
2347 */
2348 if (abbrev_distinct <= 1.0)
2349 abbrev_distinct = 1.0;
2350
2351 if (key_distinct <= 1.0)
2352 key_distinct = 1.0;
2353
2354 /*
2355 * In the worst case all abbreviated keys are identical, while at the same
2356 * time there are differences within full key strings not captured in
2357 * abbreviations.
2358 */
2359 #ifdef TRACE_SORT
2360 if (trace_sort)
2361 {
2362 double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2363
2364 elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2365 "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2366 memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2367 sss->prop_card);
2368 }
2369 #endif
2370
2371 /*
2372 * If the number of distinct abbreviated keys approximately matches the
2373 * number of distinct authoritative original keys, that's reason enough to
2374 * proceed. We can win even with a very low cardinality set if most
2375 * tie-breakers only memcmp(). This is by far the most important
2376 * consideration.
2377 *
2378 * While comparisons that are resolved at the abbreviated key level are
2379 * considerably cheaper than tie-breakers resolved with memcmp(), both of
2380 * those two outcomes are so much cheaper than a full strcoll() once
2381 * sorting is underway that it doesn't seem worth it to weigh abbreviated
2382 * cardinality against the overall size of the set in order to more
2383 * accurately model costs. Assume that an abbreviated comparison, and an
2384 * abbreviated comparison with a cheap memcmp()-based authoritative
2385 * resolution are equivalent.
2386 */
2387 if (abbrev_distinct > key_distinct * sss->prop_card)
2388 {
2389 /*
2390 * When we have exceeded 10,000 tuples, decay required cardinality
2391 * aggressively for next call.
2392 *
2393 * This is useful because the number of comparisons required on
2394 * average increases at a linearithmic rate, and at roughly 10,000
2395 * tuples that factor will start to dominate over the linear costs of
2396 * string transformation (this is a conservative estimate). The decay
2397 * rate is chosen to be a little less aggressive than halving -- which
2398 * (since we're called at points at which memtupcount has doubled)
2399 * would never see the cost model actually abort past the first call
2400 * following a decay. This decay rate is mostly a precaution against
2401 * a sudden, violent swing in how well abbreviated cardinality tracks
2402 * full key cardinality. The decay also serves to prevent a marginal
2403 * case from being aborted too late, when too much has already been
2404 * invested in string transformation.
2405 *
2406 * It's possible for sets of several million distinct strings with
2407 * mere tens of thousands of distinct abbreviated keys to still
2408 * benefit very significantly. This will generally occur provided
2409 * each abbreviated key is a proxy for a roughly uniform number of the
2410 * set's full keys. If it isn't so, we hope to catch that early and
2411 * abort. If it isn't caught early, by the time the problem is
2412 * apparent it's probably not worth aborting.
2413 */
2414 if (memtupcount > 10000)
2415 sss->prop_card *= 0.65;
2416
2417 return false;
2418 }
2419
2420 /*
2421 * Abort abbreviation strategy.
2422 *
2423 * The worst case, where all abbreviated keys are identical while all
2424 * original strings differ will typically only see a regression of about
2425 * 10% in execution time for small to medium sized lists of strings.
2426 * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2427 * often expect very large improvements, particularly with sets of strings
2428 * of moderately high to high abbreviated cardinality. There is little to
2429 * lose but much to gain, which our strategy reflects.
2430 */
2431 #ifdef TRACE_SORT
2432 if (trace_sort)
2433 elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2434 "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2435 memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2436 #endif
2437
2438 return true;
2439 }
2440
2441 Datum
text_larger(PG_FUNCTION_ARGS)2442 text_larger(PG_FUNCTION_ARGS)
2443 {
2444 text *arg1 = PG_GETARG_TEXT_PP(0);
2445 text *arg2 = PG_GETARG_TEXT_PP(1);
2446 text *result;
2447
2448 result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2449
2450 PG_RETURN_TEXT_P(result);
2451 }
2452
2453 Datum
text_smaller(PG_FUNCTION_ARGS)2454 text_smaller(PG_FUNCTION_ARGS)
2455 {
2456 text *arg1 = PG_GETARG_TEXT_PP(0);
2457 text *arg2 = PG_GETARG_TEXT_PP(1);
2458 text *result;
2459
2460 result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2461
2462 PG_RETURN_TEXT_P(result);
2463 }
2464
2465
2466 /*
2467 * The following operators support character-by-character comparison
2468 * of text datums, to allow building indexes suitable for LIKE clauses.
2469 * Note that the regular texteq/textne comparison operators, and regular
2470 * support functions 1 and 2 with "C" collation are assumed to be
2471 * compatible with these!
2472 */
2473
2474 static int
internal_text_pattern_compare(text * arg1,text * arg2)2475 internal_text_pattern_compare(text *arg1, text *arg2)
2476 {
2477 int result;
2478 int len1,
2479 len2;
2480
2481 len1 = VARSIZE_ANY_EXHDR(arg1);
2482 len2 = VARSIZE_ANY_EXHDR(arg2);
2483
2484 result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
2485 if (result != 0)
2486 return result;
2487 else if (len1 < len2)
2488 return -1;
2489 else if (len1 > len2)
2490 return 1;
2491 else
2492 return 0;
2493 }
2494
2495
2496 Datum
text_pattern_lt(PG_FUNCTION_ARGS)2497 text_pattern_lt(PG_FUNCTION_ARGS)
2498 {
2499 text *arg1 = PG_GETARG_TEXT_PP(0);
2500 text *arg2 = PG_GETARG_TEXT_PP(1);
2501 int result;
2502
2503 result = internal_text_pattern_compare(arg1, arg2);
2504
2505 PG_FREE_IF_COPY(arg1, 0);
2506 PG_FREE_IF_COPY(arg2, 1);
2507
2508 PG_RETURN_BOOL(result < 0);
2509 }
2510
2511
2512 Datum
text_pattern_le(PG_FUNCTION_ARGS)2513 text_pattern_le(PG_FUNCTION_ARGS)
2514 {
2515 text *arg1 = PG_GETARG_TEXT_PP(0);
2516 text *arg2 = PG_GETARG_TEXT_PP(1);
2517 int result;
2518
2519 result = internal_text_pattern_compare(arg1, arg2);
2520
2521 PG_FREE_IF_COPY(arg1, 0);
2522 PG_FREE_IF_COPY(arg2, 1);
2523
2524 PG_RETURN_BOOL(result <= 0);
2525 }
2526
2527
2528 Datum
text_pattern_ge(PG_FUNCTION_ARGS)2529 text_pattern_ge(PG_FUNCTION_ARGS)
2530 {
2531 text *arg1 = PG_GETARG_TEXT_PP(0);
2532 text *arg2 = PG_GETARG_TEXT_PP(1);
2533 int result;
2534
2535 result = internal_text_pattern_compare(arg1, arg2);
2536
2537 PG_FREE_IF_COPY(arg1, 0);
2538 PG_FREE_IF_COPY(arg2, 1);
2539
2540 PG_RETURN_BOOL(result >= 0);
2541 }
2542
2543
2544 Datum
text_pattern_gt(PG_FUNCTION_ARGS)2545 text_pattern_gt(PG_FUNCTION_ARGS)
2546 {
2547 text *arg1 = PG_GETARG_TEXT_PP(0);
2548 text *arg2 = PG_GETARG_TEXT_PP(1);
2549 int result;
2550
2551 result = internal_text_pattern_compare(arg1, arg2);
2552
2553 PG_FREE_IF_COPY(arg1, 0);
2554 PG_FREE_IF_COPY(arg2, 1);
2555
2556 PG_RETURN_BOOL(result > 0);
2557 }
2558
2559
2560 Datum
bttext_pattern_cmp(PG_FUNCTION_ARGS)2561 bttext_pattern_cmp(PG_FUNCTION_ARGS)
2562 {
2563 text *arg1 = PG_GETARG_TEXT_PP(0);
2564 text *arg2 = PG_GETARG_TEXT_PP(1);
2565 int result;
2566
2567 result = internal_text_pattern_compare(arg1, arg2);
2568
2569 PG_FREE_IF_COPY(arg1, 0);
2570 PG_FREE_IF_COPY(arg2, 1);
2571
2572 PG_RETURN_INT32(result);
2573 }
2574
2575
2576 Datum
bttext_pattern_sortsupport(PG_FUNCTION_ARGS)2577 bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
2578 {
2579 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
2580 MemoryContext oldcontext;
2581
2582 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2583
2584 /* Use generic string SortSupport, forcing "C" collation */
2585 varstr_sortsupport(ssup, C_COLLATION_OID, false);
2586
2587 MemoryContextSwitchTo(oldcontext);
2588
2589 PG_RETURN_VOID();
2590 }
2591
2592
2593 /*-------------------------------------------------------------
2594 * byteaoctetlen
2595 *
2596 * get the number of bytes contained in an instance of type 'bytea'
2597 *-------------------------------------------------------------
2598 */
2599 Datum
byteaoctetlen(PG_FUNCTION_ARGS)2600 byteaoctetlen(PG_FUNCTION_ARGS)
2601 {
2602 Datum str = PG_GETARG_DATUM(0);
2603
2604 /* We need not detoast the input at all */
2605 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
2606 }
2607
2608 /*
2609 * byteacat -
2610 * takes two bytea* and returns a bytea* that is the concatenation of
2611 * the two.
2612 *
2613 * Cloned from textcat and modified as required.
2614 */
2615 Datum
byteacat(PG_FUNCTION_ARGS)2616 byteacat(PG_FUNCTION_ARGS)
2617 {
2618 bytea *t1 = PG_GETARG_BYTEA_PP(0);
2619 bytea *t2 = PG_GETARG_BYTEA_PP(1);
2620
2621 PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
2622 }
2623
2624 /*
2625 * bytea_catenate
2626 * Guts of byteacat(), broken out so it can be used by other functions
2627 *
2628 * Arguments can be in short-header form, but not compressed or out-of-line
2629 */
2630 static bytea *
bytea_catenate(bytea * t1,bytea * t2)2631 bytea_catenate(bytea *t1, bytea *t2)
2632 {
2633 bytea *result;
2634 int len1,
2635 len2,
2636 len;
2637 char *ptr;
2638
2639 len1 = VARSIZE_ANY_EXHDR(t1);
2640 len2 = VARSIZE_ANY_EXHDR(t2);
2641
2642 /* paranoia ... probably should throw error instead? */
2643 if (len1 < 0)
2644 len1 = 0;
2645 if (len2 < 0)
2646 len2 = 0;
2647
2648 len = len1 + len2 + VARHDRSZ;
2649 result = (bytea *) palloc(len);
2650
2651 /* Set size of result string... */
2652 SET_VARSIZE(result, len);
2653
2654 /* Fill data field of result string... */
2655 ptr = VARDATA(result);
2656 if (len1 > 0)
2657 memcpy(ptr, VARDATA_ANY(t1), len1);
2658 if (len2 > 0)
2659 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
2660
2661 return result;
2662 }
2663
2664 #define PG_STR_GET_BYTEA(str_) \
2665 DatumGetByteaP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
2666
2667 /*
2668 * bytea_substr()
2669 * Return a substring starting at the specified position.
2670 * Cloned from text_substr and modified as required.
2671 *
2672 * Input:
2673 * - string
2674 * - starting position (is one-based)
2675 * - string length (optional)
2676 *
2677 * If the starting position is zero or less, then return from the start of the string
2678 * adjusting the length to be consistent with the "negative start" per SQL.
2679 * If the length is less than zero, an ERROR is thrown. If no third argument
2680 * (length) is provided, the length to the end of the string is assumed.
2681 */
2682 Datum
bytea_substr(PG_FUNCTION_ARGS)2683 bytea_substr(PG_FUNCTION_ARGS)
2684 {
2685 PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
2686 PG_GETARG_INT32(1),
2687 PG_GETARG_INT32(2),
2688 false));
2689 }
2690
2691 /*
2692 * bytea_substr_no_len -
2693 * Wrapper to avoid opr_sanity failure due to
2694 * one function accepting a different number of args.
2695 */
2696 Datum
bytea_substr_no_len(PG_FUNCTION_ARGS)2697 bytea_substr_no_len(PG_FUNCTION_ARGS)
2698 {
2699 PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
2700 PG_GETARG_INT32(1),
2701 -1,
2702 true));
2703 }
2704
2705 static bytea *
bytea_substring(Datum str,int S,int L,bool length_not_specified)2706 bytea_substring(Datum str,
2707 int S,
2708 int L,
2709 bool length_not_specified)
2710 {
2711 int S1; /* adjusted start position */
2712 int L1; /* adjusted substring length */
2713
2714 S1 = Max(S, 1);
2715
2716 if (length_not_specified)
2717 {
2718 /*
2719 * Not passed a length - DatumGetByteaPSlice() grabs everything to the
2720 * end of the string if we pass it a negative value for length.
2721 */
2722 L1 = -1;
2723 }
2724 else
2725 {
2726 /* end position */
2727 int E = S + L;
2728
2729 /*
2730 * A negative value for L is the only way for the end position to be
2731 * before the start. SQL99 says to throw an error.
2732 */
2733 if (E < S)
2734 ereport(ERROR,
2735 (errcode(ERRCODE_SUBSTRING_ERROR),
2736 errmsg("negative substring length not allowed")));
2737
2738 /*
2739 * A zero or negative value for the end position can happen if the
2740 * start was negative or one. SQL99 says to return a zero-length
2741 * string.
2742 */
2743 if (E < 1)
2744 return PG_STR_GET_BYTEA("");
2745
2746 L1 = E - S1;
2747 }
2748
2749 /*
2750 * If the start position is past the end of the string, SQL99 says to
2751 * return a zero-length string -- DatumGetByteaPSlice() will do that for
2752 * us. Convert to zero-based starting position
2753 */
2754 return DatumGetByteaPSlice(str, S1 - 1, L1);
2755 }
2756
2757 /*
2758 * byteaoverlay
2759 * Replace specified substring of first string with second
2760 *
2761 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
2762 * This code is a direct implementation of what the standard says.
2763 */
2764 Datum
byteaoverlay(PG_FUNCTION_ARGS)2765 byteaoverlay(PG_FUNCTION_ARGS)
2766 {
2767 bytea *t1 = PG_GETARG_BYTEA_PP(0);
2768 bytea *t2 = PG_GETARG_BYTEA_PP(1);
2769 int sp = PG_GETARG_INT32(2); /* substring start position */
2770 int sl = PG_GETARG_INT32(3); /* substring length */
2771
2772 PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
2773 }
2774
2775 Datum
byteaoverlay_no_len(PG_FUNCTION_ARGS)2776 byteaoverlay_no_len(PG_FUNCTION_ARGS)
2777 {
2778 bytea *t1 = PG_GETARG_BYTEA_PP(0);
2779 bytea *t2 = PG_GETARG_BYTEA_PP(1);
2780 int sp = PG_GETARG_INT32(2); /* substring start position */
2781 int sl;
2782
2783 sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
2784 PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
2785 }
2786
2787 static bytea *
bytea_overlay(bytea * t1,bytea * t2,int sp,int sl)2788 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
2789 {
2790 bytea *result;
2791 bytea *s1;
2792 bytea *s2;
2793 int sp_pl_sl;
2794
2795 /*
2796 * Check for possible integer-overflow cases. For negative sp, throw a
2797 * "substring length" error because that's what should be expected
2798 * according to the spec's definition of OVERLAY().
2799 */
2800 if (sp <= 0)
2801 ereport(ERROR,
2802 (errcode(ERRCODE_SUBSTRING_ERROR),
2803 errmsg("negative substring length not allowed")));
2804 sp_pl_sl = sp + sl;
2805 if (sp_pl_sl <= sl)
2806 ereport(ERROR,
2807 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
2808 errmsg("integer out of range")));
2809
2810 s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
2811 s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
2812 result = bytea_catenate(s1, t2);
2813 result = bytea_catenate(result, s2);
2814
2815 return result;
2816 }
2817
2818 /*
2819 * byteapos -
2820 * Return the position of the specified substring.
2821 * Implements the SQL POSITION() function.
2822 * Cloned from textpos and modified as required.
2823 */
2824 Datum
byteapos(PG_FUNCTION_ARGS)2825 byteapos(PG_FUNCTION_ARGS)
2826 {
2827 bytea *t1 = PG_GETARG_BYTEA_PP(0);
2828 bytea *t2 = PG_GETARG_BYTEA_PP(1);
2829 int pos;
2830 int px,
2831 p;
2832 int len1,
2833 len2;
2834 char *p1,
2835 *p2;
2836
2837 len1 = VARSIZE_ANY_EXHDR(t1);
2838 len2 = VARSIZE_ANY_EXHDR(t2);
2839
2840 if (len2 <= 0)
2841 PG_RETURN_INT32(1); /* result for empty pattern */
2842
2843 p1 = VARDATA_ANY(t1);
2844 p2 = VARDATA_ANY(t2);
2845
2846 pos = 0;
2847 px = (len1 - len2);
2848 for (p = 0; p <= px; p++)
2849 {
2850 if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
2851 {
2852 pos = p + 1;
2853 break;
2854 };
2855 p1++;
2856 };
2857
2858 PG_RETURN_INT32(pos);
2859 }
2860
2861 /*-------------------------------------------------------------
2862 * byteaGetByte
2863 *
2864 * this routine treats "bytea" as an array of bytes.
2865 * It returns the Nth byte (a number between 0 and 255).
2866 *-------------------------------------------------------------
2867 */
2868 Datum
byteaGetByte(PG_FUNCTION_ARGS)2869 byteaGetByte(PG_FUNCTION_ARGS)
2870 {
2871 bytea *v = PG_GETARG_BYTEA_PP(0);
2872 int32 n = PG_GETARG_INT32(1);
2873 int len;
2874 int byte;
2875
2876 len = VARSIZE_ANY_EXHDR(v);
2877
2878 if (n < 0 || n >= len)
2879 ereport(ERROR,
2880 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2881 errmsg("index %d out of valid range, 0..%d",
2882 n, len - 1)));
2883
2884 byte = ((unsigned char *) VARDATA_ANY(v))[n];
2885
2886 PG_RETURN_INT32(byte);
2887 }
2888
2889 /*-------------------------------------------------------------
2890 * byteaGetBit
2891 *
2892 * This routine treats a "bytea" type like an array of bits.
2893 * It returns the value of the Nth bit (0 or 1).
2894 *
2895 *-------------------------------------------------------------
2896 */
2897 Datum
byteaGetBit(PG_FUNCTION_ARGS)2898 byteaGetBit(PG_FUNCTION_ARGS)
2899 {
2900 bytea *v = PG_GETARG_BYTEA_PP(0);
2901 int32 n = PG_GETARG_INT32(1);
2902 int byteNo,
2903 bitNo;
2904 int len;
2905 int byte;
2906
2907 len = VARSIZE_ANY_EXHDR(v);
2908
2909 /* Do comparison arithmetic in int64 in case len exceeds INT_MAX/8 */
2910 if (n < 0 || n >= (int64) len * 8)
2911 ereport(ERROR,
2912 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2913 errmsg("index %d out of valid range, 0..%d",
2914 n, (int) Min((int64) len * 8 - 1, INT_MAX))));
2915
2916 byteNo = n / 8;
2917 bitNo = n % 8;
2918
2919 byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
2920
2921 if (byte & (1 << bitNo))
2922 PG_RETURN_INT32(1);
2923 else
2924 PG_RETURN_INT32(0);
2925 }
2926
2927 /*-------------------------------------------------------------
2928 * byteaSetByte
2929 *
2930 * Given an instance of type 'bytea' creates a new one with
2931 * the Nth byte set to the given value.
2932 *
2933 *-------------------------------------------------------------
2934 */
2935 Datum
byteaSetByte(PG_FUNCTION_ARGS)2936 byteaSetByte(PG_FUNCTION_ARGS)
2937 {
2938 bytea *v = PG_GETARG_BYTEA_P(0);
2939 int32 n = PG_GETARG_INT32(1);
2940 int32 newByte = PG_GETARG_INT32(2);
2941 int len;
2942 bytea *res;
2943
2944 len = VARSIZE(v) - VARHDRSZ;
2945
2946 if (n < 0 || n >= len)
2947 ereport(ERROR,
2948 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2949 errmsg("index %d out of valid range, 0..%d",
2950 n, len - 1)));
2951
2952 /*
2953 * Make a copy of the original varlena.
2954 */
2955 res = (bytea *) palloc(VARSIZE(v));
2956 memcpy((char *) res, (char *) v, VARSIZE(v));
2957
2958 /*
2959 * Now set the byte.
2960 */
2961 ((unsigned char *) VARDATA(res))[n] = newByte;
2962
2963 PG_RETURN_BYTEA_P(res);
2964 }
2965
2966 /*-------------------------------------------------------------
2967 * byteaSetBit
2968 *
2969 * Given an instance of type 'bytea' creates a new one with
2970 * the Nth bit set to the given value.
2971 *
2972 *-------------------------------------------------------------
2973 */
2974 Datum
byteaSetBit(PG_FUNCTION_ARGS)2975 byteaSetBit(PG_FUNCTION_ARGS)
2976 {
2977 bytea *v = PG_GETARG_BYTEA_P(0);
2978 int32 n = PG_GETARG_INT32(1);
2979 int32 newBit = PG_GETARG_INT32(2);
2980 bytea *res;
2981 int len;
2982 int oldByte,
2983 newByte;
2984 int byteNo,
2985 bitNo;
2986
2987 len = VARSIZE(v) - VARHDRSZ;
2988
2989 /* Do comparison arithmetic in int64 in case len exceeds INT_MAX/8 */
2990 if (n < 0 || n >= (int64) len * 8)
2991 ereport(ERROR,
2992 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2993 errmsg("index %d out of valid range, 0..%d",
2994 n, (int) Min((int64) len * 8 - 1, INT_MAX))));
2995
2996 byteNo = n / 8;
2997 bitNo = n % 8;
2998
2999 /*
3000 * sanity check!
3001 */
3002 if (newBit != 0 && newBit != 1)
3003 ereport(ERROR,
3004 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3005 errmsg("new bit must be 0 or 1")));
3006
3007 /*
3008 * Make a copy of the original varlena.
3009 */
3010 res = (bytea *) palloc(VARSIZE(v));
3011 memcpy((char *) res, (char *) v, VARSIZE(v));
3012
3013 /*
3014 * Update the byte.
3015 */
3016 oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3017
3018 if (newBit == 0)
3019 newByte = oldByte & (~(1 << bitNo));
3020 else
3021 newByte = oldByte | (1 << bitNo);
3022
3023 ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3024
3025 PG_RETURN_BYTEA_P(res);
3026 }
3027
3028
3029 /* text_name()
3030 * Converts a text type to a Name type.
3031 */
3032 Datum
text_name(PG_FUNCTION_ARGS)3033 text_name(PG_FUNCTION_ARGS)
3034 {
3035 text *s = PG_GETARG_TEXT_PP(0);
3036 Name result;
3037 int len;
3038
3039 len = VARSIZE_ANY_EXHDR(s);
3040
3041 /* Truncate oversize input */
3042 if (len >= NAMEDATALEN)
3043 len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3044
3045 /* We use palloc0 here to ensure result is zero-padded */
3046 result = (Name) palloc0(NAMEDATALEN);
3047 memcpy(NameStr(*result), VARDATA_ANY(s), len);
3048
3049 PG_RETURN_NAME(result);
3050 }
3051
3052 /* name_text()
3053 * Converts a Name type to a text type.
3054 */
3055 Datum
name_text(PG_FUNCTION_ARGS)3056 name_text(PG_FUNCTION_ARGS)
3057 {
3058 Name s = PG_GETARG_NAME(0);
3059
3060 PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3061 }
3062
3063
3064 /*
3065 * textToQualifiedNameList - convert a text object to list of names
3066 *
3067 * This implements the input parsing needed by nextval() and other
3068 * functions that take a text parameter representing a qualified name.
3069 * We split the name at dots, downcase if not double-quoted, and
3070 * truncate names if they're too long.
3071 */
3072 List *
textToQualifiedNameList(text * textval)3073 textToQualifiedNameList(text *textval)
3074 {
3075 char *rawname;
3076 List *result = NIL;
3077 List *namelist;
3078 ListCell *l;
3079
3080 /* Convert to C string (handles possible detoasting). */
3081 /* Note we rely on being able to modify rawname below. */
3082 rawname = text_to_cstring(textval);
3083
3084 if (!SplitIdentifierString(rawname, '.', &namelist))
3085 ereport(ERROR,
3086 (errcode(ERRCODE_INVALID_NAME),
3087 errmsg("invalid name syntax")));
3088
3089 if (namelist == NIL)
3090 ereport(ERROR,
3091 (errcode(ERRCODE_INVALID_NAME),
3092 errmsg("invalid name syntax")));
3093
3094 foreach(l, namelist)
3095 {
3096 char *curname = (char *) lfirst(l);
3097
3098 result = lappend(result, makeString(pstrdup(curname)));
3099 }
3100
3101 pfree(rawname);
3102 list_free(namelist);
3103
3104 return result;
3105 }
3106
3107 /*
3108 * SplitIdentifierString --- parse a string containing identifiers
3109 *
3110 * This is the guts of textToQualifiedNameList, and is exported for use in
3111 * other situations such as parsing GUC variables. In the GUC case, it's
3112 * important to avoid memory leaks, so the API is designed to minimize the
3113 * amount of stuff that needs to be allocated and freed.
3114 *
3115 * Inputs:
3116 * rawstring: the input string; must be overwritable! On return, it's
3117 * been modified to contain the separated identifiers.
3118 * separator: the separator punctuation expected between identifiers
3119 * (typically '.' or ','). Whitespace may also appear around
3120 * identifiers.
3121 * Outputs:
3122 * namelist: filled with a palloc'd list of pointers to identifiers within
3123 * rawstring. Caller should list_free() this even on error return.
3124 *
3125 * Returns TRUE if okay, FALSE if there is a syntax error in the string.
3126 *
3127 * Note that an empty string is considered okay here, though not in
3128 * textToQualifiedNameList.
3129 */
3130 bool
SplitIdentifierString(char * rawstring,char separator,List ** namelist)3131 SplitIdentifierString(char *rawstring, char separator,
3132 List **namelist)
3133 {
3134 char *nextp = rawstring;
3135 bool done = false;
3136
3137 *namelist = NIL;
3138
3139 while (scanner_isspace(*nextp))
3140 nextp++; /* skip leading whitespace */
3141
3142 if (*nextp == '\0')
3143 return true; /* allow empty string */
3144
3145 /* At the top of the loop, we are at start of a new identifier. */
3146 do
3147 {
3148 char *curname;
3149 char *endp;
3150
3151 if (*nextp == '"')
3152 {
3153 /* Quoted name --- collapse quote-quote pairs, no downcasing */
3154 curname = nextp + 1;
3155 for (;;)
3156 {
3157 endp = strchr(nextp + 1, '"');
3158 if (endp == NULL)
3159 return false; /* mismatched quotes */
3160 if (endp[1] != '"')
3161 break; /* found end of quoted name */
3162 /* Collapse adjacent quotes into one quote, and look again */
3163 memmove(endp, endp + 1, strlen(endp));
3164 nextp = endp;
3165 }
3166 /* endp now points at the terminating quote */
3167 nextp = endp + 1;
3168 }
3169 else
3170 {
3171 /* Unquoted name --- extends to separator or whitespace */
3172 char *downname;
3173 int len;
3174
3175 curname = nextp;
3176 while (*nextp && *nextp != separator &&
3177 !scanner_isspace(*nextp))
3178 nextp++;
3179 endp = nextp;
3180 if (curname == nextp)
3181 return false; /* empty unquoted name not allowed */
3182
3183 /*
3184 * Downcase the identifier, using same code as main lexer does.
3185 *
3186 * XXX because we want to overwrite the input in-place, we cannot
3187 * support a downcasing transformation that increases the string
3188 * length. This is not a problem given the current implementation
3189 * of downcase_truncate_identifier, but we'll probably have to do
3190 * something about this someday.
3191 */
3192 len = endp - curname;
3193 downname = downcase_truncate_identifier(curname, len, false);
3194 Assert(strlen(downname) <= len);
3195 strncpy(curname, downname, len); /* strncpy is required here */
3196 pfree(downname);
3197 }
3198
3199 while (scanner_isspace(*nextp))
3200 nextp++; /* skip trailing whitespace */
3201
3202 if (*nextp == separator)
3203 {
3204 nextp++;
3205 while (scanner_isspace(*nextp))
3206 nextp++; /* skip leading whitespace for next */
3207 /* we expect another name, so done remains false */
3208 }
3209 else if (*nextp == '\0')
3210 done = true;
3211 else
3212 return false; /* invalid syntax */
3213
3214 /* Now safe to overwrite separator with a null */
3215 *endp = '\0';
3216
3217 /* Truncate name if it's overlength */
3218 truncate_identifier(curname, strlen(curname), false);
3219
3220 /*
3221 * Finished isolating current name --- add it to list
3222 */
3223 *namelist = lappend(*namelist, curname);
3224
3225 /* Loop back if we didn't reach end of string */
3226 } while (!done);
3227
3228 return true;
3229 }
3230
3231
3232 /*
3233 * SplitDirectoriesString --- parse a string containing directory names
3234 *
3235 * This is similar to SplitIdentifierString, except that the parsing
3236 * rules are meant to handle pathnames instead of identifiers: there is
3237 * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3238 * and we apply canonicalize_path() to each extracted string. Because of the
3239 * last, the returned strings are separately palloc'd rather than being
3240 * pointers into rawstring --- but we still scribble on rawstring.
3241 *
3242 * Inputs:
3243 * rawstring: the input string; must be modifiable!
3244 * separator: the separator punctuation expected between directories
3245 * (typically ',' or ';'). Whitespace may also appear around
3246 * directories.
3247 * Outputs:
3248 * namelist: filled with a palloc'd list of directory names.
3249 * Caller should list_free_deep() this even on error return.
3250 *
3251 * Returns TRUE if okay, FALSE if there is a syntax error in the string.
3252 *
3253 * Note that an empty string is considered okay here.
3254 */
3255 bool
SplitDirectoriesString(char * rawstring,char separator,List ** namelist)3256 SplitDirectoriesString(char *rawstring, char separator,
3257 List **namelist)
3258 {
3259 char *nextp = rawstring;
3260 bool done = false;
3261
3262 *namelist = NIL;
3263
3264 while (scanner_isspace(*nextp))
3265 nextp++; /* skip leading whitespace */
3266
3267 if (*nextp == '\0')
3268 return true; /* allow empty string */
3269
3270 /* At the top of the loop, we are at start of a new directory. */
3271 do
3272 {
3273 char *curname;
3274 char *endp;
3275
3276 if (*nextp == '"')
3277 {
3278 /* Quoted name --- collapse quote-quote pairs */
3279 curname = nextp + 1;
3280 for (;;)
3281 {
3282 endp = strchr(nextp + 1, '"');
3283 if (endp == NULL)
3284 return false; /* mismatched quotes */
3285 if (endp[1] != '"')
3286 break; /* found end of quoted name */
3287 /* Collapse adjacent quotes into one quote, and look again */
3288 memmove(endp, endp + 1, strlen(endp));
3289 nextp = endp;
3290 }
3291 /* endp now points at the terminating quote */
3292 nextp = endp + 1;
3293 }
3294 else
3295 {
3296 /* Unquoted name --- extends to separator or end of string */
3297 curname = endp = nextp;
3298 while (*nextp && *nextp != separator)
3299 {
3300 /* trailing whitespace should not be included in name */
3301 if (!scanner_isspace(*nextp))
3302 endp = nextp + 1;
3303 nextp++;
3304 }
3305 if (curname == endp)
3306 return false; /* empty unquoted name not allowed */
3307 }
3308
3309 while (scanner_isspace(*nextp))
3310 nextp++; /* skip trailing whitespace */
3311
3312 if (*nextp == separator)
3313 {
3314 nextp++;
3315 while (scanner_isspace(*nextp))
3316 nextp++; /* skip leading whitespace for next */
3317 /* we expect another name, so done remains false */
3318 }
3319 else if (*nextp == '\0')
3320 done = true;
3321 else
3322 return false; /* invalid syntax */
3323
3324 /* Now safe to overwrite separator with a null */
3325 *endp = '\0';
3326
3327 /* Truncate path if it's overlength */
3328 if (strlen(curname) >= MAXPGPATH)
3329 curname[MAXPGPATH - 1] = '\0';
3330
3331 /*
3332 * Finished isolating current name --- add it to list
3333 */
3334 curname = pstrdup(curname);
3335 canonicalize_path(curname);
3336 *namelist = lappend(*namelist, curname);
3337
3338 /* Loop back if we didn't reach end of string */
3339 } while (!done);
3340
3341 return true;
3342 }
3343
3344
3345 /*
3346 * SplitGUCList --- parse a string containing identifiers or file names
3347 *
3348 * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3349 * presuming whether the elements will be taken as identifiers or file names.
3350 * We assume the input has already been through flatten_set_variable_args(),
3351 * so that we need never downcase (if appropriate, that was done already).
3352 * Nor do we ever truncate, since we don't know the correct max length.
3353 * We disallow embedded whitespace for simplicity (it shouldn't matter,
3354 * because any embedded whitespace should have led to double-quoting).
3355 * Otherwise the API is identical to SplitIdentifierString.
3356 *
3357 * XXX it's annoying to have so many copies of this string-splitting logic.
3358 * However, it's not clear that having one function with a bunch of option
3359 * flags would be much better.
3360 *
3361 * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3362 * Be sure to update that if you have to change this.
3363 *
3364 * Inputs:
3365 * rawstring: the input string; must be overwritable! On return, it's
3366 * been modified to contain the separated identifiers.
3367 * separator: the separator punctuation expected between identifiers
3368 * (typically '.' or ','). Whitespace may also appear around
3369 * identifiers.
3370 * Outputs:
3371 * namelist: filled with a palloc'd list of pointers to identifiers within
3372 * rawstring. Caller should list_free() this even on error return.
3373 *
3374 * Returns true if okay, false if there is a syntax error in the string.
3375 */
3376 bool
SplitGUCList(char * rawstring,char separator,List ** namelist)3377 SplitGUCList(char *rawstring, char separator,
3378 List **namelist)
3379 {
3380 char *nextp = rawstring;
3381 bool done = false;
3382
3383 *namelist = NIL;
3384
3385 while (scanner_isspace(*nextp))
3386 nextp++; /* skip leading whitespace */
3387
3388 if (*nextp == '\0')
3389 return true; /* allow empty string */
3390
3391 /* At the top of the loop, we are at start of a new identifier. */
3392 do
3393 {
3394 char *curname;
3395 char *endp;
3396
3397 if (*nextp == '"')
3398 {
3399 /* Quoted name --- collapse quote-quote pairs */
3400 curname = nextp + 1;
3401 for (;;)
3402 {
3403 endp = strchr(nextp + 1, '"');
3404 if (endp == NULL)
3405 return false; /* mismatched quotes */
3406 if (endp[1] != '"')
3407 break; /* found end of quoted name */
3408 /* Collapse adjacent quotes into one quote, and look again */
3409 memmove(endp, endp + 1, strlen(endp));
3410 nextp = endp;
3411 }
3412 /* endp now points at the terminating quote */
3413 nextp = endp + 1;
3414 }
3415 else
3416 {
3417 /* Unquoted name --- extends to separator or whitespace */
3418 curname = nextp;
3419 while (*nextp && *nextp != separator &&
3420 !scanner_isspace(*nextp))
3421 nextp++;
3422 endp = nextp;
3423 if (curname == nextp)
3424 return false; /* empty unquoted name not allowed */
3425 }
3426
3427 while (scanner_isspace(*nextp))
3428 nextp++; /* skip trailing whitespace */
3429
3430 if (*nextp == separator)
3431 {
3432 nextp++;
3433 while (scanner_isspace(*nextp))
3434 nextp++; /* skip leading whitespace for next */
3435 /* we expect another name, so done remains false */
3436 }
3437 else if (*nextp == '\0')
3438 done = true;
3439 else
3440 return false; /* invalid syntax */
3441
3442 /* Now safe to overwrite separator with a null */
3443 *endp = '\0';
3444
3445 /*
3446 * Finished isolating current name --- add it to list
3447 */
3448 *namelist = lappend(*namelist, curname);
3449
3450 /* Loop back if we didn't reach end of string */
3451 } while (!done);
3452
3453 return true;
3454 }
3455
3456
3457 /*****************************************************************************
3458 * Comparison Functions used for bytea
3459 *
3460 * Note: btree indexes need these routines not to leak memory; therefore,
3461 * be careful to free working copies of toasted datums. Most places don't
3462 * need to be so careful.
3463 *****************************************************************************/
3464
3465 Datum
byteaeq(PG_FUNCTION_ARGS)3466 byteaeq(PG_FUNCTION_ARGS)
3467 {
3468 Datum arg1 = PG_GETARG_DATUM(0);
3469 Datum arg2 = PG_GETARG_DATUM(1);
3470 bool result;
3471 Size len1,
3472 len2;
3473
3474 /*
3475 * We can use a fast path for unequal lengths, which might save us from
3476 * having to detoast one or both values.
3477 */
3478 len1 = toast_raw_datum_size(arg1);
3479 len2 = toast_raw_datum_size(arg2);
3480 if (len1 != len2)
3481 result = false;
3482 else
3483 {
3484 bytea *barg1 = DatumGetByteaPP(arg1);
3485 bytea *barg2 = DatumGetByteaPP(arg2);
3486
3487 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3488 len1 - VARHDRSZ) == 0);
3489
3490 PG_FREE_IF_COPY(barg1, 0);
3491 PG_FREE_IF_COPY(barg2, 1);
3492 }
3493
3494 PG_RETURN_BOOL(result);
3495 }
3496
3497 Datum
byteane(PG_FUNCTION_ARGS)3498 byteane(PG_FUNCTION_ARGS)
3499 {
3500 Datum arg1 = PG_GETARG_DATUM(0);
3501 Datum arg2 = PG_GETARG_DATUM(1);
3502 bool result;
3503 Size len1,
3504 len2;
3505
3506 /*
3507 * We can use a fast path for unequal lengths, which might save us from
3508 * having to detoast one or both values.
3509 */
3510 len1 = toast_raw_datum_size(arg1);
3511 len2 = toast_raw_datum_size(arg2);
3512 if (len1 != len2)
3513 result = true;
3514 else
3515 {
3516 bytea *barg1 = DatumGetByteaPP(arg1);
3517 bytea *barg2 = DatumGetByteaPP(arg2);
3518
3519 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3520 len1 - VARHDRSZ) != 0);
3521
3522 PG_FREE_IF_COPY(barg1, 0);
3523 PG_FREE_IF_COPY(barg2, 1);
3524 }
3525
3526 PG_RETURN_BOOL(result);
3527 }
3528
3529 Datum
bytealt(PG_FUNCTION_ARGS)3530 bytealt(PG_FUNCTION_ARGS)
3531 {
3532 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3533 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3534 int len1,
3535 len2;
3536 int cmp;
3537
3538 len1 = VARSIZE_ANY_EXHDR(arg1);
3539 len2 = VARSIZE_ANY_EXHDR(arg2);
3540
3541 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3542
3543 PG_FREE_IF_COPY(arg1, 0);
3544 PG_FREE_IF_COPY(arg2, 1);
3545
3546 PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
3547 }
3548
3549 Datum
byteale(PG_FUNCTION_ARGS)3550 byteale(PG_FUNCTION_ARGS)
3551 {
3552 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3553 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3554 int len1,
3555 len2;
3556 int cmp;
3557
3558 len1 = VARSIZE_ANY_EXHDR(arg1);
3559 len2 = VARSIZE_ANY_EXHDR(arg2);
3560
3561 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3562
3563 PG_FREE_IF_COPY(arg1, 0);
3564 PG_FREE_IF_COPY(arg2, 1);
3565
3566 PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
3567 }
3568
3569 Datum
byteagt(PG_FUNCTION_ARGS)3570 byteagt(PG_FUNCTION_ARGS)
3571 {
3572 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3573 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3574 int len1,
3575 len2;
3576 int cmp;
3577
3578 len1 = VARSIZE_ANY_EXHDR(arg1);
3579 len2 = VARSIZE_ANY_EXHDR(arg2);
3580
3581 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3582
3583 PG_FREE_IF_COPY(arg1, 0);
3584 PG_FREE_IF_COPY(arg2, 1);
3585
3586 PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
3587 }
3588
3589 Datum
byteage(PG_FUNCTION_ARGS)3590 byteage(PG_FUNCTION_ARGS)
3591 {
3592 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3593 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3594 int len1,
3595 len2;
3596 int cmp;
3597
3598 len1 = VARSIZE_ANY_EXHDR(arg1);
3599 len2 = VARSIZE_ANY_EXHDR(arg2);
3600
3601 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3602
3603 PG_FREE_IF_COPY(arg1, 0);
3604 PG_FREE_IF_COPY(arg2, 1);
3605
3606 PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
3607 }
3608
3609 Datum
byteacmp(PG_FUNCTION_ARGS)3610 byteacmp(PG_FUNCTION_ARGS)
3611 {
3612 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3613 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3614 int len1,
3615 len2;
3616 int cmp;
3617
3618 len1 = VARSIZE_ANY_EXHDR(arg1);
3619 len2 = VARSIZE_ANY_EXHDR(arg2);
3620
3621 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3622 if ((cmp == 0) && (len1 != len2))
3623 cmp = (len1 < len2) ? -1 : 1;
3624
3625 PG_FREE_IF_COPY(arg1, 0);
3626 PG_FREE_IF_COPY(arg2, 1);
3627
3628 PG_RETURN_INT32(cmp);
3629 }
3630
3631 Datum
bytea_sortsupport(PG_FUNCTION_ARGS)3632 bytea_sortsupport(PG_FUNCTION_ARGS)
3633 {
3634 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3635 MemoryContext oldcontext;
3636
3637 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3638
3639 /* Use generic string SortSupport, forcing "C" collation */
3640 varstr_sortsupport(ssup, C_COLLATION_OID, false);
3641
3642 MemoryContextSwitchTo(oldcontext);
3643
3644 PG_RETURN_VOID();
3645 }
3646
3647 /*
3648 * appendStringInfoText
3649 *
3650 * Append a text to str.
3651 * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
3652 */
3653 static void
appendStringInfoText(StringInfo str,const text * t)3654 appendStringInfoText(StringInfo str, const text *t)
3655 {
3656 appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
3657 }
3658
3659 /*
3660 * replace_text
3661 * replace all occurrences of 'old_sub_str' in 'orig_str'
3662 * with 'new_sub_str' to form 'new_str'
3663 *
3664 * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
3665 * otherwise returns 'new_str'
3666 */
3667 Datum
replace_text(PG_FUNCTION_ARGS)3668 replace_text(PG_FUNCTION_ARGS)
3669 {
3670 text *src_text = PG_GETARG_TEXT_PP(0);
3671 text *from_sub_text = PG_GETARG_TEXT_PP(1);
3672 text *to_sub_text = PG_GETARG_TEXT_PP(2);
3673 int src_text_len;
3674 int from_sub_text_len;
3675 TextPositionState state;
3676 text *ret_text;
3677 int start_posn;
3678 int curr_posn;
3679 int chunk_len;
3680 char *start_ptr;
3681 StringInfoData str;
3682
3683 text_position_setup(src_text, from_sub_text, &state);
3684
3685 /*
3686 * Note: we check the converted string length, not the original, because
3687 * they could be different if the input contained invalid encoding.
3688 */
3689 src_text_len = state.len1;
3690 from_sub_text_len = state.len2;
3691
3692 /* Return unmodified source string if empty source or pattern */
3693 if (src_text_len < 1 || from_sub_text_len < 1)
3694 {
3695 text_position_cleanup(&state);
3696 PG_RETURN_TEXT_P(src_text);
3697 }
3698
3699 start_posn = 1;
3700 curr_posn = text_position_next(1, &state);
3701
3702 /* When the from_sub_text is not found, there is nothing to do. */
3703 if (curr_posn == 0)
3704 {
3705 text_position_cleanup(&state);
3706 PG_RETURN_TEXT_P(src_text);
3707 }
3708
3709 /* start_ptr points to the start_posn'th character of src_text */
3710 start_ptr = VARDATA_ANY(src_text);
3711
3712 initStringInfo(&str);
3713
3714 do
3715 {
3716 CHECK_FOR_INTERRUPTS();
3717
3718 /* copy the data skipped over by last text_position_next() */
3719 chunk_len = charlen_to_bytelen(start_ptr, curr_posn - start_posn);
3720 appendBinaryStringInfo(&str, start_ptr, chunk_len);
3721
3722 appendStringInfoText(&str, to_sub_text);
3723
3724 start_posn = curr_posn;
3725 start_ptr += chunk_len;
3726 start_posn += from_sub_text_len;
3727 start_ptr += charlen_to_bytelen(start_ptr, from_sub_text_len);
3728
3729 curr_posn = text_position_next(start_posn, &state);
3730 }
3731 while (curr_posn > 0);
3732
3733 /* copy trailing data */
3734 chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
3735 appendBinaryStringInfo(&str, start_ptr, chunk_len);
3736
3737 text_position_cleanup(&state);
3738
3739 ret_text = cstring_to_text_with_len(str.data, str.len);
3740 pfree(str.data);
3741
3742 PG_RETURN_TEXT_P(ret_text);
3743 }
3744
3745 /*
3746 * check_replace_text_has_escape_char
3747 *
3748 * check whether replace_text contains escape char.
3749 */
3750 static bool
check_replace_text_has_escape_char(const text * replace_text)3751 check_replace_text_has_escape_char(const text *replace_text)
3752 {
3753 const char *p = VARDATA_ANY(replace_text);
3754 const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
3755
3756 if (pg_database_encoding_max_length() == 1)
3757 {
3758 for (; p < p_end; p++)
3759 {
3760 if (*p == '\\')
3761 return true;
3762 }
3763 }
3764 else
3765 {
3766 for (; p < p_end; p += pg_mblen(p))
3767 {
3768 if (*p == '\\')
3769 return true;
3770 }
3771 }
3772
3773 return false;
3774 }
3775
3776 /*
3777 * appendStringInfoRegexpSubstr
3778 *
3779 * Append replace_text to str, substituting regexp back references for
3780 * \n escapes. start_ptr is the start of the match in the source string,
3781 * at logical character position data_pos.
3782 */
3783 static void
appendStringInfoRegexpSubstr(StringInfo str,text * replace_text,regmatch_t * pmatch,char * start_ptr,int data_pos)3784 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
3785 regmatch_t *pmatch,
3786 char *start_ptr, int data_pos)
3787 {
3788 const char *p = VARDATA_ANY(replace_text);
3789 const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
3790 int eml = pg_database_encoding_max_length();
3791
3792 for (;;)
3793 {
3794 const char *chunk_start = p;
3795 int so;
3796 int eo;
3797
3798 /* Find next escape char. */
3799 if (eml == 1)
3800 {
3801 for (; p < p_end && *p != '\\'; p++)
3802 /* nothing */ ;
3803 }
3804 else
3805 {
3806 for (; p < p_end && *p != '\\'; p += pg_mblen(p))
3807 /* nothing */ ;
3808 }
3809
3810 /* Copy the text we just scanned over, if any. */
3811 if (p > chunk_start)
3812 appendBinaryStringInfo(str, chunk_start, p - chunk_start);
3813
3814 /* Done if at end of string, else advance over escape char. */
3815 if (p >= p_end)
3816 break;
3817 p++;
3818
3819 if (p >= p_end)
3820 {
3821 /* Escape at very end of input. Treat same as unexpected char */
3822 appendStringInfoChar(str, '\\');
3823 break;
3824 }
3825
3826 if (*p >= '1' && *p <= '9')
3827 {
3828 /* Use the back reference of regexp. */
3829 int idx = *p - '0';
3830
3831 so = pmatch[idx].rm_so;
3832 eo = pmatch[idx].rm_eo;
3833 p++;
3834 }
3835 else if (*p == '&')
3836 {
3837 /* Use the entire matched string. */
3838 so = pmatch[0].rm_so;
3839 eo = pmatch[0].rm_eo;
3840 p++;
3841 }
3842 else if (*p == '\\')
3843 {
3844 /* \\ means transfer one \ to output. */
3845 appendStringInfoChar(str, '\\');
3846 p++;
3847 continue;
3848 }
3849 else
3850 {
3851 /*
3852 * If escape char is not followed by any expected char, just treat
3853 * it as ordinary data to copy. (XXX would it be better to throw
3854 * an error?)
3855 */
3856 appendStringInfoChar(str, '\\');
3857 continue;
3858 }
3859
3860 if (so != -1 && eo != -1)
3861 {
3862 /*
3863 * Copy the text that is back reference of regexp. Note so and eo
3864 * are counted in characters not bytes.
3865 */
3866 char *chunk_start;
3867 int chunk_len;
3868
3869 Assert(so >= data_pos);
3870 chunk_start = start_ptr;
3871 chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
3872 chunk_len = charlen_to_bytelen(chunk_start, eo - so);
3873 appendBinaryStringInfo(str, chunk_start, chunk_len);
3874 }
3875 }
3876 }
3877
3878 #define REGEXP_REPLACE_BACKREF_CNT 10
3879
3880 /*
3881 * replace_text_regexp
3882 *
3883 * replace text that matches to regexp in src_text to replace_text.
3884 *
3885 * Note: to avoid having to include regex.h in builtins.h, we declare
3886 * the regexp argument as void *, but really it's regex_t *.
3887 */
3888 text *
replace_text_regexp(text * src_text,void * regexp,text * replace_text,bool glob)3889 replace_text_regexp(text *src_text, void *regexp,
3890 text *replace_text, bool glob)
3891 {
3892 text *ret_text;
3893 regex_t *re = (regex_t *) regexp;
3894 int src_text_len = VARSIZE_ANY_EXHDR(src_text);
3895 StringInfoData buf;
3896 regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT];
3897 pg_wchar *data;
3898 size_t data_len;
3899 int search_start;
3900 int data_pos;
3901 char *start_ptr;
3902 bool have_escape;
3903
3904 initStringInfo(&buf);
3905
3906 /* Convert data string to wide characters. */
3907 data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
3908 data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
3909
3910 /* Check whether replace_text has escape char. */
3911 have_escape = check_replace_text_has_escape_char(replace_text);
3912
3913 /* start_ptr points to the data_pos'th character of src_text */
3914 start_ptr = (char *) VARDATA_ANY(src_text);
3915 data_pos = 0;
3916
3917 search_start = 0;
3918 while (search_start <= data_len)
3919 {
3920 int regexec_result;
3921
3922 CHECK_FOR_INTERRUPTS();
3923
3924 regexec_result = pg_regexec(re,
3925 data,
3926 data_len,
3927 search_start,
3928 NULL, /* no details */
3929 REGEXP_REPLACE_BACKREF_CNT,
3930 pmatch,
3931 0);
3932
3933 if (regexec_result == REG_NOMATCH)
3934 break;
3935
3936 if (regexec_result != REG_OKAY)
3937 {
3938 char errMsg[100];
3939
3940 CHECK_FOR_INTERRUPTS();
3941 pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
3942 ereport(ERROR,
3943 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
3944 errmsg("regular expression failed: %s", errMsg)));
3945 }
3946
3947 /*
3948 * Copy the text to the left of the match position. Note we are given
3949 * character not byte indexes.
3950 */
3951 if (pmatch[0].rm_so - data_pos > 0)
3952 {
3953 int chunk_len;
3954
3955 chunk_len = charlen_to_bytelen(start_ptr,
3956 pmatch[0].rm_so - data_pos);
3957 appendBinaryStringInfo(&buf, start_ptr, chunk_len);
3958
3959 /*
3960 * Advance start_ptr over that text, to avoid multiple rescans of
3961 * it if the replace_text contains multiple back-references.
3962 */
3963 start_ptr += chunk_len;
3964 data_pos = pmatch[0].rm_so;
3965 }
3966
3967 /*
3968 * Copy the replace_text. Process back references when the
3969 * replace_text has escape characters.
3970 */
3971 if (have_escape)
3972 appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
3973 start_ptr, data_pos);
3974 else
3975 appendStringInfoText(&buf, replace_text);
3976
3977 /* Advance start_ptr and data_pos over the matched text. */
3978 start_ptr += charlen_to_bytelen(start_ptr,
3979 pmatch[0].rm_eo - data_pos);
3980 data_pos = pmatch[0].rm_eo;
3981
3982 /*
3983 * When global option is off, replace the first instance only.
3984 */
3985 if (!glob)
3986 break;
3987
3988 /*
3989 * Advance search position. Normally we start the next search at the
3990 * end of the previous match; but if the match was of zero length, we
3991 * have to advance by one character, or we'd just find the same match
3992 * again.
3993 */
3994 search_start = data_pos;
3995 if (pmatch[0].rm_so == pmatch[0].rm_eo)
3996 search_start++;
3997 }
3998
3999 /*
4000 * Copy the text to the right of the last match.
4001 */
4002 if (data_pos < data_len)
4003 {
4004 int chunk_len;
4005
4006 chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4007 appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4008 }
4009
4010 ret_text = cstring_to_text_with_len(buf.data, buf.len);
4011 pfree(buf.data);
4012 pfree(data);
4013
4014 return ret_text;
4015 }
4016
4017 /*
4018 * split_text
4019 * parse input string
4020 * return ord item (1 based)
4021 * based on provided field separator
4022 */
4023 Datum
split_text(PG_FUNCTION_ARGS)4024 split_text(PG_FUNCTION_ARGS)
4025 {
4026 text *inputstring = PG_GETARG_TEXT_PP(0);
4027 text *fldsep = PG_GETARG_TEXT_PP(1);
4028 int fldnum = PG_GETARG_INT32(2);
4029 int inputstring_len;
4030 int fldsep_len;
4031 TextPositionState state;
4032 int start_posn;
4033 int end_posn;
4034 text *result_text;
4035
4036 /* field number is 1 based */
4037 if (fldnum < 1)
4038 ereport(ERROR,
4039 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4040 errmsg("field position must be greater than zero")));
4041
4042 text_position_setup(inputstring, fldsep, &state);
4043
4044 /*
4045 * Note: we check the converted string length, not the original, because
4046 * they could be different if the input contained invalid encoding.
4047 */
4048 inputstring_len = state.len1;
4049 fldsep_len = state.len2;
4050
4051 /* return empty string for empty input string */
4052 if (inputstring_len < 1)
4053 {
4054 text_position_cleanup(&state);
4055 PG_RETURN_TEXT_P(cstring_to_text(""));
4056 }
4057
4058 /* empty field separator */
4059 if (fldsep_len < 1)
4060 {
4061 text_position_cleanup(&state);
4062 /* if first field, return input string, else empty string */
4063 if (fldnum == 1)
4064 PG_RETURN_TEXT_P(inputstring);
4065 else
4066 PG_RETURN_TEXT_P(cstring_to_text(""));
4067 }
4068
4069 /* identify bounds of first field */
4070 start_posn = 1;
4071 end_posn = text_position_next(1, &state);
4072
4073 /* special case if fldsep not found at all */
4074 if (end_posn == 0)
4075 {
4076 text_position_cleanup(&state);
4077 /* if field 1 requested, return input string, else empty string */
4078 if (fldnum == 1)
4079 PG_RETURN_TEXT_P(inputstring);
4080 else
4081 PG_RETURN_TEXT_P(cstring_to_text(""));
4082 }
4083
4084 while (end_posn > 0 && --fldnum > 0)
4085 {
4086 /* identify bounds of next field */
4087 start_posn = end_posn + fldsep_len;
4088 end_posn = text_position_next(start_posn, &state);
4089 }
4090
4091 text_position_cleanup(&state);
4092
4093 if (fldnum > 0)
4094 {
4095 /* N'th field separator not found */
4096 /* if last field requested, return it, else empty string */
4097 if (fldnum == 1)
4098 result_text = text_substring(PointerGetDatum(inputstring),
4099 start_posn,
4100 -1,
4101 true);
4102 else
4103 result_text = cstring_to_text("");
4104 }
4105 else
4106 {
4107 /* non-last field requested */
4108 result_text = text_substring(PointerGetDatum(inputstring),
4109 start_posn,
4110 end_posn - start_posn,
4111 false);
4112 }
4113
4114 PG_RETURN_TEXT_P(result_text);
4115 }
4116
4117 /*
4118 * Convenience function to return true when two text params are equal.
4119 */
4120 static bool
text_isequal(text * txt1,text * txt2)4121 text_isequal(text *txt1, text *txt2)
4122 {
4123 return DatumGetBool(DirectFunctionCall2(texteq,
4124 PointerGetDatum(txt1),
4125 PointerGetDatum(txt2)));
4126 }
4127
4128 /*
4129 * text_to_array
4130 * parse input string and return text array of elements,
4131 * based on provided field separator
4132 */
4133 Datum
text_to_array(PG_FUNCTION_ARGS)4134 text_to_array(PG_FUNCTION_ARGS)
4135 {
4136 return text_to_array_internal(fcinfo);
4137 }
4138
4139 /*
4140 * text_to_array_null
4141 * parse input string and return text array of elements,
4142 * based on provided field separator and null string
4143 *
4144 * This is a separate entry point only to prevent the regression tests from
4145 * complaining about different argument sets for the same internal function.
4146 */
4147 Datum
text_to_array_null(PG_FUNCTION_ARGS)4148 text_to_array_null(PG_FUNCTION_ARGS)
4149 {
4150 return text_to_array_internal(fcinfo);
4151 }
4152
4153 /*
4154 * common code for text_to_array and text_to_array_null functions
4155 *
4156 * These are not strict so we have to test for null inputs explicitly.
4157 */
4158 static Datum
text_to_array_internal(PG_FUNCTION_ARGS)4159 text_to_array_internal(PG_FUNCTION_ARGS)
4160 {
4161 text *inputstring;
4162 text *fldsep;
4163 text *null_string;
4164 int inputstring_len;
4165 int fldsep_len;
4166 char *start_ptr;
4167 text *result_text;
4168 bool is_null;
4169 ArrayBuildState *astate = NULL;
4170
4171 /* when input string is NULL, then result is NULL too */
4172 if (PG_ARGISNULL(0))
4173 PG_RETURN_NULL();
4174
4175 inputstring = PG_GETARG_TEXT_PP(0);
4176
4177 /* fldsep can be NULL */
4178 if (!PG_ARGISNULL(1))
4179 fldsep = PG_GETARG_TEXT_PP(1);
4180 else
4181 fldsep = NULL;
4182
4183 /* null_string can be NULL or omitted */
4184 if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4185 null_string = PG_GETARG_TEXT_PP(2);
4186 else
4187 null_string = NULL;
4188
4189 if (fldsep != NULL)
4190 {
4191 /*
4192 * Normal case with non-null fldsep. Use the text_position machinery
4193 * to search for occurrences of fldsep.
4194 */
4195 TextPositionState state;
4196 int fldnum;
4197 int start_posn;
4198 int end_posn;
4199 int chunk_len;
4200
4201 text_position_setup(inputstring, fldsep, &state);
4202
4203 /*
4204 * Note: we check the converted string length, not the original,
4205 * because they could be different if the input contained invalid
4206 * encoding.
4207 */
4208 inputstring_len = state.len1;
4209 fldsep_len = state.len2;
4210
4211 /* return empty array for empty input string */
4212 if (inputstring_len < 1)
4213 {
4214 text_position_cleanup(&state);
4215 PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4216 }
4217
4218 /*
4219 * empty field separator: return the input string as a one-element
4220 * array
4221 */
4222 if (fldsep_len < 1)
4223 {
4224 text_position_cleanup(&state);
4225 /* single element can be a NULL too */
4226 is_null = null_string ? text_isequal(inputstring, null_string) : false;
4227 PG_RETURN_ARRAYTYPE_P(create_singleton_array(fcinfo, TEXTOID,
4228 PointerGetDatum(inputstring),
4229 is_null, 1));
4230 }
4231
4232 start_posn = 1;
4233 /* start_ptr points to the start_posn'th character of inputstring */
4234 start_ptr = VARDATA_ANY(inputstring);
4235
4236 for (fldnum = 1;; fldnum++) /* field number is 1 based */
4237 {
4238 CHECK_FOR_INTERRUPTS();
4239
4240 end_posn = text_position_next(start_posn, &state);
4241
4242 if (end_posn == 0)
4243 {
4244 /* fetch last field */
4245 chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4246 }
4247 else
4248 {
4249 /* fetch non-last field */
4250 chunk_len = charlen_to_bytelen(start_ptr, end_posn - start_posn);
4251 }
4252
4253 /* must build a temp text datum to pass to accumArrayResult */
4254 result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4255 is_null = null_string ? text_isequal(result_text, null_string) : false;
4256
4257 /* stash away this field */
4258 astate = accumArrayResult(astate,
4259 PointerGetDatum(result_text),
4260 is_null,
4261 TEXTOID,
4262 CurrentMemoryContext);
4263
4264 pfree(result_text);
4265
4266 if (end_posn == 0)
4267 break;
4268
4269 start_posn = end_posn;
4270 start_ptr += chunk_len;
4271 start_posn += fldsep_len;
4272 start_ptr += charlen_to_bytelen(start_ptr, fldsep_len);
4273 }
4274
4275 text_position_cleanup(&state);
4276 }
4277 else
4278 {
4279 /*
4280 * When fldsep is NULL, each character in the inputstring becomes an
4281 * element in the result array. The separator is effectively the
4282 * space between characters.
4283 */
4284 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4285
4286 /* return empty array for empty input string */
4287 if (inputstring_len < 1)
4288 PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4289
4290 start_ptr = VARDATA_ANY(inputstring);
4291
4292 while (inputstring_len > 0)
4293 {
4294 int chunk_len = pg_mblen(start_ptr);
4295
4296 CHECK_FOR_INTERRUPTS();
4297
4298 /* must build a temp text datum to pass to accumArrayResult */
4299 result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4300 is_null = null_string ? text_isequal(result_text, null_string) : false;
4301
4302 /* stash away this field */
4303 astate = accumArrayResult(astate,
4304 PointerGetDatum(result_text),
4305 is_null,
4306 TEXTOID,
4307 CurrentMemoryContext);
4308
4309 pfree(result_text);
4310
4311 start_ptr += chunk_len;
4312 inputstring_len -= chunk_len;
4313 }
4314 }
4315
4316 PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
4317 CurrentMemoryContext));
4318 }
4319
4320 /*
4321 * array_to_text
4322 * concatenate Cstring representation of input array elements
4323 * using provided field separator
4324 */
4325 Datum
array_to_text(PG_FUNCTION_ARGS)4326 array_to_text(PG_FUNCTION_ARGS)
4327 {
4328 ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
4329 char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4330
4331 PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4332 }
4333
4334 /*
4335 * array_to_text_null
4336 * concatenate Cstring representation of input array elements
4337 * using provided field separator and null string
4338 *
4339 * This version is not strict so we have to test for null inputs explicitly.
4340 */
4341 Datum
array_to_text_null(PG_FUNCTION_ARGS)4342 array_to_text_null(PG_FUNCTION_ARGS)
4343 {
4344 ArrayType *v;
4345 char *fldsep;
4346 char *null_string;
4347
4348 /* returns NULL when first or second parameter is NULL */
4349 if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4350 PG_RETURN_NULL();
4351
4352 v = PG_GETARG_ARRAYTYPE_P(0);
4353 fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4354
4355 /* NULL null string is passed through as a null pointer */
4356 if (!PG_ARGISNULL(2))
4357 null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4358 else
4359 null_string = NULL;
4360
4361 PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4362 }
4363
4364 /*
4365 * common code for array_to_text and array_to_text_null functions
4366 */
4367 static text *
array_to_text_internal(FunctionCallInfo fcinfo,ArrayType * v,const char * fldsep,const char * null_string)4368 array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
4369 const char *fldsep, const char *null_string)
4370 {
4371 text *result;
4372 int nitems,
4373 *dims,
4374 ndims;
4375 Oid element_type;
4376 int typlen;
4377 bool typbyval;
4378 char typalign;
4379 StringInfoData buf;
4380 bool printed = false;
4381 char *p;
4382 bits8 *bitmap;
4383 int bitmask;
4384 int i;
4385 ArrayMetaState *my_extra;
4386
4387 ndims = ARR_NDIM(v);
4388 dims = ARR_DIMS(v);
4389 nitems = ArrayGetNItems(ndims, dims);
4390
4391 /* if there are no elements, return an empty string */
4392 if (nitems == 0)
4393 return cstring_to_text_with_len("", 0);
4394
4395 element_type = ARR_ELEMTYPE(v);
4396 initStringInfo(&buf);
4397
4398 /*
4399 * We arrange to look up info about element type, including its output
4400 * conversion proc, only once per series of calls, assuming the element
4401 * type doesn't change underneath us.
4402 */
4403 my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4404 if (my_extra == NULL)
4405 {
4406 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4407 sizeof(ArrayMetaState));
4408 my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4409 my_extra->element_type = ~element_type;
4410 }
4411
4412 if (my_extra->element_type != element_type)
4413 {
4414 /*
4415 * Get info about element type, including its output conversion proc
4416 */
4417 get_type_io_data(element_type, IOFunc_output,
4418 &my_extra->typlen, &my_extra->typbyval,
4419 &my_extra->typalign, &my_extra->typdelim,
4420 &my_extra->typioparam, &my_extra->typiofunc);
4421 fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4422 fcinfo->flinfo->fn_mcxt);
4423 my_extra->element_type = element_type;
4424 }
4425 typlen = my_extra->typlen;
4426 typbyval = my_extra->typbyval;
4427 typalign = my_extra->typalign;
4428
4429 p = ARR_DATA_PTR(v);
4430 bitmap = ARR_NULLBITMAP(v);
4431 bitmask = 1;
4432
4433 for (i = 0; i < nitems; i++)
4434 {
4435 Datum itemvalue;
4436 char *value;
4437
4438 /* Get source element, checking for NULL */
4439 if (bitmap && (*bitmap & bitmask) == 0)
4440 {
4441 /* if null_string is NULL, we just ignore null elements */
4442 if (null_string != NULL)
4443 {
4444 if (printed)
4445 appendStringInfo(&buf, "%s%s", fldsep, null_string);
4446 else
4447 appendStringInfoString(&buf, null_string);
4448 printed = true;
4449 }
4450 }
4451 else
4452 {
4453 itemvalue = fetch_att(p, typbyval, typlen);
4454
4455 value = OutputFunctionCall(&my_extra->proc, itemvalue);
4456
4457 if (printed)
4458 appendStringInfo(&buf, "%s%s", fldsep, value);
4459 else
4460 appendStringInfoString(&buf, value);
4461 printed = true;
4462
4463 p = att_addlength_pointer(p, typlen, p);
4464 p = (char *) att_align_nominal(p, typalign);
4465 }
4466
4467 /* advance bitmap pointer if any */
4468 if (bitmap)
4469 {
4470 bitmask <<= 1;
4471 if (bitmask == 0x100)
4472 {
4473 bitmap++;
4474 bitmask = 1;
4475 }
4476 }
4477 }
4478
4479 result = cstring_to_text_with_len(buf.data, buf.len);
4480 pfree(buf.data);
4481
4482 return result;
4483 }
4484
4485 #define HEXBASE 16
4486 /*
4487 * Convert an int32 to a string containing a base 16 (hex) representation of
4488 * the number.
4489 */
4490 Datum
to_hex32(PG_FUNCTION_ARGS)4491 to_hex32(PG_FUNCTION_ARGS)
4492 {
4493 uint32 value = (uint32) PG_GETARG_INT32(0);
4494 char *ptr;
4495 const char *digits = "0123456789abcdef";
4496 char buf[32]; /* bigger than needed, but reasonable */
4497
4498 ptr = buf + sizeof(buf) - 1;
4499 *ptr = '\0';
4500
4501 do
4502 {
4503 *--ptr = digits[value % HEXBASE];
4504 value /= HEXBASE;
4505 } while (ptr > buf && value);
4506
4507 PG_RETURN_TEXT_P(cstring_to_text(ptr));
4508 }
4509
4510 /*
4511 * Convert an int64 to a string containing a base 16 (hex) representation of
4512 * the number.
4513 */
4514 Datum
to_hex64(PG_FUNCTION_ARGS)4515 to_hex64(PG_FUNCTION_ARGS)
4516 {
4517 uint64 value = (uint64) PG_GETARG_INT64(0);
4518 char *ptr;
4519 const char *digits = "0123456789abcdef";
4520 char buf[32]; /* bigger than needed, but reasonable */
4521
4522 ptr = buf + sizeof(buf) - 1;
4523 *ptr = '\0';
4524
4525 do
4526 {
4527 *--ptr = digits[value % HEXBASE];
4528 value /= HEXBASE;
4529 } while (ptr > buf && value);
4530
4531 PG_RETURN_TEXT_P(cstring_to_text(ptr));
4532 }
4533
4534 /*
4535 * Create an md5 hash of a text string and return it as hex
4536 *
4537 * md5 produces a 16 byte (128 bit) hash; double it for hex
4538 */
4539 #define MD5_HASH_LEN 32
4540
4541 Datum
md5_text(PG_FUNCTION_ARGS)4542 md5_text(PG_FUNCTION_ARGS)
4543 {
4544 text *in_text = PG_GETARG_TEXT_PP(0);
4545 size_t len;
4546 char hexsum[MD5_HASH_LEN + 1];
4547
4548 /* Calculate the length of the buffer using varlena metadata */
4549 len = VARSIZE_ANY_EXHDR(in_text);
4550
4551 /* get the hash result */
4552 if (pg_md5_hash(VARDATA_ANY(in_text), len, hexsum) == false)
4553 ereport(ERROR,
4554 (errcode(ERRCODE_OUT_OF_MEMORY),
4555 errmsg("out of memory")));
4556
4557 /* convert to text and return it */
4558 PG_RETURN_TEXT_P(cstring_to_text(hexsum));
4559 }
4560
4561 /*
4562 * Create an md5 hash of a bytea field and return it as a hex string:
4563 * 16-byte md5 digest is represented in 32 hex characters.
4564 */
4565 Datum
md5_bytea(PG_FUNCTION_ARGS)4566 md5_bytea(PG_FUNCTION_ARGS)
4567 {
4568 bytea *in = PG_GETARG_BYTEA_PP(0);
4569 size_t len;
4570 char hexsum[MD5_HASH_LEN + 1];
4571
4572 len = VARSIZE_ANY_EXHDR(in);
4573 if (pg_md5_hash(VARDATA_ANY(in), len, hexsum) == false)
4574 ereport(ERROR,
4575 (errcode(ERRCODE_OUT_OF_MEMORY),
4576 errmsg("out of memory")));
4577
4578 PG_RETURN_TEXT_P(cstring_to_text(hexsum));
4579 }
4580
4581 /*
4582 * Return the size of a datum, possibly compressed
4583 *
4584 * Works on any data type
4585 */
4586 Datum
pg_column_size(PG_FUNCTION_ARGS)4587 pg_column_size(PG_FUNCTION_ARGS)
4588 {
4589 Datum value = PG_GETARG_DATUM(0);
4590 int32 result;
4591 int typlen;
4592
4593 /* On first call, get the input type's typlen, and save at *fn_extra */
4594 if (fcinfo->flinfo->fn_extra == NULL)
4595 {
4596 /* Lookup the datatype of the supplied argument */
4597 Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
4598
4599 typlen = get_typlen(argtypeid);
4600 if (typlen == 0) /* should not happen */
4601 elog(ERROR, "cache lookup failed for type %u", argtypeid);
4602
4603 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4604 sizeof(int));
4605 *((int *) fcinfo->flinfo->fn_extra) = typlen;
4606 }
4607 else
4608 typlen = *((int *) fcinfo->flinfo->fn_extra);
4609
4610 if (typlen == -1)
4611 {
4612 /* varlena type, possibly toasted */
4613 result = toast_datum_size(value);
4614 }
4615 else if (typlen == -2)
4616 {
4617 /* cstring */
4618 result = strlen(DatumGetCString(value)) + 1;
4619 }
4620 else
4621 {
4622 /* ordinary fixed-width type */
4623 result = typlen;
4624 }
4625
4626 PG_RETURN_INT32(result);
4627 }
4628
4629 /*
4630 * string_agg - Concatenates values and returns string.
4631 *
4632 * Syntax: string_agg(value text, delimiter text) RETURNS text
4633 *
4634 * Note: Any NULL values are ignored. The first-call delimiter isn't
4635 * actually used at all, and on subsequent calls the delimiter precedes
4636 * the associated value.
4637 */
4638
4639 /* subroutine to initialize state */
4640 static StringInfo
makeStringAggState(FunctionCallInfo fcinfo)4641 makeStringAggState(FunctionCallInfo fcinfo)
4642 {
4643 StringInfo state;
4644 MemoryContext aggcontext;
4645 MemoryContext oldcontext;
4646
4647 if (!AggCheckCallContext(fcinfo, &aggcontext))
4648 {
4649 /* cannot be called directly because of internal-type argument */
4650 elog(ERROR, "string_agg_transfn called in non-aggregate context");
4651 }
4652
4653 /*
4654 * Create state in aggregate context. It'll stay there across subsequent
4655 * calls.
4656 */
4657 oldcontext = MemoryContextSwitchTo(aggcontext);
4658 state = makeStringInfo();
4659 MemoryContextSwitchTo(oldcontext);
4660
4661 return state;
4662 }
4663
4664 Datum
string_agg_transfn(PG_FUNCTION_ARGS)4665 string_agg_transfn(PG_FUNCTION_ARGS)
4666 {
4667 StringInfo state;
4668
4669 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
4670
4671 /* Append the value unless null. */
4672 if (!PG_ARGISNULL(1))
4673 {
4674 /* On the first time through, we ignore the delimiter. */
4675 if (state == NULL)
4676 state = makeStringAggState(fcinfo);
4677 else if (!PG_ARGISNULL(2))
4678 appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
4679
4680 appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
4681 }
4682
4683 /*
4684 * The transition type for string_agg() is declared to be "internal",
4685 * which is a pass-by-value type the same size as a pointer.
4686 */
4687 PG_RETURN_POINTER(state);
4688 }
4689
4690 Datum
string_agg_finalfn(PG_FUNCTION_ARGS)4691 string_agg_finalfn(PG_FUNCTION_ARGS)
4692 {
4693 StringInfo state;
4694
4695 /* cannot be called directly because of internal-type argument */
4696 Assert(AggCheckCallContext(fcinfo, NULL));
4697
4698 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
4699
4700 if (state != NULL)
4701 PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
4702 else
4703 PG_RETURN_NULL();
4704 }
4705
4706 /*
4707 * Implementation of both concat() and concat_ws().
4708 *
4709 * sepstr is the separator string to place between values.
4710 * argidx identifies the first argument to concatenate (counting from zero).
4711 * Returns NULL if result should be NULL, else text value.
4712 */
4713 static text *
concat_internal(const char * sepstr,int argidx,FunctionCallInfo fcinfo)4714 concat_internal(const char *sepstr, int argidx,
4715 FunctionCallInfo fcinfo)
4716 {
4717 text *result;
4718 StringInfoData str;
4719 bool first_arg = true;
4720 int i;
4721
4722 /*
4723 * concat(VARIADIC some-array) is essentially equivalent to
4724 * array_to_text(), ie concat the array elements with the given separator.
4725 * So we just pass the case off to that code.
4726 */
4727 if (get_fn_expr_variadic(fcinfo->flinfo))
4728 {
4729 ArrayType *arr;
4730
4731 /* Should have just the one argument */
4732 Assert(argidx == PG_NARGS() - 1);
4733
4734 /* concat(VARIADIC NULL) is defined as NULL */
4735 if (PG_ARGISNULL(argidx))
4736 return NULL;
4737
4738 /*
4739 * Non-null argument had better be an array. We assume that any call
4740 * context that could let get_fn_expr_variadic return true will have
4741 * checked that a VARIADIC-labeled parameter actually is an array. So
4742 * it should be okay to just Assert that it's an array rather than
4743 * doing a full-fledged error check.
4744 */
4745 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
4746
4747 /* OK, safe to fetch the array value */
4748 arr = PG_GETARG_ARRAYTYPE_P(argidx);
4749
4750 /*
4751 * And serialize the array. We tell array_to_text to ignore null
4752 * elements, which matches the behavior of the loop below.
4753 */
4754 return array_to_text_internal(fcinfo, arr, sepstr, NULL);
4755 }
4756
4757 /* Normal case without explicit VARIADIC marker */
4758 initStringInfo(&str);
4759
4760 for (i = argidx; i < PG_NARGS(); i++)
4761 {
4762 if (!PG_ARGISNULL(i))
4763 {
4764 Datum value = PG_GETARG_DATUM(i);
4765 Oid valtype;
4766 Oid typOutput;
4767 bool typIsVarlena;
4768
4769 /* add separator if appropriate */
4770 if (first_arg)
4771 first_arg = false;
4772 else
4773 appendStringInfoString(&str, sepstr);
4774
4775 /* call the appropriate type output function, append the result */
4776 valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
4777 if (!OidIsValid(valtype))
4778 elog(ERROR, "could not determine data type of concat() input");
4779 getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
4780 appendStringInfoString(&str,
4781 OidOutputFunctionCall(typOutput, value));
4782 }
4783 }
4784
4785 result = cstring_to_text_with_len(str.data, str.len);
4786 pfree(str.data);
4787
4788 return result;
4789 }
4790
4791 /*
4792 * Concatenate all arguments. NULL arguments are ignored.
4793 */
4794 Datum
text_concat(PG_FUNCTION_ARGS)4795 text_concat(PG_FUNCTION_ARGS)
4796 {
4797 text *result;
4798
4799 result = concat_internal("", 0, fcinfo);
4800 if (result == NULL)
4801 PG_RETURN_NULL();
4802 PG_RETURN_TEXT_P(result);
4803 }
4804
4805 /*
4806 * Concatenate all but first argument value with separators. The first
4807 * parameter is used as the separator. NULL arguments are ignored.
4808 */
4809 Datum
text_concat_ws(PG_FUNCTION_ARGS)4810 text_concat_ws(PG_FUNCTION_ARGS)
4811 {
4812 char *sep;
4813 text *result;
4814
4815 /* return NULL when separator is NULL */
4816 if (PG_ARGISNULL(0))
4817 PG_RETURN_NULL();
4818 sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
4819
4820 result = concat_internal(sep, 1, fcinfo);
4821 if (result == NULL)
4822 PG_RETURN_NULL();
4823 PG_RETURN_TEXT_P(result);
4824 }
4825
4826 /*
4827 * Return first n characters in the string. When n is negative,
4828 * return all but last |n| characters.
4829 */
4830 Datum
text_left(PG_FUNCTION_ARGS)4831 text_left(PG_FUNCTION_ARGS)
4832 {
4833 text *str = PG_GETARG_TEXT_PP(0);
4834 const char *p = VARDATA_ANY(str);
4835 int len = VARSIZE_ANY_EXHDR(str);
4836 int n = PG_GETARG_INT32(1);
4837 int rlen;
4838
4839 if (n < 0)
4840 n = pg_mbstrlen_with_len(p, len) + n;
4841 rlen = pg_mbcharcliplen(p, len, n);
4842
4843 PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
4844 }
4845
4846 /*
4847 * Return last n characters in the string. When n is negative,
4848 * return all but first |n| characters.
4849 */
4850 Datum
text_right(PG_FUNCTION_ARGS)4851 text_right(PG_FUNCTION_ARGS)
4852 {
4853 text *str = PG_GETARG_TEXT_PP(0);
4854 const char *p = VARDATA_ANY(str);
4855 int len = VARSIZE_ANY_EXHDR(str);
4856 int n = PG_GETARG_INT32(1);
4857 int off;
4858
4859 if (n < 0)
4860 n = -n;
4861 else
4862 n = pg_mbstrlen_with_len(p, len) - n;
4863 off = pg_mbcharcliplen(p, len, n);
4864
4865 PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
4866 }
4867
4868 /*
4869 * Return reversed string
4870 */
4871 Datum
text_reverse(PG_FUNCTION_ARGS)4872 text_reverse(PG_FUNCTION_ARGS)
4873 {
4874 text *str = PG_GETARG_TEXT_PP(0);
4875 const char *p = VARDATA_ANY(str);
4876 int len = VARSIZE_ANY_EXHDR(str);
4877 const char *endp = p + len;
4878 text *result;
4879 char *dst;
4880
4881 result = palloc(len + VARHDRSZ);
4882 dst = (char *) VARDATA(result) + len;
4883 SET_VARSIZE(result, len + VARHDRSZ);
4884
4885 if (pg_database_encoding_max_length() > 1)
4886 {
4887 /* multibyte version */
4888 while (p < endp)
4889 {
4890 int sz;
4891
4892 sz = pg_mblen(p);
4893 dst -= sz;
4894 memcpy(dst, p, sz);
4895 p += sz;
4896 }
4897 }
4898 else
4899 {
4900 /* single byte version */
4901 while (p < endp)
4902 *(--dst) = *p++;
4903 }
4904
4905 PG_RETURN_TEXT_P(result);
4906 }
4907
4908
4909 /*
4910 * Support macros for text_format()
4911 */
4912 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
4913
4914 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
4915 do { \
4916 if (++(ptr) >= (end_ptr)) \
4917 ereport(ERROR, \
4918 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
4919 errmsg("unterminated format() type specifier"), \
4920 errhint("For a single \"%%\" use \"%%%%\"."))); \
4921 } while (0)
4922
4923 /*
4924 * Returns a formatted string
4925 */
4926 Datum
text_format(PG_FUNCTION_ARGS)4927 text_format(PG_FUNCTION_ARGS)
4928 {
4929 text *fmt;
4930 StringInfoData str;
4931 const char *cp;
4932 const char *start_ptr;
4933 const char *end_ptr;
4934 text *result;
4935 int arg;
4936 bool funcvariadic;
4937 int nargs;
4938 Datum *elements = NULL;
4939 bool *nulls = NULL;
4940 Oid element_type = InvalidOid;
4941 Oid prev_type = InvalidOid;
4942 Oid prev_width_type = InvalidOid;
4943 FmgrInfo typoutputfinfo;
4944 FmgrInfo typoutputinfo_width;
4945
4946 /* When format string is null, immediately return null */
4947 if (PG_ARGISNULL(0))
4948 PG_RETURN_NULL();
4949
4950 /* If argument is marked VARIADIC, expand array into elements */
4951 if (get_fn_expr_variadic(fcinfo->flinfo))
4952 {
4953 ArrayType *arr;
4954 int16 elmlen;
4955 bool elmbyval;
4956 char elmalign;
4957 int nitems;
4958
4959 /* Should have just the one argument */
4960 Assert(PG_NARGS() == 2);
4961
4962 /* If argument is NULL, we treat it as zero-length array */
4963 if (PG_ARGISNULL(1))
4964 nitems = 0;
4965 else
4966 {
4967 /*
4968 * Non-null argument had better be an array. We assume that any
4969 * call context that could let get_fn_expr_variadic return true
4970 * will have checked that a VARIADIC-labeled parameter actually is
4971 * an array. So it should be okay to just Assert that it's an
4972 * array rather than doing a full-fledged error check.
4973 */
4974 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
4975
4976 /* OK, safe to fetch the array value */
4977 arr = PG_GETARG_ARRAYTYPE_P(1);
4978
4979 /* Get info about array element type */
4980 element_type = ARR_ELEMTYPE(arr);
4981 get_typlenbyvalalign(element_type,
4982 &elmlen, &elmbyval, &elmalign);
4983
4984 /* Extract all array elements */
4985 deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
4986 &elements, &nulls, &nitems);
4987 }
4988
4989 nargs = nitems + 1;
4990 funcvariadic = true;
4991 }
4992 else
4993 {
4994 /* Non-variadic case, we'll process the arguments individually */
4995 nargs = PG_NARGS();
4996 funcvariadic = false;
4997 }
4998
4999 /* Setup for main loop. */
5000 fmt = PG_GETARG_TEXT_PP(0);
5001 start_ptr = VARDATA_ANY(fmt);
5002 end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5003 initStringInfo(&str);
5004 arg = 1; /* next argument position to print */
5005
5006 /* Scan format string, looking for conversion specifiers. */
5007 for (cp = start_ptr; cp < end_ptr; cp++)
5008 {
5009 int argpos;
5010 int widthpos;
5011 int flags;
5012 int width;
5013 Datum value;
5014 bool isNull;
5015 Oid typid;
5016
5017 /*
5018 * If it's not the start of a conversion specifier, just copy it to
5019 * the output buffer.
5020 */
5021 if (*cp != '%')
5022 {
5023 appendStringInfoCharMacro(&str, *cp);
5024 continue;
5025 }
5026
5027 ADVANCE_PARSE_POINTER(cp, end_ptr);
5028
5029 /* Easy case: %% outputs a single % */
5030 if (*cp == '%')
5031 {
5032 appendStringInfoCharMacro(&str, *cp);
5033 continue;
5034 }
5035
5036 /* Parse the optional portions of the format specifier */
5037 cp = text_format_parse_format(cp, end_ptr,
5038 &argpos, &widthpos,
5039 &flags, &width);
5040
5041 /*
5042 * Next we should see the main conversion specifier. Whether or not
5043 * an argument position was present, it's known that at least one
5044 * character remains in the string at this point. Experience suggests
5045 * that it's worth checking that that character is one of the expected
5046 * ones before we try to fetch arguments, so as to produce the least
5047 * confusing response to a mis-formatted specifier.
5048 */
5049 if (strchr("sIL", *cp) == NULL)
5050 ereport(ERROR,
5051 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5052 errmsg("unrecognized format() type specifier \"%c\"",
5053 *cp),
5054 errhint("For a single \"%%\" use \"%%%%\".")));
5055
5056 /* If indirect width was specified, get its value */
5057 if (widthpos >= 0)
5058 {
5059 /* Collect the specified or next argument position */
5060 if (widthpos > 0)
5061 arg = widthpos;
5062 if (arg >= nargs)
5063 ereport(ERROR,
5064 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5065 errmsg("too few arguments for format()")));
5066
5067 /* Get the value and type of the selected argument */
5068 if (!funcvariadic)
5069 {
5070 value = PG_GETARG_DATUM(arg);
5071 isNull = PG_ARGISNULL(arg);
5072 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5073 }
5074 else
5075 {
5076 value = elements[arg - 1];
5077 isNull = nulls[arg - 1];
5078 typid = element_type;
5079 }
5080 if (!OidIsValid(typid))
5081 elog(ERROR, "could not determine data type of format() input");
5082
5083 arg++;
5084
5085 /* We can treat NULL width the same as zero */
5086 if (isNull)
5087 width = 0;
5088 else if (typid == INT4OID)
5089 width = DatumGetInt32(value);
5090 else if (typid == INT2OID)
5091 width = DatumGetInt16(value);
5092 else
5093 {
5094 /* For less-usual datatypes, convert to text then to int */
5095 char *str;
5096
5097 if (typid != prev_width_type)
5098 {
5099 Oid typoutputfunc;
5100 bool typIsVarlena;
5101
5102 getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5103 fmgr_info(typoutputfunc, &typoutputinfo_width);
5104 prev_width_type = typid;
5105 }
5106
5107 str = OutputFunctionCall(&typoutputinfo_width, value);
5108
5109 /* pg_atoi will complain about bad data or overflow */
5110 width = pg_atoi(str, sizeof(int), '\0');
5111
5112 pfree(str);
5113 }
5114 }
5115
5116 /* Collect the specified or next argument position */
5117 if (argpos > 0)
5118 arg = argpos;
5119 if (arg >= nargs)
5120 ereport(ERROR,
5121 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5122 errmsg("too few arguments for format()")));
5123
5124 /* Get the value and type of the selected argument */
5125 if (!funcvariadic)
5126 {
5127 value = PG_GETARG_DATUM(arg);
5128 isNull = PG_ARGISNULL(arg);
5129 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5130 }
5131 else
5132 {
5133 value = elements[arg - 1];
5134 isNull = nulls[arg - 1];
5135 typid = element_type;
5136 }
5137 if (!OidIsValid(typid))
5138 elog(ERROR, "could not determine data type of format() input");
5139
5140 arg++;
5141
5142 /*
5143 * Get the appropriate typOutput function, reusing previous one if
5144 * same type as previous argument. That's particularly useful in the
5145 * variadic-array case, but often saves work even for ordinary calls.
5146 */
5147 if (typid != prev_type)
5148 {
5149 Oid typoutputfunc;
5150 bool typIsVarlena;
5151
5152 getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5153 fmgr_info(typoutputfunc, &typoutputfinfo);
5154 prev_type = typid;
5155 }
5156
5157 /*
5158 * And now we can format the value.
5159 */
5160 switch (*cp)
5161 {
5162 case 's':
5163 case 'I':
5164 case 'L':
5165 text_format_string_conversion(&str, *cp, &typoutputfinfo,
5166 value, isNull,
5167 flags, width);
5168 break;
5169 default:
5170 /* should not get here, because of previous check */
5171 ereport(ERROR,
5172 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5173 errmsg("unrecognized format() type specifier \"%c\"",
5174 *cp),
5175 errhint("For a single \"%%\" use \"%%%%\".")));
5176 break;
5177 }
5178 }
5179
5180 /* Don't need deconstruct_array results anymore. */
5181 if (elements != NULL)
5182 pfree(elements);
5183 if (nulls != NULL)
5184 pfree(nulls);
5185
5186 /* Generate results. */
5187 result = cstring_to_text_with_len(str.data, str.len);
5188 pfree(str.data);
5189
5190 PG_RETURN_TEXT_P(result);
5191 }
5192
5193 /*
5194 * Parse contiguous digits as a decimal number.
5195 *
5196 * Returns true if some digits could be parsed.
5197 * The value is returned into *value, and *ptr is advanced to the next
5198 * character to be parsed.
5199 *
5200 * Note parsing invariant: at least one character is known available before
5201 * string end (end_ptr) at entry, and this is still true at exit.
5202 */
5203 static bool
text_format_parse_digits(const char ** ptr,const char * end_ptr,int * value)5204 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5205 {
5206 bool found = false;
5207 const char *cp = *ptr;
5208 int val = 0;
5209
5210 while (*cp >= '0' && *cp <= '9')
5211 {
5212 int newval = val * 10 + (*cp - '0');
5213
5214 if (newval / 10 != val) /* overflow? */
5215 ereport(ERROR,
5216 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5217 errmsg("number is out of range")));
5218 val = newval;
5219 ADVANCE_PARSE_POINTER(cp, end_ptr);
5220 found = true;
5221 }
5222
5223 *ptr = cp;
5224 *value = val;
5225
5226 return found;
5227 }
5228
5229 /*
5230 * Parse a format specifier (generally following the SUS printf spec).
5231 *
5232 * We have already advanced over the initial '%', and we are looking for
5233 * [argpos][flags][width]type (but the type character is not consumed here).
5234 *
5235 * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5236 * Output parameters:
5237 * argpos: argument position for value to be printed. -1 means unspecified.
5238 * widthpos: argument position for width. Zero means the argument position
5239 * was unspecified (ie, take the next arg) and -1 means no width
5240 * argument (width was omitted or specified as a constant).
5241 * flags: bitmask of flags.
5242 * width: directly-specified width value. Zero means the width was omitted
5243 * (note it's not necessary to distinguish this case from an explicit
5244 * zero width value).
5245 *
5246 * The function result is the next character position to be parsed, ie, the
5247 * location where the type character is/should be.
5248 *
5249 * Note parsing invariant: at least one character is known available before
5250 * string end (end_ptr) at entry, and this is still true at exit.
5251 */
5252 static const char *
text_format_parse_format(const char * start_ptr,const char * end_ptr,int * argpos,int * widthpos,int * flags,int * width)5253 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5254 int *argpos, int *widthpos,
5255 int *flags, int *width)
5256 {
5257 const char *cp = start_ptr;
5258 int n;
5259
5260 /* set defaults for output parameters */
5261 *argpos = -1;
5262 *widthpos = -1;
5263 *flags = 0;
5264 *width = 0;
5265
5266 /* try to identify first number */
5267 if (text_format_parse_digits(&cp, end_ptr, &n))
5268 {
5269 if (*cp != '$')
5270 {
5271 /* Must be just a width and a type, so we're done */
5272 *width = n;
5273 return cp;
5274 }
5275 /* The number was argument position */
5276 *argpos = n;
5277 /* Explicit 0 for argument index is immediately refused */
5278 if (n == 0)
5279 ereport(ERROR,
5280 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5281 errmsg("format specifies argument 0, but arguments are numbered from 1")));
5282 ADVANCE_PARSE_POINTER(cp, end_ptr);
5283 }
5284
5285 /* Handle flags (only minus is supported now) */
5286 while (*cp == '-')
5287 {
5288 *flags |= TEXT_FORMAT_FLAG_MINUS;
5289 ADVANCE_PARSE_POINTER(cp, end_ptr);
5290 }
5291
5292 if (*cp == '*')
5293 {
5294 /* Handle indirect width */
5295 ADVANCE_PARSE_POINTER(cp, end_ptr);
5296 if (text_format_parse_digits(&cp, end_ptr, &n))
5297 {
5298 /* number in this position must be closed by $ */
5299 if (*cp != '$')
5300 ereport(ERROR,
5301 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5302 errmsg("width argument position must be ended by \"$\"")));
5303 /* The number was width argument position */
5304 *widthpos = n;
5305 /* Explicit 0 for argument index is immediately refused */
5306 if (n == 0)
5307 ereport(ERROR,
5308 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5309 errmsg("format specifies argument 0, but arguments are numbered from 1")));
5310 ADVANCE_PARSE_POINTER(cp, end_ptr);
5311 }
5312 else
5313 *widthpos = 0; /* width's argument position is unspecified */
5314 }
5315 else
5316 {
5317 /* Check for direct width specification */
5318 if (text_format_parse_digits(&cp, end_ptr, &n))
5319 *width = n;
5320 }
5321
5322 /* cp should now be pointing at type character */
5323 return cp;
5324 }
5325
5326 /*
5327 * Format a %s, %I, or %L conversion
5328 */
5329 static void
text_format_string_conversion(StringInfo buf,char conversion,FmgrInfo * typOutputInfo,Datum value,bool isNull,int flags,int width)5330 text_format_string_conversion(StringInfo buf, char conversion,
5331 FmgrInfo *typOutputInfo,
5332 Datum value, bool isNull,
5333 int flags, int width)
5334 {
5335 char *str;
5336
5337 /* Handle NULL arguments before trying to stringify the value. */
5338 if (isNull)
5339 {
5340 if (conversion == 's')
5341 text_format_append_string(buf, "", flags, width);
5342 else if (conversion == 'L')
5343 text_format_append_string(buf, "NULL", flags, width);
5344 else if (conversion == 'I')
5345 ereport(ERROR,
5346 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
5347 errmsg("null values cannot be formatted as an SQL identifier")));
5348 return;
5349 }
5350
5351 /* Stringify. */
5352 str = OutputFunctionCall(typOutputInfo, value);
5353
5354 /* Escape. */
5355 if (conversion == 'I')
5356 {
5357 /* quote_identifier may or may not allocate a new string. */
5358 text_format_append_string(buf, quote_identifier(str), flags, width);
5359 }
5360 else if (conversion == 'L')
5361 {
5362 char *qstr = quote_literal_cstr(str);
5363
5364 text_format_append_string(buf, qstr, flags, width);
5365 /* quote_literal_cstr() always allocates a new string */
5366 pfree(qstr);
5367 }
5368 else
5369 text_format_append_string(buf, str, flags, width);
5370
5371 /* Cleanup. */
5372 pfree(str);
5373 }
5374
5375 /*
5376 * Append str to buf, padding as directed by flags/width
5377 */
5378 static void
text_format_append_string(StringInfo buf,const char * str,int flags,int width)5379 text_format_append_string(StringInfo buf, const char *str,
5380 int flags, int width)
5381 {
5382 bool align_to_left = false;
5383 int len;
5384
5385 /* fast path for typical easy case */
5386 if (width == 0)
5387 {
5388 appendStringInfoString(buf, str);
5389 return;
5390 }
5391
5392 if (width < 0)
5393 {
5394 /* Negative width: implicit '-' flag, then take absolute value */
5395 align_to_left = true;
5396 /* -INT_MIN is undefined */
5397 if (width <= INT_MIN)
5398 ereport(ERROR,
5399 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5400 errmsg("number is out of range")));
5401 width = -width;
5402 }
5403 else if (flags & TEXT_FORMAT_FLAG_MINUS)
5404 align_to_left = true;
5405
5406 len = pg_mbstrlen(str);
5407 if (align_to_left)
5408 {
5409 /* left justify */
5410 appendStringInfoString(buf, str);
5411 if (len < width)
5412 appendStringInfoSpaces(buf, width - len);
5413 }
5414 else
5415 {
5416 /* right justify */
5417 if (len < width)
5418 appendStringInfoSpaces(buf, width - len);
5419 appendStringInfoString(buf, str);
5420 }
5421 }
5422
5423 /*
5424 * text_format_nv - nonvariadic wrapper for text_format function.
5425 *
5426 * note: this wrapper is necessary to pass the sanity check in opr_sanity,
5427 * which checks that all built-in functions that share the implementing C
5428 * function take the same number of arguments.
5429 */
5430 Datum
text_format_nv(PG_FUNCTION_ARGS)5431 text_format_nv(PG_FUNCTION_ARGS)
5432 {
5433 return text_format(fcinfo);
5434 }
5435
5436 /*
5437 * Helper function for Levenshtein distance functions. Faster than memcmp(),
5438 * for this use case.
5439 */
5440 static inline bool
rest_of_char_same(const char * s1,const char * s2,int len)5441 rest_of_char_same(const char *s1, const char *s2, int len)
5442 {
5443 while (len > 0)
5444 {
5445 len--;
5446 if (s1[len] != s2[len])
5447 return false;
5448 }
5449 return true;
5450 }
5451
5452 /* Expand each Levenshtein distance variant */
5453 #include "levenshtein.c"
5454 #define LEVENSHTEIN_LESS_EQUAL
5455 #include "levenshtein.c"
5456