1 /*-------------------------------------------------------------------------
2 *
3 * varlena.c
4 * Functions for the variable-length built-in types.
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/utils/adt/varlena.c
12 *
13 *-------------------------------------------------------------------------
14 */
15 #include "postgres.h"
16
17 #include <ctype.h>
18 #include <limits.h>
19
20 #include "access/tuptoaster.h"
21 #include "catalog/pg_collation.h"
22 #include "catalog/pg_type.h"
23 #include "common/int.h"
24 #include "lib/hyperloglog.h"
25 #include "libpq/pqformat.h"
26 #include "miscadmin.h"
27 #include "parser/scansup.h"
28 #include "port/pg_bswap.h"
29 #include "regex/regex.h"
30 #include "utils/builtins.h"
31 #include "utils/bytea.h"
32 #include "utils/hashutils.h"
33 #include "utils/lsyscache.h"
34 #include "utils/memutils.h"
35 #include "utils/pg_locale.h"
36 #include "utils/sortsupport.h"
37 #include "utils/varlena.h"
38
39
40 /* GUC variable */
41 int bytea_output = BYTEA_OUTPUT_HEX;
42
43 typedef struct varlena unknown;
44 typedef struct varlena VarString;
45
46 /*
47 * State for text_position_* functions.
48 */
49 typedef struct
50 {
51 bool is_multibyte; /* T if multibyte encoding */
52 bool is_multibyte_char_in_char;
53
54 char *str1; /* haystack string */
55 char *str2; /* needle string */
56 int len1; /* string lengths in bytes */
57 int len2;
58
59 /* Skip table for Boyer-Moore-Horspool search algorithm: */
60 int skiptablemask; /* mask for ANDing with skiptable subscripts */
61 int skiptable[256]; /* skip distance for given mismatched char */
62
63 char *last_match; /* pointer to last match in 'str1' */
64
65 /*
66 * Sometimes we need to convert the byte position of a match to a
67 * character position. These store the last position that was converted,
68 * so that on the next call, we can continue from that point, rather than
69 * count characters from the very beginning.
70 */
71 char *refpoint; /* pointer within original haystack string */
72 int refpos; /* 0-based character offset of the same point */
73 } TextPositionState;
74
75 typedef struct
76 {
77 char *buf1; /* 1st string, or abbreviation original string
78 * buf */
79 char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
80 int buflen1;
81 int buflen2;
82 int last_len1; /* Length of last buf1 string/strxfrm() input */
83 int last_len2; /* Length of last buf2 string/strxfrm() blob */
84 int last_returned; /* Last comparison result (cache) */
85 bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
86 bool collate_c;
87 Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
88 hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
89 hyperLogLogState full_card; /* Full key cardinality state */
90 double prop_card; /* Required cardinality proportion */
91 pg_locale_t locale;
92 } VarStringSortSupport;
93
94 /*
95 * This should be large enough that most strings will fit, but small enough
96 * that we feel comfortable putting it on the stack
97 */
98 #define TEXTBUFLEN 1024
99
100 #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
101 #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
102 #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
103 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
104 #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
105
106 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
107 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
108
109 static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
110 static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
111 static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
112 static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
113 static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
114 static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
115 static int varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
116 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
117 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
118 static int32 text_length(Datum str);
119 static text *text_catenate(text *t1, text *t2);
120 static text *text_substring(Datum str,
121 int32 start,
122 int32 length,
123 bool length_not_specified);
124 static text *text_overlay(text *t1, text *t2, int sp, int sl);
125 static int text_position(text *t1, text *t2, Oid collid);
126 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
127 static bool text_position_next(TextPositionState *state);
128 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
129 static char *text_position_get_match_ptr(TextPositionState *state);
130 static int text_position_get_match_pos(TextPositionState *state);
131 static void text_position_cleanup(TextPositionState *state);
132 static void check_collation_set(Oid collid);
133 static int text_cmp(text *arg1, text *arg2, Oid collid);
134 static bytea *bytea_catenate(bytea *t1, bytea *t2);
135 static bytea *bytea_substring(Datum str,
136 int S,
137 int L,
138 bool length_not_specified);
139 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
140 static void appendStringInfoText(StringInfo str, const text *t);
141 static Datum text_to_array_internal(PG_FUNCTION_ARGS);
142 static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
143 const char *fldsep, const char *null_string);
144 static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
145 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
146 int *value);
147 static const char *text_format_parse_format(const char *start_ptr,
148 const char *end_ptr,
149 int *argpos, int *widthpos,
150 int *flags, int *width);
151 static void text_format_string_conversion(StringInfo buf, char conversion,
152 FmgrInfo *typOutputInfo,
153 Datum value, bool isNull,
154 int flags, int width);
155 static void text_format_append_string(StringInfo buf, const char *str,
156 int flags, int width);
157
158
159 /*****************************************************************************
160 * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
161 *****************************************************************************/
162
163 /*
164 * cstring_to_text
165 *
166 * Create a text value from a null-terminated C string.
167 *
168 * The new text value is freshly palloc'd with a full-size VARHDR.
169 */
170 text *
cstring_to_text(const char * s)171 cstring_to_text(const char *s)
172 {
173 return cstring_to_text_with_len(s, strlen(s));
174 }
175
176 /*
177 * cstring_to_text_with_len
178 *
179 * Same as cstring_to_text except the caller specifies the string length;
180 * the string need not be null_terminated.
181 */
182 text *
cstring_to_text_with_len(const char * s,int len)183 cstring_to_text_with_len(const char *s, int len)
184 {
185 text *result = (text *) palloc(len + VARHDRSZ);
186
187 SET_VARSIZE(result, len + VARHDRSZ);
188 memcpy(VARDATA(result), s, len);
189
190 return result;
191 }
192
193 /*
194 * text_to_cstring
195 *
196 * Create a palloc'd, null-terminated C string from a text value.
197 *
198 * We support being passed a compressed or toasted text value.
199 * This is a bit bogus since such values shouldn't really be referred to as
200 * "text *", but it seems useful for robustness. If we didn't handle that
201 * case here, we'd need another routine that did, anyway.
202 */
203 char *
text_to_cstring(const text * t)204 text_to_cstring(const text *t)
205 {
206 /* must cast away the const, unfortunately */
207 text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
208 int len = VARSIZE_ANY_EXHDR(tunpacked);
209 char *result;
210
211 result = (char *) palloc(len + 1);
212 memcpy(result, VARDATA_ANY(tunpacked), len);
213 result[len] = '\0';
214
215 if (tunpacked != t)
216 pfree(tunpacked);
217
218 return result;
219 }
220
221 /*
222 * text_to_cstring_buffer
223 *
224 * Copy a text value into a caller-supplied buffer of size dst_len.
225 *
226 * The text string is truncated if necessary to fit. The result is
227 * guaranteed null-terminated (unless dst_len == 0).
228 *
229 * We support being passed a compressed or toasted text value.
230 * This is a bit bogus since such values shouldn't really be referred to as
231 * "text *", but it seems useful for robustness. If we didn't handle that
232 * case here, we'd need another routine that did, anyway.
233 */
234 void
text_to_cstring_buffer(const text * src,char * dst,size_t dst_len)235 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
236 {
237 /* must cast away the const, unfortunately */
238 text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
239 size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
240
241 if (dst_len > 0)
242 {
243 dst_len--;
244 if (dst_len >= src_len)
245 dst_len = src_len;
246 else /* ensure truncation is encoding-safe */
247 dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
248 memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
249 dst[dst_len] = '\0';
250 }
251
252 if (srcunpacked != src)
253 pfree(srcunpacked);
254 }
255
256
257 /*****************************************************************************
258 * USER I/O ROUTINES *
259 *****************************************************************************/
260
261
262 #define VAL(CH) ((CH) - '0')
263 #define DIG(VAL) ((VAL) + '0')
264
265 /*
266 * byteain - converts from printable representation of byte array
267 *
268 * Non-printable characters must be passed as '\nnn' (octal) and are
269 * converted to internal form. '\' must be passed as '\\'.
270 * ereport(ERROR, ...) if bad form.
271 *
272 * BUGS:
273 * The input is scanned twice.
274 * The error checking of input is minimal.
275 */
276 Datum
byteain(PG_FUNCTION_ARGS)277 byteain(PG_FUNCTION_ARGS)
278 {
279 char *inputText = PG_GETARG_CSTRING(0);
280 char *tp;
281 char *rp;
282 int bc;
283 bytea *result;
284
285 /* Recognize hex input */
286 if (inputText[0] == '\\' && inputText[1] == 'x')
287 {
288 size_t len = strlen(inputText);
289
290 bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
291 result = palloc(bc);
292 bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
293 SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
294
295 PG_RETURN_BYTEA_P(result);
296 }
297
298 /* Else, it's the traditional escaped style */
299 for (bc = 0, tp = inputText; *tp != '\0'; bc++)
300 {
301 if (tp[0] != '\\')
302 tp++;
303 else if ((tp[0] == '\\') &&
304 (tp[1] >= '0' && tp[1] <= '3') &&
305 (tp[2] >= '0' && tp[2] <= '7') &&
306 (tp[3] >= '0' && tp[3] <= '7'))
307 tp += 4;
308 else if ((tp[0] == '\\') &&
309 (tp[1] == '\\'))
310 tp += 2;
311 else
312 {
313 /*
314 * one backslash, not followed by another or ### valid octal
315 */
316 ereport(ERROR,
317 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
318 errmsg("invalid input syntax for type %s", "bytea")));
319 }
320 }
321
322 bc += VARHDRSZ;
323
324 result = (bytea *) palloc(bc);
325 SET_VARSIZE(result, bc);
326
327 tp = inputText;
328 rp = VARDATA(result);
329 while (*tp != '\0')
330 {
331 if (tp[0] != '\\')
332 *rp++ = *tp++;
333 else if ((tp[0] == '\\') &&
334 (tp[1] >= '0' && tp[1] <= '3') &&
335 (tp[2] >= '0' && tp[2] <= '7') &&
336 (tp[3] >= '0' && tp[3] <= '7'))
337 {
338 bc = VAL(tp[1]);
339 bc <<= 3;
340 bc += VAL(tp[2]);
341 bc <<= 3;
342 *rp++ = bc + VAL(tp[3]);
343
344 tp += 4;
345 }
346 else if ((tp[0] == '\\') &&
347 (tp[1] == '\\'))
348 {
349 *rp++ = '\\';
350 tp += 2;
351 }
352 else
353 {
354 /*
355 * We should never get here. The first pass should not allow it.
356 */
357 ereport(ERROR,
358 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
359 errmsg("invalid input syntax for type %s", "bytea")));
360 }
361 }
362
363 PG_RETURN_BYTEA_P(result);
364 }
365
366 /*
367 * byteaout - converts to printable representation of byte array
368 *
369 * In the traditional escaped format, non-printable characters are
370 * printed as '\nnn' (octal) and '\' as '\\'.
371 */
372 Datum
byteaout(PG_FUNCTION_ARGS)373 byteaout(PG_FUNCTION_ARGS)
374 {
375 bytea *vlena = PG_GETARG_BYTEA_PP(0);
376 char *result;
377 char *rp;
378
379 if (bytea_output == BYTEA_OUTPUT_HEX)
380 {
381 /* Print hex format */
382 rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
383 *rp++ = '\\';
384 *rp++ = 'x';
385 rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
386 }
387 else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
388 {
389 /* Print traditional escaped format */
390 char *vp;
391 int len;
392 int i;
393
394 len = 1; /* empty string has 1 char */
395 vp = VARDATA_ANY(vlena);
396 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
397 {
398 if (*vp == '\\')
399 len += 2;
400 else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
401 len += 4;
402 else
403 len++;
404 }
405 rp = result = (char *) palloc(len);
406 vp = VARDATA_ANY(vlena);
407 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
408 {
409 if (*vp == '\\')
410 {
411 *rp++ = '\\';
412 *rp++ = '\\';
413 }
414 else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
415 {
416 int val; /* holds unprintable chars */
417
418 val = *vp;
419 rp[0] = '\\';
420 rp[3] = DIG(val & 07);
421 val >>= 3;
422 rp[2] = DIG(val & 07);
423 val >>= 3;
424 rp[1] = DIG(val & 03);
425 rp += 4;
426 }
427 else
428 *rp++ = *vp;
429 }
430 }
431 else
432 {
433 elog(ERROR, "unrecognized bytea_output setting: %d",
434 bytea_output);
435 rp = result = NULL; /* keep compiler quiet */
436 }
437 *rp = '\0';
438 PG_RETURN_CSTRING(result);
439 }
440
441 /*
442 * bytearecv - converts external binary format to bytea
443 */
444 Datum
bytearecv(PG_FUNCTION_ARGS)445 bytearecv(PG_FUNCTION_ARGS)
446 {
447 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
448 bytea *result;
449 int nbytes;
450
451 nbytes = buf->len - buf->cursor;
452 result = (bytea *) palloc(nbytes + VARHDRSZ);
453 SET_VARSIZE(result, nbytes + VARHDRSZ);
454 pq_copymsgbytes(buf, VARDATA(result), nbytes);
455 PG_RETURN_BYTEA_P(result);
456 }
457
458 /*
459 * byteasend - converts bytea to binary format
460 *
461 * This is a special case: just copy the input...
462 */
463 Datum
byteasend(PG_FUNCTION_ARGS)464 byteasend(PG_FUNCTION_ARGS)
465 {
466 bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
467
468 PG_RETURN_BYTEA_P(vlena);
469 }
470
471 Datum
bytea_string_agg_transfn(PG_FUNCTION_ARGS)472 bytea_string_agg_transfn(PG_FUNCTION_ARGS)
473 {
474 StringInfo state;
475
476 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
477
478 /* Append the value unless null. */
479 if (!PG_ARGISNULL(1))
480 {
481 bytea *value = PG_GETARG_BYTEA_PP(1);
482
483 /* On the first time through, we ignore the delimiter. */
484 if (state == NULL)
485 state = makeStringAggState(fcinfo);
486 else if (!PG_ARGISNULL(2))
487 {
488 bytea *delim = PG_GETARG_BYTEA_PP(2);
489
490 appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
491 }
492
493 appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
494 }
495
496 /*
497 * The transition type for string_agg() is declared to be "internal",
498 * which is a pass-by-value type the same size as a pointer.
499 */
500 PG_RETURN_POINTER(state);
501 }
502
503 Datum
bytea_string_agg_finalfn(PG_FUNCTION_ARGS)504 bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
505 {
506 StringInfo state;
507
508 /* cannot be called directly because of internal-type argument */
509 Assert(AggCheckCallContext(fcinfo, NULL));
510
511 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
512
513 if (state != NULL)
514 {
515 bytea *result;
516
517 result = (bytea *) palloc(state->len + VARHDRSZ);
518 SET_VARSIZE(result, state->len + VARHDRSZ);
519 memcpy(VARDATA(result), state->data, state->len);
520 PG_RETURN_BYTEA_P(result);
521 }
522 else
523 PG_RETURN_NULL();
524 }
525
526 /*
527 * textin - converts "..." to internal representation
528 */
529 Datum
textin(PG_FUNCTION_ARGS)530 textin(PG_FUNCTION_ARGS)
531 {
532 char *inputText = PG_GETARG_CSTRING(0);
533
534 PG_RETURN_TEXT_P(cstring_to_text(inputText));
535 }
536
537 /*
538 * textout - converts internal representation to "..."
539 */
540 Datum
textout(PG_FUNCTION_ARGS)541 textout(PG_FUNCTION_ARGS)
542 {
543 Datum txt = PG_GETARG_DATUM(0);
544
545 PG_RETURN_CSTRING(TextDatumGetCString(txt));
546 }
547
548 /*
549 * textrecv - converts external binary format to text
550 */
551 Datum
textrecv(PG_FUNCTION_ARGS)552 textrecv(PG_FUNCTION_ARGS)
553 {
554 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
555 text *result;
556 char *str;
557 int nbytes;
558
559 str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
560
561 result = cstring_to_text_with_len(str, nbytes);
562 pfree(str);
563 PG_RETURN_TEXT_P(result);
564 }
565
566 /*
567 * textsend - converts text to binary format
568 */
569 Datum
textsend(PG_FUNCTION_ARGS)570 textsend(PG_FUNCTION_ARGS)
571 {
572 text *t = PG_GETARG_TEXT_PP(0);
573 StringInfoData buf;
574
575 pq_begintypsend(&buf);
576 pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
577 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
578 }
579
580
581 /*
582 * unknownin - converts "..." to internal representation
583 */
584 Datum
unknownin(PG_FUNCTION_ARGS)585 unknownin(PG_FUNCTION_ARGS)
586 {
587 char *str = PG_GETARG_CSTRING(0);
588
589 /* representation is same as cstring */
590 PG_RETURN_CSTRING(pstrdup(str));
591 }
592
593 /*
594 * unknownout - converts internal representation to "..."
595 */
596 Datum
unknownout(PG_FUNCTION_ARGS)597 unknownout(PG_FUNCTION_ARGS)
598 {
599 /* representation is same as cstring */
600 char *str = PG_GETARG_CSTRING(0);
601
602 PG_RETURN_CSTRING(pstrdup(str));
603 }
604
605 /*
606 * unknownrecv - converts external binary format to unknown
607 */
608 Datum
unknownrecv(PG_FUNCTION_ARGS)609 unknownrecv(PG_FUNCTION_ARGS)
610 {
611 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
612 char *str;
613 int nbytes;
614
615 str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
616 /* representation is same as cstring */
617 PG_RETURN_CSTRING(str);
618 }
619
620 /*
621 * unknownsend - converts unknown to binary format
622 */
623 Datum
unknownsend(PG_FUNCTION_ARGS)624 unknownsend(PG_FUNCTION_ARGS)
625 {
626 /* representation is same as cstring */
627 char *str = PG_GETARG_CSTRING(0);
628 StringInfoData buf;
629
630 pq_begintypsend(&buf);
631 pq_sendtext(&buf, str, strlen(str));
632 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
633 }
634
635
636 /* ========== PUBLIC ROUTINES ========== */
637
638 /*
639 * textlen -
640 * returns the logical length of a text*
641 * (which is less than the VARSIZE of the text*)
642 */
643 Datum
textlen(PG_FUNCTION_ARGS)644 textlen(PG_FUNCTION_ARGS)
645 {
646 Datum str = PG_GETARG_DATUM(0);
647
648 /* try to avoid decompressing argument */
649 PG_RETURN_INT32(text_length(str));
650 }
651
652 /*
653 * text_length -
654 * Does the real work for textlen()
655 *
656 * This is broken out so it can be called directly by other string processing
657 * functions. Note that the argument is passed as a Datum, to indicate that
658 * it may still be in compressed form. We can avoid decompressing it at all
659 * in some cases.
660 */
661 static int32
text_length(Datum str)662 text_length(Datum str)
663 {
664 /* fastpath when max encoding length is one */
665 if (pg_database_encoding_max_length() == 1)
666 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
667 else
668 {
669 text *t = DatumGetTextPP(str);
670
671 PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
672 VARSIZE_ANY_EXHDR(t)));
673 }
674 }
675
676 /*
677 * textoctetlen -
678 * returns the physical length of a text*
679 * (which is less than the VARSIZE of the text*)
680 */
681 Datum
textoctetlen(PG_FUNCTION_ARGS)682 textoctetlen(PG_FUNCTION_ARGS)
683 {
684 Datum str = PG_GETARG_DATUM(0);
685
686 /* We need not detoast the input at all */
687 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
688 }
689
690 /*
691 * textcat -
692 * takes two text* and returns a text* that is the concatenation of
693 * the two.
694 *
695 * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
696 * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
697 * Allocate space for output in all cases.
698 * XXX - thomas 1997-07-10
699 */
700 Datum
textcat(PG_FUNCTION_ARGS)701 textcat(PG_FUNCTION_ARGS)
702 {
703 text *t1 = PG_GETARG_TEXT_PP(0);
704 text *t2 = PG_GETARG_TEXT_PP(1);
705
706 PG_RETURN_TEXT_P(text_catenate(t1, t2));
707 }
708
709 /*
710 * text_catenate
711 * Guts of textcat(), broken out so it can be used by other functions
712 *
713 * Arguments can be in short-header form, but not compressed or out-of-line
714 */
715 static text *
text_catenate(text * t1,text * t2)716 text_catenate(text *t1, text *t2)
717 {
718 text *result;
719 int len1,
720 len2,
721 len;
722 char *ptr;
723
724 len1 = VARSIZE_ANY_EXHDR(t1);
725 len2 = VARSIZE_ANY_EXHDR(t2);
726
727 /* paranoia ... probably should throw error instead? */
728 if (len1 < 0)
729 len1 = 0;
730 if (len2 < 0)
731 len2 = 0;
732
733 len = len1 + len2 + VARHDRSZ;
734 result = (text *) palloc(len);
735
736 /* Set size of result string... */
737 SET_VARSIZE(result, len);
738
739 /* Fill data field of result string... */
740 ptr = VARDATA(result);
741 if (len1 > 0)
742 memcpy(ptr, VARDATA_ANY(t1), len1);
743 if (len2 > 0)
744 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
745
746 return result;
747 }
748
749 /*
750 * charlen_to_bytelen()
751 * Compute the number of bytes occupied by n characters starting at *p
752 *
753 * It is caller's responsibility that there actually are n characters;
754 * the string need not be null-terminated.
755 */
756 static int
charlen_to_bytelen(const char * p,int n)757 charlen_to_bytelen(const char *p, int n)
758 {
759 if (pg_database_encoding_max_length() == 1)
760 {
761 /* Optimization for single-byte encodings */
762 return n;
763 }
764 else
765 {
766 const char *s;
767
768 for (s = p; n > 0; n--)
769 s += pg_mblen(s);
770
771 return s - p;
772 }
773 }
774
775 /*
776 * text_substr()
777 * Return a substring starting at the specified position.
778 * - thomas 1997-12-31
779 *
780 * Input:
781 * - string
782 * - starting position (is one-based)
783 * - string length
784 *
785 * If the starting position is zero or less, then return from the start of the string
786 * adjusting the length to be consistent with the "negative start" per SQL.
787 * If the length is less than zero, return the remaining string.
788 *
789 * Added multibyte support.
790 * - Tatsuo Ishii 1998-4-21
791 * Changed behavior if starting position is less than one to conform to SQL behavior.
792 * Formerly returned the entire string; now returns a portion.
793 * - Thomas Lockhart 1998-12-10
794 * Now uses faster TOAST-slicing interface
795 * - John Gray 2002-02-22
796 * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
797 * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
798 * error; if E < 1, return '', not entire string). Fixed MB related bug when
799 * S > LC and < LC + 4 sometimes garbage characters are returned.
800 * - Joe Conway 2002-08-10
801 */
802 Datum
text_substr(PG_FUNCTION_ARGS)803 text_substr(PG_FUNCTION_ARGS)
804 {
805 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
806 PG_GETARG_INT32(1),
807 PG_GETARG_INT32(2),
808 false));
809 }
810
811 /*
812 * text_substr_no_len -
813 * Wrapper to avoid opr_sanity failure due to
814 * one function accepting a different number of args.
815 */
816 Datum
text_substr_no_len(PG_FUNCTION_ARGS)817 text_substr_no_len(PG_FUNCTION_ARGS)
818 {
819 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
820 PG_GETARG_INT32(1),
821 -1, true));
822 }
823
824 /*
825 * text_substring -
826 * Does the real work for text_substr() and text_substr_no_len()
827 *
828 * This is broken out so it can be called directly by other string processing
829 * functions. Note that the argument is passed as a Datum, to indicate that
830 * it may still be in compressed/toasted form. We can avoid detoasting all
831 * of it in some cases.
832 *
833 * The result is always a freshly palloc'd datum.
834 */
835 static text *
text_substring(Datum str,int32 start,int32 length,bool length_not_specified)836 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
837 {
838 int32 eml = pg_database_encoding_max_length();
839 int32 S = start; /* start position */
840 int32 S1; /* adjusted start position */
841 int32 L1; /* adjusted substring length */
842 int32 E; /* end position */
843
844 /*
845 * SQL99 says S can be zero or negative, but we still must fetch from the
846 * start of the string.
847 */
848 S1 = Max(S, 1);
849
850 /* life is easy if the encoding max length is 1 */
851 if (eml == 1)
852 {
853 if (length_not_specified) /* special case - get length to end of
854 * string */
855 L1 = -1;
856 else if (length < 0)
857 {
858 /* SQL99 says to throw an error for E < S, i.e., negative length */
859 ereport(ERROR,
860 (errcode(ERRCODE_SUBSTRING_ERROR),
861 errmsg("negative substring length not allowed")));
862 L1 = -1; /* silence stupider compilers */
863 }
864 else if (pg_add_s32_overflow(S, length, &E))
865 {
866 /*
867 * L could be large enough for S + L to overflow, in which case
868 * the substring must run to end of string.
869 */
870 L1 = -1;
871 }
872 else
873 {
874 /*
875 * A zero or negative value for the end position can happen if the
876 * start was negative or one. SQL99 says to return a zero-length
877 * string.
878 */
879 if (E < 1)
880 return cstring_to_text("");
881
882 L1 = E - S1;
883 }
884
885 /*
886 * If the start position is past the end of the string, SQL99 says to
887 * return a zero-length string -- DatumGetTextPSlice() will do that
888 * for us. We need only convert S1 to zero-based starting position.
889 */
890 return DatumGetTextPSlice(str, S1 - 1, L1);
891 }
892 else if (eml > 1)
893 {
894 /*
895 * When encoding max length is > 1, we can't get LC without
896 * detoasting, so we'll grab a conservatively large slice now and go
897 * back later to do the right thing
898 */
899 int32 slice_start;
900 int32 slice_size;
901 int32 slice_strlen;
902 text *slice;
903 int32 E1;
904 int32 i;
905 char *p;
906 char *s;
907 text *ret;
908
909 /*
910 * We need to start at position zero because there is no way to know
911 * in advance which byte offset corresponds to the supplied start
912 * position.
913 */
914 slice_start = 0;
915
916 if (length_not_specified) /* special case - get length to end of
917 * string */
918 slice_size = L1 = -1;
919 else if (length < 0)
920 {
921 /* SQL99 says to throw an error for E < S, i.e., negative length */
922 ereport(ERROR,
923 (errcode(ERRCODE_SUBSTRING_ERROR),
924 errmsg("negative substring length not allowed")));
925 slice_size = L1 = -1; /* silence stupider compilers */
926 }
927 else if (pg_add_s32_overflow(S, length, &E))
928 {
929 /*
930 * L could be large enough for S + L to overflow, in which case
931 * the substring must run to end of string.
932 */
933 slice_size = L1 = -1;
934 }
935 else
936 {
937 /*
938 * A zero or negative value for the end position can happen if the
939 * start was negative or one. SQL99 says to return a zero-length
940 * string.
941 */
942 if (E < 1)
943 return cstring_to_text("");
944
945 /*
946 * if E is past the end of the string, the tuple toaster will
947 * truncate the length for us
948 */
949 L1 = E - S1;
950
951 /*
952 * Total slice size in bytes can't be any longer than the start
953 * position plus substring length times the encoding max length.
954 * If that overflows, we can just use -1.
955 */
956 if (pg_mul_s32_overflow(E, eml, &slice_size))
957 slice_size = -1;
958 }
959
960 /*
961 * If we're working with an untoasted source, no need to do an extra
962 * copying step.
963 */
964 if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
965 VARATT_IS_EXTERNAL(DatumGetPointer(str)))
966 slice = DatumGetTextPSlice(str, slice_start, slice_size);
967 else
968 slice = (text *) DatumGetPointer(str);
969
970 /* see if we got back an empty string */
971 if (VARSIZE_ANY_EXHDR(slice) == 0)
972 {
973 if (slice != (text *) DatumGetPointer(str))
974 pfree(slice);
975 return cstring_to_text("");
976 }
977
978 /* Now we can get the actual length of the slice in MB characters */
979 slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
980 VARSIZE_ANY_EXHDR(slice));
981
982 /*
983 * Check that the start position wasn't > slice_strlen. If so, SQL99
984 * says to return a zero-length string.
985 */
986 if (S1 > slice_strlen)
987 {
988 if (slice != (text *) DatumGetPointer(str))
989 pfree(slice);
990 return cstring_to_text("");
991 }
992
993 /*
994 * Adjust L1 and E1 now that we know the slice string length. Again
995 * remember that S1 is one based, and slice_start is zero based.
996 */
997 if (L1 > -1)
998 E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
999 else
1000 E1 = slice_start + 1 + slice_strlen;
1001
1002 /*
1003 * Find the start position in the slice; remember S1 is not zero based
1004 */
1005 p = VARDATA_ANY(slice);
1006 for (i = 0; i < S1 - 1; i++)
1007 p += pg_mblen(p);
1008
1009 /* hang onto a pointer to our start position */
1010 s = p;
1011
1012 /*
1013 * Count the actual bytes used by the substring of the requested
1014 * length.
1015 */
1016 for (i = S1; i < E1; i++)
1017 p += pg_mblen(p);
1018
1019 ret = (text *) palloc(VARHDRSZ + (p - s));
1020 SET_VARSIZE(ret, VARHDRSZ + (p - s));
1021 memcpy(VARDATA(ret), s, (p - s));
1022
1023 if (slice != (text *) DatumGetPointer(str))
1024 pfree(slice);
1025
1026 return ret;
1027 }
1028 else
1029 elog(ERROR, "invalid backend encoding: encoding max length < 1");
1030
1031 /* not reached: suppress compiler warning */
1032 return NULL;
1033 }
1034
1035 /*
1036 * textoverlay
1037 * Replace specified substring of first string with second
1038 *
1039 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1040 * This code is a direct implementation of what the standard says.
1041 */
1042 Datum
textoverlay(PG_FUNCTION_ARGS)1043 textoverlay(PG_FUNCTION_ARGS)
1044 {
1045 text *t1 = PG_GETARG_TEXT_PP(0);
1046 text *t2 = PG_GETARG_TEXT_PP(1);
1047 int sp = PG_GETARG_INT32(2); /* substring start position */
1048 int sl = PG_GETARG_INT32(3); /* substring length */
1049
1050 PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1051 }
1052
1053 Datum
textoverlay_no_len(PG_FUNCTION_ARGS)1054 textoverlay_no_len(PG_FUNCTION_ARGS)
1055 {
1056 text *t1 = PG_GETARG_TEXT_PP(0);
1057 text *t2 = PG_GETARG_TEXT_PP(1);
1058 int sp = PG_GETARG_INT32(2); /* substring start position */
1059 int sl;
1060
1061 sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1062 PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1063 }
1064
1065 static text *
text_overlay(text * t1,text * t2,int sp,int sl)1066 text_overlay(text *t1, text *t2, int sp, int sl)
1067 {
1068 text *result;
1069 text *s1;
1070 text *s2;
1071 int sp_pl_sl;
1072
1073 /*
1074 * Check for possible integer-overflow cases. For negative sp, throw a
1075 * "substring length" error because that's what should be expected
1076 * according to the spec's definition of OVERLAY().
1077 */
1078 if (sp <= 0)
1079 ereport(ERROR,
1080 (errcode(ERRCODE_SUBSTRING_ERROR),
1081 errmsg("negative substring length not allowed")));
1082 if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1083 ereport(ERROR,
1084 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1085 errmsg("integer out of range")));
1086
1087 s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1088 s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1089 result = text_catenate(s1, t2);
1090 result = text_catenate(result, s2);
1091
1092 return result;
1093 }
1094
1095 /*
1096 * textpos -
1097 * Return the position of the specified substring.
1098 * Implements the SQL POSITION() function.
1099 * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1100 * - thomas 1997-07-27
1101 */
1102 Datum
textpos(PG_FUNCTION_ARGS)1103 textpos(PG_FUNCTION_ARGS)
1104 {
1105 text *str = PG_GETARG_TEXT_PP(0);
1106 text *search_str = PG_GETARG_TEXT_PP(1);
1107
1108 PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1109 }
1110
1111 /*
1112 * text_position -
1113 * Does the real work for textpos()
1114 *
1115 * Inputs:
1116 * t1 - string to be searched
1117 * t2 - pattern to match within t1
1118 * Result:
1119 * Character index of the first matched char, starting from 1,
1120 * or 0 if no match.
1121 *
1122 * This is broken out so it can be called directly by other string processing
1123 * functions.
1124 */
1125 static int
text_position(text * t1,text * t2,Oid collid)1126 text_position(text *t1, text *t2, Oid collid)
1127 {
1128 TextPositionState state;
1129 int result;
1130
1131 /* Empty needle always matches at position 1 */
1132 if (VARSIZE_ANY_EXHDR(t2) < 1)
1133 return 1;
1134
1135 /* Otherwise, can't match if haystack is shorter than needle */
1136 if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1137 return 0;
1138
1139 text_position_setup(t1, t2, collid, &state);
1140 if (!text_position_next(&state))
1141 result = 0;
1142 else
1143 result = text_position_get_match_pos(&state);
1144 text_position_cleanup(&state);
1145 return result;
1146 }
1147
1148
1149 /*
1150 * text_position_setup, text_position_next, text_position_cleanup -
1151 * Component steps of text_position()
1152 *
1153 * These are broken out so that a string can be efficiently searched for
1154 * multiple occurrences of the same pattern. text_position_next may be
1155 * called multiple times, and it advances to the next match on each call.
1156 * text_position_get_match_ptr() and text_position_get_match_pos() return
1157 * a pointer or 1-based character position of the last match, respectively.
1158 *
1159 * The "state" variable is normally just a local variable in the caller.
1160 *
1161 * NOTE: text_position_next skips over the matched portion. For example,
1162 * searching for "xx" in "xxx" returns only one match, not two.
1163 */
1164
1165 static void
text_position_setup(text * t1,text * t2,Oid collid,TextPositionState * state)1166 text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1167 {
1168 int len1 = VARSIZE_ANY_EXHDR(t1);
1169 int len2 = VARSIZE_ANY_EXHDR(t2);
1170 pg_locale_t mylocale = 0;
1171
1172 check_collation_set(collid);
1173
1174 if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1175 mylocale = pg_newlocale_from_collation(collid);
1176
1177 if (mylocale && !mylocale->deterministic)
1178 ereport(ERROR,
1179 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1180 errmsg("nondeterministic collations are not supported for substring searches")));
1181
1182 Assert(len1 > 0);
1183 Assert(len2 > 0);
1184
1185 /*
1186 * Even with a multi-byte encoding, we perform the search using the raw
1187 * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1188 * because in UTF-8 the byte sequence of one character cannot contain
1189 * another character. For other multi-byte encodings, we do the search
1190 * initially as a simple byte search, ignoring multibyte issues, but
1191 * verify afterwards that the match we found is at a character boundary,
1192 * and continue the search if it was a false match.
1193 */
1194 if (pg_database_encoding_max_length() == 1)
1195 {
1196 state->is_multibyte = false;
1197 state->is_multibyte_char_in_char = false;
1198 }
1199 else if (GetDatabaseEncoding() == PG_UTF8)
1200 {
1201 state->is_multibyte = true;
1202 state->is_multibyte_char_in_char = false;
1203 }
1204 else
1205 {
1206 state->is_multibyte = true;
1207 state->is_multibyte_char_in_char = true;
1208 }
1209
1210 state->str1 = VARDATA_ANY(t1);
1211 state->str2 = VARDATA_ANY(t2);
1212 state->len1 = len1;
1213 state->len2 = len2;
1214 state->last_match = NULL;
1215 state->refpoint = state->str1;
1216 state->refpos = 0;
1217
1218 /*
1219 * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1220 * notes we use the terminology that the "haystack" is the string to be
1221 * searched (t1) and the "needle" is the pattern being sought (t2).
1222 *
1223 * If the needle is empty or bigger than the haystack then there is no
1224 * point in wasting cycles initializing the table. We also choose not to
1225 * use B-M-H for needles of length 1, since the skip table can't possibly
1226 * save anything in that case.
1227 */
1228 if (len1 >= len2 && len2 > 1)
1229 {
1230 int searchlength = len1 - len2;
1231 int skiptablemask;
1232 int last;
1233 int i;
1234 const char *str2 = state->str2;
1235
1236 /*
1237 * First we must determine how much of the skip table to use. The
1238 * declaration of TextPositionState allows up to 256 elements, but for
1239 * short search problems we don't really want to have to initialize so
1240 * many elements --- it would take too long in comparison to the
1241 * actual search time. So we choose a useful skip table size based on
1242 * the haystack length minus the needle length. The closer the needle
1243 * length is to the haystack length the less useful skipping becomes.
1244 *
1245 * Note: since we use bit-masking to select table elements, the skip
1246 * table size MUST be a power of 2, and so the mask must be 2^N-1.
1247 */
1248 if (searchlength < 16)
1249 skiptablemask = 3;
1250 else if (searchlength < 64)
1251 skiptablemask = 7;
1252 else if (searchlength < 128)
1253 skiptablemask = 15;
1254 else if (searchlength < 512)
1255 skiptablemask = 31;
1256 else if (searchlength < 2048)
1257 skiptablemask = 63;
1258 else if (searchlength < 4096)
1259 skiptablemask = 127;
1260 else
1261 skiptablemask = 255;
1262 state->skiptablemask = skiptablemask;
1263
1264 /*
1265 * Initialize the skip table. We set all elements to the needle
1266 * length, since this is the correct skip distance for any character
1267 * not found in the needle.
1268 */
1269 for (i = 0; i <= skiptablemask; i++)
1270 state->skiptable[i] = len2;
1271
1272 /*
1273 * Now examine the needle. For each character except the last one,
1274 * set the corresponding table element to the appropriate skip
1275 * distance. Note that when two characters share the same skip table
1276 * entry, the one later in the needle must determine the skip
1277 * distance.
1278 */
1279 last = len2 - 1;
1280
1281 for (i = 0; i < last; i++)
1282 state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1283 }
1284 }
1285
1286 /*
1287 * Advance to the next match, starting from the end of the previous match
1288 * (or the beginning of the string, on first call). Returns true if a match
1289 * is found.
1290 *
1291 * Note that this refuses to match an empty-string needle. Most callers
1292 * will have handled that case specially and we'll never see it here.
1293 */
1294 static bool
text_position_next(TextPositionState * state)1295 text_position_next(TextPositionState *state)
1296 {
1297 int needle_len = state->len2;
1298 char *start_ptr;
1299 char *matchptr;
1300
1301 if (needle_len <= 0)
1302 return false; /* result for empty pattern */
1303
1304 /* Start from the point right after the previous match. */
1305 if (state->last_match)
1306 start_ptr = state->last_match + needle_len;
1307 else
1308 start_ptr = state->str1;
1309
1310 retry:
1311 matchptr = text_position_next_internal(start_ptr, state);
1312
1313 if (!matchptr)
1314 return false;
1315
1316 /*
1317 * Found a match for the byte sequence. If this is a multibyte encoding,
1318 * where one character's byte sequence can appear inside a longer
1319 * multi-byte character, we need to verify that the match was at a
1320 * character boundary, not in the middle of a multi-byte character.
1321 */
1322 if (state->is_multibyte_char_in_char)
1323 {
1324 /* Walk one character at a time, until we reach the match. */
1325
1326 /* the search should never move backwards. */
1327 Assert(state->refpoint <= matchptr);
1328
1329 while (state->refpoint < matchptr)
1330 {
1331 /* step to next character. */
1332 state->refpoint += pg_mblen(state->refpoint);
1333 state->refpos++;
1334
1335 /*
1336 * If we stepped over the match's start position, then it was a
1337 * false positive, where the byte sequence appeared in the middle
1338 * of a multi-byte character. Skip it, and continue the search at
1339 * the next character boundary.
1340 */
1341 if (state->refpoint > matchptr)
1342 {
1343 start_ptr = state->refpoint;
1344 goto retry;
1345 }
1346 }
1347 }
1348
1349 state->last_match = matchptr;
1350 return true;
1351 }
1352
1353 /*
1354 * Subroutine of text_position_next(). This searches for the raw byte
1355 * sequence, ignoring any multi-byte encoding issues. Returns the first
1356 * match starting at 'start_ptr', or NULL if no match is found.
1357 */
1358 static char *
text_position_next_internal(char * start_ptr,TextPositionState * state)1359 text_position_next_internal(char *start_ptr, TextPositionState *state)
1360 {
1361 int haystack_len = state->len1;
1362 int needle_len = state->len2;
1363 int skiptablemask = state->skiptablemask;
1364 const char *haystack = state->str1;
1365 const char *needle = state->str2;
1366 const char *haystack_end = &haystack[haystack_len];
1367 const char *hptr;
1368
1369 Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1370
1371 if (needle_len == 1)
1372 {
1373 /* No point in using B-M-H for a one-character needle */
1374 char nchar = *needle;
1375
1376 hptr = start_ptr;
1377 while (hptr < haystack_end)
1378 {
1379 if (*hptr == nchar)
1380 return (char *) hptr;
1381 hptr++;
1382 }
1383 }
1384 else
1385 {
1386 const char *needle_last = &needle[needle_len - 1];
1387
1388 /* Start at startpos plus the length of the needle */
1389 hptr = start_ptr + needle_len - 1;
1390 while (hptr < haystack_end)
1391 {
1392 /* Match the needle scanning *backward* */
1393 const char *nptr;
1394 const char *p;
1395
1396 nptr = needle_last;
1397 p = hptr;
1398 while (*nptr == *p)
1399 {
1400 /* Matched it all? If so, return 1-based position */
1401 if (nptr == needle)
1402 return (char *) p;
1403 nptr--, p--;
1404 }
1405
1406 /*
1407 * No match, so use the haystack char at hptr to decide how far to
1408 * advance. If the needle had any occurrence of that character
1409 * (or more precisely, one sharing the same skiptable entry)
1410 * before its last character, then we advance far enough to align
1411 * the last such needle character with that haystack position.
1412 * Otherwise we can advance by the whole needle length.
1413 */
1414 hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1415 }
1416 }
1417
1418 return 0; /* not found */
1419 }
1420
1421 /*
1422 * Return a pointer to the current match.
1423 *
1424 * The returned pointer points into correct position in the original
1425 * the haystack string.
1426 */
1427 static char *
text_position_get_match_ptr(TextPositionState * state)1428 text_position_get_match_ptr(TextPositionState *state)
1429 {
1430 return state->last_match;
1431 }
1432
1433 /*
1434 * Return the offset of the current match.
1435 *
1436 * The offset is in characters, 1-based.
1437 */
1438 static int
text_position_get_match_pos(TextPositionState * state)1439 text_position_get_match_pos(TextPositionState *state)
1440 {
1441 if (!state->is_multibyte)
1442 return state->last_match - state->str1 + 1;
1443 else
1444 {
1445 /* Convert the byte position to char position. */
1446 while (state->refpoint < state->last_match)
1447 {
1448 state->refpoint += pg_mblen(state->refpoint);
1449 state->refpos++;
1450 }
1451 Assert(state->refpoint == state->last_match);
1452 return state->refpos + 1;
1453 }
1454 }
1455
1456 static void
text_position_cleanup(TextPositionState * state)1457 text_position_cleanup(TextPositionState *state)
1458 {
1459 /* no cleanup needed */
1460 }
1461
1462 static void
check_collation_set(Oid collid)1463 check_collation_set(Oid collid)
1464 {
1465 if (!OidIsValid(collid))
1466 {
1467 /*
1468 * This typically means that the parser could not resolve a conflict
1469 * of implicit collations, so report it that way.
1470 */
1471 ereport(ERROR,
1472 (errcode(ERRCODE_INDETERMINATE_COLLATION),
1473 errmsg("could not determine which collation to use for string comparison"),
1474 errhint("Use the COLLATE clause to set the collation explicitly.")));
1475 }
1476 }
1477
1478 /* varstr_cmp()
1479 * Comparison function for text strings with given lengths.
1480 * Includes locale support, but must copy strings to temporary memory
1481 * to allow null-termination for inputs to strcoll().
1482 * Returns an integer less than, equal to, or greater than zero, indicating
1483 * whether arg1 is less than, equal to, or greater than arg2.
1484 *
1485 * Note: many functions that depend on this are marked leakproof; therefore,
1486 * avoid reporting the actual contents of the input when throwing errors.
1487 * All errors herein should be things that can't happen except on corrupt
1488 * data, anyway; otherwise we will have trouble with indexing strings that
1489 * would cause them.
1490 */
1491 int
varstr_cmp(const char * arg1,int len1,const char * arg2,int len2,Oid collid)1492 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1493 {
1494 int result;
1495
1496 check_collation_set(collid);
1497
1498 /*
1499 * Unfortunately, there is no strncoll(), so in the non-C locale case we
1500 * have to do some memory copying. This turns out to be significantly
1501 * slower, so we optimize the case where LC_COLLATE is C. We also try to
1502 * optimize relatively-short strings by avoiding palloc/pfree overhead.
1503 */
1504 if (lc_collate_is_c(collid))
1505 {
1506 result = memcmp(arg1, arg2, Min(len1, len2));
1507 if ((result == 0) && (len1 != len2))
1508 result = (len1 < len2) ? -1 : 1;
1509 }
1510 else
1511 {
1512 char a1buf[TEXTBUFLEN];
1513 char a2buf[TEXTBUFLEN];
1514 char *a1p,
1515 *a2p;
1516 pg_locale_t mylocale = 0;
1517
1518 if (collid != DEFAULT_COLLATION_OID)
1519 mylocale = pg_newlocale_from_collation(collid);
1520
1521 /*
1522 * memcmp() can't tell us which of two unequal strings sorts first,
1523 * but it's a cheap way to tell if they're equal. Testing shows that
1524 * memcmp() followed by strcoll() is only trivially slower than
1525 * strcoll() by itself, so we don't lose much if this doesn't work out
1526 * very often, and if it does - for example, because there are many
1527 * equal strings in the input - then we win big by avoiding expensive
1528 * collation-aware comparisons.
1529 */
1530 if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1531 return 0;
1532
1533 #ifdef WIN32
1534 /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1535 if (GetDatabaseEncoding() == PG_UTF8
1536 && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1537 {
1538 int a1len;
1539 int a2len;
1540 int r;
1541
1542 if (len1 >= TEXTBUFLEN / 2)
1543 {
1544 a1len = len1 * 2 + 2;
1545 a1p = palloc(a1len);
1546 }
1547 else
1548 {
1549 a1len = TEXTBUFLEN;
1550 a1p = a1buf;
1551 }
1552 if (len2 >= TEXTBUFLEN / 2)
1553 {
1554 a2len = len2 * 2 + 2;
1555 a2p = palloc(a2len);
1556 }
1557 else
1558 {
1559 a2len = TEXTBUFLEN;
1560 a2p = a2buf;
1561 }
1562
1563 /* stupid Microsloth API does not work for zero-length input */
1564 if (len1 == 0)
1565 r = 0;
1566 else
1567 {
1568 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1569 (LPWSTR) a1p, a1len / 2);
1570 if (!r)
1571 ereport(ERROR,
1572 (errmsg("could not convert string to UTF-16: error code %lu",
1573 GetLastError())));
1574 }
1575 ((LPWSTR) a1p)[r] = 0;
1576
1577 if (len2 == 0)
1578 r = 0;
1579 else
1580 {
1581 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1582 (LPWSTR) a2p, a2len / 2);
1583 if (!r)
1584 ereport(ERROR,
1585 (errmsg("could not convert string to UTF-16: error code %lu",
1586 GetLastError())));
1587 }
1588 ((LPWSTR) a2p)[r] = 0;
1589
1590 errno = 0;
1591 #ifdef HAVE_LOCALE_T
1592 if (mylocale)
1593 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1594 else
1595 #endif
1596 result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1597 if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1598 * headers */
1599 ereport(ERROR,
1600 (errmsg("could not compare Unicode strings: %m")));
1601
1602 /* Break tie if necessary. */
1603 if (result == 0 &&
1604 (!mylocale || mylocale->deterministic))
1605 {
1606 result = memcmp(arg1, arg2, Min(len1, len2));
1607 if ((result == 0) && (len1 != len2))
1608 result = (len1 < len2) ? -1 : 1;
1609 }
1610
1611 if (a1p != a1buf)
1612 pfree(a1p);
1613 if (a2p != a2buf)
1614 pfree(a2p);
1615
1616 return result;
1617 }
1618 #endif /* WIN32 */
1619
1620 if (len1 >= TEXTBUFLEN)
1621 a1p = (char *) palloc(len1 + 1);
1622 else
1623 a1p = a1buf;
1624 if (len2 >= TEXTBUFLEN)
1625 a2p = (char *) palloc(len2 + 1);
1626 else
1627 a2p = a2buf;
1628
1629 memcpy(a1p, arg1, len1);
1630 a1p[len1] = '\0';
1631 memcpy(a2p, arg2, len2);
1632 a2p[len2] = '\0';
1633
1634 if (mylocale)
1635 {
1636 if (mylocale->provider == COLLPROVIDER_ICU)
1637 {
1638 #ifdef USE_ICU
1639 #ifdef HAVE_UCOL_STRCOLLUTF8
1640 if (GetDatabaseEncoding() == PG_UTF8)
1641 {
1642 UErrorCode status;
1643
1644 status = U_ZERO_ERROR;
1645 result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1646 arg1, len1,
1647 arg2, len2,
1648 &status);
1649 if (U_FAILURE(status))
1650 ereport(ERROR,
1651 (errmsg("collation failed: %s", u_errorName(status))));
1652 }
1653 else
1654 #endif
1655 {
1656 int32_t ulen1,
1657 ulen2;
1658 UChar *uchar1,
1659 *uchar2;
1660
1661 ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1662 ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1663
1664 result = ucol_strcoll(mylocale->info.icu.ucol,
1665 uchar1, ulen1,
1666 uchar2, ulen2);
1667
1668 pfree(uchar1);
1669 pfree(uchar2);
1670 }
1671 #else /* not USE_ICU */
1672 /* shouldn't happen */
1673 elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1674 #endif /* not USE_ICU */
1675 }
1676 else
1677 {
1678 #ifdef HAVE_LOCALE_T
1679 result = strcoll_l(a1p, a2p, mylocale->info.lt);
1680 #else
1681 /* shouldn't happen */
1682 elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1683 #endif
1684 }
1685 }
1686 else
1687 result = strcoll(a1p, a2p);
1688
1689 /* Break tie if necessary. */
1690 if (result == 0 &&
1691 (!mylocale || mylocale->deterministic))
1692 result = strcmp(a1p, a2p);
1693
1694 if (a1p != a1buf)
1695 pfree(a1p);
1696 if (a2p != a2buf)
1697 pfree(a2p);
1698 }
1699
1700 return result;
1701 }
1702
1703 /* text_cmp()
1704 * Internal comparison function for text strings.
1705 * Returns -1, 0 or 1
1706 */
1707 static int
text_cmp(text * arg1,text * arg2,Oid collid)1708 text_cmp(text *arg1, text *arg2, Oid collid)
1709 {
1710 char *a1p,
1711 *a2p;
1712 int len1,
1713 len2;
1714
1715 a1p = VARDATA_ANY(arg1);
1716 a2p = VARDATA_ANY(arg2);
1717
1718 len1 = VARSIZE_ANY_EXHDR(arg1);
1719 len2 = VARSIZE_ANY_EXHDR(arg2);
1720
1721 return varstr_cmp(a1p, len1, a2p, len2, collid);
1722 }
1723
1724 /*
1725 * Comparison functions for text strings.
1726 *
1727 * Note: btree indexes need these routines not to leak memory; therefore,
1728 * be careful to free working copies of toasted datums. Most places don't
1729 * need to be so careful.
1730 */
1731
1732 Datum
texteq(PG_FUNCTION_ARGS)1733 texteq(PG_FUNCTION_ARGS)
1734 {
1735 Oid collid = PG_GET_COLLATION();
1736 bool result;
1737
1738 check_collation_set(collid);
1739
1740 if (lc_collate_is_c(collid) ||
1741 collid == DEFAULT_COLLATION_OID ||
1742 pg_newlocale_from_collation(collid)->deterministic)
1743 {
1744 Datum arg1 = PG_GETARG_DATUM(0);
1745 Datum arg2 = PG_GETARG_DATUM(1);
1746 Size len1,
1747 len2;
1748
1749 /*
1750 * Since we only care about equality or not-equality, we can avoid all
1751 * the expense of strcoll() here, and just do bitwise comparison. In
1752 * fact, we don't even have to do a bitwise comparison if we can show
1753 * the lengths of the strings are unequal; which might save us from
1754 * having to detoast one or both values.
1755 */
1756 len1 = toast_raw_datum_size(arg1);
1757 len2 = toast_raw_datum_size(arg2);
1758 if (len1 != len2)
1759 result = false;
1760 else
1761 {
1762 text *targ1 = DatumGetTextPP(arg1);
1763 text *targ2 = DatumGetTextPP(arg2);
1764
1765 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1766 len1 - VARHDRSZ) == 0);
1767
1768 PG_FREE_IF_COPY(targ1, 0);
1769 PG_FREE_IF_COPY(targ2, 1);
1770 }
1771 }
1772 else
1773 {
1774 text *arg1 = PG_GETARG_TEXT_PP(0);
1775 text *arg2 = PG_GETARG_TEXT_PP(1);
1776
1777 result = (text_cmp(arg1, arg2, collid) == 0);
1778
1779 PG_FREE_IF_COPY(arg1, 0);
1780 PG_FREE_IF_COPY(arg2, 1);
1781 }
1782
1783 PG_RETURN_BOOL(result);
1784 }
1785
1786 Datum
textne(PG_FUNCTION_ARGS)1787 textne(PG_FUNCTION_ARGS)
1788 {
1789 Oid collid = PG_GET_COLLATION();
1790 bool result;
1791
1792 check_collation_set(collid);
1793
1794 if (lc_collate_is_c(collid) ||
1795 collid == DEFAULT_COLLATION_OID ||
1796 pg_newlocale_from_collation(collid)->deterministic)
1797 {
1798 Datum arg1 = PG_GETARG_DATUM(0);
1799 Datum arg2 = PG_GETARG_DATUM(1);
1800 Size len1,
1801 len2;
1802
1803 /* See comment in texteq() */
1804 len1 = toast_raw_datum_size(arg1);
1805 len2 = toast_raw_datum_size(arg2);
1806 if (len1 != len2)
1807 result = true;
1808 else
1809 {
1810 text *targ1 = DatumGetTextPP(arg1);
1811 text *targ2 = DatumGetTextPP(arg2);
1812
1813 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1814 len1 - VARHDRSZ) != 0);
1815
1816 PG_FREE_IF_COPY(targ1, 0);
1817 PG_FREE_IF_COPY(targ2, 1);
1818 }
1819 }
1820 else
1821 {
1822 text *arg1 = PG_GETARG_TEXT_PP(0);
1823 text *arg2 = PG_GETARG_TEXT_PP(1);
1824
1825 result = (text_cmp(arg1, arg2, collid) != 0);
1826
1827 PG_FREE_IF_COPY(arg1, 0);
1828 PG_FREE_IF_COPY(arg2, 1);
1829 }
1830
1831 PG_RETURN_BOOL(result);
1832 }
1833
1834 Datum
text_lt(PG_FUNCTION_ARGS)1835 text_lt(PG_FUNCTION_ARGS)
1836 {
1837 text *arg1 = PG_GETARG_TEXT_PP(0);
1838 text *arg2 = PG_GETARG_TEXT_PP(1);
1839 bool result;
1840
1841 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1842
1843 PG_FREE_IF_COPY(arg1, 0);
1844 PG_FREE_IF_COPY(arg2, 1);
1845
1846 PG_RETURN_BOOL(result);
1847 }
1848
1849 Datum
text_le(PG_FUNCTION_ARGS)1850 text_le(PG_FUNCTION_ARGS)
1851 {
1852 text *arg1 = PG_GETARG_TEXT_PP(0);
1853 text *arg2 = PG_GETARG_TEXT_PP(1);
1854 bool result;
1855
1856 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1857
1858 PG_FREE_IF_COPY(arg1, 0);
1859 PG_FREE_IF_COPY(arg2, 1);
1860
1861 PG_RETURN_BOOL(result);
1862 }
1863
1864 Datum
text_gt(PG_FUNCTION_ARGS)1865 text_gt(PG_FUNCTION_ARGS)
1866 {
1867 text *arg1 = PG_GETARG_TEXT_PP(0);
1868 text *arg2 = PG_GETARG_TEXT_PP(1);
1869 bool result;
1870
1871 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1872
1873 PG_FREE_IF_COPY(arg1, 0);
1874 PG_FREE_IF_COPY(arg2, 1);
1875
1876 PG_RETURN_BOOL(result);
1877 }
1878
1879 Datum
text_ge(PG_FUNCTION_ARGS)1880 text_ge(PG_FUNCTION_ARGS)
1881 {
1882 text *arg1 = PG_GETARG_TEXT_PP(0);
1883 text *arg2 = PG_GETARG_TEXT_PP(1);
1884 bool result;
1885
1886 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1887
1888 PG_FREE_IF_COPY(arg1, 0);
1889 PG_FREE_IF_COPY(arg2, 1);
1890
1891 PG_RETURN_BOOL(result);
1892 }
1893
1894 Datum
text_starts_with(PG_FUNCTION_ARGS)1895 text_starts_with(PG_FUNCTION_ARGS)
1896 {
1897 Datum arg1 = PG_GETARG_DATUM(0);
1898 Datum arg2 = PG_GETARG_DATUM(1);
1899 Oid collid = PG_GET_COLLATION();
1900 pg_locale_t mylocale = 0;
1901 bool result;
1902 Size len1,
1903 len2;
1904
1905 check_collation_set(collid);
1906
1907 if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1908 mylocale = pg_newlocale_from_collation(collid);
1909
1910 if (mylocale && !mylocale->deterministic)
1911 ereport(ERROR,
1912 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1913 errmsg("nondeterministic collations are not supported for substring searches")));
1914
1915 len1 = toast_raw_datum_size(arg1);
1916 len2 = toast_raw_datum_size(arg2);
1917 if (len2 > len1)
1918 result = false;
1919 else
1920 {
1921 text *targ1 = text_substring(arg1, 1, len2, false);
1922 text *targ2 = DatumGetTextPP(arg2);
1923
1924 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1925 VARSIZE_ANY_EXHDR(targ2)) == 0);
1926
1927 PG_FREE_IF_COPY(targ1, 0);
1928 PG_FREE_IF_COPY(targ2, 1);
1929 }
1930
1931 PG_RETURN_BOOL(result);
1932 }
1933
1934 Datum
bttextcmp(PG_FUNCTION_ARGS)1935 bttextcmp(PG_FUNCTION_ARGS)
1936 {
1937 text *arg1 = PG_GETARG_TEXT_PP(0);
1938 text *arg2 = PG_GETARG_TEXT_PP(1);
1939 int32 result;
1940
1941 result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1942
1943 PG_FREE_IF_COPY(arg1, 0);
1944 PG_FREE_IF_COPY(arg2, 1);
1945
1946 PG_RETURN_INT32(result);
1947 }
1948
1949 Datum
bttextsortsupport(PG_FUNCTION_ARGS)1950 bttextsortsupport(PG_FUNCTION_ARGS)
1951 {
1952 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1953 Oid collid = ssup->ssup_collation;
1954 MemoryContext oldcontext;
1955
1956 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1957
1958 /* Use generic string SortSupport */
1959 varstr_sortsupport(ssup, TEXTOID, collid);
1960
1961 MemoryContextSwitchTo(oldcontext);
1962
1963 PG_RETURN_VOID();
1964 }
1965
1966 /*
1967 * Generic sortsupport interface for character type's operator classes.
1968 * Includes locale support, and support for BpChar semantics (i.e. removing
1969 * trailing spaces before comparison).
1970 *
1971 * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1972 * same representation. Callers that always use the C collation (e.g.
1973 * non-collatable type callers like bytea) may have NUL bytes in their strings;
1974 * this will not work with any other collation, though.
1975 */
1976 void
varstr_sortsupport(SortSupport ssup,Oid typid,Oid collid)1977 varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
1978 {
1979 bool abbreviate = ssup->abbreviate;
1980 bool collate_c = false;
1981 VarStringSortSupport *sss;
1982 pg_locale_t locale = 0;
1983
1984 check_collation_set(collid);
1985
1986 /*
1987 * If possible, set ssup->comparator to a function which can be used to
1988 * directly compare two datums. If we can do this, we'll avoid the
1989 * overhead of a trip through the fmgr layer for every comparison, which
1990 * can be substantial.
1991 *
1992 * Most typically, we'll set the comparator to varlenafastcmp_locale,
1993 * which uses strcoll() to perform comparisons. We use that for the
1994 * BpChar case too, but type NAME uses namefastcmp_locale. However, if
1995 * LC_COLLATE = C, we can make things quite a bit faster with
1996 * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
1997 * memcmp() rather than strcoll().
1998 */
1999 if (lc_collate_is_c(collid))
2000 {
2001 if (typid == BPCHAROID)
2002 ssup->comparator = bpcharfastcmp_c;
2003 else if (typid == NAMEOID)
2004 {
2005 ssup->comparator = namefastcmp_c;
2006 /* Not supporting abbreviation with type NAME, for now */
2007 abbreviate = false;
2008 }
2009 else
2010 ssup->comparator = varstrfastcmp_c;
2011
2012 collate_c = true;
2013 }
2014 else
2015 {
2016 /*
2017 * We need a collation-sensitive comparison. To make things faster,
2018 * we'll figure out the collation based on the locale id and cache the
2019 * result.
2020 */
2021 if (collid != DEFAULT_COLLATION_OID)
2022 locale = pg_newlocale_from_collation(collid);
2023
2024 /*
2025 * There is a further exception on Windows. When the database
2026 * encoding is UTF-8 and we are not using the C collation, complex
2027 * hacks are required. We don't currently have a comparator that
2028 * handles that case, so we fall back on the slow method of having the
2029 * sort code invoke bttextcmp() (in the case of text) via the fmgr
2030 * trampoline. ICU locales work just the same on Windows, however.
2031 */
2032 #ifdef WIN32
2033 if (GetDatabaseEncoding() == PG_UTF8 &&
2034 !(locale && locale->provider == COLLPROVIDER_ICU))
2035 return;
2036 #endif
2037
2038 /*
2039 * We use varlenafastcmp_locale except for type NAME.
2040 */
2041 if (typid == NAMEOID)
2042 {
2043 ssup->comparator = namefastcmp_locale;
2044 /* Not supporting abbreviation with type NAME, for now */
2045 abbreviate = false;
2046 }
2047 else
2048 ssup->comparator = varlenafastcmp_locale;
2049 }
2050
2051 /*
2052 * Unfortunately, it seems that abbreviation for non-C collations is
2053 * broken on many common platforms; testing of multiple versions of glibc
2054 * reveals that, for many locales, strcoll() and strxfrm() do not return
2055 * consistent results, which is fatal to this optimization. While no
2056 * other libc other than Cygwin has so far been shown to have a problem,
2057 * we take the conservative course of action for right now and disable
2058 * this categorically. (Users who are certain this isn't a problem on
2059 * their system can define TRUST_STRXFRM.)
2060 *
2061 * Even apart from the risk of broken locales, it's possible that there
2062 * are platforms where the use of abbreviated keys should be disabled at
2063 * compile time. Having only 4 byte datums could make worst-case
2064 * performance drastically more likely, for example. Moreover, macOS's
2065 * strxfrm() implementation is known to not effectively concentrate a
2066 * significant amount of entropy from the original string in earlier
2067 * transformed blobs. It's possible that other supported platforms are
2068 * similarly encumbered. So, if we ever get past disabling this
2069 * categorically, we may still want or need to disable it for particular
2070 * platforms.
2071 */
2072 #ifndef TRUST_STRXFRM
2073 if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2074 abbreviate = false;
2075 #endif
2076
2077 /*
2078 * If we're using abbreviated keys, or if we're using a locale-aware
2079 * comparison, we need to initialize a StringSortSupport object. Both
2080 * cases will make use of the temporary buffers we initialize here for
2081 * scratch space (and to detect requirement for BpChar semantics from
2082 * caller), and the abbreviation case requires additional state.
2083 */
2084 if (abbreviate || !collate_c)
2085 {
2086 sss = palloc(sizeof(VarStringSortSupport));
2087 sss->buf1 = palloc(TEXTBUFLEN);
2088 sss->buflen1 = TEXTBUFLEN;
2089 sss->buf2 = palloc(TEXTBUFLEN);
2090 sss->buflen2 = TEXTBUFLEN;
2091 /* Start with invalid values */
2092 sss->last_len1 = -1;
2093 sss->last_len2 = -1;
2094 /* Initialize */
2095 sss->last_returned = 0;
2096 sss->locale = locale;
2097
2098 /*
2099 * To avoid somehow confusing a strxfrm() blob and an original string,
2100 * constantly keep track of the variety of data that buf1 and buf2
2101 * currently contain.
2102 *
2103 * Comparisons may be interleaved with conversion calls. Frequently,
2104 * conversions and comparisons are batched into two distinct phases,
2105 * but the correctness of caching cannot hinge upon this. For
2106 * comparison caching, buffer state is only trusted if cache_blob is
2107 * found set to false, whereas strxfrm() caching only trusts the state
2108 * when cache_blob is found set to true.
2109 *
2110 * Arbitrarily initialize cache_blob to true.
2111 */
2112 sss->cache_blob = true;
2113 sss->collate_c = collate_c;
2114 sss->typid = typid;
2115 ssup->ssup_extra = sss;
2116
2117 /*
2118 * If possible, plan to use the abbreviated keys optimization. The
2119 * core code may switch back to authoritative comparator should
2120 * abbreviation be aborted.
2121 */
2122 if (abbreviate)
2123 {
2124 sss->prop_card = 0.20;
2125 initHyperLogLog(&sss->abbr_card, 10);
2126 initHyperLogLog(&sss->full_card, 10);
2127 ssup->abbrev_full_comparator = ssup->comparator;
2128 ssup->comparator = varstrcmp_abbrev;
2129 ssup->abbrev_converter = varstr_abbrev_convert;
2130 ssup->abbrev_abort = varstr_abbrev_abort;
2131 }
2132 }
2133 }
2134
2135 /*
2136 * sortsupport comparison func (for C locale case)
2137 */
2138 static int
varstrfastcmp_c(Datum x,Datum y,SortSupport ssup)2139 varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
2140 {
2141 VarString *arg1 = DatumGetVarStringPP(x);
2142 VarString *arg2 = DatumGetVarStringPP(y);
2143 char *a1p,
2144 *a2p;
2145 int len1,
2146 len2,
2147 result;
2148
2149 a1p = VARDATA_ANY(arg1);
2150 a2p = VARDATA_ANY(arg2);
2151
2152 len1 = VARSIZE_ANY_EXHDR(arg1);
2153 len2 = VARSIZE_ANY_EXHDR(arg2);
2154
2155 result = memcmp(a1p, a2p, Min(len1, len2));
2156 if ((result == 0) && (len1 != len2))
2157 result = (len1 < len2) ? -1 : 1;
2158
2159 /* We can't afford to leak memory here. */
2160 if (PointerGetDatum(arg1) != x)
2161 pfree(arg1);
2162 if (PointerGetDatum(arg2) != y)
2163 pfree(arg2);
2164
2165 return result;
2166 }
2167
2168 /*
2169 * sortsupport comparison func (for BpChar C locale case)
2170 *
2171 * BpChar outsources its sortsupport to this module. Specialization for the
2172 * varstr_sortsupport BpChar case, modeled on
2173 * internal_bpchar_pattern_compare().
2174 */
2175 static int
bpcharfastcmp_c(Datum x,Datum y,SortSupport ssup)2176 bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2177 {
2178 BpChar *arg1 = DatumGetBpCharPP(x);
2179 BpChar *arg2 = DatumGetBpCharPP(y);
2180 char *a1p,
2181 *a2p;
2182 int len1,
2183 len2,
2184 result;
2185
2186 a1p = VARDATA_ANY(arg1);
2187 a2p = VARDATA_ANY(arg2);
2188
2189 len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2190 len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2191
2192 result = memcmp(a1p, a2p, Min(len1, len2));
2193 if ((result == 0) && (len1 != len2))
2194 result = (len1 < len2) ? -1 : 1;
2195
2196 /* We can't afford to leak memory here. */
2197 if (PointerGetDatum(arg1) != x)
2198 pfree(arg1);
2199 if (PointerGetDatum(arg2) != y)
2200 pfree(arg2);
2201
2202 return result;
2203 }
2204
2205 /*
2206 * sortsupport comparison func (for NAME C locale case)
2207 */
2208 static int
namefastcmp_c(Datum x,Datum y,SortSupport ssup)2209 namefastcmp_c(Datum x, Datum y, SortSupport ssup)
2210 {
2211 Name arg1 = DatumGetName(x);
2212 Name arg2 = DatumGetName(y);
2213
2214 return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2215 }
2216
2217 /*
2218 * sortsupport comparison func (for locale case with all varlena types)
2219 */
2220 static int
varlenafastcmp_locale(Datum x,Datum y,SortSupport ssup)2221 varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
2222 {
2223 VarString *arg1 = DatumGetVarStringPP(x);
2224 VarString *arg2 = DatumGetVarStringPP(y);
2225 char *a1p,
2226 *a2p;
2227 int len1,
2228 len2,
2229 result;
2230
2231 a1p = VARDATA_ANY(arg1);
2232 a2p = VARDATA_ANY(arg2);
2233
2234 len1 = VARSIZE_ANY_EXHDR(arg1);
2235 len2 = VARSIZE_ANY_EXHDR(arg2);
2236
2237 result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2238
2239 /* We can't afford to leak memory here. */
2240 if (PointerGetDatum(arg1) != x)
2241 pfree(arg1);
2242 if (PointerGetDatum(arg2) != y)
2243 pfree(arg2);
2244
2245 return result;
2246 }
2247
2248 /*
2249 * sortsupport comparison func (for locale case with NAME type)
2250 */
2251 static int
namefastcmp_locale(Datum x,Datum y,SortSupport ssup)2252 namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
2253 {
2254 Name arg1 = DatumGetName(x);
2255 Name arg2 = DatumGetName(y);
2256
2257 return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2258 NameStr(*arg2), strlen(NameStr(*arg2)),
2259 ssup);
2260 }
2261
2262 /*
2263 * sortsupport comparison func for locale cases
2264 */
2265 static int
varstrfastcmp_locale(char * a1p,int len1,char * a2p,int len2,SortSupport ssup)2266 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2267 {
2268 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2269 int result;
2270 bool arg1_match;
2271
2272 /* Fast pre-check for equality, as discussed in varstr_cmp() */
2273 if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2274 {
2275 /*
2276 * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2277 * last_len2. Existing contents of buffers might still be used by
2278 * next call.
2279 *
2280 * It's fine to allow the comparison of BpChar padding bytes here,
2281 * even though that implies that the memcmp() will usually be
2282 * performed for BpChar callers (though multibyte characters could
2283 * still prevent that from occurring). The memcmp() is still very
2284 * cheap, and BpChar's funny semantics have us remove trailing spaces
2285 * (not limited to padding), so we need make no distinction between
2286 * padding space characters and "real" space characters.
2287 */
2288 return 0;
2289 }
2290
2291 if (sss->typid == BPCHAROID)
2292 {
2293 /* Get true number of bytes, ignoring trailing spaces */
2294 len1 = bpchartruelen(a1p, len1);
2295 len2 = bpchartruelen(a2p, len2);
2296 }
2297
2298 if (len1 >= sss->buflen1)
2299 {
2300 pfree(sss->buf1);
2301 sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2302 sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2303 }
2304 if (len2 >= sss->buflen2)
2305 {
2306 pfree(sss->buf2);
2307 sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2308 sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2309 }
2310
2311 /*
2312 * We're likely to be asked to compare the same strings repeatedly, and
2313 * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2314 * comparisons, even though in general there is no reason to think that
2315 * that will work out (every string datum may be unique). Caching does
2316 * not slow things down measurably when it doesn't work out, and can speed
2317 * things up by rather a lot when it does. In part, this is because the
2318 * memcmp() compares data from cachelines that are needed in L1 cache even
2319 * when the last comparison's result cannot be reused.
2320 */
2321 arg1_match = true;
2322 if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2323 {
2324 arg1_match = false;
2325 memcpy(sss->buf1, a1p, len1);
2326 sss->buf1[len1] = '\0';
2327 sss->last_len1 = len1;
2328 }
2329
2330 /*
2331 * If we're comparing the same two strings as last time, we can return the
2332 * same answer without calling strcoll() again. This is more likely than
2333 * it seems (at least with moderate to low cardinality sets), because
2334 * quicksort compares the same pivot against many values.
2335 */
2336 if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2337 {
2338 memcpy(sss->buf2, a2p, len2);
2339 sss->buf2[len2] = '\0';
2340 sss->last_len2 = len2;
2341 }
2342 else if (arg1_match && !sss->cache_blob)
2343 {
2344 /* Use result cached following last actual strcoll() call */
2345 return sss->last_returned;
2346 }
2347
2348 if (sss->locale)
2349 {
2350 if (sss->locale->provider == COLLPROVIDER_ICU)
2351 {
2352 #ifdef USE_ICU
2353 #ifdef HAVE_UCOL_STRCOLLUTF8
2354 if (GetDatabaseEncoding() == PG_UTF8)
2355 {
2356 UErrorCode status;
2357
2358 status = U_ZERO_ERROR;
2359 result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2360 a1p, len1,
2361 a2p, len2,
2362 &status);
2363 if (U_FAILURE(status))
2364 ereport(ERROR,
2365 (errmsg("collation failed: %s", u_errorName(status))));
2366 }
2367 else
2368 #endif
2369 {
2370 int32_t ulen1,
2371 ulen2;
2372 UChar *uchar1,
2373 *uchar2;
2374
2375 ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2376 ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2377
2378 result = ucol_strcoll(sss->locale->info.icu.ucol,
2379 uchar1, ulen1,
2380 uchar2, ulen2);
2381
2382 pfree(uchar1);
2383 pfree(uchar2);
2384 }
2385 #else /* not USE_ICU */
2386 /* shouldn't happen */
2387 elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2388 #endif /* not USE_ICU */
2389 }
2390 else
2391 {
2392 #ifdef HAVE_LOCALE_T
2393 result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2394 #else
2395 /* shouldn't happen */
2396 elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2397 #endif
2398 }
2399 }
2400 else
2401 result = strcoll(sss->buf1, sss->buf2);
2402
2403 /* Break tie if necessary. */
2404 if (result == 0 &&
2405 (!sss->locale || sss->locale->deterministic))
2406 result = strcmp(sss->buf1, sss->buf2);
2407
2408 /* Cache result, perhaps saving an expensive strcoll() call next time */
2409 sss->cache_blob = false;
2410 sss->last_returned = result;
2411 return result;
2412 }
2413
2414 /*
2415 * Abbreviated key comparison func
2416 */
2417 static int
varstrcmp_abbrev(Datum x,Datum y,SortSupport ssup)2418 varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup)
2419 {
2420 /*
2421 * When 0 is returned, the core system will call varstrfastcmp_c()
2422 * (bpcharfastcmp_c() in BpChar case) or varlenafastcmp_locale(). Even a
2423 * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2424 * authoritatively, for the same reason that there is a strcoll()
2425 * tie-breaker call to strcmp() in varstr_cmp().
2426 */
2427 if (x > y)
2428 return 1;
2429 else if (x == y)
2430 return 0;
2431 else
2432 return -1;
2433 }
2434
2435 /*
2436 * Conversion routine for sortsupport. Converts original to abbreviated key
2437 * representation. Our encoding strategy is simple -- pack the first 8 bytes
2438 * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2439 * stored in reverse order), and treat it as an unsigned integer. When the "C"
2440 * locale is used, or in case of bytea, just memcpy() from original instead.
2441 */
2442 static Datum
varstr_abbrev_convert(Datum original,SortSupport ssup)2443 varstr_abbrev_convert(Datum original, SortSupport ssup)
2444 {
2445 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2446 VarString *authoritative = DatumGetVarStringPP(original);
2447 char *authoritative_data = VARDATA_ANY(authoritative);
2448
2449 /* working state */
2450 Datum res;
2451 char *pres;
2452 int len;
2453 uint32 hash;
2454
2455 pres = (char *) &res;
2456 /* memset(), so any non-overwritten bytes are NUL */
2457 memset(pres, 0, sizeof(Datum));
2458 len = VARSIZE_ANY_EXHDR(authoritative);
2459
2460 /* Get number of bytes, ignoring trailing spaces */
2461 if (sss->typid == BPCHAROID)
2462 len = bpchartruelen(authoritative_data, len);
2463
2464 /*
2465 * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2466 * abbreviate keys. The full comparator for the C locale is always
2467 * memcmp(). It would be incorrect to allow bytea callers (callers that
2468 * always force the C collation -- bytea isn't a collatable type, but this
2469 * approach is convenient) to use strxfrm(). This is because bytea
2470 * strings may contain NUL bytes. Besides, this should be faster, too.
2471 *
2472 * More generally, it's okay that bytea callers can have NUL bytes in
2473 * strings because varstrcmp_abbrev() need not make a distinction between
2474 * terminating NUL bytes, and NUL bytes representing actual NULs in the
2475 * authoritative representation. Hopefully a comparison at or past one
2476 * abbreviated key's terminating NUL byte will resolve the comparison
2477 * without consulting the authoritative representation; specifically, some
2478 * later non-NUL byte in the longer string can resolve the comparison
2479 * against a subsequent terminating NUL in the shorter string. There will
2480 * usually be what is effectively a "length-wise" resolution there and
2481 * then.
2482 *
2483 * If that doesn't work out -- if all bytes in the longer string
2484 * positioned at or past the offset of the smaller string's (first)
2485 * terminating NUL are actually representative of NUL bytes in the
2486 * authoritative binary string (perhaps with some *terminating* NUL bytes
2487 * towards the end of the longer string iff it happens to still be small)
2488 * -- then an authoritative tie-breaker will happen, and do the right
2489 * thing: explicitly consider string length.
2490 */
2491 if (sss->collate_c)
2492 memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2493 else
2494 {
2495 Size bsize;
2496 #ifdef USE_ICU
2497 int32_t ulen = -1;
2498 UChar *uchar = NULL;
2499 #endif
2500
2501 /*
2502 * We're not using the C collation, so fall back on strxfrm or ICU
2503 * analogs.
2504 */
2505
2506 /* By convention, we use buffer 1 to store and NUL-terminate */
2507 if (len >= sss->buflen1)
2508 {
2509 pfree(sss->buf1);
2510 sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2511 sss->buf1 = palloc(sss->buflen1);
2512 }
2513
2514 /* Might be able to reuse strxfrm() blob from last call */
2515 if (sss->last_len1 == len && sss->cache_blob &&
2516 memcmp(sss->buf1, authoritative_data, len) == 0)
2517 {
2518 memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2519 /* No change affecting cardinality, so no hashing required */
2520 goto done;
2521 }
2522
2523 memcpy(sss->buf1, authoritative_data, len);
2524
2525 /*
2526 * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2527 * necessary for ICU, but doesn't hurt.
2528 */
2529 sss->buf1[len] = '\0';
2530 sss->last_len1 = len;
2531
2532 #ifdef USE_ICU
2533 /* When using ICU and not UTF8, convert string to UChar. */
2534 if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2535 GetDatabaseEncoding() != PG_UTF8)
2536 ulen = icu_to_uchar(&uchar, sss->buf1, len);
2537 #endif
2538
2539 /*
2540 * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2541 * and try again. Both of these functions have the result buffer
2542 * content undefined if the result did not fit, so we need to retry
2543 * until everything fits, even though we only need the first few bytes
2544 * in the end. When using ucol_nextSortKeyPart(), however, we only
2545 * ask for as many bytes as we actually need.
2546 */
2547 for (;;)
2548 {
2549 #ifdef USE_ICU
2550 if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2551 {
2552 /*
2553 * When using UTF8, use the iteration interface so we only
2554 * need to produce as many bytes as we actually need.
2555 */
2556 if (GetDatabaseEncoding() == PG_UTF8)
2557 {
2558 UCharIterator iter;
2559 uint32_t state[2];
2560 UErrorCode status;
2561
2562 uiter_setUTF8(&iter, sss->buf1, len);
2563 state[0] = state[1] = 0; /* won't need that again */
2564 status = U_ZERO_ERROR;
2565 bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2566 &iter,
2567 state,
2568 (uint8_t *) sss->buf2,
2569 Min(sizeof(Datum), sss->buflen2),
2570 &status);
2571 if (U_FAILURE(status))
2572 ereport(ERROR,
2573 (errmsg("sort key generation failed: %s",
2574 u_errorName(status))));
2575 }
2576 else
2577 bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2578 uchar, ulen,
2579 (uint8_t *) sss->buf2, sss->buflen2);
2580 }
2581 else
2582 #endif
2583 #ifdef HAVE_LOCALE_T
2584 if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2585 bsize = strxfrm_l(sss->buf2, sss->buf1,
2586 sss->buflen2, sss->locale->info.lt);
2587 else
2588 #endif
2589 bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2590
2591 sss->last_len2 = bsize;
2592 if (bsize < sss->buflen2)
2593 break;
2594
2595 /*
2596 * Grow buffer and retry.
2597 */
2598 pfree(sss->buf2);
2599 sss->buflen2 = Max(bsize + 1,
2600 Min(sss->buflen2 * 2, MaxAllocSize));
2601 sss->buf2 = palloc(sss->buflen2);
2602 }
2603
2604 /*
2605 * Every Datum byte is always compared. This is safe because the
2606 * strxfrm() blob is itself NUL terminated, leaving no danger of
2607 * misinterpreting any NUL bytes not intended to be interpreted as
2608 * logically representing termination.
2609 *
2610 * (Actually, even if there were NUL bytes in the blob it would be
2611 * okay. See remarks on bytea case above.)
2612 */
2613 memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2614
2615 #ifdef USE_ICU
2616 if (uchar)
2617 pfree(uchar);
2618 #endif
2619 }
2620
2621 /*
2622 * Maintain approximate cardinality of both abbreviated keys and original,
2623 * authoritative keys using HyperLogLog. Used as cheap insurance against
2624 * the worst case, where we do many string transformations for no saving
2625 * in full strcoll()-based comparisons. These statistics are used by
2626 * varstr_abbrev_abort().
2627 *
2628 * First, Hash key proper, or a significant fraction of it. Mix in length
2629 * in order to compensate for cases where differences are past
2630 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2631 */
2632 hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2633 Min(len, PG_CACHE_LINE_SIZE)));
2634
2635 if (len > PG_CACHE_LINE_SIZE)
2636 hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2637
2638 addHyperLogLog(&sss->full_card, hash);
2639
2640 /* Hash abbreviated key */
2641 #if SIZEOF_DATUM == 8
2642 {
2643 uint32 lohalf,
2644 hihalf;
2645
2646 lohalf = (uint32) res;
2647 hihalf = (uint32) (res >> 32);
2648 hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2649 }
2650 #else /* SIZEOF_DATUM != 8 */
2651 hash = DatumGetUInt32(hash_uint32((uint32) res));
2652 #endif
2653
2654 addHyperLogLog(&sss->abbr_card, hash);
2655
2656 /* Cache result, perhaps saving an expensive strxfrm() call next time */
2657 sss->cache_blob = true;
2658 done:
2659
2660 /*
2661 * Byteswap on little-endian machines.
2662 *
2663 * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2664 * comparator) works correctly on all platforms. If we didn't do this,
2665 * the comparator would have to call memcmp() with a pair of pointers to
2666 * the first byte of each abbreviated key, which is slower.
2667 */
2668 res = DatumBigEndianToNative(res);
2669
2670 /* Don't leak memory here */
2671 if (PointerGetDatum(authoritative) != original)
2672 pfree(authoritative);
2673
2674 return res;
2675 }
2676
2677 /*
2678 * Callback for estimating effectiveness of abbreviated key optimization, using
2679 * heuristic rules. Returns value indicating if the abbreviation optimization
2680 * should be aborted, based on its projected effectiveness.
2681 */
2682 static bool
varstr_abbrev_abort(int memtupcount,SortSupport ssup)2683 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2684 {
2685 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2686 double abbrev_distinct,
2687 key_distinct;
2688
2689 Assert(ssup->abbreviate);
2690
2691 /* Have a little patience */
2692 if (memtupcount < 100)
2693 return false;
2694
2695 abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2696 key_distinct = estimateHyperLogLog(&sss->full_card);
2697
2698 /*
2699 * Clamp cardinality estimates to at least one distinct value. While
2700 * NULLs are generally disregarded, if only NULL values were seen so far,
2701 * that might misrepresent costs if we failed to clamp.
2702 */
2703 if (abbrev_distinct <= 1.0)
2704 abbrev_distinct = 1.0;
2705
2706 if (key_distinct <= 1.0)
2707 key_distinct = 1.0;
2708
2709 /*
2710 * In the worst case all abbreviated keys are identical, while at the same
2711 * time there are differences within full key strings not captured in
2712 * abbreviations.
2713 */
2714 #ifdef TRACE_SORT
2715 if (trace_sort)
2716 {
2717 double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2718
2719 elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2720 "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2721 memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2722 sss->prop_card);
2723 }
2724 #endif
2725
2726 /*
2727 * If the number of distinct abbreviated keys approximately matches the
2728 * number of distinct authoritative original keys, that's reason enough to
2729 * proceed. We can win even with a very low cardinality set if most
2730 * tie-breakers only memcmp(). This is by far the most important
2731 * consideration.
2732 *
2733 * While comparisons that are resolved at the abbreviated key level are
2734 * considerably cheaper than tie-breakers resolved with memcmp(), both of
2735 * those two outcomes are so much cheaper than a full strcoll() once
2736 * sorting is underway that it doesn't seem worth it to weigh abbreviated
2737 * cardinality against the overall size of the set in order to more
2738 * accurately model costs. Assume that an abbreviated comparison, and an
2739 * abbreviated comparison with a cheap memcmp()-based authoritative
2740 * resolution are equivalent.
2741 */
2742 if (abbrev_distinct > key_distinct * sss->prop_card)
2743 {
2744 /*
2745 * When we have exceeded 10,000 tuples, decay required cardinality
2746 * aggressively for next call.
2747 *
2748 * This is useful because the number of comparisons required on
2749 * average increases at a linearithmic rate, and at roughly 10,000
2750 * tuples that factor will start to dominate over the linear costs of
2751 * string transformation (this is a conservative estimate). The decay
2752 * rate is chosen to be a little less aggressive than halving -- which
2753 * (since we're called at points at which memtupcount has doubled)
2754 * would never see the cost model actually abort past the first call
2755 * following a decay. This decay rate is mostly a precaution against
2756 * a sudden, violent swing in how well abbreviated cardinality tracks
2757 * full key cardinality. The decay also serves to prevent a marginal
2758 * case from being aborted too late, when too much has already been
2759 * invested in string transformation.
2760 *
2761 * It's possible for sets of several million distinct strings with
2762 * mere tens of thousands of distinct abbreviated keys to still
2763 * benefit very significantly. This will generally occur provided
2764 * each abbreviated key is a proxy for a roughly uniform number of the
2765 * set's full keys. If it isn't so, we hope to catch that early and
2766 * abort. If it isn't caught early, by the time the problem is
2767 * apparent it's probably not worth aborting.
2768 */
2769 if (memtupcount > 10000)
2770 sss->prop_card *= 0.65;
2771
2772 return false;
2773 }
2774
2775 /*
2776 * Abort abbreviation strategy.
2777 *
2778 * The worst case, where all abbreviated keys are identical while all
2779 * original strings differ will typically only see a regression of about
2780 * 10% in execution time for small to medium sized lists of strings.
2781 * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2782 * often expect very large improvements, particularly with sets of strings
2783 * of moderately high to high abbreviated cardinality. There is little to
2784 * lose but much to gain, which our strategy reflects.
2785 */
2786 #ifdef TRACE_SORT
2787 if (trace_sort)
2788 elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2789 "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2790 memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2791 #endif
2792
2793 return true;
2794 }
2795
2796 Datum
text_larger(PG_FUNCTION_ARGS)2797 text_larger(PG_FUNCTION_ARGS)
2798 {
2799 text *arg1 = PG_GETARG_TEXT_PP(0);
2800 text *arg2 = PG_GETARG_TEXT_PP(1);
2801 text *result;
2802
2803 result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2804
2805 PG_RETURN_TEXT_P(result);
2806 }
2807
2808 Datum
text_smaller(PG_FUNCTION_ARGS)2809 text_smaller(PG_FUNCTION_ARGS)
2810 {
2811 text *arg1 = PG_GETARG_TEXT_PP(0);
2812 text *arg2 = PG_GETARG_TEXT_PP(1);
2813 text *result;
2814
2815 result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2816
2817 PG_RETURN_TEXT_P(result);
2818 }
2819
2820
2821 /*
2822 * Cross-type comparison functions for types text and name.
2823 */
2824
2825 Datum
nameeqtext(PG_FUNCTION_ARGS)2826 nameeqtext(PG_FUNCTION_ARGS)
2827 {
2828 Name arg1 = PG_GETARG_NAME(0);
2829 text *arg2 = PG_GETARG_TEXT_PP(1);
2830 size_t len1 = strlen(NameStr(*arg1));
2831 size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2832 Oid collid = PG_GET_COLLATION();
2833 bool result;
2834
2835 check_collation_set(collid);
2836
2837 if (collid == C_COLLATION_OID)
2838 result = (len1 == len2 &&
2839 memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2840 else
2841 result = (varstr_cmp(NameStr(*arg1), len1,
2842 VARDATA_ANY(arg2), len2,
2843 collid) == 0);
2844
2845 PG_FREE_IF_COPY(arg2, 1);
2846
2847 PG_RETURN_BOOL(result);
2848 }
2849
2850 Datum
texteqname(PG_FUNCTION_ARGS)2851 texteqname(PG_FUNCTION_ARGS)
2852 {
2853 text *arg1 = PG_GETARG_TEXT_PP(0);
2854 Name arg2 = PG_GETARG_NAME(1);
2855 size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2856 size_t len2 = strlen(NameStr(*arg2));
2857 Oid collid = PG_GET_COLLATION();
2858 bool result;
2859
2860 check_collation_set(collid);
2861
2862 if (collid == C_COLLATION_OID)
2863 result = (len1 == len2 &&
2864 memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2865 else
2866 result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2867 NameStr(*arg2), len2,
2868 collid) == 0);
2869
2870 PG_FREE_IF_COPY(arg1, 0);
2871
2872 PG_RETURN_BOOL(result);
2873 }
2874
2875 Datum
namenetext(PG_FUNCTION_ARGS)2876 namenetext(PG_FUNCTION_ARGS)
2877 {
2878 Name arg1 = PG_GETARG_NAME(0);
2879 text *arg2 = PG_GETARG_TEXT_PP(1);
2880 size_t len1 = strlen(NameStr(*arg1));
2881 size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2882 Oid collid = PG_GET_COLLATION();
2883 bool result;
2884
2885 check_collation_set(collid);
2886
2887 if (collid == C_COLLATION_OID)
2888 result = !(len1 == len2 &&
2889 memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2890 else
2891 result = !(varstr_cmp(NameStr(*arg1), len1,
2892 VARDATA_ANY(arg2), len2,
2893 collid) == 0);
2894
2895 PG_FREE_IF_COPY(arg2, 1);
2896
2897 PG_RETURN_BOOL(result);
2898 }
2899
2900 Datum
textnename(PG_FUNCTION_ARGS)2901 textnename(PG_FUNCTION_ARGS)
2902 {
2903 text *arg1 = PG_GETARG_TEXT_PP(0);
2904 Name arg2 = PG_GETARG_NAME(1);
2905 size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2906 size_t len2 = strlen(NameStr(*arg2));
2907 Oid collid = PG_GET_COLLATION();
2908 bool result;
2909
2910 check_collation_set(collid);
2911
2912 if (collid == C_COLLATION_OID)
2913 result = !(len1 == len2 &&
2914 memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2915 else
2916 result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2917 NameStr(*arg2), len2,
2918 collid) == 0);
2919
2920 PG_FREE_IF_COPY(arg1, 0);
2921
2922 PG_RETURN_BOOL(result);
2923 }
2924
2925 Datum
btnametextcmp(PG_FUNCTION_ARGS)2926 btnametextcmp(PG_FUNCTION_ARGS)
2927 {
2928 Name arg1 = PG_GETARG_NAME(0);
2929 text *arg2 = PG_GETARG_TEXT_PP(1);
2930 int32 result;
2931
2932 result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2933 VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2934 PG_GET_COLLATION());
2935
2936 PG_FREE_IF_COPY(arg2, 1);
2937
2938 PG_RETURN_INT32(result);
2939 }
2940
2941 Datum
bttextnamecmp(PG_FUNCTION_ARGS)2942 bttextnamecmp(PG_FUNCTION_ARGS)
2943 {
2944 text *arg1 = PG_GETARG_TEXT_PP(0);
2945 Name arg2 = PG_GETARG_NAME(1);
2946 int32 result;
2947
2948 result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2949 NameStr(*arg2), strlen(NameStr(*arg2)),
2950 PG_GET_COLLATION());
2951
2952 PG_FREE_IF_COPY(arg1, 0);
2953
2954 PG_RETURN_INT32(result);
2955 }
2956
2957 #define CmpCall(cmpfunc) \
2958 DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2959 PG_GET_COLLATION(), \
2960 PG_GETARG_DATUM(0), \
2961 PG_GETARG_DATUM(1)))
2962
2963 Datum
namelttext(PG_FUNCTION_ARGS)2964 namelttext(PG_FUNCTION_ARGS)
2965 {
2966 PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
2967 }
2968
2969 Datum
nameletext(PG_FUNCTION_ARGS)2970 nameletext(PG_FUNCTION_ARGS)
2971 {
2972 PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
2973 }
2974
2975 Datum
namegttext(PG_FUNCTION_ARGS)2976 namegttext(PG_FUNCTION_ARGS)
2977 {
2978 PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
2979 }
2980
2981 Datum
namegetext(PG_FUNCTION_ARGS)2982 namegetext(PG_FUNCTION_ARGS)
2983 {
2984 PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
2985 }
2986
2987 Datum
textltname(PG_FUNCTION_ARGS)2988 textltname(PG_FUNCTION_ARGS)
2989 {
2990 PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
2991 }
2992
2993 Datum
textlename(PG_FUNCTION_ARGS)2994 textlename(PG_FUNCTION_ARGS)
2995 {
2996 PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
2997 }
2998
2999 Datum
textgtname(PG_FUNCTION_ARGS)3000 textgtname(PG_FUNCTION_ARGS)
3001 {
3002 PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
3003 }
3004
3005 Datum
textgename(PG_FUNCTION_ARGS)3006 textgename(PG_FUNCTION_ARGS)
3007 {
3008 PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
3009 }
3010
3011 #undef CmpCall
3012
3013
3014 /*
3015 * The following operators support character-by-character comparison
3016 * of text datums, to allow building indexes suitable for LIKE clauses.
3017 * Note that the regular texteq/textne comparison operators, and regular
3018 * support functions 1 and 2 with "C" collation are assumed to be
3019 * compatible with these!
3020 */
3021
3022 static int
internal_text_pattern_compare(text * arg1,text * arg2)3023 internal_text_pattern_compare(text *arg1, text *arg2)
3024 {
3025 int result;
3026 int len1,
3027 len2;
3028
3029 len1 = VARSIZE_ANY_EXHDR(arg1);
3030 len2 = VARSIZE_ANY_EXHDR(arg2);
3031
3032 result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3033 if (result != 0)
3034 return result;
3035 else if (len1 < len2)
3036 return -1;
3037 else if (len1 > len2)
3038 return 1;
3039 else
3040 return 0;
3041 }
3042
3043
3044 Datum
text_pattern_lt(PG_FUNCTION_ARGS)3045 text_pattern_lt(PG_FUNCTION_ARGS)
3046 {
3047 text *arg1 = PG_GETARG_TEXT_PP(0);
3048 text *arg2 = PG_GETARG_TEXT_PP(1);
3049 int result;
3050
3051 result = internal_text_pattern_compare(arg1, arg2);
3052
3053 PG_FREE_IF_COPY(arg1, 0);
3054 PG_FREE_IF_COPY(arg2, 1);
3055
3056 PG_RETURN_BOOL(result < 0);
3057 }
3058
3059
3060 Datum
text_pattern_le(PG_FUNCTION_ARGS)3061 text_pattern_le(PG_FUNCTION_ARGS)
3062 {
3063 text *arg1 = PG_GETARG_TEXT_PP(0);
3064 text *arg2 = PG_GETARG_TEXT_PP(1);
3065 int result;
3066
3067 result = internal_text_pattern_compare(arg1, arg2);
3068
3069 PG_FREE_IF_COPY(arg1, 0);
3070 PG_FREE_IF_COPY(arg2, 1);
3071
3072 PG_RETURN_BOOL(result <= 0);
3073 }
3074
3075
3076 Datum
text_pattern_ge(PG_FUNCTION_ARGS)3077 text_pattern_ge(PG_FUNCTION_ARGS)
3078 {
3079 text *arg1 = PG_GETARG_TEXT_PP(0);
3080 text *arg2 = PG_GETARG_TEXT_PP(1);
3081 int result;
3082
3083 result = internal_text_pattern_compare(arg1, arg2);
3084
3085 PG_FREE_IF_COPY(arg1, 0);
3086 PG_FREE_IF_COPY(arg2, 1);
3087
3088 PG_RETURN_BOOL(result >= 0);
3089 }
3090
3091
3092 Datum
text_pattern_gt(PG_FUNCTION_ARGS)3093 text_pattern_gt(PG_FUNCTION_ARGS)
3094 {
3095 text *arg1 = PG_GETARG_TEXT_PP(0);
3096 text *arg2 = PG_GETARG_TEXT_PP(1);
3097 int result;
3098
3099 result = internal_text_pattern_compare(arg1, arg2);
3100
3101 PG_FREE_IF_COPY(arg1, 0);
3102 PG_FREE_IF_COPY(arg2, 1);
3103
3104 PG_RETURN_BOOL(result > 0);
3105 }
3106
3107
3108 Datum
bttext_pattern_cmp(PG_FUNCTION_ARGS)3109 bttext_pattern_cmp(PG_FUNCTION_ARGS)
3110 {
3111 text *arg1 = PG_GETARG_TEXT_PP(0);
3112 text *arg2 = PG_GETARG_TEXT_PP(1);
3113 int result;
3114
3115 result = internal_text_pattern_compare(arg1, arg2);
3116
3117 PG_FREE_IF_COPY(arg1, 0);
3118 PG_FREE_IF_COPY(arg2, 1);
3119
3120 PG_RETURN_INT32(result);
3121 }
3122
3123
3124 Datum
bttext_pattern_sortsupport(PG_FUNCTION_ARGS)3125 bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
3126 {
3127 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3128 MemoryContext oldcontext;
3129
3130 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3131
3132 /* Use generic string SortSupport, forcing "C" collation */
3133 varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3134
3135 MemoryContextSwitchTo(oldcontext);
3136
3137 PG_RETURN_VOID();
3138 }
3139
3140
3141 /*-------------------------------------------------------------
3142 * byteaoctetlen
3143 *
3144 * get the number of bytes contained in an instance of type 'bytea'
3145 *-------------------------------------------------------------
3146 */
3147 Datum
byteaoctetlen(PG_FUNCTION_ARGS)3148 byteaoctetlen(PG_FUNCTION_ARGS)
3149 {
3150 Datum str = PG_GETARG_DATUM(0);
3151
3152 /* We need not detoast the input at all */
3153 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
3154 }
3155
3156 /*
3157 * byteacat -
3158 * takes two bytea* and returns a bytea* that is the concatenation of
3159 * the two.
3160 *
3161 * Cloned from textcat and modified as required.
3162 */
3163 Datum
byteacat(PG_FUNCTION_ARGS)3164 byteacat(PG_FUNCTION_ARGS)
3165 {
3166 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3167 bytea *t2 = PG_GETARG_BYTEA_PP(1);
3168
3169 PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
3170 }
3171
3172 /*
3173 * bytea_catenate
3174 * Guts of byteacat(), broken out so it can be used by other functions
3175 *
3176 * Arguments can be in short-header form, but not compressed or out-of-line
3177 */
3178 static bytea *
bytea_catenate(bytea * t1,bytea * t2)3179 bytea_catenate(bytea *t1, bytea *t2)
3180 {
3181 bytea *result;
3182 int len1,
3183 len2,
3184 len;
3185 char *ptr;
3186
3187 len1 = VARSIZE_ANY_EXHDR(t1);
3188 len2 = VARSIZE_ANY_EXHDR(t2);
3189
3190 /* paranoia ... probably should throw error instead? */
3191 if (len1 < 0)
3192 len1 = 0;
3193 if (len2 < 0)
3194 len2 = 0;
3195
3196 len = len1 + len2 + VARHDRSZ;
3197 result = (bytea *) palloc(len);
3198
3199 /* Set size of result string... */
3200 SET_VARSIZE(result, len);
3201
3202 /* Fill data field of result string... */
3203 ptr = VARDATA(result);
3204 if (len1 > 0)
3205 memcpy(ptr, VARDATA_ANY(t1), len1);
3206 if (len2 > 0)
3207 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3208
3209 return result;
3210 }
3211
3212 #define PG_STR_GET_BYTEA(str_) \
3213 DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3214
3215 /*
3216 * bytea_substr()
3217 * Return a substring starting at the specified position.
3218 * Cloned from text_substr and modified as required.
3219 *
3220 * Input:
3221 * - string
3222 * - starting position (is one-based)
3223 * - string length (optional)
3224 *
3225 * If the starting position is zero or less, then return from the start of the string
3226 * adjusting the length to be consistent with the "negative start" per SQL.
3227 * If the length is less than zero, an ERROR is thrown. If no third argument
3228 * (length) is provided, the length to the end of the string is assumed.
3229 */
3230 Datum
bytea_substr(PG_FUNCTION_ARGS)3231 bytea_substr(PG_FUNCTION_ARGS)
3232 {
3233 PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3234 PG_GETARG_INT32(1),
3235 PG_GETARG_INT32(2),
3236 false));
3237 }
3238
3239 /*
3240 * bytea_substr_no_len -
3241 * Wrapper to avoid opr_sanity failure due to
3242 * one function accepting a different number of args.
3243 */
3244 Datum
bytea_substr_no_len(PG_FUNCTION_ARGS)3245 bytea_substr_no_len(PG_FUNCTION_ARGS)
3246 {
3247 PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3248 PG_GETARG_INT32(1),
3249 -1,
3250 true));
3251 }
3252
3253 static bytea *
bytea_substring(Datum str,int S,int L,bool length_not_specified)3254 bytea_substring(Datum str,
3255 int S,
3256 int L,
3257 bool length_not_specified)
3258 {
3259 int32 S1; /* adjusted start position */
3260 int32 L1; /* adjusted substring length */
3261 int32 E; /* end position */
3262
3263 /*
3264 * The logic here should generally match text_substring().
3265 */
3266 S1 = Max(S, 1);
3267
3268 if (length_not_specified)
3269 {
3270 /*
3271 * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3272 * end of the string if we pass it a negative value for length.
3273 */
3274 L1 = -1;
3275 }
3276 else if (L < 0)
3277 {
3278 /* SQL99 says to throw an error for E < S, i.e., negative length */
3279 ereport(ERROR,
3280 (errcode(ERRCODE_SUBSTRING_ERROR),
3281 errmsg("negative substring length not allowed")));
3282 L1 = -1; /* silence stupider compilers */
3283 }
3284 else if (pg_add_s32_overflow(S, L, &E))
3285 {
3286 /*
3287 * L could be large enough for S + L to overflow, in which case the
3288 * substring must run to end of string.
3289 */
3290 L1 = -1;
3291 }
3292 else
3293 {
3294 /*
3295 * A zero or negative value for the end position can happen if the
3296 * start was negative or one. SQL99 says to return a zero-length
3297 * string.
3298 */
3299 if (E < 1)
3300 return PG_STR_GET_BYTEA("");
3301
3302 L1 = E - S1;
3303 }
3304
3305 /*
3306 * If the start position is past the end of the string, SQL99 says to
3307 * return a zero-length string -- DatumGetByteaPSlice() will do that for
3308 * us. We need only convert S1 to zero-based starting position.
3309 */
3310 return DatumGetByteaPSlice(str, S1 - 1, L1);
3311 }
3312
3313 /*
3314 * byteaoverlay
3315 * Replace specified substring of first string with second
3316 *
3317 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3318 * This code is a direct implementation of what the standard says.
3319 */
3320 Datum
byteaoverlay(PG_FUNCTION_ARGS)3321 byteaoverlay(PG_FUNCTION_ARGS)
3322 {
3323 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3324 bytea *t2 = PG_GETARG_BYTEA_PP(1);
3325 int sp = PG_GETARG_INT32(2); /* substring start position */
3326 int sl = PG_GETARG_INT32(3); /* substring length */
3327
3328 PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3329 }
3330
3331 Datum
byteaoverlay_no_len(PG_FUNCTION_ARGS)3332 byteaoverlay_no_len(PG_FUNCTION_ARGS)
3333 {
3334 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3335 bytea *t2 = PG_GETARG_BYTEA_PP(1);
3336 int sp = PG_GETARG_INT32(2); /* substring start position */
3337 int sl;
3338
3339 sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3340 PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3341 }
3342
3343 static bytea *
bytea_overlay(bytea * t1,bytea * t2,int sp,int sl)3344 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3345 {
3346 bytea *result;
3347 bytea *s1;
3348 bytea *s2;
3349 int sp_pl_sl;
3350
3351 /*
3352 * Check for possible integer-overflow cases. For negative sp, throw a
3353 * "substring length" error because that's what should be expected
3354 * according to the spec's definition of OVERLAY().
3355 */
3356 if (sp <= 0)
3357 ereport(ERROR,
3358 (errcode(ERRCODE_SUBSTRING_ERROR),
3359 errmsg("negative substring length not allowed")));
3360 if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3361 ereport(ERROR,
3362 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3363 errmsg("integer out of range")));
3364
3365 s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3366 s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3367 result = bytea_catenate(s1, t2);
3368 result = bytea_catenate(result, s2);
3369
3370 return result;
3371 }
3372
3373 /*
3374 * byteapos -
3375 * Return the position of the specified substring.
3376 * Implements the SQL POSITION() function.
3377 * Cloned from textpos and modified as required.
3378 */
3379 Datum
byteapos(PG_FUNCTION_ARGS)3380 byteapos(PG_FUNCTION_ARGS)
3381 {
3382 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3383 bytea *t2 = PG_GETARG_BYTEA_PP(1);
3384 int pos;
3385 int px,
3386 p;
3387 int len1,
3388 len2;
3389 char *p1,
3390 *p2;
3391
3392 len1 = VARSIZE_ANY_EXHDR(t1);
3393 len2 = VARSIZE_ANY_EXHDR(t2);
3394
3395 if (len2 <= 0)
3396 PG_RETURN_INT32(1); /* result for empty pattern */
3397
3398 p1 = VARDATA_ANY(t1);
3399 p2 = VARDATA_ANY(t2);
3400
3401 pos = 0;
3402 px = (len1 - len2);
3403 for (p = 0; p <= px; p++)
3404 {
3405 if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3406 {
3407 pos = p + 1;
3408 break;
3409 };
3410 p1++;
3411 };
3412
3413 PG_RETURN_INT32(pos);
3414 }
3415
3416 /*-------------------------------------------------------------
3417 * byteaGetByte
3418 *
3419 * this routine treats "bytea" as an array of bytes.
3420 * It returns the Nth byte (a number between 0 and 255).
3421 *-------------------------------------------------------------
3422 */
3423 Datum
byteaGetByte(PG_FUNCTION_ARGS)3424 byteaGetByte(PG_FUNCTION_ARGS)
3425 {
3426 bytea *v = PG_GETARG_BYTEA_PP(0);
3427 int32 n = PG_GETARG_INT32(1);
3428 int len;
3429 int byte;
3430
3431 len = VARSIZE_ANY_EXHDR(v);
3432
3433 if (n < 0 || n >= len)
3434 ereport(ERROR,
3435 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3436 errmsg("index %d out of valid range, 0..%d",
3437 n, len - 1)));
3438
3439 byte = ((unsigned char *) VARDATA_ANY(v))[n];
3440
3441 PG_RETURN_INT32(byte);
3442 }
3443
3444 /*-------------------------------------------------------------
3445 * byteaGetBit
3446 *
3447 * This routine treats a "bytea" type like an array of bits.
3448 * It returns the value of the Nth bit (0 or 1).
3449 *
3450 *-------------------------------------------------------------
3451 */
3452 Datum
byteaGetBit(PG_FUNCTION_ARGS)3453 byteaGetBit(PG_FUNCTION_ARGS)
3454 {
3455 bytea *v = PG_GETARG_BYTEA_PP(0);
3456 int32 n = PG_GETARG_INT32(1);
3457 int byteNo,
3458 bitNo;
3459 int len;
3460 int byte;
3461
3462 len = VARSIZE_ANY_EXHDR(v);
3463
3464 /* Do comparison arithmetic in int64 in case len exceeds INT_MAX/8 */
3465 if (n < 0 || n >= (int64) len * 8)
3466 ereport(ERROR,
3467 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3468 errmsg("index %d out of valid range, 0..%d",
3469 n, (int) Min((int64) len * 8 - 1, INT_MAX))));
3470
3471 byteNo = n / 8;
3472 bitNo = n % 8;
3473
3474 byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3475
3476 if (byte & (1 << bitNo))
3477 PG_RETURN_INT32(1);
3478 else
3479 PG_RETURN_INT32(0);
3480 }
3481
3482 /*-------------------------------------------------------------
3483 * byteaSetByte
3484 *
3485 * Given an instance of type 'bytea' creates a new one with
3486 * the Nth byte set to the given value.
3487 *
3488 *-------------------------------------------------------------
3489 */
3490 Datum
byteaSetByte(PG_FUNCTION_ARGS)3491 byteaSetByte(PG_FUNCTION_ARGS)
3492 {
3493 bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3494 int32 n = PG_GETARG_INT32(1);
3495 int32 newByte = PG_GETARG_INT32(2);
3496 int len;
3497
3498 len = VARSIZE(res) - VARHDRSZ;
3499
3500 if (n < 0 || n >= len)
3501 ereport(ERROR,
3502 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3503 errmsg("index %d out of valid range, 0..%d",
3504 n, len - 1)));
3505
3506 /*
3507 * Now set the byte.
3508 */
3509 ((unsigned char *) VARDATA(res))[n] = newByte;
3510
3511 PG_RETURN_BYTEA_P(res);
3512 }
3513
3514 /*-------------------------------------------------------------
3515 * byteaSetBit
3516 *
3517 * Given an instance of type 'bytea' creates a new one with
3518 * the Nth bit set to the given value.
3519 *
3520 *-------------------------------------------------------------
3521 */
3522 Datum
byteaSetBit(PG_FUNCTION_ARGS)3523 byteaSetBit(PG_FUNCTION_ARGS)
3524 {
3525 bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3526 int32 n = PG_GETARG_INT32(1);
3527 int32 newBit = PG_GETARG_INT32(2);
3528 int len;
3529 int oldByte,
3530 newByte;
3531 int byteNo,
3532 bitNo;
3533
3534 len = VARSIZE(res) - VARHDRSZ;
3535
3536 /* Do comparison arithmetic in int64 in case len exceeds INT_MAX/8 */
3537 if (n < 0 || n >= (int64) len * 8)
3538 ereport(ERROR,
3539 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3540 errmsg("index %d out of valid range, 0..%d",
3541 n, (int) Min((int64) len * 8 - 1, INT_MAX))));
3542
3543 byteNo = n / 8;
3544 bitNo = n % 8;
3545
3546 /*
3547 * sanity check!
3548 */
3549 if (newBit != 0 && newBit != 1)
3550 ereport(ERROR,
3551 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3552 errmsg("new bit must be 0 or 1")));
3553
3554 /*
3555 * Update the byte.
3556 */
3557 oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3558
3559 if (newBit == 0)
3560 newByte = oldByte & (~(1 << bitNo));
3561 else
3562 newByte = oldByte | (1 << bitNo);
3563
3564 ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3565
3566 PG_RETURN_BYTEA_P(res);
3567 }
3568
3569
3570 /* text_name()
3571 * Converts a text type to a Name type.
3572 */
3573 Datum
text_name(PG_FUNCTION_ARGS)3574 text_name(PG_FUNCTION_ARGS)
3575 {
3576 text *s = PG_GETARG_TEXT_PP(0);
3577 Name result;
3578 int len;
3579
3580 len = VARSIZE_ANY_EXHDR(s);
3581
3582 /* Truncate oversize input */
3583 if (len >= NAMEDATALEN)
3584 len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3585
3586 /* We use palloc0 here to ensure result is zero-padded */
3587 result = (Name) palloc0(NAMEDATALEN);
3588 memcpy(NameStr(*result), VARDATA_ANY(s), len);
3589
3590 PG_RETURN_NAME(result);
3591 }
3592
3593 /* name_text()
3594 * Converts a Name type to a text type.
3595 */
3596 Datum
name_text(PG_FUNCTION_ARGS)3597 name_text(PG_FUNCTION_ARGS)
3598 {
3599 Name s = PG_GETARG_NAME(0);
3600
3601 PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3602 }
3603
3604
3605 /*
3606 * textToQualifiedNameList - convert a text object to list of names
3607 *
3608 * This implements the input parsing needed by nextval() and other
3609 * functions that take a text parameter representing a qualified name.
3610 * We split the name at dots, downcase if not double-quoted, and
3611 * truncate names if they're too long.
3612 */
3613 List *
textToQualifiedNameList(text * textval)3614 textToQualifiedNameList(text *textval)
3615 {
3616 char *rawname;
3617 List *result = NIL;
3618 List *namelist;
3619 ListCell *l;
3620
3621 /* Convert to C string (handles possible detoasting). */
3622 /* Note we rely on being able to modify rawname below. */
3623 rawname = text_to_cstring(textval);
3624
3625 if (!SplitIdentifierString(rawname, '.', &namelist))
3626 ereport(ERROR,
3627 (errcode(ERRCODE_INVALID_NAME),
3628 errmsg("invalid name syntax")));
3629
3630 if (namelist == NIL)
3631 ereport(ERROR,
3632 (errcode(ERRCODE_INVALID_NAME),
3633 errmsg("invalid name syntax")));
3634
3635 foreach(l, namelist)
3636 {
3637 char *curname = (char *) lfirst(l);
3638
3639 result = lappend(result, makeString(pstrdup(curname)));
3640 }
3641
3642 pfree(rawname);
3643 list_free(namelist);
3644
3645 return result;
3646 }
3647
3648 /*
3649 * SplitIdentifierString --- parse a string containing identifiers
3650 *
3651 * This is the guts of textToQualifiedNameList, and is exported for use in
3652 * other situations such as parsing GUC variables. In the GUC case, it's
3653 * important to avoid memory leaks, so the API is designed to minimize the
3654 * amount of stuff that needs to be allocated and freed.
3655 *
3656 * Inputs:
3657 * rawstring: the input string; must be overwritable! On return, it's
3658 * been modified to contain the separated identifiers.
3659 * separator: the separator punctuation expected between identifiers
3660 * (typically '.' or ','). Whitespace may also appear around
3661 * identifiers.
3662 * Outputs:
3663 * namelist: filled with a palloc'd list of pointers to identifiers within
3664 * rawstring. Caller should list_free() this even on error return.
3665 *
3666 * Returns true if okay, false if there is a syntax error in the string.
3667 *
3668 * Note that an empty string is considered okay here, though not in
3669 * textToQualifiedNameList.
3670 */
3671 bool
SplitIdentifierString(char * rawstring,char separator,List ** namelist)3672 SplitIdentifierString(char *rawstring, char separator,
3673 List **namelist)
3674 {
3675 char *nextp = rawstring;
3676 bool done = false;
3677
3678 *namelist = NIL;
3679
3680 while (scanner_isspace(*nextp))
3681 nextp++; /* skip leading whitespace */
3682
3683 if (*nextp == '\0')
3684 return true; /* allow empty string */
3685
3686 /* At the top of the loop, we are at start of a new identifier. */
3687 do
3688 {
3689 char *curname;
3690 char *endp;
3691
3692 if (*nextp == '"')
3693 {
3694 /* Quoted name --- collapse quote-quote pairs, no downcasing */
3695 curname = nextp + 1;
3696 for (;;)
3697 {
3698 endp = strchr(nextp + 1, '"');
3699 if (endp == NULL)
3700 return false; /* mismatched quotes */
3701 if (endp[1] != '"')
3702 break; /* found end of quoted name */
3703 /* Collapse adjacent quotes into one quote, and look again */
3704 memmove(endp, endp + 1, strlen(endp));
3705 nextp = endp;
3706 }
3707 /* endp now points at the terminating quote */
3708 nextp = endp + 1;
3709 }
3710 else
3711 {
3712 /* Unquoted name --- extends to separator or whitespace */
3713 char *downname;
3714 int len;
3715
3716 curname = nextp;
3717 while (*nextp && *nextp != separator &&
3718 !scanner_isspace(*nextp))
3719 nextp++;
3720 endp = nextp;
3721 if (curname == nextp)
3722 return false; /* empty unquoted name not allowed */
3723
3724 /*
3725 * Downcase the identifier, using same code as main lexer does.
3726 *
3727 * XXX because we want to overwrite the input in-place, we cannot
3728 * support a downcasing transformation that increases the string
3729 * length. This is not a problem given the current implementation
3730 * of downcase_truncate_identifier, but we'll probably have to do
3731 * something about this someday.
3732 */
3733 len = endp - curname;
3734 downname = downcase_truncate_identifier(curname, len, false);
3735 Assert(strlen(downname) <= len);
3736 strncpy(curname, downname, len); /* strncpy is required here */
3737 pfree(downname);
3738 }
3739
3740 while (scanner_isspace(*nextp))
3741 nextp++; /* skip trailing whitespace */
3742
3743 if (*nextp == separator)
3744 {
3745 nextp++;
3746 while (scanner_isspace(*nextp))
3747 nextp++; /* skip leading whitespace for next */
3748 /* we expect another name, so done remains false */
3749 }
3750 else if (*nextp == '\0')
3751 done = true;
3752 else
3753 return false; /* invalid syntax */
3754
3755 /* Now safe to overwrite separator with a null */
3756 *endp = '\0';
3757
3758 /* Truncate name if it's overlength */
3759 truncate_identifier(curname, strlen(curname), false);
3760
3761 /*
3762 * Finished isolating current name --- add it to list
3763 */
3764 *namelist = lappend(*namelist, curname);
3765
3766 /* Loop back if we didn't reach end of string */
3767 } while (!done);
3768
3769 return true;
3770 }
3771
3772
3773 /*
3774 * SplitDirectoriesString --- parse a string containing file/directory names
3775 *
3776 * This works fine on file names too; the function name is historical.
3777 *
3778 * This is similar to SplitIdentifierString, except that the parsing
3779 * rules are meant to handle pathnames instead of identifiers: there is
3780 * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3781 * and we apply canonicalize_path() to each extracted string. Because of the
3782 * last, the returned strings are separately palloc'd rather than being
3783 * pointers into rawstring --- but we still scribble on rawstring.
3784 *
3785 * Inputs:
3786 * rawstring: the input string; must be modifiable!
3787 * separator: the separator punctuation expected between directories
3788 * (typically ',' or ';'). Whitespace may also appear around
3789 * directories.
3790 * Outputs:
3791 * namelist: filled with a palloc'd list of directory names.
3792 * Caller should list_free_deep() this even on error return.
3793 *
3794 * Returns true if okay, false if there is a syntax error in the string.
3795 *
3796 * Note that an empty string is considered okay here.
3797 */
3798 bool
SplitDirectoriesString(char * rawstring,char separator,List ** namelist)3799 SplitDirectoriesString(char *rawstring, char separator,
3800 List **namelist)
3801 {
3802 char *nextp = rawstring;
3803 bool done = false;
3804
3805 *namelist = NIL;
3806
3807 while (scanner_isspace(*nextp))
3808 nextp++; /* skip leading whitespace */
3809
3810 if (*nextp == '\0')
3811 return true; /* allow empty string */
3812
3813 /* At the top of the loop, we are at start of a new directory. */
3814 do
3815 {
3816 char *curname;
3817 char *endp;
3818
3819 if (*nextp == '"')
3820 {
3821 /* Quoted name --- collapse quote-quote pairs */
3822 curname = nextp + 1;
3823 for (;;)
3824 {
3825 endp = strchr(nextp + 1, '"');
3826 if (endp == NULL)
3827 return false; /* mismatched quotes */
3828 if (endp[1] != '"')
3829 break; /* found end of quoted name */
3830 /* Collapse adjacent quotes into one quote, and look again */
3831 memmove(endp, endp + 1, strlen(endp));
3832 nextp = endp;
3833 }
3834 /* endp now points at the terminating quote */
3835 nextp = endp + 1;
3836 }
3837 else
3838 {
3839 /* Unquoted name --- extends to separator or end of string */
3840 curname = endp = nextp;
3841 while (*nextp && *nextp != separator)
3842 {
3843 /* trailing whitespace should not be included in name */
3844 if (!scanner_isspace(*nextp))
3845 endp = nextp + 1;
3846 nextp++;
3847 }
3848 if (curname == endp)
3849 return false; /* empty unquoted name not allowed */
3850 }
3851
3852 while (scanner_isspace(*nextp))
3853 nextp++; /* skip trailing whitespace */
3854
3855 if (*nextp == separator)
3856 {
3857 nextp++;
3858 while (scanner_isspace(*nextp))
3859 nextp++; /* skip leading whitespace for next */
3860 /* we expect another name, so done remains false */
3861 }
3862 else if (*nextp == '\0')
3863 done = true;
3864 else
3865 return false; /* invalid syntax */
3866
3867 /* Now safe to overwrite separator with a null */
3868 *endp = '\0';
3869
3870 /* Truncate path if it's overlength */
3871 if (strlen(curname) >= MAXPGPATH)
3872 curname[MAXPGPATH - 1] = '\0';
3873
3874 /*
3875 * Finished isolating current name --- add it to list
3876 */
3877 curname = pstrdup(curname);
3878 canonicalize_path(curname);
3879 *namelist = lappend(*namelist, curname);
3880
3881 /* Loop back if we didn't reach end of string */
3882 } while (!done);
3883
3884 return true;
3885 }
3886
3887
3888 /*
3889 * SplitGUCList --- parse a string containing identifiers or file names
3890 *
3891 * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3892 * presuming whether the elements will be taken as identifiers or file names.
3893 * We assume the input has already been through flatten_set_variable_args(),
3894 * so that we need never downcase (if appropriate, that was done already).
3895 * Nor do we ever truncate, since we don't know the correct max length.
3896 * We disallow embedded whitespace for simplicity (it shouldn't matter,
3897 * because any embedded whitespace should have led to double-quoting).
3898 * Otherwise the API is identical to SplitIdentifierString.
3899 *
3900 * XXX it's annoying to have so many copies of this string-splitting logic.
3901 * However, it's not clear that having one function with a bunch of option
3902 * flags would be much better.
3903 *
3904 * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3905 * Be sure to update that if you have to change this.
3906 *
3907 * Inputs:
3908 * rawstring: the input string; must be overwritable! On return, it's
3909 * been modified to contain the separated identifiers.
3910 * separator: the separator punctuation expected between identifiers
3911 * (typically '.' or ','). Whitespace may also appear around
3912 * identifiers.
3913 * Outputs:
3914 * namelist: filled with a palloc'd list of pointers to identifiers within
3915 * rawstring. Caller should list_free() this even on error return.
3916 *
3917 * Returns true if okay, false if there is a syntax error in the string.
3918 */
3919 bool
SplitGUCList(char * rawstring,char separator,List ** namelist)3920 SplitGUCList(char *rawstring, char separator,
3921 List **namelist)
3922 {
3923 char *nextp = rawstring;
3924 bool done = false;
3925
3926 *namelist = NIL;
3927
3928 while (scanner_isspace(*nextp))
3929 nextp++; /* skip leading whitespace */
3930
3931 if (*nextp == '\0')
3932 return true; /* allow empty string */
3933
3934 /* At the top of the loop, we are at start of a new identifier. */
3935 do
3936 {
3937 char *curname;
3938 char *endp;
3939
3940 if (*nextp == '"')
3941 {
3942 /* Quoted name --- collapse quote-quote pairs */
3943 curname = nextp + 1;
3944 for (;;)
3945 {
3946 endp = strchr(nextp + 1, '"');
3947 if (endp == NULL)
3948 return false; /* mismatched quotes */
3949 if (endp[1] != '"')
3950 break; /* found end of quoted name */
3951 /* Collapse adjacent quotes into one quote, and look again */
3952 memmove(endp, endp + 1, strlen(endp));
3953 nextp = endp;
3954 }
3955 /* endp now points at the terminating quote */
3956 nextp = endp + 1;
3957 }
3958 else
3959 {
3960 /* Unquoted name --- extends to separator or whitespace */
3961 curname = nextp;
3962 while (*nextp && *nextp != separator &&
3963 !scanner_isspace(*nextp))
3964 nextp++;
3965 endp = nextp;
3966 if (curname == nextp)
3967 return false; /* empty unquoted name not allowed */
3968 }
3969
3970 while (scanner_isspace(*nextp))
3971 nextp++; /* skip trailing whitespace */
3972
3973 if (*nextp == separator)
3974 {
3975 nextp++;
3976 while (scanner_isspace(*nextp))
3977 nextp++; /* skip leading whitespace for next */
3978 /* we expect another name, so done remains false */
3979 }
3980 else if (*nextp == '\0')
3981 done = true;
3982 else
3983 return false; /* invalid syntax */
3984
3985 /* Now safe to overwrite separator with a null */
3986 *endp = '\0';
3987
3988 /*
3989 * Finished isolating current name --- add it to list
3990 */
3991 *namelist = lappend(*namelist, curname);
3992
3993 /* Loop back if we didn't reach end of string */
3994 } while (!done);
3995
3996 return true;
3997 }
3998
3999
4000 /*****************************************************************************
4001 * Comparison Functions used for bytea
4002 *
4003 * Note: btree indexes need these routines not to leak memory; therefore,
4004 * be careful to free working copies of toasted datums. Most places don't
4005 * need to be so careful.
4006 *****************************************************************************/
4007
4008 Datum
byteaeq(PG_FUNCTION_ARGS)4009 byteaeq(PG_FUNCTION_ARGS)
4010 {
4011 Datum arg1 = PG_GETARG_DATUM(0);
4012 Datum arg2 = PG_GETARG_DATUM(1);
4013 bool result;
4014 Size len1,
4015 len2;
4016
4017 /*
4018 * We can use a fast path for unequal lengths, which might save us from
4019 * having to detoast one or both values.
4020 */
4021 len1 = toast_raw_datum_size(arg1);
4022 len2 = toast_raw_datum_size(arg2);
4023 if (len1 != len2)
4024 result = false;
4025 else
4026 {
4027 bytea *barg1 = DatumGetByteaPP(arg1);
4028 bytea *barg2 = DatumGetByteaPP(arg2);
4029
4030 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4031 len1 - VARHDRSZ) == 0);
4032
4033 PG_FREE_IF_COPY(barg1, 0);
4034 PG_FREE_IF_COPY(barg2, 1);
4035 }
4036
4037 PG_RETURN_BOOL(result);
4038 }
4039
4040 Datum
byteane(PG_FUNCTION_ARGS)4041 byteane(PG_FUNCTION_ARGS)
4042 {
4043 Datum arg1 = PG_GETARG_DATUM(0);
4044 Datum arg2 = PG_GETARG_DATUM(1);
4045 bool result;
4046 Size len1,
4047 len2;
4048
4049 /*
4050 * We can use a fast path for unequal lengths, which might save us from
4051 * having to detoast one or both values.
4052 */
4053 len1 = toast_raw_datum_size(arg1);
4054 len2 = toast_raw_datum_size(arg2);
4055 if (len1 != len2)
4056 result = true;
4057 else
4058 {
4059 bytea *barg1 = DatumGetByteaPP(arg1);
4060 bytea *barg2 = DatumGetByteaPP(arg2);
4061
4062 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4063 len1 - VARHDRSZ) != 0);
4064
4065 PG_FREE_IF_COPY(barg1, 0);
4066 PG_FREE_IF_COPY(barg2, 1);
4067 }
4068
4069 PG_RETURN_BOOL(result);
4070 }
4071
4072 Datum
bytealt(PG_FUNCTION_ARGS)4073 bytealt(PG_FUNCTION_ARGS)
4074 {
4075 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4076 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4077 int len1,
4078 len2;
4079 int cmp;
4080
4081 len1 = VARSIZE_ANY_EXHDR(arg1);
4082 len2 = VARSIZE_ANY_EXHDR(arg2);
4083
4084 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4085
4086 PG_FREE_IF_COPY(arg1, 0);
4087 PG_FREE_IF_COPY(arg2, 1);
4088
4089 PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4090 }
4091
4092 Datum
byteale(PG_FUNCTION_ARGS)4093 byteale(PG_FUNCTION_ARGS)
4094 {
4095 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4096 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4097 int len1,
4098 len2;
4099 int cmp;
4100
4101 len1 = VARSIZE_ANY_EXHDR(arg1);
4102 len2 = VARSIZE_ANY_EXHDR(arg2);
4103
4104 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4105
4106 PG_FREE_IF_COPY(arg1, 0);
4107 PG_FREE_IF_COPY(arg2, 1);
4108
4109 PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4110 }
4111
4112 Datum
byteagt(PG_FUNCTION_ARGS)4113 byteagt(PG_FUNCTION_ARGS)
4114 {
4115 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4116 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4117 int len1,
4118 len2;
4119 int cmp;
4120
4121 len1 = VARSIZE_ANY_EXHDR(arg1);
4122 len2 = VARSIZE_ANY_EXHDR(arg2);
4123
4124 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4125
4126 PG_FREE_IF_COPY(arg1, 0);
4127 PG_FREE_IF_COPY(arg2, 1);
4128
4129 PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4130 }
4131
4132 Datum
byteage(PG_FUNCTION_ARGS)4133 byteage(PG_FUNCTION_ARGS)
4134 {
4135 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4136 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4137 int len1,
4138 len2;
4139 int cmp;
4140
4141 len1 = VARSIZE_ANY_EXHDR(arg1);
4142 len2 = VARSIZE_ANY_EXHDR(arg2);
4143
4144 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4145
4146 PG_FREE_IF_COPY(arg1, 0);
4147 PG_FREE_IF_COPY(arg2, 1);
4148
4149 PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4150 }
4151
4152 Datum
byteacmp(PG_FUNCTION_ARGS)4153 byteacmp(PG_FUNCTION_ARGS)
4154 {
4155 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4156 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4157 int len1,
4158 len2;
4159 int cmp;
4160
4161 len1 = VARSIZE_ANY_EXHDR(arg1);
4162 len2 = VARSIZE_ANY_EXHDR(arg2);
4163
4164 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4165 if ((cmp == 0) && (len1 != len2))
4166 cmp = (len1 < len2) ? -1 : 1;
4167
4168 PG_FREE_IF_COPY(arg1, 0);
4169 PG_FREE_IF_COPY(arg2, 1);
4170
4171 PG_RETURN_INT32(cmp);
4172 }
4173
4174 Datum
bytea_sortsupport(PG_FUNCTION_ARGS)4175 bytea_sortsupport(PG_FUNCTION_ARGS)
4176 {
4177 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
4178 MemoryContext oldcontext;
4179
4180 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4181
4182 /* Use generic string SortSupport, forcing "C" collation */
4183 varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4184
4185 MemoryContextSwitchTo(oldcontext);
4186
4187 PG_RETURN_VOID();
4188 }
4189
4190 /*
4191 * appendStringInfoText
4192 *
4193 * Append a text to str.
4194 * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4195 */
4196 static void
appendStringInfoText(StringInfo str,const text * t)4197 appendStringInfoText(StringInfo str, const text *t)
4198 {
4199 appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
4200 }
4201
4202 /*
4203 * replace_text
4204 * replace all occurrences of 'old_sub_str' in 'orig_str'
4205 * with 'new_sub_str' to form 'new_str'
4206 *
4207 * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4208 * otherwise returns 'new_str'
4209 */
4210 Datum
replace_text(PG_FUNCTION_ARGS)4211 replace_text(PG_FUNCTION_ARGS)
4212 {
4213 text *src_text = PG_GETARG_TEXT_PP(0);
4214 text *from_sub_text = PG_GETARG_TEXT_PP(1);
4215 text *to_sub_text = PG_GETARG_TEXT_PP(2);
4216 int src_text_len;
4217 int from_sub_text_len;
4218 TextPositionState state;
4219 text *ret_text;
4220 int chunk_len;
4221 char *curr_ptr;
4222 char *start_ptr;
4223 StringInfoData str;
4224 bool found;
4225
4226 src_text_len = VARSIZE_ANY_EXHDR(src_text);
4227 from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4228
4229 /* Return unmodified source string if empty source or pattern */
4230 if (src_text_len < 1 || from_sub_text_len < 1)
4231 {
4232 PG_RETURN_TEXT_P(src_text);
4233 }
4234
4235 text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4236
4237 found = text_position_next(&state);
4238
4239 /* When the from_sub_text is not found, there is nothing to do. */
4240 if (!found)
4241 {
4242 text_position_cleanup(&state);
4243 PG_RETURN_TEXT_P(src_text);
4244 }
4245 curr_ptr = text_position_get_match_ptr(&state);
4246 start_ptr = VARDATA_ANY(src_text);
4247
4248 initStringInfo(&str);
4249
4250 do
4251 {
4252 CHECK_FOR_INTERRUPTS();
4253
4254 /* copy the data skipped over by last text_position_next() */
4255 chunk_len = curr_ptr - start_ptr;
4256 appendBinaryStringInfo(&str, start_ptr, chunk_len);
4257
4258 appendStringInfoText(&str, to_sub_text);
4259
4260 start_ptr = curr_ptr + from_sub_text_len;
4261
4262 found = text_position_next(&state);
4263 if (found)
4264 curr_ptr = text_position_get_match_ptr(&state);
4265 }
4266 while (found);
4267
4268 /* copy trailing data */
4269 chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4270 appendBinaryStringInfo(&str, start_ptr, chunk_len);
4271
4272 text_position_cleanup(&state);
4273
4274 ret_text = cstring_to_text_with_len(str.data, str.len);
4275 pfree(str.data);
4276
4277 PG_RETURN_TEXT_P(ret_text);
4278 }
4279
4280 /*
4281 * check_replace_text_has_escape_char
4282 *
4283 * check whether replace_text contains escape char.
4284 */
4285 static bool
check_replace_text_has_escape_char(const text * replace_text)4286 check_replace_text_has_escape_char(const text *replace_text)
4287 {
4288 const char *p = VARDATA_ANY(replace_text);
4289 const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4290
4291 if (pg_database_encoding_max_length() == 1)
4292 {
4293 for (; p < p_end; p++)
4294 {
4295 if (*p == '\\')
4296 return true;
4297 }
4298 }
4299 else
4300 {
4301 for (; p < p_end; p += pg_mblen(p))
4302 {
4303 if (*p == '\\')
4304 return true;
4305 }
4306 }
4307
4308 return false;
4309 }
4310
4311 /*
4312 * appendStringInfoRegexpSubstr
4313 *
4314 * Append replace_text to str, substituting regexp back references for
4315 * \n escapes. start_ptr is the start of the match in the source string,
4316 * at logical character position data_pos.
4317 */
4318 static void
appendStringInfoRegexpSubstr(StringInfo str,text * replace_text,regmatch_t * pmatch,char * start_ptr,int data_pos)4319 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
4320 regmatch_t *pmatch,
4321 char *start_ptr, int data_pos)
4322 {
4323 const char *p = VARDATA_ANY(replace_text);
4324 const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4325 int eml = pg_database_encoding_max_length();
4326
4327 for (;;)
4328 {
4329 const char *chunk_start = p;
4330 int so;
4331 int eo;
4332
4333 /* Find next escape char. */
4334 if (eml == 1)
4335 {
4336 for (; p < p_end && *p != '\\'; p++)
4337 /* nothing */ ;
4338 }
4339 else
4340 {
4341 for (; p < p_end && *p != '\\'; p += pg_mblen(p))
4342 /* nothing */ ;
4343 }
4344
4345 /* Copy the text we just scanned over, if any. */
4346 if (p > chunk_start)
4347 appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4348
4349 /* Done if at end of string, else advance over escape char. */
4350 if (p >= p_end)
4351 break;
4352 p++;
4353
4354 if (p >= p_end)
4355 {
4356 /* Escape at very end of input. Treat same as unexpected char */
4357 appendStringInfoChar(str, '\\');
4358 break;
4359 }
4360
4361 if (*p >= '1' && *p <= '9')
4362 {
4363 /* Use the back reference of regexp. */
4364 int idx = *p - '0';
4365
4366 so = pmatch[idx].rm_so;
4367 eo = pmatch[idx].rm_eo;
4368 p++;
4369 }
4370 else if (*p == '&')
4371 {
4372 /* Use the entire matched string. */
4373 so = pmatch[0].rm_so;
4374 eo = pmatch[0].rm_eo;
4375 p++;
4376 }
4377 else if (*p == '\\')
4378 {
4379 /* \\ means transfer one \ to output. */
4380 appendStringInfoChar(str, '\\');
4381 p++;
4382 continue;
4383 }
4384 else
4385 {
4386 /*
4387 * If escape char is not followed by any expected char, just treat
4388 * it as ordinary data to copy. (XXX would it be better to throw
4389 * an error?)
4390 */
4391 appendStringInfoChar(str, '\\');
4392 continue;
4393 }
4394
4395 if (so != -1 && eo != -1)
4396 {
4397 /*
4398 * Copy the text that is back reference of regexp. Note so and eo
4399 * are counted in characters not bytes.
4400 */
4401 char *chunk_start;
4402 int chunk_len;
4403
4404 Assert(so >= data_pos);
4405 chunk_start = start_ptr;
4406 chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4407 chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4408 appendBinaryStringInfo(str, chunk_start, chunk_len);
4409 }
4410 }
4411 }
4412
4413 #define REGEXP_REPLACE_BACKREF_CNT 10
4414
4415 /*
4416 * replace_text_regexp
4417 *
4418 * replace text that matches to regexp in src_text to replace_text.
4419 *
4420 * Note: to avoid having to include regex.h in builtins.h, we declare
4421 * the regexp argument as void *, but really it's regex_t *.
4422 */
4423 text *
replace_text_regexp(text * src_text,void * regexp,text * replace_text,bool glob)4424 replace_text_regexp(text *src_text, void *regexp,
4425 text *replace_text, bool glob)
4426 {
4427 text *ret_text;
4428 regex_t *re = (regex_t *) regexp;
4429 int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4430 StringInfoData buf;
4431 regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT];
4432 pg_wchar *data;
4433 size_t data_len;
4434 int search_start;
4435 int data_pos;
4436 char *start_ptr;
4437 bool have_escape;
4438
4439 initStringInfo(&buf);
4440
4441 /* Convert data string to wide characters. */
4442 data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4443 data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4444
4445 /* Check whether replace_text has escape char. */
4446 have_escape = check_replace_text_has_escape_char(replace_text);
4447
4448 /* start_ptr points to the data_pos'th character of src_text */
4449 start_ptr = (char *) VARDATA_ANY(src_text);
4450 data_pos = 0;
4451
4452 search_start = 0;
4453 while (search_start <= data_len)
4454 {
4455 int regexec_result;
4456
4457 CHECK_FOR_INTERRUPTS();
4458
4459 regexec_result = pg_regexec(re,
4460 data,
4461 data_len,
4462 search_start,
4463 NULL, /* no details */
4464 REGEXP_REPLACE_BACKREF_CNT,
4465 pmatch,
4466 0);
4467
4468 if (regexec_result == REG_NOMATCH)
4469 break;
4470
4471 if (regexec_result != REG_OKAY)
4472 {
4473 char errMsg[100];
4474
4475 CHECK_FOR_INTERRUPTS();
4476 pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4477 ereport(ERROR,
4478 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4479 errmsg("regular expression failed: %s", errMsg)));
4480 }
4481
4482 /*
4483 * Copy the text to the left of the match position. Note we are given
4484 * character not byte indexes.
4485 */
4486 if (pmatch[0].rm_so - data_pos > 0)
4487 {
4488 int chunk_len;
4489
4490 chunk_len = charlen_to_bytelen(start_ptr,
4491 pmatch[0].rm_so - data_pos);
4492 appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4493
4494 /*
4495 * Advance start_ptr over that text, to avoid multiple rescans of
4496 * it if the replace_text contains multiple back-references.
4497 */
4498 start_ptr += chunk_len;
4499 data_pos = pmatch[0].rm_so;
4500 }
4501
4502 /*
4503 * Copy the replace_text. Process back references when the
4504 * replace_text has escape characters.
4505 */
4506 if (have_escape)
4507 appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4508 start_ptr, data_pos);
4509 else
4510 appendStringInfoText(&buf, replace_text);
4511
4512 /* Advance start_ptr and data_pos over the matched text. */
4513 start_ptr += charlen_to_bytelen(start_ptr,
4514 pmatch[0].rm_eo - data_pos);
4515 data_pos = pmatch[0].rm_eo;
4516
4517 /*
4518 * When global option is off, replace the first instance only.
4519 */
4520 if (!glob)
4521 break;
4522
4523 /*
4524 * Advance search position. Normally we start the next search at the
4525 * end of the previous match; but if the match was of zero length, we
4526 * have to advance by one character, or we'd just find the same match
4527 * again.
4528 */
4529 search_start = data_pos;
4530 if (pmatch[0].rm_so == pmatch[0].rm_eo)
4531 search_start++;
4532 }
4533
4534 /*
4535 * Copy the text to the right of the last match.
4536 */
4537 if (data_pos < data_len)
4538 {
4539 int chunk_len;
4540
4541 chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4542 appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4543 }
4544
4545 ret_text = cstring_to_text_with_len(buf.data, buf.len);
4546 pfree(buf.data);
4547 pfree(data);
4548
4549 return ret_text;
4550 }
4551
4552 /*
4553 * split_text
4554 * parse input string
4555 * return ord item (1 based)
4556 * based on provided field separator
4557 */
4558 Datum
split_text(PG_FUNCTION_ARGS)4559 split_text(PG_FUNCTION_ARGS)
4560 {
4561 text *inputstring = PG_GETARG_TEXT_PP(0);
4562 text *fldsep = PG_GETARG_TEXT_PP(1);
4563 int fldnum = PG_GETARG_INT32(2);
4564 int inputstring_len;
4565 int fldsep_len;
4566 TextPositionState state;
4567 char *start_ptr;
4568 char *end_ptr;
4569 text *result_text;
4570 bool found;
4571
4572 /* field number is 1 based */
4573 if (fldnum < 1)
4574 ereport(ERROR,
4575 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4576 errmsg("field position must be greater than zero")));
4577
4578 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4579 fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4580
4581 /* return empty string for empty input string */
4582 if (inputstring_len < 1)
4583 PG_RETURN_TEXT_P(cstring_to_text(""));
4584
4585 /* empty field separator */
4586 if (fldsep_len < 1)
4587 {
4588 text_position_cleanup(&state);
4589 /* if first field, return input string, else empty string */
4590 if (fldnum == 1)
4591 PG_RETURN_TEXT_P(inputstring);
4592 else
4593 PG_RETURN_TEXT_P(cstring_to_text(""));
4594 }
4595
4596 text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4597
4598 /* identify bounds of first field */
4599 start_ptr = VARDATA_ANY(inputstring);
4600 found = text_position_next(&state);
4601
4602 /* special case if fldsep not found at all */
4603 if (!found)
4604 {
4605 text_position_cleanup(&state);
4606 /* if field 1 requested, return input string, else empty string */
4607 if (fldnum == 1)
4608 PG_RETURN_TEXT_P(inputstring);
4609 else
4610 PG_RETURN_TEXT_P(cstring_to_text(""));
4611 }
4612 end_ptr = text_position_get_match_ptr(&state);
4613
4614 while (found && --fldnum > 0)
4615 {
4616 /* identify bounds of next field */
4617 start_ptr = end_ptr + fldsep_len;
4618 found = text_position_next(&state);
4619 if (found)
4620 end_ptr = text_position_get_match_ptr(&state);
4621 }
4622
4623 text_position_cleanup(&state);
4624
4625 if (fldnum > 0)
4626 {
4627 /* N'th field separator not found */
4628 /* if last field requested, return it, else empty string */
4629 if (fldnum == 1)
4630 {
4631 int last_len = start_ptr - VARDATA_ANY(inputstring);
4632
4633 result_text = cstring_to_text_with_len(start_ptr,
4634 inputstring_len - last_len);
4635 }
4636 else
4637 result_text = cstring_to_text("");
4638 }
4639 else
4640 {
4641 /* non-last field requested */
4642 result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4643 }
4644
4645 PG_RETURN_TEXT_P(result_text);
4646 }
4647
4648 /*
4649 * Convenience function to return true when two text params are equal.
4650 */
4651 static bool
text_isequal(text * txt1,text * txt2,Oid collid)4652 text_isequal(text *txt1, text *txt2, Oid collid)
4653 {
4654 return DatumGetBool(DirectFunctionCall2Coll(texteq,
4655 collid,
4656 PointerGetDatum(txt1),
4657 PointerGetDatum(txt2)));
4658 }
4659
4660 /*
4661 * text_to_array
4662 * parse input string and return text array of elements,
4663 * based on provided field separator
4664 */
4665 Datum
text_to_array(PG_FUNCTION_ARGS)4666 text_to_array(PG_FUNCTION_ARGS)
4667 {
4668 return text_to_array_internal(fcinfo);
4669 }
4670
4671 /*
4672 * text_to_array_null
4673 * parse input string and return text array of elements,
4674 * based on provided field separator and null string
4675 *
4676 * This is a separate entry point only to prevent the regression tests from
4677 * complaining about different argument sets for the same internal function.
4678 */
4679 Datum
text_to_array_null(PG_FUNCTION_ARGS)4680 text_to_array_null(PG_FUNCTION_ARGS)
4681 {
4682 return text_to_array_internal(fcinfo);
4683 }
4684
4685 /*
4686 * common code for text_to_array and text_to_array_null functions
4687 *
4688 * These are not strict so we have to test for null inputs explicitly.
4689 */
4690 static Datum
text_to_array_internal(PG_FUNCTION_ARGS)4691 text_to_array_internal(PG_FUNCTION_ARGS)
4692 {
4693 text *inputstring;
4694 text *fldsep;
4695 text *null_string;
4696 int inputstring_len;
4697 int fldsep_len;
4698 char *start_ptr;
4699 text *result_text;
4700 bool is_null;
4701 ArrayBuildState *astate = NULL;
4702
4703 /* when input string is NULL, then result is NULL too */
4704 if (PG_ARGISNULL(0))
4705 PG_RETURN_NULL();
4706
4707 inputstring = PG_GETARG_TEXT_PP(0);
4708
4709 /* fldsep can be NULL */
4710 if (!PG_ARGISNULL(1))
4711 fldsep = PG_GETARG_TEXT_PP(1);
4712 else
4713 fldsep = NULL;
4714
4715 /* null_string can be NULL or omitted */
4716 if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4717 null_string = PG_GETARG_TEXT_PP(2);
4718 else
4719 null_string = NULL;
4720
4721 if (fldsep != NULL)
4722 {
4723 /*
4724 * Normal case with non-null fldsep. Use the text_position machinery
4725 * to search for occurrences of fldsep.
4726 */
4727 TextPositionState state;
4728
4729 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4730 fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4731
4732 /* return empty array for empty input string */
4733 if (inputstring_len < 1)
4734 PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4735
4736 /*
4737 * empty field separator: return the input string as a one-element
4738 * array
4739 */
4740 if (fldsep_len < 1)
4741 {
4742 Datum elems[1];
4743 bool nulls[1];
4744 int dims[1];
4745 int lbs[1];
4746
4747 /* single element can be a NULL too */
4748 is_null = null_string ? text_isequal(inputstring, null_string, PG_GET_COLLATION()) : false;
4749
4750 elems[0] = PointerGetDatum(inputstring);
4751 nulls[0] = is_null;
4752 dims[0] = 1;
4753 lbs[0] = 1;
4754 /* XXX: this hardcodes assumptions about the text type */
4755 PG_RETURN_ARRAYTYPE_P(construct_md_array(elems, nulls,
4756 1, dims, lbs,
4757 TEXTOID, -1, false, 'i'));
4758 }
4759
4760 text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4761
4762 start_ptr = VARDATA_ANY(inputstring);
4763
4764 for (;;)
4765 {
4766 bool found;
4767 char *end_ptr;
4768 int chunk_len;
4769
4770 CHECK_FOR_INTERRUPTS();
4771
4772 found = text_position_next(&state);
4773 if (!found)
4774 {
4775 /* fetch last field */
4776 chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4777 end_ptr = NULL; /* not used, but some compilers complain */
4778 }
4779 else
4780 {
4781 /* fetch non-last field */
4782 end_ptr = text_position_get_match_ptr(&state);
4783 chunk_len = end_ptr - start_ptr;
4784 }
4785
4786 /* must build a temp text datum to pass to accumArrayResult */
4787 result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4788 is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false;
4789
4790 /* stash away this field */
4791 astate = accumArrayResult(astate,
4792 PointerGetDatum(result_text),
4793 is_null,
4794 TEXTOID,
4795 CurrentMemoryContext);
4796
4797 pfree(result_text);
4798
4799 if (!found)
4800 break;
4801
4802 start_ptr = end_ptr + fldsep_len;
4803 }
4804
4805 text_position_cleanup(&state);
4806 }
4807 else
4808 {
4809 /*
4810 * When fldsep is NULL, each character in the inputstring becomes an
4811 * element in the result array. The separator is effectively the
4812 * space between characters.
4813 */
4814 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4815
4816 /* return empty array for empty input string */
4817 if (inputstring_len < 1)
4818 PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4819
4820 start_ptr = VARDATA_ANY(inputstring);
4821
4822 while (inputstring_len > 0)
4823 {
4824 int chunk_len = pg_mblen(start_ptr);
4825
4826 CHECK_FOR_INTERRUPTS();
4827
4828 /* must build a temp text datum to pass to accumArrayResult */
4829 result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4830 is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false;
4831
4832 /* stash away this field */
4833 astate = accumArrayResult(astate,
4834 PointerGetDatum(result_text),
4835 is_null,
4836 TEXTOID,
4837 CurrentMemoryContext);
4838
4839 pfree(result_text);
4840
4841 start_ptr += chunk_len;
4842 inputstring_len -= chunk_len;
4843 }
4844 }
4845
4846 PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
4847 CurrentMemoryContext));
4848 }
4849
4850 /*
4851 * array_to_text
4852 * concatenate Cstring representation of input array elements
4853 * using provided field separator
4854 */
4855 Datum
array_to_text(PG_FUNCTION_ARGS)4856 array_to_text(PG_FUNCTION_ARGS)
4857 {
4858 ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
4859 char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4860
4861 PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4862 }
4863
4864 /*
4865 * array_to_text_null
4866 * concatenate Cstring representation of input array elements
4867 * using provided field separator and null string
4868 *
4869 * This version is not strict so we have to test for null inputs explicitly.
4870 */
4871 Datum
array_to_text_null(PG_FUNCTION_ARGS)4872 array_to_text_null(PG_FUNCTION_ARGS)
4873 {
4874 ArrayType *v;
4875 char *fldsep;
4876 char *null_string;
4877
4878 /* returns NULL when first or second parameter is NULL */
4879 if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4880 PG_RETURN_NULL();
4881
4882 v = PG_GETARG_ARRAYTYPE_P(0);
4883 fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4884
4885 /* NULL null string is passed through as a null pointer */
4886 if (!PG_ARGISNULL(2))
4887 null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4888 else
4889 null_string = NULL;
4890
4891 PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4892 }
4893
4894 /*
4895 * common code for array_to_text and array_to_text_null functions
4896 */
4897 static text *
array_to_text_internal(FunctionCallInfo fcinfo,ArrayType * v,const char * fldsep,const char * null_string)4898 array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
4899 const char *fldsep, const char *null_string)
4900 {
4901 text *result;
4902 int nitems,
4903 *dims,
4904 ndims;
4905 Oid element_type;
4906 int typlen;
4907 bool typbyval;
4908 char typalign;
4909 StringInfoData buf;
4910 bool printed = false;
4911 char *p;
4912 bits8 *bitmap;
4913 int bitmask;
4914 int i;
4915 ArrayMetaState *my_extra;
4916
4917 ndims = ARR_NDIM(v);
4918 dims = ARR_DIMS(v);
4919 nitems = ArrayGetNItems(ndims, dims);
4920
4921 /* if there are no elements, return an empty string */
4922 if (nitems == 0)
4923 return cstring_to_text_with_len("", 0);
4924
4925 element_type = ARR_ELEMTYPE(v);
4926 initStringInfo(&buf);
4927
4928 /*
4929 * We arrange to look up info about element type, including its output
4930 * conversion proc, only once per series of calls, assuming the element
4931 * type doesn't change underneath us.
4932 */
4933 my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4934 if (my_extra == NULL)
4935 {
4936 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4937 sizeof(ArrayMetaState));
4938 my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4939 my_extra->element_type = ~element_type;
4940 }
4941
4942 if (my_extra->element_type != element_type)
4943 {
4944 /*
4945 * Get info about element type, including its output conversion proc
4946 */
4947 get_type_io_data(element_type, IOFunc_output,
4948 &my_extra->typlen, &my_extra->typbyval,
4949 &my_extra->typalign, &my_extra->typdelim,
4950 &my_extra->typioparam, &my_extra->typiofunc);
4951 fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4952 fcinfo->flinfo->fn_mcxt);
4953 my_extra->element_type = element_type;
4954 }
4955 typlen = my_extra->typlen;
4956 typbyval = my_extra->typbyval;
4957 typalign = my_extra->typalign;
4958
4959 p = ARR_DATA_PTR(v);
4960 bitmap = ARR_NULLBITMAP(v);
4961 bitmask = 1;
4962
4963 for (i = 0; i < nitems; i++)
4964 {
4965 Datum itemvalue;
4966 char *value;
4967
4968 /* Get source element, checking for NULL */
4969 if (bitmap && (*bitmap & bitmask) == 0)
4970 {
4971 /* if null_string is NULL, we just ignore null elements */
4972 if (null_string != NULL)
4973 {
4974 if (printed)
4975 appendStringInfo(&buf, "%s%s", fldsep, null_string);
4976 else
4977 appendStringInfoString(&buf, null_string);
4978 printed = true;
4979 }
4980 }
4981 else
4982 {
4983 itemvalue = fetch_att(p, typbyval, typlen);
4984
4985 value = OutputFunctionCall(&my_extra->proc, itemvalue);
4986
4987 if (printed)
4988 appendStringInfo(&buf, "%s%s", fldsep, value);
4989 else
4990 appendStringInfoString(&buf, value);
4991 printed = true;
4992
4993 p = att_addlength_pointer(p, typlen, p);
4994 p = (char *) att_align_nominal(p, typalign);
4995 }
4996
4997 /* advance bitmap pointer if any */
4998 if (bitmap)
4999 {
5000 bitmask <<= 1;
5001 if (bitmask == 0x100)
5002 {
5003 bitmap++;
5004 bitmask = 1;
5005 }
5006 }
5007 }
5008
5009 result = cstring_to_text_with_len(buf.data, buf.len);
5010 pfree(buf.data);
5011
5012 return result;
5013 }
5014
5015 #define HEXBASE 16
5016 /*
5017 * Convert an int32 to a string containing a base 16 (hex) representation of
5018 * the number.
5019 */
5020 Datum
to_hex32(PG_FUNCTION_ARGS)5021 to_hex32(PG_FUNCTION_ARGS)
5022 {
5023 uint32 value = (uint32) PG_GETARG_INT32(0);
5024 char *ptr;
5025 const char *digits = "0123456789abcdef";
5026 char buf[32]; /* bigger than needed, but reasonable */
5027
5028 ptr = buf + sizeof(buf) - 1;
5029 *ptr = '\0';
5030
5031 do
5032 {
5033 *--ptr = digits[value % HEXBASE];
5034 value /= HEXBASE;
5035 } while (ptr > buf && value);
5036
5037 PG_RETURN_TEXT_P(cstring_to_text(ptr));
5038 }
5039
5040 /*
5041 * Convert an int64 to a string containing a base 16 (hex) representation of
5042 * the number.
5043 */
5044 Datum
to_hex64(PG_FUNCTION_ARGS)5045 to_hex64(PG_FUNCTION_ARGS)
5046 {
5047 uint64 value = (uint64) PG_GETARG_INT64(0);
5048 char *ptr;
5049 const char *digits = "0123456789abcdef";
5050 char buf[32]; /* bigger than needed, but reasonable */
5051
5052 ptr = buf + sizeof(buf) - 1;
5053 *ptr = '\0';
5054
5055 do
5056 {
5057 *--ptr = digits[value % HEXBASE];
5058 value /= HEXBASE;
5059 } while (ptr > buf && value);
5060
5061 PG_RETURN_TEXT_P(cstring_to_text(ptr));
5062 }
5063
5064 /*
5065 * Return the size of a datum, possibly compressed
5066 *
5067 * Works on any data type
5068 */
5069 Datum
pg_column_size(PG_FUNCTION_ARGS)5070 pg_column_size(PG_FUNCTION_ARGS)
5071 {
5072 Datum value = PG_GETARG_DATUM(0);
5073 int32 result;
5074 int typlen;
5075
5076 /* On first call, get the input type's typlen, and save at *fn_extra */
5077 if (fcinfo->flinfo->fn_extra == NULL)
5078 {
5079 /* Lookup the datatype of the supplied argument */
5080 Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5081
5082 typlen = get_typlen(argtypeid);
5083 if (typlen == 0) /* should not happen */
5084 elog(ERROR, "cache lookup failed for type %u", argtypeid);
5085
5086 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5087 sizeof(int));
5088 *((int *) fcinfo->flinfo->fn_extra) = typlen;
5089 }
5090 else
5091 typlen = *((int *) fcinfo->flinfo->fn_extra);
5092
5093 if (typlen == -1)
5094 {
5095 /* varlena type, possibly toasted */
5096 result = toast_datum_size(value);
5097 }
5098 else if (typlen == -2)
5099 {
5100 /* cstring */
5101 result = strlen(DatumGetCString(value)) + 1;
5102 }
5103 else
5104 {
5105 /* ordinary fixed-width type */
5106 result = typlen;
5107 }
5108
5109 PG_RETURN_INT32(result);
5110 }
5111
5112 /*
5113 * string_agg - Concatenates values and returns string.
5114 *
5115 * Syntax: string_agg(value text, delimiter text) RETURNS text
5116 *
5117 * Note: Any NULL values are ignored. The first-call delimiter isn't
5118 * actually used at all, and on subsequent calls the delimiter precedes
5119 * the associated value.
5120 */
5121
5122 /* subroutine to initialize state */
5123 static StringInfo
makeStringAggState(FunctionCallInfo fcinfo)5124 makeStringAggState(FunctionCallInfo fcinfo)
5125 {
5126 StringInfo state;
5127 MemoryContext aggcontext;
5128 MemoryContext oldcontext;
5129
5130 if (!AggCheckCallContext(fcinfo, &aggcontext))
5131 {
5132 /* cannot be called directly because of internal-type argument */
5133 elog(ERROR, "string_agg_transfn called in non-aggregate context");
5134 }
5135
5136 /*
5137 * Create state in aggregate context. It'll stay there across subsequent
5138 * calls.
5139 */
5140 oldcontext = MemoryContextSwitchTo(aggcontext);
5141 state = makeStringInfo();
5142 MemoryContextSwitchTo(oldcontext);
5143
5144 return state;
5145 }
5146
5147 Datum
string_agg_transfn(PG_FUNCTION_ARGS)5148 string_agg_transfn(PG_FUNCTION_ARGS)
5149 {
5150 StringInfo state;
5151
5152 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5153
5154 /* Append the value unless null. */
5155 if (!PG_ARGISNULL(1))
5156 {
5157 /* On the first time through, we ignore the delimiter. */
5158 if (state == NULL)
5159 state = makeStringAggState(fcinfo);
5160 else if (!PG_ARGISNULL(2))
5161 appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
5162
5163 appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
5164 }
5165
5166 /*
5167 * The transition type for string_agg() is declared to be "internal",
5168 * which is a pass-by-value type the same size as a pointer.
5169 */
5170 PG_RETURN_POINTER(state);
5171 }
5172
5173 Datum
string_agg_finalfn(PG_FUNCTION_ARGS)5174 string_agg_finalfn(PG_FUNCTION_ARGS)
5175 {
5176 StringInfo state;
5177
5178 /* cannot be called directly because of internal-type argument */
5179 Assert(AggCheckCallContext(fcinfo, NULL));
5180
5181 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5182
5183 if (state != NULL)
5184 PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
5185 else
5186 PG_RETURN_NULL();
5187 }
5188
5189 /*
5190 * Prepare cache with fmgr info for the output functions of the datatypes of
5191 * the arguments of a concat-like function, beginning with argument "argidx".
5192 * (Arguments before that will have corresponding slots in the resulting
5193 * FmgrInfo array, but we don't fill those slots.)
5194 */
5195 static FmgrInfo *
build_concat_foutcache(FunctionCallInfo fcinfo,int argidx)5196 build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
5197 {
5198 FmgrInfo *foutcache;
5199 int i;
5200
5201 /* We keep the info in fn_mcxt so it survives across calls */
5202 foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5203 PG_NARGS() * sizeof(FmgrInfo));
5204
5205 for (i = argidx; i < PG_NARGS(); i++)
5206 {
5207 Oid valtype;
5208 Oid typOutput;
5209 bool typIsVarlena;
5210
5211 valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5212 if (!OidIsValid(valtype))
5213 elog(ERROR, "could not determine data type of concat() input");
5214
5215 getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5216 fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5217 }
5218
5219 fcinfo->flinfo->fn_extra = foutcache;
5220
5221 return foutcache;
5222 }
5223
5224 /*
5225 * Implementation of both concat() and concat_ws().
5226 *
5227 * sepstr is the separator string to place between values.
5228 * argidx identifies the first argument to concatenate (counting from zero);
5229 * note that this must be constant across any one series of calls.
5230 *
5231 * Returns NULL if result should be NULL, else text value.
5232 */
5233 static text *
concat_internal(const char * sepstr,int argidx,FunctionCallInfo fcinfo)5234 concat_internal(const char *sepstr, int argidx,
5235 FunctionCallInfo fcinfo)
5236 {
5237 text *result;
5238 StringInfoData str;
5239 FmgrInfo *foutcache;
5240 bool first_arg = true;
5241 int i;
5242
5243 /*
5244 * concat(VARIADIC some-array) is essentially equivalent to
5245 * array_to_text(), ie concat the array elements with the given separator.
5246 * So we just pass the case off to that code.
5247 */
5248 if (get_fn_expr_variadic(fcinfo->flinfo))
5249 {
5250 ArrayType *arr;
5251
5252 /* Should have just the one argument */
5253 Assert(argidx == PG_NARGS() - 1);
5254
5255 /* concat(VARIADIC NULL) is defined as NULL */
5256 if (PG_ARGISNULL(argidx))
5257 return NULL;
5258
5259 /*
5260 * Non-null argument had better be an array. We assume that any call
5261 * context that could let get_fn_expr_variadic return true will have
5262 * checked that a VARIADIC-labeled parameter actually is an array. So
5263 * it should be okay to just Assert that it's an array rather than
5264 * doing a full-fledged error check.
5265 */
5266 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
5267
5268 /* OK, safe to fetch the array value */
5269 arr = PG_GETARG_ARRAYTYPE_P(argidx);
5270
5271 /*
5272 * And serialize the array. We tell array_to_text to ignore null
5273 * elements, which matches the behavior of the loop below.
5274 */
5275 return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5276 }
5277
5278 /* Normal case without explicit VARIADIC marker */
5279 initStringInfo(&str);
5280
5281 /* Get output function info, building it if first time through */
5282 foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5283 if (foutcache == NULL)
5284 foutcache = build_concat_foutcache(fcinfo, argidx);
5285
5286 for (i = argidx; i < PG_NARGS(); i++)
5287 {
5288 if (!PG_ARGISNULL(i))
5289 {
5290 Datum value = PG_GETARG_DATUM(i);
5291
5292 /* add separator if appropriate */
5293 if (first_arg)
5294 first_arg = false;
5295 else
5296 appendStringInfoString(&str, sepstr);
5297
5298 /* call the appropriate type output function, append the result */
5299 appendStringInfoString(&str,
5300 OutputFunctionCall(&foutcache[i], value));
5301 }
5302 }
5303
5304 result = cstring_to_text_with_len(str.data, str.len);
5305 pfree(str.data);
5306
5307 return result;
5308 }
5309
5310 /*
5311 * Concatenate all arguments. NULL arguments are ignored.
5312 */
5313 Datum
text_concat(PG_FUNCTION_ARGS)5314 text_concat(PG_FUNCTION_ARGS)
5315 {
5316 text *result;
5317
5318 result = concat_internal("", 0, fcinfo);
5319 if (result == NULL)
5320 PG_RETURN_NULL();
5321 PG_RETURN_TEXT_P(result);
5322 }
5323
5324 /*
5325 * Concatenate all but first argument value with separators. The first
5326 * parameter is used as the separator. NULL arguments are ignored.
5327 */
5328 Datum
text_concat_ws(PG_FUNCTION_ARGS)5329 text_concat_ws(PG_FUNCTION_ARGS)
5330 {
5331 char *sep;
5332 text *result;
5333
5334 /* return NULL when separator is NULL */
5335 if (PG_ARGISNULL(0))
5336 PG_RETURN_NULL();
5337 sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
5338
5339 result = concat_internal(sep, 1, fcinfo);
5340 if (result == NULL)
5341 PG_RETURN_NULL();
5342 PG_RETURN_TEXT_P(result);
5343 }
5344
5345 /*
5346 * Return first n characters in the string. When n is negative,
5347 * return all but last |n| characters.
5348 */
5349 Datum
text_left(PG_FUNCTION_ARGS)5350 text_left(PG_FUNCTION_ARGS)
5351 {
5352 int n = PG_GETARG_INT32(1);
5353
5354 if (n < 0)
5355 {
5356 text *str = PG_GETARG_TEXT_PP(0);
5357 const char *p = VARDATA_ANY(str);
5358 int len = VARSIZE_ANY_EXHDR(str);
5359 int rlen;
5360
5361 n = pg_mbstrlen_with_len(p, len) + n;
5362 rlen = pg_mbcharcliplen(p, len, n);
5363 PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
5364 }
5365 else
5366 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
5367 }
5368
5369 /*
5370 * Return last n characters in the string. When n is negative,
5371 * return all but first |n| characters.
5372 */
5373 Datum
text_right(PG_FUNCTION_ARGS)5374 text_right(PG_FUNCTION_ARGS)
5375 {
5376 text *str = PG_GETARG_TEXT_PP(0);
5377 const char *p = VARDATA_ANY(str);
5378 int len = VARSIZE_ANY_EXHDR(str);
5379 int n = PG_GETARG_INT32(1);
5380 int off;
5381
5382 if (n < 0)
5383 n = -n;
5384 else
5385 n = pg_mbstrlen_with_len(p, len) - n;
5386 off = pg_mbcharcliplen(p, len, n);
5387
5388 PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5389 }
5390
5391 /*
5392 * Return reversed string
5393 */
5394 Datum
text_reverse(PG_FUNCTION_ARGS)5395 text_reverse(PG_FUNCTION_ARGS)
5396 {
5397 text *str = PG_GETARG_TEXT_PP(0);
5398 const char *p = VARDATA_ANY(str);
5399 int len = VARSIZE_ANY_EXHDR(str);
5400 const char *endp = p + len;
5401 text *result;
5402 char *dst;
5403
5404 result = palloc(len + VARHDRSZ);
5405 dst = (char *) VARDATA(result) + len;
5406 SET_VARSIZE(result, len + VARHDRSZ);
5407
5408 if (pg_database_encoding_max_length() > 1)
5409 {
5410 /* multibyte version */
5411 while (p < endp)
5412 {
5413 int sz;
5414
5415 sz = pg_mblen(p);
5416 dst -= sz;
5417 memcpy(dst, p, sz);
5418 p += sz;
5419 }
5420 }
5421 else
5422 {
5423 /* single byte version */
5424 while (p < endp)
5425 *(--dst) = *p++;
5426 }
5427
5428 PG_RETURN_TEXT_P(result);
5429 }
5430
5431
5432 /*
5433 * Support macros for text_format()
5434 */
5435 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5436
5437 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5438 do { \
5439 if (++(ptr) >= (end_ptr)) \
5440 ereport(ERROR, \
5441 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5442 errmsg("unterminated format() type specifier"), \
5443 errhint("For a single \"%%\" use \"%%%%\"."))); \
5444 } while (0)
5445
5446 /*
5447 * Returns a formatted string
5448 */
5449 Datum
text_format(PG_FUNCTION_ARGS)5450 text_format(PG_FUNCTION_ARGS)
5451 {
5452 text *fmt;
5453 StringInfoData str;
5454 const char *cp;
5455 const char *start_ptr;
5456 const char *end_ptr;
5457 text *result;
5458 int arg;
5459 bool funcvariadic;
5460 int nargs;
5461 Datum *elements = NULL;
5462 bool *nulls = NULL;
5463 Oid element_type = InvalidOid;
5464 Oid prev_type = InvalidOid;
5465 Oid prev_width_type = InvalidOid;
5466 FmgrInfo typoutputfinfo;
5467 FmgrInfo typoutputinfo_width;
5468
5469 /* When format string is null, immediately return null */
5470 if (PG_ARGISNULL(0))
5471 PG_RETURN_NULL();
5472
5473 /* If argument is marked VARIADIC, expand array into elements */
5474 if (get_fn_expr_variadic(fcinfo->flinfo))
5475 {
5476 ArrayType *arr;
5477 int16 elmlen;
5478 bool elmbyval;
5479 char elmalign;
5480 int nitems;
5481
5482 /* Should have just the one argument */
5483 Assert(PG_NARGS() == 2);
5484
5485 /* If argument is NULL, we treat it as zero-length array */
5486 if (PG_ARGISNULL(1))
5487 nitems = 0;
5488 else
5489 {
5490 /*
5491 * Non-null argument had better be an array. We assume that any
5492 * call context that could let get_fn_expr_variadic return true
5493 * will have checked that a VARIADIC-labeled parameter actually is
5494 * an array. So it should be okay to just Assert that it's an
5495 * array rather than doing a full-fledged error check.
5496 */
5497 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5498
5499 /* OK, safe to fetch the array value */
5500 arr = PG_GETARG_ARRAYTYPE_P(1);
5501
5502 /* Get info about array element type */
5503 element_type = ARR_ELEMTYPE(arr);
5504 get_typlenbyvalalign(element_type,
5505 &elmlen, &elmbyval, &elmalign);
5506
5507 /* Extract all array elements */
5508 deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5509 &elements, &nulls, &nitems);
5510 }
5511
5512 nargs = nitems + 1;
5513 funcvariadic = true;
5514 }
5515 else
5516 {
5517 /* Non-variadic case, we'll process the arguments individually */
5518 nargs = PG_NARGS();
5519 funcvariadic = false;
5520 }
5521
5522 /* Setup for main loop. */
5523 fmt = PG_GETARG_TEXT_PP(0);
5524 start_ptr = VARDATA_ANY(fmt);
5525 end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5526 initStringInfo(&str);
5527 arg = 1; /* next argument position to print */
5528
5529 /* Scan format string, looking for conversion specifiers. */
5530 for (cp = start_ptr; cp < end_ptr; cp++)
5531 {
5532 int argpos;
5533 int widthpos;
5534 int flags;
5535 int width;
5536 Datum value;
5537 bool isNull;
5538 Oid typid;
5539
5540 /*
5541 * If it's not the start of a conversion specifier, just copy it to
5542 * the output buffer.
5543 */
5544 if (*cp != '%')
5545 {
5546 appendStringInfoCharMacro(&str, *cp);
5547 continue;
5548 }
5549
5550 ADVANCE_PARSE_POINTER(cp, end_ptr);
5551
5552 /* Easy case: %% outputs a single % */
5553 if (*cp == '%')
5554 {
5555 appendStringInfoCharMacro(&str, *cp);
5556 continue;
5557 }
5558
5559 /* Parse the optional portions of the format specifier */
5560 cp = text_format_parse_format(cp, end_ptr,
5561 &argpos, &widthpos,
5562 &flags, &width);
5563
5564 /*
5565 * Next we should see the main conversion specifier. Whether or not
5566 * an argument position was present, it's known that at least one
5567 * character remains in the string at this point. Experience suggests
5568 * that it's worth checking that that character is one of the expected
5569 * ones before we try to fetch arguments, so as to produce the least
5570 * confusing response to a mis-formatted specifier.
5571 */
5572 if (strchr("sIL", *cp) == NULL)
5573 ereport(ERROR,
5574 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5575 errmsg("unrecognized format() type specifier \"%c\"",
5576 *cp),
5577 errhint("For a single \"%%\" use \"%%%%\".")));
5578
5579 /* If indirect width was specified, get its value */
5580 if (widthpos >= 0)
5581 {
5582 /* Collect the specified or next argument position */
5583 if (widthpos > 0)
5584 arg = widthpos;
5585 if (arg >= nargs)
5586 ereport(ERROR,
5587 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5588 errmsg("too few arguments for format()")));
5589
5590 /* Get the value and type of the selected argument */
5591 if (!funcvariadic)
5592 {
5593 value = PG_GETARG_DATUM(arg);
5594 isNull = PG_ARGISNULL(arg);
5595 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5596 }
5597 else
5598 {
5599 value = elements[arg - 1];
5600 isNull = nulls[arg - 1];
5601 typid = element_type;
5602 }
5603 if (!OidIsValid(typid))
5604 elog(ERROR, "could not determine data type of format() input");
5605
5606 arg++;
5607
5608 /* We can treat NULL width the same as zero */
5609 if (isNull)
5610 width = 0;
5611 else if (typid == INT4OID)
5612 width = DatumGetInt32(value);
5613 else if (typid == INT2OID)
5614 width = DatumGetInt16(value);
5615 else
5616 {
5617 /* For less-usual datatypes, convert to text then to int */
5618 char *str;
5619
5620 if (typid != prev_width_type)
5621 {
5622 Oid typoutputfunc;
5623 bool typIsVarlena;
5624
5625 getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5626 fmgr_info(typoutputfunc, &typoutputinfo_width);
5627 prev_width_type = typid;
5628 }
5629
5630 str = OutputFunctionCall(&typoutputinfo_width, value);
5631
5632 /* pg_strtoint32 will complain about bad data or overflow */
5633 width = pg_strtoint32(str);
5634
5635 pfree(str);
5636 }
5637 }
5638
5639 /* Collect the specified or next argument position */
5640 if (argpos > 0)
5641 arg = argpos;
5642 if (arg >= nargs)
5643 ereport(ERROR,
5644 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5645 errmsg("too few arguments for format()")));
5646
5647 /* Get the value and type of the selected argument */
5648 if (!funcvariadic)
5649 {
5650 value = PG_GETARG_DATUM(arg);
5651 isNull = PG_ARGISNULL(arg);
5652 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5653 }
5654 else
5655 {
5656 value = elements[arg - 1];
5657 isNull = nulls[arg - 1];
5658 typid = element_type;
5659 }
5660 if (!OidIsValid(typid))
5661 elog(ERROR, "could not determine data type of format() input");
5662
5663 arg++;
5664
5665 /*
5666 * Get the appropriate typOutput function, reusing previous one if
5667 * same type as previous argument. That's particularly useful in the
5668 * variadic-array case, but often saves work even for ordinary calls.
5669 */
5670 if (typid != prev_type)
5671 {
5672 Oid typoutputfunc;
5673 bool typIsVarlena;
5674
5675 getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5676 fmgr_info(typoutputfunc, &typoutputfinfo);
5677 prev_type = typid;
5678 }
5679
5680 /*
5681 * And now we can format the value.
5682 */
5683 switch (*cp)
5684 {
5685 case 's':
5686 case 'I':
5687 case 'L':
5688 text_format_string_conversion(&str, *cp, &typoutputfinfo,
5689 value, isNull,
5690 flags, width);
5691 break;
5692 default:
5693 /* should not get here, because of previous check */
5694 ereport(ERROR,
5695 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5696 errmsg("unrecognized format() type specifier \"%c\"",
5697 *cp),
5698 errhint("For a single \"%%\" use \"%%%%\".")));
5699 break;
5700 }
5701 }
5702
5703 /* Don't need deconstruct_array results anymore. */
5704 if (elements != NULL)
5705 pfree(elements);
5706 if (nulls != NULL)
5707 pfree(nulls);
5708
5709 /* Generate results. */
5710 result = cstring_to_text_with_len(str.data, str.len);
5711 pfree(str.data);
5712
5713 PG_RETURN_TEXT_P(result);
5714 }
5715
5716 /*
5717 * Parse contiguous digits as a decimal number.
5718 *
5719 * Returns true if some digits could be parsed.
5720 * The value is returned into *value, and *ptr is advanced to the next
5721 * character to be parsed.
5722 *
5723 * Note parsing invariant: at least one character is known available before
5724 * string end (end_ptr) at entry, and this is still true at exit.
5725 */
5726 static bool
text_format_parse_digits(const char ** ptr,const char * end_ptr,int * value)5727 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5728 {
5729 bool found = false;
5730 const char *cp = *ptr;
5731 int val = 0;
5732
5733 while (*cp >= '0' && *cp <= '9')
5734 {
5735 int8 digit = (*cp - '0');
5736
5737 if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5738 unlikely(pg_add_s32_overflow(val, digit, &val)))
5739 ereport(ERROR,
5740 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5741 errmsg("number is out of range")));
5742 ADVANCE_PARSE_POINTER(cp, end_ptr);
5743 found = true;
5744 }
5745
5746 *ptr = cp;
5747 *value = val;
5748
5749 return found;
5750 }
5751
5752 /*
5753 * Parse a format specifier (generally following the SUS printf spec).
5754 *
5755 * We have already advanced over the initial '%', and we are looking for
5756 * [argpos][flags][width]type (but the type character is not consumed here).
5757 *
5758 * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5759 * Output parameters:
5760 * argpos: argument position for value to be printed. -1 means unspecified.
5761 * widthpos: argument position for width. Zero means the argument position
5762 * was unspecified (ie, take the next arg) and -1 means no width
5763 * argument (width was omitted or specified as a constant).
5764 * flags: bitmask of flags.
5765 * width: directly-specified width value. Zero means the width was omitted
5766 * (note it's not necessary to distinguish this case from an explicit
5767 * zero width value).
5768 *
5769 * The function result is the next character position to be parsed, ie, the
5770 * location where the type character is/should be.
5771 *
5772 * Note parsing invariant: at least one character is known available before
5773 * string end (end_ptr) at entry, and this is still true at exit.
5774 */
5775 static const char *
text_format_parse_format(const char * start_ptr,const char * end_ptr,int * argpos,int * widthpos,int * flags,int * width)5776 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5777 int *argpos, int *widthpos,
5778 int *flags, int *width)
5779 {
5780 const char *cp = start_ptr;
5781 int n;
5782
5783 /* set defaults for output parameters */
5784 *argpos = -1;
5785 *widthpos = -1;
5786 *flags = 0;
5787 *width = 0;
5788
5789 /* try to identify first number */
5790 if (text_format_parse_digits(&cp, end_ptr, &n))
5791 {
5792 if (*cp != '$')
5793 {
5794 /* Must be just a width and a type, so we're done */
5795 *width = n;
5796 return cp;
5797 }
5798 /* The number was argument position */
5799 *argpos = n;
5800 /* Explicit 0 for argument index is immediately refused */
5801 if (n == 0)
5802 ereport(ERROR,
5803 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5804 errmsg("format specifies argument 0, but arguments are numbered from 1")));
5805 ADVANCE_PARSE_POINTER(cp, end_ptr);
5806 }
5807
5808 /* Handle flags (only minus is supported now) */
5809 while (*cp == '-')
5810 {
5811 *flags |= TEXT_FORMAT_FLAG_MINUS;
5812 ADVANCE_PARSE_POINTER(cp, end_ptr);
5813 }
5814
5815 if (*cp == '*')
5816 {
5817 /* Handle indirect width */
5818 ADVANCE_PARSE_POINTER(cp, end_ptr);
5819 if (text_format_parse_digits(&cp, end_ptr, &n))
5820 {
5821 /* number in this position must be closed by $ */
5822 if (*cp != '$')
5823 ereport(ERROR,
5824 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5825 errmsg("width argument position must be ended by \"$\"")));
5826 /* The number was width argument position */
5827 *widthpos = n;
5828 /* Explicit 0 for argument index is immediately refused */
5829 if (n == 0)
5830 ereport(ERROR,
5831 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5832 errmsg("format specifies argument 0, but arguments are numbered from 1")));
5833 ADVANCE_PARSE_POINTER(cp, end_ptr);
5834 }
5835 else
5836 *widthpos = 0; /* width's argument position is unspecified */
5837 }
5838 else
5839 {
5840 /* Check for direct width specification */
5841 if (text_format_parse_digits(&cp, end_ptr, &n))
5842 *width = n;
5843 }
5844
5845 /* cp should now be pointing at type character */
5846 return cp;
5847 }
5848
5849 /*
5850 * Format a %s, %I, or %L conversion
5851 */
5852 static void
text_format_string_conversion(StringInfo buf,char conversion,FmgrInfo * typOutputInfo,Datum value,bool isNull,int flags,int width)5853 text_format_string_conversion(StringInfo buf, char conversion,
5854 FmgrInfo *typOutputInfo,
5855 Datum value, bool isNull,
5856 int flags, int width)
5857 {
5858 char *str;
5859
5860 /* Handle NULL arguments before trying to stringify the value. */
5861 if (isNull)
5862 {
5863 if (conversion == 's')
5864 text_format_append_string(buf, "", flags, width);
5865 else if (conversion == 'L')
5866 text_format_append_string(buf, "NULL", flags, width);
5867 else if (conversion == 'I')
5868 ereport(ERROR,
5869 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
5870 errmsg("null values cannot be formatted as an SQL identifier")));
5871 return;
5872 }
5873
5874 /* Stringify. */
5875 str = OutputFunctionCall(typOutputInfo, value);
5876
5877 /* Escape. */
5878 if (conversion == 'I')
5879 {
5880 /* quote_identifier may or may not allocate a new string. */
5881 text_format_append_string(buf, quote_identifier(str), flags, width);
5882 }
5883 else if (conversion == 'L')
5884 {
5885 char *qstr = quote_literal_cstr(str);
5886
5887 text_format_append_string(buf, qstr, flags, width);
5888 /* quote_literal_cstr() always allocates a new string */
5889 pfree(qstr);
5890 }
5891 else
5892 text_format_append_string(buf, str, flags, width);
5893
5894 /* Cleanup. */
5895 pfree(str);
5896 }
5897
5898 /*
5899 * Append str to buf, padding as directed by flags/width
5900 */
5901 static void
text_format_append_string(StringInfo buf,const char * str,int flags,int width)5902 text_format_append_string(StringInfo buf, const char *str,
5903 int flags, int width)
5904 {
5905 bool align_to_left = false;
5906 int len;
5907
5908 /* fast path for typical easy case */
5909 if (width == 0)
5910 {
5911 appendStringInfoString(buf, str);
5912 return;
5913 }
5914
5915 if (width < 0)
5916 {
5917 /* Negative width: implicit '-' flag, then take absolute value */
5918 align_to_left = true;
5919 /* -INT_MIN is undefined */
5920 if (width <= INT_MIN)
5921 ereport(ERROR,
5922 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5923 errmsg("number is out of range")));
5924 width = -width;
5925 }
5926 else if (flags & TEXT_FORMAT_FLAG_MINUS)
5927 align_to_left = true;
5928
5929 len = pg_mbstrlen(str);
5930 if (align_to_left)
5931 {
5932 /* left justify */
5933 appendStringInfoString(buf, str);
5934 if (len < width)
5935 appendStringInfoSpaces(buf, width - len);
5936 }
5937 else
5938 {
5939 /* right justify */
5940 if (len < width)
5941 appendStringInfoSpaces(buf, width - len);
5942 appendStringInfoString(buf, str);
5943 }
5944 }
5945
5946 /*
5947 * text_format_nv - nonvariadic wrapper for text_format function.
5948 *
5949 * note: this wrapper is necessary to pass the sanity check in opr_sanity,
5950 * which checks that all built-in functions that share the implementing C
5951 * function take the same number of arguments.
5952 */
5953 Datum
text_format_nv(PG_FUNCTION_ARGS)5954 text_format_nv(PG_FUNCTION_ARGS)
5955 {
5956 return text_format(fcinfo);
5957 }
5958
5959 /*
5960 * Helper function for Levenshtein distance functions. Faster than memcmp(),
5961 * for this use case.
5962 */
5963 static inline bool
rest_of_char_same(const char * s1,const char * s2,int len)5964 rest_of_char_same(const char *s1, const char *s2, int len)
5965 {
5966 while (len > 0)
5967 {
5968 len--;
5969 if (s1[len] != s2[len])
5970 return false;
5971 }
5972 return true;
5973 }
5974
5975 /* Expand each Levenshtein distance variant */
5976 #include "levenshtein.c"
5977 #define LEVENSHTEIN_LESS_EQUAL
5978 #include "levenshtein.c"
5979