1 /*-------------------------------------------------------------------------
2 *
3 * varlena.c
4 * Functions for the variable-length built-in types.
5 *
6 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/utils/adt/varlena.c
12 *
13 *-------------------------------------------------------------------------
14 */
15 #include "postgres.h"
16
17 #include <ctype.h>
18 #include <limits.h>
19
20 #include "access/detoast.h"
21 #include "catalog/pg_collation.h"
22 #include "catalog/pg_type.h"
23 #include "common/hashfn.h"
24 #include "common/int.h"
25 #include "common/unicode_norm.h"
26 #include "lib/hyperloglog.h"
27 #include "libpq/pqformat.h"
28 #include "miscadmin.h"
29 #include "parser/scansup.h"
30 #include "port/pg_bswap.h"
31 #include "regex/regex.h"
32 #include "utils/builtins.h"
33 #include "utils/bytea.h"
34 #include "utils/lsyscache.h"
35 #include "utils/memutils.h"
36 #include "utils/pg_locale.h"
37 #include "utils/sortsupport.h"
38 #include "utils/varlena.h"
39
40
41 /* GUC variable */
42 int bytea_output = BYTEA_OUTPUT_HEX;
43
44 typedef struct varlena unknown;
45 typedef struct varlena VarString;
46
47 /*
48 * State for text_position_* functions.
49 */
50 typedef struct
51 {
52 bool is_multibyte; /* T if multibyte encoding */
53 bool is_multibyte_char_in_char;
54
55 char *str1; /* haystack string */
56 char *str2; /* needle string */
57 int len1; /* string lengths in bytes */
58 int len2;
59
60 /* Skip table for Boyer-Moore-Horspool search algorithm: */
61 int skiptablemask; /* mask for ANDing with skiptable subscripts */
62 int skiptable[256]; /* skip distance for given mismatched char */
63
64 char *last_match; /* pointer to last match in 'str1' */
65
66 /*
67 * Sometimes we need to convert the byte position of a match to a
68 * character position. These store the last position that was converted,
69 * so that on the next call, we can continue from that point, rather than
70 * count characters from the very beginning.
71 */
72 char *refpoint; /* pointer within original haystack string */
73 int refpos; /* 0-based character offset of the same point */
74 } TextPositionState;
75
76 typedef struct
77 {
78 char *buf1; /* 1st string, or abbreviation original string
79 * buf */
80 char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
81 int buflen1;
82 int buflen2;
83 int last_len1; /* Length of last buf1 string/strxfrm() input */
84 int last_len2; /* Length of last buf2 string/strxfrm() blob */
85 int last_returned; /* Last comparison result (cache) */
86 bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
87 bool collate_c;
88 Oid typid; /* Actual datatype (text/bpchar/bytea/name) */
89 hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
90 hyperLogLogState full_card; /* Full key cardinality state */
91 double prop_card; /* Required cardinality proportion */
92 pg_locale_t locale;
93 } VarStringSortSupport;
94
95 /*
96 * This should be large enough that most strings will fit, but small enough
97 * that we feel comfortable putting it on the stack
98 */
99 #define TEXTBUFLEN 1024
100
101 #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
102 #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
103 #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
104 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
105 #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
106
107 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
108 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
109
110 static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
111 static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
112 static int namefastcmp_c(Datum x, Datum y, SortSupport ssup);
113 static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
114 static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
115 static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
116 static int varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
117 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
118 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
119 static int32 text_length(Datum str);
120 static text *text_catenate(text *t1, text *t2);
121 static text *text_substring(Datum str,
122 int32 start,
123 int32 length,
124 bool length_not_specified);
125 static text *text_overlay(text *t1, text *t2, int sp, int sl);
126 static int text_position(text *t1, text *t2, Oid collid);
127 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
128 static bool text_position_next(TextPositionState *state);
129 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
130 static char *text_position_get_match_ptr(TextPositionState *state);
131 static int text_position_get_match_pos(TextPositionState *state);
132 static void text_position_cleanup(TextPositionState *state);
133 static void check_collation_set(Oid collid);
134 static int text_cmp(text *arg1, text *arg2, Oid collid);
135 static bytea *bytea_catenate(bytea *t1, bytea *t2);
136 static bytea *bytea_substring(Datum str,
137 int S,
138 int L,
139 bool length_not_specified);
140 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
141 static void appendStringInfoText(StringInfo str, const text *t);
142 static Datum text_to_array_internal(PG_FUNCTION_ARGS);
143 static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
144 const char *fldsep, const char *null_string);
145 static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
146 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
147 int *value);
148 static const char *text_format_parse_format(const char *start_ptr,
149 const char *end_ptr,
150 int *argpos, int *widthpos,
151 int *flags, int *width);
152 static void text_format_string_conversion(StringInfo buf, char conversion,
153 FmgrInfo *typOutputInfo,
154 Datum value, bool isNull,
155 int flags, int width);
156 static void text_format_append_string(StringInfo buf, const char *str,
157 int flags, int width);
158
159
160 /*****************************************************************************
161 * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
162 *****************************************************************************/
163
164 /*
165 * cstring_to_text
166 *
167 * Create a text value from a null-terminated C string.
168 *
169 * The new text value is freshly palloc'd with a full-size VARHDR.
170 */
171 text *
cstring_to_text(const char * s)172 cstring_to_text(const char *s)
173 {
174 return cstring_to_text_with_len(s, strlen(s));
175 }
176
177 /*
178 * cstring_to_text_with_len
179 *
180 * Same as cstring_to_text except the caller specifies the string length;
181 * the string need not be null_terminated.
182 */
183 text *
cstring_to_text_with_len(const char * s,int len)184 cstring_to_text_with_len(const char *s, int len)
185 {
186 text *result = (text *) palloc(len + VARHDRSZ);
187
188 SET_VARSIZE(result, len + VARHDRSZ);
189 memcpy(VARDATA(result), s, len);
190
191 return result;
192 }
193
194 /*
195 * text_to_cstring
196 *
197 * Create a palloc'd, null-terminated C string from a text value.
198 *
199 * We support being passed a compressed or toasted text value.
200 * This is a bit bogus since such values shouldn't really be referred to as
201 * "text *", but it seems useful for robustness. If we didn't handle that
202 * case here, we'd need another routine that did, anyway.
203 */
204 char *
text_to_cstring(const text * t)205 text_to_cstring(const text *t)
206 {
207 /* must cast away the const, unfortunately */
208 text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
209 int len = VARSIZE_ANY_EXHDR(tunpacked);
210 char *result;
211
212 result = (char *) palloc(len + 1);
213 memcpy(result, VARDATA_ANY(tunpacked), len);
214 result[len] = '\0';
215
216 if (tunpacked != t)
217 pfree(tunpacked);
218
219 return result;
220 }
221
222 /*
223 * text_to_cstring_buffer
224 *
225 * Copy a text value into a caller-supplied buffer of size dst_len.
226 *
227 * The text string is truncated if necessary to fit. The result is
228 * guaranteed null-terminated (unless dst_len == 0).
229 *
230 * We support being passed a compressed or toasted text value.
231 * This is a bit bogus since such values shouldn't really be referred to as
232 * "text *", but it seems useful for robustness. If we didn't handle that
233 * case here, we'd need another routine that did, anyway.
234 */
235 void
text_to_cstring_buffer(const text * src,char * dst,size_t dst_len)236 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
237 {
238 /* must cast away the const, unfortunately */
239 text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
240 size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
241
242 if (dst_len > 0)
243 {
244 dst_len--;
245 if (dst_len >= src_len)
246 dst_len = src_len;
247 else /* ensure truncation is encoding-safe */
248 dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
249 memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
250 dst[dst_len] = '\0';
251 }
252
253 if (srcunpacked != src)
254 pfree(srcunpacked);
255 }
256
257
258 /*****************************************************************************
259 * USER I/O ROUTINES *
260 *****************************************************************************/
261
262
263 #define VAL(CH) ((CH) - '0')
264 #define DIG(VAL) ((VAL) + '0')
265
266 /*
267 * byteain - converts from printable representation of byte array
268 *
269 * Non-printable characters must be passed as '\nnn' (octal) and are
270 * converted to internal form. '\' must be passed as '\\'.
271 * ereport(ERROR, ...) if bad form.
272 *
273 * BUGS:
274 * The input is scanned twice.
275 * The error checking of input is minimal.
276 */
277 Datum
byteain(PG_FUNCTION_ARGS)278 byteain(PG_FUNCTION_ARGS)
279 {
280 char *inputText = PG_GETARG_CSTRING(0);
281 char *tp;
282 char *rp;
283 int bc;
284 bytea *result;
285
286 /* Recognize hex input */
287 if (inputText[0] == '\\' && inputText[1] == 'x')
288 {
289 size_t len = strlen(inputText);
290
291 bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
292 result = palloc(bc);
293 bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
294 SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
295
296 PG_RETURN_BYTEA_P(result);
297 }
298
299 /* Else, it's the traditional escaped style */
300 for (bc = 0, tp = inputText; *tp != '\0'; bc++)
301 {
302 if (tp[0] != '\\')
303 tp++;
304 else if ((tp[0] == '\\') &&
305 (tp[1] >= '0' && tp[1] <= '3') &&
306 (tp[2] >= '0' && tp[2] <= '7') &&
307 (tp[3] >= '0' && tp[3] <= '7'))
308 tp += 4;
309 else if ((tp[0] == '\\') &&
310 (tp[1] == '\\'))
311 tp += 2;
312 else
313 {
314 /*
315 * one backslash, not followed by another or ### valid octal
316 */
317 ereport(ERROR,
318 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
319 errmsg("invalid input syntax for type %s", "bytea")));
320 }
321 }
322
323 bc += VARHDRSZ;
324
325 result = (bytea *) palloc(bc);
326 SET_VARSIZE(result, bc);
327
328 tp = inputText;
329 rp = VARDATA(result);
330 while (*tp != '\0')
331 {
332 if (tp[0] != '\\')
333 *rp++ = *tp++;
334 else if ((tp[0] == '\\') &&
335 (tp[1] >= '0' && tp[1] <= '3') &&
336 (tp[2] >= '0' && tp[2] <= '7') &&
337 (tp[3] >= '0' && tp[3] <= '7'))
338 {
339 bc = VAL(tp[1]);
340 bc <<= 3;
341 bc += VAL(tp[2]);
342 bc <<= 3;
343 *rp++ = bc + VAL(tp[3]);
344
345 tp += 4;
346 }
347 else if ((tp[0] == '\\') &&
348 (tp[1] == '\\'))
349 {
350 *rp++ = '\\';
351 tp += 2;
352 }
353 else
354 {
355 /*
356 * We should never get here. The first pass should not allow it.
357 */
358 ereport(ERROR,
359 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
360 errmsg("invalid input syntax for type %s", "bytea")));
361 }
362 }
363
364 PG_RETURN_BYTEA_P(result);
365 }
366
367 /*
368 * byteaout - converts to printable representation of byte array
369 *
370 * In the traditional escaped format, non-printable characters are
371 * printed as '\nnn' (octal) and '\' as '\\'.
372 */
373 Datum
byteaout(PG_FUNCTION_ARGS)374 byteaout(PG_FUNCTION_ARGS)
375 {
376 bytea *vlena = PG_GETARG_BYTEA_PP(0);
377 char *result;
378 char *rp;
379
380 if (bytea_output == BYTEA_OUTPUT_HEX)
381 {
382 /* Print hex format */
383 rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
384 *rp++ = '\\';
385 *rp++ = 'x';
386 rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
387 }
388 else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
389 {
390 /* Print traditional escaped format */
391 char *vp;
392 uint64 len;
393 int i;
394
395 len = 1; /* empty string has 1 char */
396 vp = VARDATA_ANY(vlena);
397 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
398 {
399 if (*vp == '\\')
400 len += 2;
401 else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
402 len += 4;
403 else
404 len++;
405 }
406
407 /*
408 * In principle len can't overflow uint32 if the input fit in 1GB, but
409 * for safety let's check rather than relying on palloc's internal
410 * check.
411 */
412 if (len > MaxAllocSize)
413 ereport(ERROR,
414 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
415 errmsg_internal("result of bytea output conversion is too large")));
416 rp = result = (char *) palloc(len);
417
418 vp = VARDATA_ANY(vlena);
419 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
420 {
421 if (*vp == '\\')
422 {
423 *rp++ = '\\';
424 *rp++ = '\\';
425 }
426 else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
427 {
428 int val; /* holds unprintable chars */
429
430 val = *vp;
431 rp[0] = '\\';
432 rp[3] = DIG(val & 07);
433 val >>= 3;
434 rp[2] = DIG(val & 07);
435 val >>= 3;
436 rp[1] = DIG(val & 03);
437 rp += 4;
438 }
439 else
440 *rp++ = *vp;
441 }
442 }
443 else
444 {
445 elog(ERROR, "unrecognized bytea_output setting: %d",
446 bytea_output);
447 rp = result = NULL; /* keep compiler quiet */
448 }
449 *rp = '\0';
450 PG_RETURN_CSTRING(result);
451 }
452
453 /*
454 * bytearecv - converts external binary format to bytea
455 */
456 Datum
bytearecv(PG_FUNCTION_ARGS)457 bytearecv(PG_FUNCTION_ARGS)
458 {
459 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
460 bytea *result;
461 int nbytes;
462
463 nbytes = buf->len - buf->cursor;
464 result = (bytea *) palloc(nbytes + VARHDRSZ);
465 SET_VARSIZE(result, nbytes + VARHDRSZ);
466 pq_copymsgbytes(buf, VARDATA(result), nbytes);
467 PG_RETURN_BYTEA_P(result);
468 }
469
470 /*
471 * byteasend - converts bytea to binary format
472 *
473 * This is a special case: just copy the input...
474 */
475 Datum
byteasend(PG_FUNCTION_ARGS)476 byteasend(PG_FUNCTION_ARGS)
477 {
478 bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
479
480 PG_RETURN_BYTEA_P(vlena);
481 }
482
483 Datum
bytea_string_agg_transfn(PG_FUNCTION_ARGS)484 bytea_string_agg_transfn(PG_FUNCTION_ARGS)
485 {
486 StringInfo state;
487
488 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
489
490 /* Append the value unless null. */
491 if (!PG_ARGISNULL(1))
492 {
493 bytea *value = PG_GETARG_BYTEA_PP(1);
494
495 /* On the first time through, we ignore the delimiter. */
496 if (state == NULL)
497 state = makeStringAggState(fcinfo);
498 else if (!PG_ARGISNULL(2))
499 {
500 bytea *delim = PG_GETARG_BYTEA_PP(2);
501
502 appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
503 }
504
505 appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
506 }
507
508 /*
509 * The transition type for string_agg() is declared to be "internal",
510 * which is a pass-by-value type the same size as a pointer.
511 */
512 PG_RETURN_POINTER(state);
513 }
514
515 Datum
bytea_string_agg_finalfn(PG_FUNCTION_ARGS)516 bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
517 {
518 StringInfo state;
519
520 /* cannot be called directly because of internal-type argument */
521 Assert(AggCheckCallContext(fcinfo, NULL));
522
523 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
524
525 if (state != NULL)
526 {
527 bytea *result;
528
529 result = (bytea *) palloc(state->len + VARHDRSZ);
530 SET_VARSIZE(result, state->len + VARHDRSZ);
531 memcpy(VARDATA(result), state->data, state->len);
532 PG_RETURN_BYTEA_P(result);
533 }
534 else
535 PG_RETURN_NULL();
536 }
537
538 /*
539 * textin - converts "..." to internal representation
540 */
541 Datum
textin(PG_FUNCTION_ARGS)542 textin(PG_FUNCTION_ARGS)
543 {
544 char *inputText = PG_GETARG_CSTRING(0);
545
546 PG_RETURN_TEXT_P(cstring_to_text(inputText));
547 }
548
549 /*
550 * textout - converts internal representation to "..."
551 */
552 Datum
textout(PG_FUNCTION_ARGS)553 textout(PG_FUNCTION_ARGS)
554 {
555 Datum txt = PG_GETARG_DATUM(0);
556
557 PG_RETURN_CSTRING(TextDatumGetCString(txt));
558 }
559
560 /*
561 * textrecv - converts external binary format to text
562 */
563 Datum
textrecv(PG_FUNCTION_ARGS)564 textrecv(PG_FUNCTION_ARGS)
565 {
566 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
567 text *result;
568 char *str;
569 int nbytes;
570
571 str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
572
573 result = cstring_to_text_with_len(str, nbytes);
574 pfree(str);
575 PG_RETURN_TEXT_P(result);
576 }
577
578 /*
579 * textsend - converts text to binary format
580 */
581 Datum
textsend(PG_FUNCTION_ARGS)582 textsend(PG_FUNCTION_ARGS)
583 {
584 text *t = PG_GETARG_TEXT_PP(0);
585 StringInfoData buf;
586
587 pq_begintypsend(&buf);
588 pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
589 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
590 }
591
592
593 /*
594 * unknownin - converts "..." to internal representation
595 */
596 Datum
unknownin(PG_FUNCTION_ARGS)597 unknownin(PG_FUNCTION_ARGS)
598 {
599 char *str = PG_GETARG_CSTRING(0);
600
601 /* representation is same as cstring */
602 PG_RETURN_CSTRING(pstrdup(str));
603 }
604
605 /*
606 * unknownout - converts internal representation to "..."
607 */
608 Datum
unknownout(PG_FUNCTION_ARGS)609 unknownout(PG_FUNCTION_ARGS)
610 {
611 /* representation is same as cstring */
612 char *str = PG_GETARG_CSTRING(0);
613
614 PG_RETURN_CSTRING(pstrdup(str));
615 }
616
617 /*
618 * unknownrecv - converts external binary format to unknown
619 */
620 Datum
unknownrecv(PG_FUNCTION_ARGS)621 unknownrecv(PG_FUNCTION_ARGS)
622 {
623 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
624 char *str;
625 int nbytes;
626
627 str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
628 /* representation is same as cstring */
629 PG_RETURN_CSTRING(str);
630 }
631
632 /*
633 * unknownsend - converts unknown to binary format
634 */
635 Datum
unknownsend(PG_FUNCTION_ARGS)636 unknownsend(PG_FUNCTION_ARGS)
637 {
638 /* representation is same as cstring */
639 char *str = PG_GETARG_CSTRING(0);
640 StringInfoData buf;
641
642 pq_begintypsend(&buf);
643 pq_sendtext(&buf, str, strlen(str));
644 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
645 }
646
647
648 /* ========== PUBLIC ROUTINES ========== */
649
650 /*
651 * textlen -
652 * returns the logical length of a text*
653 * (which is less than the VARSIZE of the text*)
654 */
655 Datum
textlen(PG_FUNCTION_ARGS)656 textlen(PG_FUNCTION_ARGS)
657 {
658 Datum str = PG_GETARG_DATUM(0);
659
660 /* try to avoid decompressing argument */
661 PG_RETURN_INT32(text_length(str));
662 }
663
664 /*
665 * text_length -
666 * Does the real work for textlen()
667 *
668 * This is broken out so it can be called directly by other string processing
669 * functions. Note that the argument is passed as a Datum, to indicate that
670 * it may still be in compressed form. We can avoid decompressing it at all
671 * in some cases.
672 */
673 static int32
text_length(Datum str)674 text_length(Datum str)
675 {
676 /* fastpath when max encoding length is one */
677 if (pg_database_encoding_max_length() == 1)
678 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
679 else
680 {
681 text *t = DatumGetTextPP(str);
682
683 PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
684 VARSIZE_ANY_EXHDR(t)));
685 }
686 }
687
688 /*
689 * textoctetlen -
690 * returns the physical length of a text*
691 * (which is less than the VARSIZE of the text*)
692 */
693 Datum
textoctetlen(PG_FUNCTION_ARGS)694 textoctetlen(PG_FUNCTION_ARGS)
695 {
696 Datum str = PG_GETARG_DATUM(0);
697
698 /* We need not detoast the input at all */
699 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
700 }
701
702 /*
703 * textcat -
704 * takes two text* and returns a text* that is the concatenation of
705 * the two.
706 *
707 * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
708 * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
709 * Allocate space for output in all cases.
710 * XXX - thomas 1997-07-10
711 */
712 Datum
textcat(PG_FUNCTION_ARGS)713 textcat(PG_FUNCTION_ARGS)
714 {
715 text *t1 = PG_GETARG_TEXT_PP(0);
716 text *t2 = PG_GETARG_TEXT_PP(1);
717
718 PG_RETURN_TEXT_P(text_catenate(t1, t2));
719 }
720
721 /*
722 * text_catenate
723 * Guts of textcat(), broken out so it can be used by other functions
724 *
725 * Arguments can be in short-header form, but not compressed or out-of-line
726 */
727 static text *
text_catenate(text * t1,text * t2)728 text_catenate(text *t1, text *t2)
729 {
730 text *result;
731 int len1,
732 len2,
733 len;
734 char *ptr;
735
736 len1 = VARSIZE_ANY_EXHDR(t1);
737 len2 = VARSIZE_ANY_EXHDR(t2);
738
739 /* paranoia ... probably should throw error instead? */
740 if (len1 < 0)
741 len1 = 0;
742 if (len2 < 0)
743 len2 = 0;
744
745 len = len1 + len2 + VARHDRSZ;
746 result = (text *) palloc(len);
747
748 /* Set size of result string... */
749 SET_VARSIZE(result, len);
750
751 /* Fill data field of result string... */
752 ptr = VARDATA(result);
753 if (len1 > 0)
754 memcpy(ptr, VARDATA_ANY(t1), len1);
755 if (len2 > 0)
756 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
757
758 return result;
759 }
760
761 /*
762 * charlen_to_bytelen()
763 * Compute the number of bytes occupied by n characters starting at *p
764 *
765 * It is caller's responsibility that there actually are n characters;
766 * the string need not be null-terminated.
767 */
768 static int
charlen_to_bytelen(const char * p,int n)769 charlen_to_bytelen(const char *p, int n)
770 {
771 if (pg_database_encoding_max_length() == 1)
772 {
773 /* Optimization for single-byte encodings */
774 return n;
775 }
776 else
777 {
778 const char *s;
779
780 for (s = p; n > 0; n--)
781 s += pg_mblen(s);
782
783 return s - p;
784 }
785 }
786
787 /*
788 * text_substr()
789 * Return a substring starting at the specified position.
790 * - thomas 1997-12-31
791 *
792 * Input:
793 * - string
794 * - starting position (is one-based)
795 * - string length
796 *
797 * If the starting position is zero or less, then return from the start of the string
798 * adjusting the length to be consistent with the "negative start" per SQL.
799 * If the length is less than zero, return the remaining string.
800 *
801 * Added multibyte support.
802 * - Tatsuo Ishii 1998-4-21
803 * Changed behavior if starting position is less than one to conform to SQL behavior.
804 * Formerly returned the entire string; now returns a portion.
805 * - Thomas Lockhart 1998-12-10
806 * Now uses faster TOAST-slicing interface
807 * - John Gray 2002-02-22
808 * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
809 * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
810 * error; if E < 1, return '', not entire string). Fixed MB related bug when
811 * S > LC and < LC + 4 sometimes garbage characters are returned.
812 * - Joe Conway 2002-08-10
813 */
814 Datum
text_substr(PG_FUNCTION_ARGS)815 text_substr(PG_FUNCTION_ARGS)
816 {
817 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
818 PG_GETARG_INT32(1),
819 PG_GETARG_INT32(2),
820 false));
821 }
822
823 /*
824 * text_substr_no_len -
825 * Wrapper to avoid opr_sanity failure due to
826 * one function accepting a different number of args.
827 */
828 Datum
text_substr_no_len(PG_FUNCTION_ARGS)829 text_substr_no_len(PG_FUNCTION_ARGS)
830 {
831 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
832 PG_GETARG_INT32(1),
833 -1, true));
834 }
835
836 /*
837 * text_substring -
838 * Does the real work for text_substr() and text_substr_no_len()
839 *
840 * This is broken out so it can be called directly by other string processing
841 * functions. Note that the argument is passed as a Datum, to indicate that
842 * it may still be in compressed/toasted form. We can avoid detoasting all
843 * of it in some cases.
844 *
845 * The result is always a freshly palloc'd datum.
846 */
847 static text *
text_substring(Datum str,int32 start,int32 length,bool length_not_specified)848 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
849 {
850 int32 eml = pg_database_encoding_max_length();
851 int32 S = start; /* start position */
852 int32 S1; /* adjusted start position */
853 int32 L1; /* adjusted substring length */
854 int32 E; /* end position */
855
856 /*
857 * SQL99 says S can be zero or negative, but we still must fetch from the
858 * start of the string.
859 */
860 S1 = Max(S, 1);
861
862 /* life is easy if the encoding max length is 1 */
863 if (eml == 1)
864 {
865 if (length_not_specified) /* special case - get length to end of
866 * string */
867 L1 = -1;
868 else if (length < 0)
869 {
870 /* SQL99 says to throw an error for E < S, i.e., negative length */
871 ereport(ERROR,
872 (errcode(ERRCODE_SUBSTRING_ERROR),
873 errmsg("negative substring length not allowed")));
874 L1 = -1; /* silence stupider compilers */
875 }
876 else if (pg_add_s32_overflow(S, length, &E))
877 {
878 /*
879 * L could be large enough for S + L to overflow, in which case
880 * the substring must run to end of string.
881 */
882 L1 = -1;
883 }
884 else
885 {
886 /*
887 * A zero or negative value for the end position can happen if the
888 * start was negative or one. SQL99 says to return a zero-length
889 * string.
890 */
891 if (E < 1)
892 return cstring_to_text("");
893
894 L1 = E - S1;
895 }
896
897 /*
898 * If the start position is past the end of the string, SQL99 says to
899 * return a zero-length string -- DatumGetTextPSlice() will do that
900 * for us. We need only convert S1 to zero-based starting position.
901 */
902 return DatumGetTextPSlice(str, S1 - 1, L1);
903 }
904 else if (eml > 1)
905 {
906 /*
907 * When encoding max length is > 1, we can't get LC without
908 * detoasting, so we'll grab a conservatively large slice now and go
909 * back later to do the right thing
910 */
911 int32 slice_start;
912 int32 slice_size;
913 int32 slice_strlen;
914 text *slice;
915 int32 E1;
916 int32 i;
917 char *p;
918 char *s;
919 text *ret;
920
921 /*
922 * We need to start at position zero because there is no way to know
923 * in advance which byte offset corresponds to the supplied start
924 * position.
925 */
926 slice_start = 0;
927
928 if (length_not_specified) /* special case - get length to end of
929 * string */
930 slice_size = L1 = -1;
931 else if (length < 0)
932 {
933 /* SQL99 says to throw an error for E < S, i.e., negative length */
934 ereport(ERROR,
935 (errcode(ERRCODE_SUBSTRING_ERROR),
936 errmsg("negative substring length not allowed")));
937 slice_size = L1 = -1; /* silence stupider compilers */
938 }
939 else if (pg_add_s32_overflow(S, length, &E))
940 {
941 /*
942 * L could be large enough for S + L to overflow, in which case
943 * the substring must run to end of string.
944 */
945 slice_size = L1 = -1;
946 }
947 else
948 {
949 /*
950 * A zero or negative value for the end position can happen if the
951 * start was negative or one. SQL99 says to return a zero-length
952 * string.
953 */
954 if (E < 1)
955 return cstring_to_text("");
956
957 /*
958 * if E is past the end of the string, the tuple toaster will
959 * truncate the length for us
960 */
961 L1 = E - S1;
962
963 /*
964 * Total slice size in bytes can't be any longer than the start
965 * position plus substring length times the encoding max length.
966 * If that overflows, we can just use -1.
967 */
968 if (pg_mul_s32_overflow(E, eml, &slice_size))
969 slice_size = -1;
970 }
971
972 /*
973 * If we're working with an untoasted source, no need to do an extra
974 * copying step.
975 */
976 if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
977 VARATT_IS_EXTERNAL(DatumGetPointer(str)))
978 slice = DatumGetTextPSlice(str, slice_start, slice_size);
979 else
980 slice = (text *) DatumGetPointer(str);
981
982 /* see if we got back an empty string */
983 if (VARSIZE_ANY_EXHDR(slice) == 0)
984 {
985 if (slice != (text *) DatumGetPointer(str))
986 pfree(slice);
987 return cstring_to_text("");
988 }
989
990 /* Now we can get the actual length of the slice in MB characters */
991 slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
992 VARSIZE_ANY_EXHDR(slice));
993
994 /*
995 * Check that the start position wasn't > slice_strlen. If so, SQL99
996 * says to return a zero-length string.
997 */
998 if (S1 > slice_strlen)
999 {
1000 if (slice != (text *) DatumGetPointer(str))
1001 pfree(slice);
1002 return cstring_to_text("");
1003 }
1004
1005 /*
1006 * Adjust L1 and E1 now that we know the slice string length. Again
1007 * remember that S1 is one based, and slice_start is zero based.
1008 */
1009 if (L1 > -1)
1010 E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1011 else
1012 E1 = slice_start + 1 + slice_strlen;
1013
1014 /*
1015 * Find the start position in the slice; remember S1 is not zero based
1016 */
1017 p = VARDATA_ANY(slice);
1018 for (i = 0; i < S1 - 1; i++)
1019 p += pg_mblen(p);
1020
1021 /* hang onto a pointer to our start position */
1022 s = p;
1023
1024 /*
1025 * Count the actual bytes used by the substring of the requested
1026 * length.
1027 */
1028 for (i = S1; i < E1; i++)
1029 p += pg_mblen(p);
1030
1031 ret = (text *) palloc(VARHDRSZ + (p - s));
1032 SET_VARSIZE(ret, VARHDRSZ + (p - s));
1033 memcpy(VARDATA(ret), s, (p - s));
1034
1035 if (slice != (text *) DatumGetPointer(str))
1036 pfree(slice);
1037
1038 return ret;
1039 }
1040 else
1041 elog(ERROR, "invalid backend encoding: encoding max length < 1");
1042
1043 /* not reached: suppress compiler warning */
1044 return NULL;
1045 }
1046
1047 /*
1048 * textoverlay
1049 * Replace specified substring of first string with second
1050 *
1051 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1052 * This code is a direct implementation of what the standard says.
1053 */
1054 Datum
textoverlay(PG_FUNCTION_ARGS)1055 textoverlay(PG_FUNCTION_ARGS)
1056 {
1057 text *t1 = PG_GETARG_TEXT_PP(0);
1058 text *t2 = PG_GETARG_TEXT_PP(1);
1059 int sp = PG_GETARG_INT32(2); /* substring start position */
1060 int sl = PG_GETARG_INT32(3); /* substring length */
1061
1062 PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1063 }
1064
1065 Datum
textoverlay_no_len(PG_FUNCTION_ARGS)1066 textoverlay_no_len(PG_FUNCTION_ARGS)
1067 {
1068 text *t1 = PG_GETARG_TEXT_PP(0);
1069 text *t2 = PG_GETARG_TEXT_PP(1);
1070 int sp = PG_GETARG_INT32(2); /* substring start position */
1071 int sl;
1072
1073 sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1074 PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1075 }
1076
1077 static text *
text_overlay(text * t1,text * t2,int sp,int sl)1078 text_overlay(text *t1, text *t2, int sp, int sl)
1079 {
1080 text *result;
1081 text *s1;
1082 text *s2;
1083 int sp_pl_sl;
1084
1085 /*
1086 * Check for possible integer-overflow cases. For negative sp, throw a
1087 * "substring length" error because that's what should be expected
1088 * according to the spec's definition of OVERLAY().
1089 */
1090 if (sp <= 0)
1091 ereport(ERROR,
1092 (errcode(ERRCODE_SUBSTRING_ERROR),
1093 errmsg("negative substring length not allowed")));
1094 if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1095 ereport(ERROR,
1096 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1097 errmsg("integer out of range")));
1098
1099 s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1100 s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1101 result = text_catenate(s1, t2);
1102 result = text_catenate(result, s2);
1103
1104 return result;
1105 }
1106
1107 /*
1108 * textpos -
1109 * Return the position of the specified substring.
1110 * Implements the SQL POSITION() function.
1111 * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1112 * - thomas 1997-07-27
1113 */
1114 Datum
textpos(PG_FUNCTION_ARGS)1115 textpos(PG_FUNCTION_ARGS)
1116 {
1117 text *str = PG_GETARG_TEXT_PP(0);
1118 text *search_str = PG_GETARG_TEXT_PP(1);
1119
1120 PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1121 }
1122
1123 /*
1124 * text_position -
1125 * Does the real work for textpos()
1126 *
1127 * Inputs:
1128 * t1 - string to be searched
1129 * t2 - pattern to match within t1
1130 * Result:
1131 * Character index of the first matched char, starting from 1,
1132 * or 0 if no match.
1133 *
1134 * This is broken out so it can be called directly by other string processing
1135 * functions.
1136 */
1137 static int
text_position(text * t1,text * t2,Oid collid)1138 text_position(text *t1, text *t2, Oid collid)
1139 {
1140 TextPositionState state;
1141 int result;
1142
1143 /* Empty needle always matches at position 1 */
1144 if (VARSIZE_ANY_EXHDR(t2) < 1)
1145 return 1;
1146
1147 /* Otherwise, can't match if haystack is shorter than needle */
1148 if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1149 return 0;
1150
1151 text_position_setup(t1, t2, collid, &state);
1152 if (!text_position_next(&state))
1153 result = 0;
1154 else
1155 result = text_position_get_match_pos(&state);
1156 text_position_cleanup(&state);
1157 return result;
1158 }
1159
1160
1161 /*
1162 * text_position_setup, text_position_next, text_position_cleanup -
1163 * Component steps of text_position()
1164 *
1165 * These are broken out so that a string can be efficiently searched for
1166 * multiple occurrences of the same pattern. text_position_next may be
1167 * called multiple times, and it advances to the next match on each call.
1168 * text_position_get_match_ptr() and text_position_get_match_pos() return
1169 * a pointer or 1-based character position of the last match, respectively.
1170 *
1171 * The "state" variable is normally just a local variable in the caller.
1172 *
1173 * NOTE: text_position_next skips over the matched portion. For example,
1174 * searching for "xx" in "xxx" returns only one match, not two.
1175 */
1176
1177 static void
text_position_setup(text * t1,text * t2,Oid collid,TextPositionState * state)1178 text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1179 {
1180 int len1 = VARSIZE_ANY_EXHDR(t1);
1181 int len2 = VARSIZE_ANY_EXHDR(t2);
1182 pg_locale_t mylocale = 0;
1183
1184 check_collation_set(collid);
1185
1186 if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1187 mylocale = pg_newlocale_from_collation(collid);
1188
1189 if (mylocale && !mylocale->deterministic)
1190 ereport(ERROR,
1191 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1192 errmsg("nondeterministic collations are not supported for substring searches")));
1193
1194 Assert(len1 > 0);
1195 Assert(len2 > 0);
1196
1197 /*
1198 * Even with a multi-byte encoding, we perform the search using the raw
1199 * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1200 * because in UTF-8 the byte sequence of one character cannot contain
1201 * another character. For other multi-byte encodings, we do the search
1202 * initially as a simple byte search, ignoring multibyte issues, but
1203 * verify afterwards that the match we found is at a character boundary,
1204 * and continue the search if it was a false match.
1205 */
1206 if (pg_database_encoding_max_length() == 1)
1207 {
1208 state->is_multibyte = false;
1209 state->is_multibyte_char_in_char = false;
1210 }
1211 else if (GetDatabaseEncoding() == PG_UTF8)
1212 {
1213 state->is_multibyte = true;
1214 state->is_multibyte_char_in_char = false;
1215 }
1216 else
1217 {
1218 state->is_multibyte = true;
1219 state->is_multibyte_char_in_char = true;
1220 }
1221
1222 state->str1 = VARDATA_ANY(t1);
1223 state->str2 = VARDATA_ANY(t2);
1224 state->len1 = len1;
1225 state->len2 = len2;
1226 state->last_match = NULL;
1227 state->refpoint = state->str1;
1228 state->refpos = 0;
1229
1230 /*
1231 * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1232 * notes we use the terminology that the "haystack" is the string to be
1233 * searched (t1) and the "needle" is the pattern being sought (t2).
1234 *
1235 * If the needle is empty or bigger than the haystack then there is no
1236 * point in wasting cycles initializing the table. We also choose not to
1237 * use B-M-H for needles of length 1, since the skip table can't possibly
1238 * save anything in that case.
1239 */
1240 if (len1 >= len2 && len2 > 1)
1241 {
1242 int searchlength = len1 - len2;
1243 int skiptablemask;
1244 int last;
1245 int i;
1246 const char *str2 = state->str2;
1247
1248 /*
1249 * First we must determine how much of the skip table to use. The
1250 * declaration of TextPositionState allows up to 256 elements, but for
1251 * short search problems we don't really want to have to initialize so
1252 * many elements --- it would take too long in comparison to the
1253 * actual search time. So we choose a useful skip table size based on
1254 * the haystack length minus the needle length. The closer the needle
1255 * length is to the haystack length the less useful skipping becomes.
1256 *
1257 * Note: since we use bit-masking to select table elements, the skip
1258 * table size MUST be a power of 2, and so the mask must be 2^N-1.
1259 */
1260 if (searchlength < 16)
1261 skiptablemask = 3;
1262 else if (searchlength < 64)
1263 skiptablemask = 7;
1264 else if (searchlength < 128)
1265 skiptablemask = 15;
1266 else if (searchlength < 512)
1267 skiptablemask = 31;
1268 else if (searchlength < 2048)
1269 skiptablemask = 63;
1270 else if (searchlength < 4096)
1271 skiptablemask = 127;
1272 else
1273 skiptablemask = 255;
1274 state->skiptablemask = skiptablemask;
1275
1276 /*
1277 * Initialize the skip table. We set all elements to the needle
1278 * length, since this is the correct skip distance for any character
1279 * not found in the needle.
1280 */
1281 for (i = 0; i <= skiptablemask; i++)
1282 state->skiptable[i] = len2;
1283
1284 /*
1285 * Now examine the needle. For each character except the last one,
1286 * set the corresponding table element to the appropriate skip
1287 * distance. Note that when two characters share the same skip table
1288 * entry, the one later in the needle must determine the skip
1289 * distance.
1290 */
1291 last = len2 - 1;
1292
1293 for (i = 0; i < last; i++)
1294 state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1295 }
1296 }
1297
1298 /*
1299 * Advance to the next match, starting from the end of the previous match
1300 * (or the beginning of the string, on first call). Returns true if a match
1301 * is found.
1302 *
1303 * Note that this refuses to match an empty-string needle. Most callers
1304 * will have handled that case specially and we'll never see it here.
1305 */
1306 static bool
text_position_next(TextPositionState * state)1307 text_position_next(TextPositionState *state)
1308 {
1309 int needle_len = state->len2;
1310 char *start_ptr;
1311 char *matchptr;
1312
1313 if (needle_len <= 0)
1314 return false; /* result for empty pattern */
1315
1316 /* Start from the point right after the previous match. */
1317 if (state->last_match)
1318 start_ptr = state->last_match + needle_len;
1319 else
1320 start_ptr = state->str1;
1321
1322 retry:
1323 matchptr = text_position_next_internal(start_ptr, state);
1324
1325 if (!matchptr)
1326 return false;
1327
1328 /*
1329 * Found a match for the byte sequence. If this is a multibyte encoding,
1330 * where one character's byte sequence can appear inside a longer
1331 * multi-byte character, we need to verify that the match was at a
1332 * character boundary, not in the middle of a multi-byte character.
1333 */
1334 if (state->is_multibyte_char_in_char)
1335 {
1336 /* Walk one character at a time, until we reach the match. */
1337
1338 /* the search should never move backwards. */
1339 Assert(state->refpoint <= matchptr);
1340
1341 while (state->refpoint < matchptr)
1342 {
1343 /* step to next character. */
1344 state->refpoint += pg_mblen(state->refpoint);
1345 state->refpos++;
1346
1347 /*
1348 * If we stepped over the match's start position, then it was a
1349 * false positive, where the byte sequence appeared in the middle
1350 * of a multi-byte character. Skip it, and continue the search at
1351 * the next character boundary.
1352 */
1353 if (state->refpoint > matchptr)
1354 {
1355 start_ptr = state->refpoint;
1356 goto retry;
1357 }
1358 }
1359 }
1360
1361 state->last_match = matchptr;
1362 return true;
1363 }
1364
1365 /*
1366 * Subroutine of text_position_next(). This searches for the raw byte
1367 * sequence, ignoring any multi-byte encoding issues. Returns the first
1368 * match starting at 'start_ptr', or NULL if no match is found.
1369 */
1370 static char *
text_position_next_internal(char * start_ptr,TextPositionState * state)1371 text_position_next_internal(char *start_ptr, TextPositionState *state)
1372 {
1373 int haystack_len = state->len1;
1374 int needle_len = state->len2;
1375 int skiptablemask = state->skiptablemask;
1376 const char *haystack = state->str1;
1377 const char *needle = state->str2;
1378 const char *haystack_end = &haystack[haystack_len];
1379 const char *hptr;
1380
1381 Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1382
1383 if (needle_len == 1)
1384 {
1385 /* No point in using B-M-H for a one-character needle */
1386 char nchar = *needle;
1387
1388 hptr = start_ptr;
1389 while (hptr < haystack_end)
1390 {
1391 if (*hptr == nchar)
1392 return (char *) hptr;
1393 hptr++;
1394 }
1395 }
1396 else
1397 {
1398 const char *needle_last = &needle[needle_len - 1];
1399
1400 /* Start at startpos plus the length of the needle */
1401 hptr = start_ptr + needle_len - 1;
1402 while (hptr < haystack_end)
1403 {
1404 /* Match the needle scanning *backward* */
1405 const char *nptr;
1406 const char *p;
1407
1408 nptr = needle_last;
1409 p = hptr;
1410 while (*nptr == *p)
1411 {
1412 /* Matched it all? If so, return 1-based position */
1413 if (nptr == needle)
1414 return (char *) p;
1415 nptr--, p--;
1416 }
1417
1418 /*
1419 * No match, so use the haystack char at hptr to decide how far to
1420 * advance. If the needle had any occurrence of that character
1421 * (or more precisely, one sharing the same skiptable entry)
1422 * before its last character, then we advance far enough to align
1423 * the last such needle character with that haystack position.
1424 * Otherwise we can advance by the whole needle length.
1425 */
1426 hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1427 }
1428 }
1429
1430 return 0; /* not found */
1431 }
1432
1433 /*
1434 * Return a pointer to the current match.
1435 *
1436 * The returned pointer points into correct position in the original
1437 * the haystack string.
1438 */
1439 static char *
text_position_get_match_ptr(TextPositionState * state)1440 text_position_get_match_ptr(TextPositionState *state)
1441 {
1442 return state->last_match;
1443 }
1444
1445 /*
1446 * Return the offset of the current match.
1447 *
1448 * The offset is in characters, 1-based.
1449 */
1450 static int
text_position_get_match_pos(TextPositionState * state)1451 text_position_get_match_pos(TextPositionState *state)
1452 {
1453 if (!state->is_multibyte)
1454 return state->last_match - state->str1 + 1;
1455 else
1456 {
1457 /* Convert the byte position to char position. */
1458 while (state->refpoint < state->last_match)
1459 {
1460 state->refpoint += pg_mblen(state->refpoint);
1461 state->refpos++;
1462 }
1463 Assert(state->refpoint == state->last_match);
1464 return state->refpos + 1;
1465 }
1466 }
1467
1468 static void
text_position_cleanup(TextPositionState * state)1469 text_position_cleanup(TextPositionState *state)
1470 {
1471 /* no cleanup needed */
1472 }
1473
1474 static void
check_collation_set(Oid collid)1475 check_collation_set(Oid collid)
1476 {
1477 if (!OidIsValid(collid))
1478 {
1479 /*
1480 * This typically means that the parser could not resolve a conflict
1481 * of implicit collations, so report it that way.
1482 */
1483 ereport(ERROR,
1484 (errcode(ERRCODE_INDETERMINATE_COLLATION),
1485 errmsg("could not determine which collation to use for string comparison"),
1486 errhint("Use the COLLATE clause to set the collation explicitly.")));
1487 }
1488 }
1489
1490 /* varstr_cmp()
1491 * Comparison function for text strings with given lengths.
1492 * Includes locale support, but must copy strings to temporary memory
1493 * to allow null-termination for inputs to strcoll().
1494 * Returns an integer less than, equal to, or greater than zero, indicating
1495 * whether arg1 is less than, equal to, or greater than arg2.
1496 *
1497 * Note: many functions that depend on this are marked leakproof; therefore,
1498 * avoid reporting the actual contents of the input when throwing errors.
1499 * All errors herein should be things that can't happen except on corrupt
1500 * data, anyway; otherwise we will have trouble with indexing strings that
1501 * would cause them.
1502 */
1503 int
varstr_cmp(const char * arg1,int len1,const char * arg2,int len2,Oid collid)1504 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1505 {
1506 int result;
1507
1508 check_collation_set(collid);
1509
1510 /*
1511 * Unfortunately, there is no strncoll(), so in the non-C locale case we
1512 * have to do some memory copying. This turns out to be significantly
1513 * slower, so we optimize the case where LC_COLLATE is C. We also try to
1514 * optimize relatively-short strings by avoiding palloc/pfree overhead.
1515 */
1516 if (lc_collate_is_c(collid))
1517 {
1518 result = memcmp(arg1, arg2, Min(len1, len2));
1519 if ((result == 0) && (len1 != len2))
1520 result = (len1 < len2) ? -1 : 1;
1521 }
1522 else
1523 {
1524 char a1buf[TEXTBUFLEN];
1525 char a2buf[TEXTBUFLEN];
1526 char *a1p,
1527 *a2p;
1528 pg_locale_t mylocale = 0;
1529
1530 if (collid != DEFAULT_COLLATION_OID)
1531 mylocale = pg_newlocale_from_collation(collid);
1532
1533 /*
1534 * memcmp() can't tell us which of two unequal strings sorts first,
1535 * but it's a cheap way to tell if they're equal. Testing shows that
1536 * memcmp() followed by strcoll() is only trivially slower than
1537 * strcoll() by itself, so we don't lose much if this doesn't work out
1538 * very often, and if it does - for example, because there are many
1539 * equal strings in the input - then we win big by avoiding expensive
1540 * collation-aware comparisons.
1541 */
1542 if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1543 return 0;
1544
1545 #ifdef WIN32
1546 /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1547 if (GetDatabaseEncoding() == PG_UTF8
1548 && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1549 {
1550 int a1len;
1551 int a2len;
1552 int r;
1553
1554 if (len1 >= TEXTBUFLEN / 2)
1555 {
1556 a1len = len1 * 2 + 2;
1557 a1p = palloc(a1len);
1558 }
1559 else
1560 {
1561 a1len = TEXTBUFLEN;
1562 a1p = a1buf;
1563 }
1564 if (len2 >= TEXTBUFLEN / 2)
1565 {
1566 a2len = len2 * 2 + 2;
1567 a2p = palloc(a2len);
1568 }
1569 else
1570 {
1571 a2len = TEXTBUFLEN;
1572 a2p = a2buf;
1573 }
1574
1575 /* stupid Microsloth API does not work for zero-length input */
1576 if (len1 == 0)
1577 r = 0;
1578 else
1579 {
1580 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1581 (LPWSTR) a1p, a1len / 2);
1582 if (!r)
1583 ereport(ERROR,
1584 (errmsg("could not convert string to UTF-16: error code %lu",
1585 GetLastError())));
1586 }
1587 ((LPWSTR) a1p)[r] = 0;
1588
1589 if (len2 == 0)
1590 r = 0;
1591 else
1592 {
1593 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1594 (LPWSTR) a2p, a2len / 2);
1595 if (!r)
1596 ereport(ERROR,
1597 (errmsg("could not convert string to UTF-16: error code %lu",
1598 GetLastError())));
1599 }
1600 ((LPWSTR) a2p)[r] = 0;
1601
1602 errno = 0;
1603 #ifdef HAVE_LOCALE_T
1604 if (mylocale)
1605 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1606 else
1607 #endif
1608 result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1609 if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1610 * headers */
1611 ereport(ERROR,
1612 (errmsg("could not compare Unicode strings: %m")));
1613
1614 /* Break tie if necessary. */
1615 if (result == 0 &&
1616 (!mylocale || mylocale->deterministic))
1617 {
1618 result = memcmp(arg1, arg2, Min(len1, len2));
1619 if ((result == 0) && (len1 != len2))
1620 result = (len1 < len2) ? -1 : 1;
1621 }
1622
1623 if (a1p != a1buf)
1624 pfree(a1p);
1625 if (a2p != a2buf)
1626 pfree(a2p);
1627
1628 return result;
1629 }
1630 #endif /* WIN32 */
1631
1632 if (len1 >= TEXTBUFLEN)
1633 a1p = (char *) palloc(len1 + 1);
1634 else
1635 a1p = a1buf;
1636 if (len2 >= TEXTBUFLEN)
1637 a2p = (char *) palloc(len2 + 1);
1638 else
1639 a2p = a2buf;
1640
1641 memcpy(a1p, arg1, len1);
1642 a1p[len1] = '\0';
1643 memcpy(a2p, arg2, len2);
1644 a2p[len2] = '\0';
1645
1646 if (mylocale)
1647 {
1648 if (mylocale->provider == COLLPROVIDER_ICU)
1649 {
1650 #ifdef USE_ICU
1651 #ifdef HAVE_UCOL_STRCOLLUTF8
1652 if (GetDatabaseEncoding() == PG_UTF8)
1653 {
1654 UErrorCode status;
1655
1656 status = U_ZERO_ERROR;
1657 result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1658 arg1, len1,
1659 arg2, len2,
1660 &status);
1661 if (U_FAILURE(status))
1662 ereport(ERROR,
1663 (errmsg("collation failed: %s", u_errorName(status))));
1664 }
1665 else
1666 #endif
1667 {
1668 int32_t ulen1,
1669 ulen2;
1670 UChar *uchar1,
1671 *uchar2;
1672
1673 ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1674 ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1675
1676 result = ucol_strcoll(mylocale->info.icu.ucol,
1677 uchar1, ulen1,
1678 uchar2, ulen2);
1679
1680 pfree(uchar1);
1681 pfree(uchar2);
1682 }
1683 #else /* not USE_ICU */
1684 /* shouldn't happen */
1685 elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1686 #endif /* not USE_ICU */
1687 }
1688 else
1689 {
1690 #ifdef HAVE_LOCALE_T
1691 result = strcoll_l(a1p, a2p, mylocale->info.lt);
1692 #else
1693 /* shouldn't happen */
1694 elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1695 #endif
1696 }
1697 }
1698 else
1699 result = strcoll(a1p, a2p);
1700
1701 /* Break tie if necessary. */
1702 if (result == 0 &&
1703 (!mylocale || mylocale->deterministic))
1704 result = strcmp(a1p, a2p);
1705
1706 if (a1p != a1buf)
1707 pfree(a1p);
1708 if (a2p != a2buf)
1709 pfree(a2p);
1710 }
1711
1712 return result;
1713 }
1714
1715 /* text_cmp()
1716 * Internal comparison function for text strings.
1717 * Returns -1, 0 or 1
1718 */
1719 static int
text_cmp(text * arg1,text * arg2,Oid collid)1720 text_cmp(text *arg1, text *arg2, Oid collid)
1721 {
1722 char *a1p,
1723 *a2p;
1724 int len1,
1725 len2;
1726
1727 a1p = VARDATA_ANY(arg1);
1728 a2p = VARDATA_ANY(arg2);
1729
1730 len1 = VARSIZE_ANY_EXHDR(arg1);
1731 len2 = VARSIZE_ANY_EXHDR(arg2);
1732
1733 return varstr_cmp(a1p, len1, a2p, len2, collid);
1734 }
1735
1736 /*
1737 * Comparison functions for text strings.
1738 *
1739 * Note: btree indexes need these routines not to leak memory; therefore,
1740 * be careful to free working copies of toasted datums. Most places don't
1741 * need to be so careful.
1742 */
1743
1744 Datum
texteq(PG_FUNCTION_ARGS)1745 texteq(PG_FUNCTION_ARGS)
1746 {
1747 Oid collid = PG_GET_COLLATION();
1748 bool result;
1749
1750 check_collation_set(collid);
1751
1752 if (lc_collate_is_c(collid) ||
1753 collid == DEFAULT_COLLATION_OID ||
1754 pg_newlocale_from_collation(collid)->deterministic)
1755 {
1756 Datum arg1 = PG_GETARG_DATUM(0);
1757 Datum arg2 = PG_GETARG_DATUM(1);
1758 Size len1,
1759 len2;
1760
1761 /*
1762 * Since we only care about equality or not-equality, we can avoid all
1763 * the expense of strcoll() here, and just do bitwise comparison. In
1764 * fact, we don't even have to do a bitwise comparison if we can show
1765 * the lengths of the strings are unequal; which might save us from
1766 * having to detoast one or both values.
1767 */
1768 len1 = toast_raw_datum_size(arg1);
1769 len2 = toast_raw_datum_size(arg2);
1770 if (len1 != len2)
1771 result = false;
1772 else
1773 {
1774 text *targ1 = DatumGetTextPP(arg1);
1775 text *targ2 = DatumGetTextPP(arg2);
1776
1777 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1778 len1 - VARHDRSZ) == 0);
1779
1780 PG_FREE_IF_COPY(targ1, 0);
1781 PG_FREE_IF_COPY(targ2, 1);
1782 }
1783 }
1784 else
1785 {
1786 text *arg1 = PG_GETARG_TEXT_PP(0);
1787 text *arg2 = PG_GETARG_TEXT_PP(1);
1788
1789 result = (text_cmp(arg1, arg2, collid) == 0);
1790
1791 PG_FREE_IF_COPY(arg1, 0);
1792 PG_FREE_IF_COPY(arg2, 1);
1793 }
1794
1795 PG_RETURN_BOOL(result);
1796 }
1797
1798 Datum
textne(PG_FUNCTION_ARGS)1799 textne(PG_FUNCTION_ARGS)
1800 {
1801 Oid collid = PG_GET_COLLATION();
1802 bool result;
1803
1804 check_collation_set(collid);
1805
1806 if (lc_collate_is_c(collid) ||
1807 collid == DEFAULT_COLLATION_OID ||
1808 pg_newlocale_from_collation(collid)->deterministic)
1809 {
1810 Datum arg1 = PG_GETARG_DATUM(0);
1811 Datum arg2 = PG_GETARG_DATUM(1);
1812 Size len1,
1813 len2;
1814
1815 /* See comment in texteq() */
1816 len1 = toast_raw_datum_size(arg1);
1817 len2 = toast_raw_datum_size(arg2);
1818 if (len1 != len2)
1819 result = true;
1820 else
1821 {
1822 text *targ1 = DatumGetTextPP(arg1);
1823 text *targ2 = DatumGetTextPP(arg2);
1824
1825 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1826 len1 - VARHDRSZ) != 0);
1827
1828 PG_FREE_IF_COPY(targ1, 0);
1829 PG_FREE_IF_COPY(targ2, 1);
1830 }
1831 }
1832 else
1833 {
1834 text *arg1 = PG_GETARG_TEXT_PP(0);
1835 text *arg2 = PG_GETARG_TEXT_PP(1);
1836
1837 result = (text_cmp(arg1, arg2, collid) != 0);
1838
1839 PG_FREE_IF_COPY(arg1, 0);
1840 PG_FREE_IF_COPY(arg2, 1);
1841 }
1842
1843 PG_RETURN_BOOL(result);
1844 }
1845
1846 Datum
text_lt(PG_FUNCTION_ARGS)1847 text_lt(PG_FUNCTION_ARGS)
1848 {
1849 text *arg1 = PG_GETARG_TEXT_PP(0);
1850 text *arg2 = PG_GETARG_TEXT_PP(1);
1851 bool result;
1852
1853 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1854
1855 PG_FREE_IF_COPY(arg1, 0);
1856 PG_FREE_IF_COPY(arg2, 1);
1857
1858 PG_RETURN_BOOL(result);
1859 }
1860
1861 Datum
text_le(PG_FUNCTION_ARGS)1862 text_le(PG_FUNCTION_ARGS)
1863 {
1864 text *arg1 = PG_GETARG_TEXT_PP(0);
1865 text *arg2 = PG_GETARG_TEXT_PP(1);
1866 bool result;
1867
1868 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1869
1870 PG_FREE_IF_COPY(arg1, 0);
1871 PG_FREE_IF_COPY(arg2, 1);
1872
1873 PG_RETURN_BOOL(result);
1874 }
1875
1876 Datum
text_gt(PG_FUNCTION_ARGS)1877 text_gt(PG_FUNCTION_ARGS)
1878 {
1879 text *arg1 = PG_GETARG_TEXT_PP(0);
1880 text *arg2 = PG_GETARG_TEXT_PP(1);
1881 bool result;
1882
1883 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1884
1885 PG_FREE_IF_COPY(arg1, 0);
1886 PG_FREE_IF_COPY(arg2, 1);
1887
1888 PG_RETURN_BOOL(result);
1889 }
1890
1891 Datum
text_ge(PG_FUNCTION_ARGS)1892 text_ge(PG_FUNCTION_ARGS)
1893 {
1894 text *arg1 = PG_GETARG_TEXT_PP(0);
1895 text *arg2 = PG_GETARG_TEXT_PP(1);
1896 bool result;
1897
1898 result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1899
1900 PG_FREE_IF_COPY(arg1, 0);
1901 PG_FREE_IF_COPY(arg2, 1);
1902
1903 PG_RETURN_BOOL(result);
1904 }
1905
1906 Datum
text_starts_with(PG_FUNCTION_ARGS)1907 text_starts_with(PG_FUNCTION_ARGS)
1908 {
1909 Datum arg1 = PG_GETARG_DATUM(0);
1910 Datum arg2 = PG_GETARG_DATUM(1);
1911 Oid collid = PG_GET_COLLATION();
1912 pg_locale_t mylocale = 0;
1913 bool result;
1914 Size len1,
1915 len2;
1916
1917 check_collation_set(collid);
1918
1919 if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1920 mylocale = pg_newlocale_from_collation(collid);
1921
1922 if (mylocale && !mylocale->deterministic)
1923 ereport(ERROR,
1924 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1925 errmsg("nondeterministic collations are not supported for substring searches")));
1926
1927 len1 = toast_raw_datum_size(arg1);
1928 len2 = toast_raw_datum_size(arg2);
1929 if (len2 > len1)
1930 result = false;
1931 else
1932 {
1933 text *targ1 = text_substring(arg1, 1, len2, false);
1934 text *targ2 = DatumGetTextPP(arg2);
1935
1936 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1937 VARSIZE_ANY_EXHDR(targ2)) == 0);
1938
1939 PG_FREE_IF_COPY(targ1, 0);
1940 PG_FREE_IF_COPY(targ2, 1);
1941 }
1942
1943 PG_RETURN_BOOL(result);
1944 }
1945
1946 Datum
bttextcmp(PG_FUNCTION_ARGS)1947 bttextcmp(PG_FUNCTION_ARGS)
1948 {
1949 text *arg1 = PG_GETARG_TEXT_PP(0);
1950 text *arg2 = PG_GETARG_TEXT_PP(1);
1951 int32 result;
1952
1953 result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1954
1955 PG_FREE_IF_COPY(arg1, 0);
1956 PG_FREE_IF_COPY(arg2, 1);
1957
1958 PG_RETURN_INT32(result);
1959 }
1960
1961 Datum
bttextsortsupport(PG_FUNCTION_ARGS)1962 bttextsortsupport(PG_FUNCTION_ARGS)
1963 {
1964 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1965 Oid collid = ssup->ssup_collation;
1966 MemoryContext oldcontext;
1967
1968 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1969
1970 /* Use generic string SortSupport */
1971 varstr_sortsupport(ssup, TEXTOID, collid);
1972
1973 MemoryContextSwitchTo(oldcontext);
1974
1975 PG_RETURN_VOID();
1976 }
1977
1978 /*
1979 * Generic sortsupport interface for character type's operator classes.
1980 * Includes locale support, and support for BpChar semantics (i.e. removing
1981 * trailing spaces before comparison).
1982 *
1983 * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1984 * same representation. Callers that always use the C collation (e.g.
1985 * non-collatable type callers like bytea) may have NUL bytes in their strings;
1986 * this will not work with any other collation, though.
1987 */
1988 void
varstr_sortsupport(SortSupport ssup,Oid typid,Oid collid)1989 varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
1990 {
1991 bool abbreviate = ssup->abbreviate;
1992 bool collate_c = false;
1993 VarStringSortSupport *sss;
1994 pg_locale_t locale = 0;
1995
1996 check_collation_set(collid);
1997
1998 /*
1999 * If possible, set ssup->comparator to a function which can be used to
2000 * directly compare two datums. If we can do this, we'll avoid the
2001 * overhead of a trip through the fmgr layer for every comparison, which
2002 * can be substantial.
2003 *
2004 * Most typically, we'll set the comparator to varlenafastcmp_locale,
2005 * which uses strcoll() to perform comparisons. We use that for the
2006 * BpChar case too, but type NAME uses namefastcmp_locale. However, if
2007 * LC_COLLATE = C, we can make things quite a bit faster with
2008 * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
2009 * memcmp() rather than strcoll().
2010 */
2011 if (lc_collate_is_c(collid))
2012 {
2013 if (typid == BPCHAROID)
2014 ssup->comparator = bpcharfastcmp_c;
2015 else if (typid == NAMEOID)
2016 {
2017 ssup->comparator = namefastcmp_c;
2018 /* Not supporting abbreviation with type NAME, for now */
2019 abbreviate = false;
2020 }
2021 else
2022 ssup->comparator = varstrfastcmp_c;
2023
2024 collate_c = true;
2025 }
2026 else
2027 {
2028 /*
2029 * We need a collation-sensitive comparison. To make things faster,
2030 * we'll figure out the collation based on the locale id and cache the
2031 * result.
2032 */
2033 if (collid != DEFAULT_COLLATION_OID)
2034 locale = pg_newlocale_from_collation(collid);
2035
2036 /*
2037 * There is a further exception on Windows. When the database
2038 * encoding is UTF-8 and we are not using the C collation, complex
2039 * hacks are required. We don't currently have a comparator that
2040 * handles that case, so we fall back on the slow method of having the
2041 * sort code invoke bttextcmp() (in the case of text) via the fmgr
2042 * trampoline. ICU locales work just the same on Windows, however.
2043 */
2044 #ifdef WIN32
2045 if (GetDatabaseEncoding() == PG_UTF8 &&
2046 !(locale && locale->provider == COLLPROVIDER_ICU))
2047 return;
2048 #endif
2049
2050 /*
2051 * We use varlenafastcmp_locale except for type NAME.
2052 */
2053 if (typid == NAMEOID)
2054 {
2055 ssup->comparator = namefastcmp_locale;
2056 /* Not supporting abbreviation with type NAME, for now */
2057 abbreviate = false;
2058 }
2059 else
2060 ssup->comparator = varlenafastcmp_locale;
2061 }
2062
2063 /*
2064 * Unfortunately, it seems that abbreviation for non-C collations is
2065 * broken on many common platforms; testing of multiple versions of glibc
2066 * reveals that, for many locales, strcoll() and strxfrm() do not return
2067 * consistent results, which is fatal to this optimization. While no
2068 * other libc other than Cygwin has so far been shown to have a problem,
2069 * we take the conservative course of action for right now and disable
2070 * this categorically. (Users who are certain this isn't a problem on
2071 * their system can define TRUST_STRXFRM.)
2072 *
2073 * Even apart from the risk of broken locales, it's possible that there
2074 * are platforms where the use of abbreviated keys should be disabled at
2075 * compile time. Having only 4 byte datums could make worst-case
2076 * performance drastically more likely, for example. Moreover, macOS's
2077 * strxfrm() implementation is known to not effectively concentrate a
2078 * significant amount of entropy from the original string in earlier
2079 * transformed blobs. It's possible that other supported platforms are
2080 * similarly encumbered. So, if we ever get past disabling this
2081 * categorically, we may still want or need to disable it for particular
2082 * platforms.
2083 */
2084 #ifndef TRUST_STRXFRM
2085 if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2086 abbreviate = false;
2087 #endif
2088
2089 /*
2090 * If we're using abbreviated keys, or if we're using a locale-aware
2091 * comparison, we need to initialize a VarStringSortSupport object. Both
2092 * cases will make use of the temporary buffers we initialize here for
2093 * scratch space (and to detect requirement for BpChar semantics from
2094 * caller), and the abbreviation case requires additional state.
2095 */
2096 if (abbreviate || !collate_c)
2097 {
2098 sss = palloc(sizeof(VarStringSortSupport));
2099 sss->buf1 = palloc(TEXTBUFLEN);
2100 sss->buflen1 = TEXTBUFLEN;
2101 sss->buf2 = palloc(TEXTBUFLEN);
2102 sss->buflen2 = TEXTBUFLEN;
2103 /* Start with invalid values */
2104 sss->last_len1 = -1;
2105 sss->last_len2 = -1;
2106 /* Initialize */
2107 sss->last_returned = 0;
2108 sss->locale = locale;
2109
2110 /*
2111 * To avoid somehow confusing a strxfrm() blob and an original string,
2112 * constantly keep track of the variety of data that buf1 and buf2
2113 * currently contain.
2114 *
2115 * Comparisons may be interleaved with conversion calls. Frequently,
2116 * conversions and comparisons are batched into two distinct phases,
2117 * but the correctness of caching cannot hinge upon this. For
2118 * comparison caching, buffer state is only trusted if cache_blob is
2119 * found set to false, whereas strxfrm() caching only trusts the state
2120 * when cache_blob is found set to true.
2121 *
2122 * Arbitrarily initialize cache_blob to true.
2123 */
2124 sss->cache_blob = true;
2125 sss->collate_c = collate_c;
2126 sss->typid = typid;
2127 ssup->ssup_extra = sss;
2128
2129 /*
2130 * If possible, plan to use the abbreviated keys optimization. The
2131 * core code may switch back to authoritative comparator should
2132 * abbreviation be aborted.
2133 */
2134 if (abbreviate)
2135 {
2136 sss->prop_card = 0.20;
2137 initHyperLogLog(&sss->abbr_card, 10);
2138 initHyperLogLog(&sss->full_card, 10);
2139 ssup->abbrev_full_comparator = ssup->comparator;
2140 ssup->comparator = varstrcmp_abbrev;
2141 ssup->abbrev_converter = varstr_abbrev_convert;
2142 ssup->abbrev_abort = varstr_abbrev_abort;
2143 }
2144 }
2145 }
2146
2147 /*
2148 * sortsupport comparison func (for C locale case)
2149 */
2150 static int
varstrfastcmp_c(Datum x,Datum y,SortSupport ssup)2151 varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
2152 {
2153 VarString *arg1 = DatumGetVarStringPP(x);
2154 VarString *arg2 = DatumGetVarStringPP(y);
2155 char *a1p,
2156 *a2p;
2157 int len1,
2158 len2,
2159 result;
2160
2161 a1p = VARDATA_ANY(arg1);
2162 a2p = VARDATA_ANY(arg2);
2163
2164 len1 = VARSIZE_ANY_EXHDR(arg1);
2165 len2 = VARSIZE_ANY_EXHDR(arg2);
2166
2167 result = memcmp(a1p, a2p, Min(len1, len2));
2168 if ((result == 0) && (len1 != len2))
2169 result = (len1 < len2) ? -1 : 1;
2170
2171 /* We can't afford to leak memory here. */
2172 if (PointerGetDatum(arg1) != x)
2173 pfree(arg1);
2174 if (PointerGetDatum(arg2) != y)
2175 pfree(arg2);
2176
2177 return result;
2178 }
2179
2180 /*
2181 * sortsupport comparison func (for BpChar C locale case)
2182 *
2183 * BpChar outsources its sortsupport to this module. Specialization for the
2184 * varstr_sortsupport BpChar case, modeled on
2185 * internal_bpchar_pattern_compare().
2186 */
2187 static int
bpcharfastcmp_c(Datum x,Datum y,SortSupport ssup)2188 bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2189 {
2190 BpChar *arg1 = DatumGetBpCharPP(x);
2191 BpChar *arg2 = DatumGetBpCharPP(y);
2192 char *a1p,
2193 *a2p;
2194 int len1,
2195 len2,
2196 result;
2197
2198 a1p = VARDATA_ANY(arg1);
2199 a2p = VARDATA_ANY(arg2);
2200
2201 len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2202 len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2203
2204 result = memcmp(a1p, a2p, Min(len1, len2));
2205 if ((result == 0) && (len1 != len2))
2206 result = (len1 < len2) ? -1 : 1;
2207
2208 /* We can't afford to leak memory here. */
2209 if (PointerGetDatum(arg1) != x)
2210 pfree(arg1);
2211 if (PointerGetDatum(arg2) != y)
2212 pfree(arg2);
2213
2214 return result;
2215 }
2216
2217 /*
2218 * sortsupport comparison func (for NAME C locale case)
2219 */
2220 static int
namefastcmp_c(Datum x,Datum y,SortSupport ssup)2221 namefastcmp_c(Datum x, Datum y, SortSupport ssup)
2222 {
2223 Name arg1 = DatumGetName(x);
2224 Name arg2 = DatumGetName(y);
2225
2226 return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2227 }
2228
2229 /*
2230 * sortsupport comparison func (for locale case with all varlena types)
2231 */
2232 static int
varlenafastcmp_locale(Datum x,Datum y,SortSupport ssup)2233 varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
2234 {
2235 VarString *arg1 = DatumGetVarStringPP(x);
2236 VarString *arg2 = DatumGetVarStringPP(y);
2237 char *a1p,
2238 *a2p;
2239 int len1,
2240 len2,
2241 result;
2242
2243 a1p = VARDATA_ANY(arg1);
2244 a2p = VARDATA_ANY(arg2);
2245
2246 len1 = VARSIZE_ANY_EXHDR(arg1);
2247 len2 = VARSIZE_ANY_EXHDR(arg2);
2248
2249 result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2250
2251 /* We can't afford to leak memory here. */
2252 if (PointerGetDatum(arg1) != x)
2253 pfree(arg1);
2254 if (PointerGetDatum(arg2) != y)
2255 pfree(arg2);
2256
2257 return result;
2258 }
2259
2260 /*
2261 * sortsupport comparison func (for locale case with NAME type)
2262 */
2263 static int
namefastcmp_locale(Datum x,Datum y,SortSupport ssup)2264 namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
2265 {
2266 Name arg1 = DatumGetName(x);
2267 Name arg2 = DatumGetName(y);
2268
2269 return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2270 NameStr(*arg2), strlen(NameStr(*arg2)),
2271 ssup);
2272 }
2273
2274 /*
2275 * sortsupport comparison func for locale cases
2276 */
2277 static int
varstrfastcmp_locale(char * a1p,int len1,char * a2p,int len2,SortSupport ssup)2278 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2279 {
2280 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2281 int result;
2282 bool arg1_match;
2283
2284 /* Fast pre-check for equality, as discussed in varstr_cmp() */
2285 if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2286 {
2287 /*
2288 * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2289 * last_len2. Existing contents of buffers might still be used by
2290 * next call.
2291 *
2292 * It's fine to allow the comparison of BpChar padding bytes here,
2293 * even though that implies that the memcmp() will usually be
2294 * performed for BpChar callers (though multibyte characters could
2295 * still prevent that from occurring). The memcmp() is still very
2296 * cheap, and BpChar's funny semantics have us remove trailing spaces
2297 * (not limited to padding), so we need make no distinction between
2298 * padding space characters and "real" space characters.
2299 */
2300 return 0;
2301 }
2302
2303 if (sss->typid == BPCHAROID)
2304 {
2305 /* Get true number of bytes, ignoring trailing spaces */
2306 len1 = bpchartruelen(a1p, len1);
2307 len2 = bpchartruelen(a2p, len2);
2308 }
2309
2310 if (len1 >= sss->buflen1)
2311 {
2312 pfree(sss->buf1);
2313 sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2314 sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2315 }
2316 if (len2 >= sss->buflen2)
2317 {
2318 pfree(sss->buf2);
2319 sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2320 sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2321 }
2322
2323 /*
2324 * We're likely to be asked to compare the same strings repeatedly, and
2325 * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2326 * comparisons, even though in general there is no reason to think that
2327 * that will work out (every string datum may be unique). Caching does
2328 * not slow things down measurably when it doesn't work out, and can speed
2329 * things up by rather a lot when it does. In part, this is because the
2330 * memcmp() compares data from cachelines that are needed in L1 cache even
2331 * when the last comparison's result cannot be reused.
2332 */
2333 arg1_match = true;
2334 if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2335 {
2336 arg1_match = false;
2337 memcpy(sss->buf1, a1p, len1);
2338 sss->buf1[len1] = '\0';
2339 sss->last_len1 = len1;
2340 }
2341
2342 /*
2343 * If we're comparing the same two strings as last time, we can return the
2344 * same answer without calling strcoll() again. This is more likely than
2345 * it seems (at least with moderate to low cardinality sets), because
2346 * quicksort compares the same pivot against many values.
2347 */
2348 if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2349 {
2350 memcpy(sss->buf2, a2p, len2);
2351 sss->buf2[len2] = '\0';
2352 sss->last_len2 = len2;
2353 }
2354 else if (arg1_match && !sss->cache_blob)
2355 {
2356 /* Use result cached following last actual strcoll() call */
2357 return sss->last_returned;
2358 }
2359
2360 if (sss->locale)
2361 {
2362 if (sss->locale->provider == COLLPROVIDER_ICU)
2363 {
2364 #ifdef USE_ICU
2365 #ifdef HAVE_UCOL_STRCOLLUTF8
2366 if (GetDatabaseEncoding() == PG_UTF8)
2367 {
2368 UErrorCode status;
2369
2370 status = U_ZERO_ERROR;
2371 result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2372 a1p, len1,
2373 a2p, len2,
2374 &status);
2375 if (U_FAILURE(status))
2376 ereport(ERROR,
2377 (errmsg("collation failed: %s", u_errorName(status))));
2378 }
2379 else
2380 #endif
2381 {
2382 int32_t ulen1,
2383 ulen2;
2384 UChar *uchar1,
2385 *uchar2;
2386
2387 ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2388 ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2389
2390 result = ucol_strcoll(sss->locale->info.icu.ucol,
2391 uchar1, ulen1,
2392 uchar2, ulen2);
2393
2394 pfree(uchar1);
2395 pfree(uchar2);
2396 }
2397 #else /* not USE_ICU */
2398 /* shouldn't happen */
2399 elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2400 #endif /* not USE_ICU */
2401 }
2402 else
2403 {
2404 #ifdef HAVE_LOCALE_T
2405 result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2406 #else
2407 /* shouldn't happen */
2408 elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2409 #endif
2410 }
2411 }
2412 else
2413 result = strcoll(sss->buf1, sss->buf2);
2414
2415 /* Break tie if necessary. */
2416 if (result == 0 &&
2417 (!sss->locale || sss->locale->deterministic))
2418 result = strcmp(sss->buf1, sss->buf2);
2419
2420 /* Cache result, perhaps saving an expensive strcoll() call next time */
2421 sss->cache_blob = false;
2422 sss->last_returned = result;
2423 return result;
2424 }
2425
2426 /*
2427 * Abbreviated key comparison func
2428 */
2429 static int
varstrcmp_abbrev(Datum x,Datum y,SortSupport ssup)2430 varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup)
2431 {
2432 /*
2433 * When 0 is returned, the core system will call varstrfastcmp_c()
2434 * (bpcharfastcmp_c() in BpChar case) or varlenafastcmp_locale(). Even a
2435 * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2436 * authoritatively, for the same reason that there is a strcoll()
2437 * tie-breaker call to strcmp() in varstr_cmp().
2438 */
2439 if (x > y)
2440 return 1;
2441 else if (x == y)
2442 return 0;
2443 else
2444 return -1;
2445 }
2446
2447 /*
2448 * Conversion routine for sortsupport. Converts original to abbreviated key
2449 * representation. Our encoding strategy is simple -- pack the first 8 bytes
2450 * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2451 * stored in reverse order), and treat it as an unsigned integer. When the "C"
2452 * locale is used, or in case of bytea, just memcpy() from original instead.
2453 */
2454 static Datum
varstr_abbrev_convert(Datum original,SortSupport ssup)2455 varstr_abbrev_convert(Datum original, SortSupport ssup)
2456 {
2457 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2458 VarString *authoritative = DatumGetVarStringPP(original);
2459 char *authoritative_data = VARDATA_ANY(authoritative);
2460
2461 /* working state */
2462 Datum res;
2463 char *pres;
2464 int len;
2465 uint32 hash;
2466
2467 pres = (char *) &res;
2468 /* memset(), so any non-overwritten bytes are NUL */
2469 memset(pres, 0, sizeof(Datum));
2470 len = VARSIZE_ANY_EXHDR(authoritative);
2471
2472 /* Get number of bytes, ignoring trailing spaces */
2473 if (sss->typid == BPCHAROID)
2474 len = bpchartruelen(authoritative_data, len);
2475
2476 /*
2477 * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2478 * abbreviate keys. The full comparator for the C locale is always
2479 * memcmp(). It would be incorrect to allow bytea callers (callers that
2480 * always force the C collation -- bytea isn't a collatable type, but this
2481 * approach is convenient) to use strxfrm(). This is because bytea
2482 * strings may contain NUL bytes. Besides, this should be faster, too.
2483 *
2484 * More generally, it's okay that bytea callers can have NUL bytes in
2485 * strings because varstrcmp_abbrev() need not make a distinction between
2486 * terminating NUL bytes, and NUL bytes representing actual NULs in the
2487 * authoritative representation. Hopefully a comparison at or past one
2488 * abbreviated key's terminating NUL byte will resolve the comparison
2489 * without consulting the authoritative representation; specifically, some
2490 * later non-NUL byte in the longer string can resolve the comparison
2491 * against a subsequent terminating NUL in the shorter string. There will
2492 * usually be what is effectively a "length-wise" resolution there and
2493 * then.
2494 *
2495 * If that doesn't work out -- if all bytes in the longer string
2496 * positioned at or past the offset of the smaller string's (first)
2497 * terminating NUL are actually representative of NUL bytes in the
2498 * authoritative binary string (perhaps with some *terminating* NUL bytes
2499 * towards the end of the longer string iff it happens to still be small)
2500 * -- then an authoritative tie-breaker will happen, and do the right
2501 * thing: explicitly consider string length.
2502 */
2503 if (sss->collate_c)
2504 memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2505 else
2506 {
2507 Size bsize;
2508 #ifdef USE_ICU
2509 int32_t ulen = -1;
2510 UChar *uchar = NULL;
2511 #endif
2512
2513 /*
2514 * We're not using the C collation, so fall back on strxfrm or ICU
2515 * analogs.
2516 */
2517
2518 /* By convention, we use buffer 1 to store and NUL-terminate */
2519 if (len >= sss->buflen1)
2520 {
2521 pfree(sss->buf1);
2522 sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2523 sss->buf1 = palloc(sss->buflen1);
2524 }
2525
2526 /* Might be able to reuse strxfrm() blob from last call */
2527 if (sss->last_len1 == len && sss->cache_blob &&
2528 memcmp(sss->buf1, authoritative_data, len) == 0)
2529 {
2530 memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2531 /* No change affecting cardinality, so no hashing required */
2532 goto done;
2533 }
2534
2535 memcpy(sss->buf1, authoritative_data, len);
2536
2537 /*
2538 * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2539 * necessary for ICU, but doesn't hurt.
2540 */
2541 sss->buf1[len] = '\0';
2542 sss->last_len1 = len;
2543
2544 #ifdef USE_ICU
2545 /* When using ICU and not UTF8, convert string to UChar. */
2546 if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2547 GetDatabaseEncoding() != PG_UTF8)
2548 ulen = icu_to_uchar(&uchar, sss->buf1, len);
2549 #endif
2550
2551 /*
2552 * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2553 * and try again. Both of these functions have the result buffer
2554 * content undefined if the result did not fit, so we need to retry
2555 * until everything fits, even though we only need the first few bytes
2556 * in the end. When using ucol_nextSortKeyPart(), however, we only
2557 * ask for as many bytes as we actually need.
2558 */
2559 for (;;)
2560 {
2561 #ifdef USE_ICU
2562 if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2563 {
2564 /*
2565 * When using UTF8, use the iteration interface so we only
2566 * need to produce as many bytes as we actually need.
2567 */
2568 if (GetDatabaseEncoding() == PG_UTF8)
2569 {
2570 UCharIterator iter;
2571 uint32_t state[2];
2572 UErrorCode status;
2573
2574 uiter_setUTF8(&iter, sss->buf1, len);
2575 state[0] = state[1] = 0; /* won't need that again */
2576 status = U_ZERO_ERROR;
2577 bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2578 &iter,
2579 state,
2580 (uint8_t *) sss->buf2,
2581 Min(sizeof(Datum), sss->buflen2),
2582 &status);
2583 if (U_FAILURE(status))
2584 ereport(ERROR,
2585 (errmsg("sort key generation failed: %s",
2586 u_errorName(status))));
2587 }
2588 else
2589 bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2590 uchar, ulen,
2591 (uint8_t *) sss->buf2, sss->buflen2);
2592 }
2593 else
2594 #endif
2595 #ifdef HAVE_LOCALE_T
2596 if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2597 bsize = strxfrm_l(sss->buf2, sss->buf1,
2598 sss->buflen2, sss->locale->info.lt);
2599 else
2600 #endif
2601 bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2602
2603 sss->last_len2 = bsize;
2604 if (bsize < sss->buflen2)
2605 break;
2606
2607 /*
2608 * Grow buffer and retry.
2609 */
2610 pfree(sss->buf2);
2611 sss->buflen2 = Max(bsize + 1,
2612 Min(sss->buflen2 * 2, MaxAllocSize));
2613 sss->buf2 = palloc(sss->buflen2);
2614 }
2615
2616 /*
2617 * Every Datum byte is always compared. This is safe because the
2618 * strxfrm() blob is itself NUL terminated, leaving no danger of
2619 * misinterpreting any NUL bytes not intended to be interpreted as
2620 * logically representing termination.
2621 *
2622 * (Actually, even if there were NUL bytes in the blob it would be
2623 * okay. See remarks on bytea case above.)
2624 */
2625 memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2626
2627 #ifdef USE_ICU
2628 if (uchar)
2629 pfree(uchar);
2630 #endif
2631 }
2632
2633 /*
2634 * Maintain approximate cardinality of both abbreviated keys and original,
2635 * authoritative keys using HyperLogLog. Used as cheap insurance against
2636 * the worst case, where we do many string transformations for no saving
2637 * in full strcoll()-based comparisons. These statistics are used by
2638 * varstr_abbrev_abort().
2639 *
2640 * First, Hash key proper, or a significant fraction of it. Mix in length
2641 * in order to compensate for cases where differences are past
2642 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2643 */
2644 hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2645 Min(len, PG_CACHE_LINE_SIZE)));
2646
2647 if (len > PG_CACHE_LINE_SIZE)
2648 hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2649
2650 addHyperLogLog(&sss->full_card, hash);
2651
2652 /* Hash abbreviated key */
2653 #if SIZEOF_DATUM == 8
2654 {
2655 uint32 lohalf,
2656 hihalf;
2657
2658 lohalf = (uint32) res;
2659 hihalf = (uint32) (res >> 32);
2660 hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2661 }
2662 #else /* SIZEOF_DATUM != 8 */
2663 hash = DatumGetUInt32(hash_uint32((uint32) res));
2664 #endif
2665
2666 addHyperLogLog(&sss->abbr_card, hash);
2667
2668 /* Cache result, perhaps saving an expensive strxfrm() call next time */
2669 sss->cache_blob = true;
2670 done:
2671
2672 /*
2673 * Byteswap on little-endian machines.
2674 *
2675 * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2676 * comparator) works correctly on all platforms. If we didn't do this,
2677 * the comparator would have to call memcmp() with a pair of pointers to
2678 * the first byte of each abbreviated key, which is slower.
2679 */
2680 res = DatumBigEndianToNative(res);
2681
2682 /* Don't leak memory here */
2683 if (PointerGetDatum(authoritative) != original)
2684 pfree(authoritative);
2685
2686 return res;
2687 }
2688
2689 /*
2690 * Callback for estimating effectiveness of abbreviated key optimization, using
2691 * heuristic rules. Returns value indicating if the abbreviation optimization
2692 * should be aborted, based on its projected effectiveness.
2693 */
2694 static bool
varstr_abbrev_abort(int memtupcount,SortSupport ssup)2695 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2696 {
2697 VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2698 double abbrev_distinct,
2699 key_distinct;
2700
2701 Assert(ssup->abbreviate);
2702
2703 /* Have a little patience */
2704 if (memtupcount < 100)
2705 return false;
2706
2707 abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2708 key_distinct = estimateHyperLogLog(&sss->full_card);
2709
2710 /*
2711 * Clamp cardinality estimates to at least one distinct value. While
2712 * NULLs are generally disregarded, if only NULL values were seen so far,
2713 * that might misrepresent costs if we failed to clamp.
2714 */
2715 if (abbrev_distinct <= 1.0)
2716 abbrev_distinct = 1.0;
2717
2718 if (key_distinct <= 1.0)
2719 key_distinct = 1.0;
2720
2721 /*
2722 * In the worst case all abbreviated keys are identical, while at the same
2723 * time there are differences within full key strings not captured in
2724 * abbreviations.
2725 */
2726 #ifdef TRACE_SORT
2727 if (trace_sort)
2728 {
2729 double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2730
2731 elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2732 "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2733 memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2734 sss->prop_card);
2735 }
2736 #endif
2737
2738 /*
2739 * If the number of distinct abbreviated keys approximately matches the
2740 * number of distinct authoritative original keys, that's reason enough to
2741 * proceed. We can win even with a very low cardinality set if most
2742 * tie-breakers only memcmp(). This is by far the most important
2743 * consideration.
2744 *
2745 * While comparisons that are resolved at the abbreviated key level are
2746 * considerably cheaper than tie-breakers resolved with memcmp(), both of
2747 * those two outcomes are so much cheaper than a full strcoll() once
2748 * sorting is underway that it doesn't seem worth it to weigh abbreviated
2749 * cardinality against the overall size of the set in order to more
2750 * accurately model costs. Assume that an abbreviated comparison, and an
2751 * abbreviated comparison with a cheap memcmp()-based authoritative
2752 * resolution are equivalent.
2753 */
2754 if (abbrev_distinct > key_distinct * sss->prop_card)
2755 {
2756 /*
2757 * When we have exceeded 10,000 tuples, decay required cardinality
2758 * aggressively for next call.
2759 *
2760 * This is useful because the number of comparisons required on
2761 * average increases at a linearithmic rate, and at roughly 10,000
2762 * tuples that factor will start to dominate over the linear costs of
2763 * string transformation (this is a conservative estimate). The decay
2764 * rate is chosen to be a little less aggressive than halving -- which
2765 * (since we're called at points at which memtupcount has doubled)
2766 * would never see the cost model actually abort past the first call
2767 * following a decay. This decay rate is mostly a precaution against
2768 * a sudden, violent swing in how well abbreviated cardinality tracks
2769 * full key cardinality. The decay also serves to prevent a marginal
2770 * case from being aborted too late, when too much has already been
2771 * invested in string transformation.
2772 *
2773 * It's possible for sets of several million distinct strings with
2774 * mere tens of thousands of distinct abbreviated keys to still
2775 * benefit very significantly. This will generally occur provided
2776 * each abbreviated key is a proxy for a roughly uniform number of the
2777 * set's full keys. If it isn't so, we hope to catch that early and
2778 * abort. If it isn't caught early, by the time the problem is
2779 * apparent it's probably not worth aborting.
2780 */
2781 if (memtupcount > 10000)
2782 sss->prop_card *= 0.65;
2783
2784 return false;
2785 }
2786
2787 /*
2788 * Abort abbreviation strategy.
2789 *
2790 * The worst case, where all abbreviated keys are identical while all
2791 * original strings differ will typically only see a regression of about
2792 * 10% in execution time for small to medium sized lists of strings.
2793 * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2794 * often expect very large improvements, particularly with sets of strings
2795 * of moderately high to high abbreviated cardinality. There is little to
2796 * lose but much to gain, which our strategy reflects.
2797 */
2798 #ifdef TRACE_SORT
2799 if (trace_sort)
2800 elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2801 "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2802 memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2803 #endif
2804
2805 return true;
2806 }
2807
2808 /*
2809 * Generic equalimage support function for character type's operator classes.
2810 * Disables the use of deduplication with nondeterministic collations.
2811 */
2812 Datum
btvarstrequalimage(PG_FUNCTION_ARGS)2813 btvarstrequalimage(PG_FUNCTION_ARGS)
2814 {
2815 /* Oid opcintype = PG_GETARG_OID(0); */
2816 Oid collid = PG_GET_COLLATION();
2817
2818 check_collation_set(collid);
2819
2820 if (lc_collate_is_c(collid) ||
2821 collid == DEFAULT_COLLATION_OID ||
2822 get_collation_isdeterministic(collid))
2823 PG_RETURN_BOOL(true);
2824 else
2825 PG_RETURN_BOOL(false);
2826 }
2827
2828 Datum
text_larger(PG_FUNCTION_ARGS)2829 text_larger(PG_FUNCTION_ARGS)
2830 {
2831 text *arg1 = PG_GETARG_TEXT_PP(0);
2832 text *arg2 = PG_GETARG_TEXT_PP(1);
2833 text *result;
2834
2835 result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2836
2837 PG_RETURN_TEXT_P(result);
2838 }
2839
2840 Datum
text_smaller(PG_FUNCTION_ARGS)2841 text_smaller(PG_FUNCTION_ARGS)
2842 {
2843 text *arg1 = PG_GETARG_TEXT_PP(0);
2844 text *arg2 = PG_GETARG_TEXT_PP(1);
2845 text *result;
2846
2847 result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2848
2849 PG_RETURN_TEXT_P(result);
2850 }
2851
2852
2853 /*
2854 * Cross-type comparison functions for types text and name.
2855 */
2856
2857 Datum
nameeqtext(PG_FUNCTION_ARGS)2858 nameeqtext(PG_FUNCTION_ARGS)
2859 {
2860 Name arg1 = PG_GETARG_NAME(0);
2861 text *arg2 = PG_GETARG_TEXT_PP(1);
2862 size_t len1 = strlen(NameStr(*arg1));
2863 size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2864 Oid collid = PG_GET_COLLATION();
2865 bool result;
2866
2867 check_collation_set(collid);
2868
2869 if (collid == C_COLLATION_OID)
2870 result = (len1 == len2 &&
2871 memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2872 else
2873 result = (varstr_cmp(NameStr(*arg1), len1,
2874 VARDATA_ANY(arg2), len2,
2875 collid) == 0);
2876
2877 PG_FREE_IF_COPY(arg2, 1);
2878
2879 PG_RETURN_BOOL(result);
2880 }
2881
2882 Datum
texteqname(PG_FUNCTION_ARGS)2883 texteqname(PG_FUNCTION_ARGS)
2884 {
2885 text *arg1 = PG_GETARG_TEXT_PP(0);
2886 Name arg2 = PG_GETARG_NAME(1);
2887 size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2888 size_t len2 = strlen(NameStr(*arg2));
2889 Oid collid = PG_GET_COLLATION();
2890 bool result;
2891
2892 check_collation_set(collid);
2893
2894 if (collid == C_COLLATION_OID)
2895 result = (len1 == len2 &&
2896 memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2897 else
2898 result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2899 NameStr(*arg2), len2,
2900 collid) == 0);
2901
2902 PG_FREE_IF_COPY(arg1, 0);
2903
2904 PG_RETURN_BOOL(result);
2905 }
2906
2907 Datum
namenetext(PG_FUNCTION_ARGS)2908 namenetext(PG_FUNCTION_ARGS)
2909 {
2910 Name arg1 = PG_GETARG_NAME(0);
2911 text *arg2 = PG_GETARG_TEXT_PP(1);
2912 size_t len1 = strlen(NameStr(*arg1));
2913 size_t len2 = VARSIZE_ANY_EXHDR(arg2);
2914 Oid collid = PG_GET_COLLATION();
2915 bool result;
2916
2917 check_collation_set(collid);
2918
2919 if (collid == C_COLLATION_OID)
2920 result = !(len1 == len2 &&
2921 memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2922 else
2923 result = !(varstr_cmp(NameStr(*arg1), len1,
2924 VARDATA_ANY(arg2), len2,
2925 collid) == 0);
2926
2927 PG_FREE_IF_COPY(arg2, 1);
2928
2929 PG_RETURN_BOOL(result);
2930 }
2931
2932 Datum
textnename(PG_FUNCTION_ARGS)2933 textnename(PG_FUNCTION_ARGS)
2934 {
2935 text *arg1 = PG_GETARG_TEXT_PP(0);
2936 Name arg2 = PG_GETARG_NAME(1);
2937 size_t len1 = VARSIZE_ANY_EXHDR(arg1);
2938 size_t len2 = strlen(NameStr(*arg2));
2939 Oid collid = PG_GET_COLLATION();
2940 bool result;
2941
2942 check_collation_set(collid);
2943
2944 if (collid == C_COLLATION_OID)
2945 result = !(len1 == len2 &&
2946 memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2947 else
2948 result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2949 NameStr(*arg2), len2,
2950 collid) == 0);
2951
2952 PG_FREE_IF_COPY(arg1, 0);
2953
2954 PG_RETURN_BOOL(result);
2955 }
2956
2957 Datum
btnametextcmp(PG_FUNCTION_ARGS)2958 btnametextcmp(PG_FUNCTION_ARGS)
2959 {
2960 Name arg1 = PG_GETARG_NAME(0);
2961 text *arg2 = PG_GETARG_TEXT_PP(1);
2962 int32 result;
2963
2964 result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2965 VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2966 PG_GET_COLLATION());
2967
2968 PG_FREE_IF_COPY(arg2, 1);
2969
2970 PG_RETURN_INT32(result);
2971 }
2972
2973 Datum
bttextnamecmp(PG_FUNCTION_ARGS)2974 bttextnamecmp(PG_FUNCTION_ARGS)
2975 {
2976 text *arg1 = PG_GETARG_TEXT_PP(0);
2977 Name arg2 = PG_GETARG_NAME(1);
2978 int32 result;
2979
2980 result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2981 NameStr(*arg2), strlen(NameStr(*arg2)),
2982 PG_GET_COLLATION());
2983
2984 PG_FREE_IF_COPY(arg1, 0);
2985
2986 PG_RETURN_INT32(result);
2987 }
2988
2989 #define CmpCall(cmpfunc) \
2990 DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2991 PG_GET_COLLATION(), \
2992 PG_GETARG_DATUM(0), \
2993 PG_GETARG_DATUM(1)))
2994
2995 Datum
namelttext(PG_FUNCTION_ARGS)2996 namelttext(PG_FUNCTION_ARGS)
2997 {
2998 PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
2999 }
3000
3001 Datum
nameletext(PG_FUNCTION_ARGS)3002 nameletext(PG_FUNCTION_ARGS)
3003 {
3004 PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
3005 }
3006
3007 Datum
namegttext(PG_FUNCTION_ARGS)3008 namegttext(PG_FUNCTION_ARGS)
3009 {
3010 PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
3011 }
3012
3013 Datum
namegetext(PG_FUNCTION_ARGS)3014 namegetext(PG_FUNCTION_ARGS)
3015 {
3016 PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
3017 }
3018
3019 Datum
textltname(PG_FUNCTION_ARGS)3020 textltname(PG_FUNCTION_ARGS)
3021 {
3022 PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
3023 }
3024
3025 Datum
textlename(PG_FUNCTION_ARGS)3026 textlename(PG_FUNCTION_ARGS)
3027 {
3028 PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
3029 }
3030
3031 Datum
textgtname(PG_FUNCTION_ARGS)3032 textgtname(PG_FUNCTION_ARGS)
3033 {
3034 PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
3035 }
3036
3037 Datum
textgename(PG_FUNCTION_ARGS)3038 textgename(PG_FUNCTION_ARGS)
3039 {
3040 PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
3041 }
3042
3043 #undef CmpCall
3044
3045
3046 /*
3047 * The following operators support character-by-character comparison
3048 * of text datums, to allow building indexes suitable for LIKE clauses.
3049 * Note that the regular texteq/textne comparison operators, and regular
3050 * support functions 1 and 2 with "C" collation are assumed to be
3051 * compatible with these!
3052 */
3053
3054 static int
internal_text_pattern_compare(text * arg1,text * arg2)3055 internal_text_pattern_compare(text *arg1, text *arg2)
3056 {
3057 int result;
3058 int len1,
3059 len2;
3060
3061 len1 = VARSIZE_ANY_EXHDR(arg1);
3062 len2 = VARSIZE_ANY_EXHDR(arg2);
3063
3064 result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3065 if (result != 0)
3066 return result;
3067 else if (len1 < len2)
3068 return -1;
3069 else if (len1 > len2)
3070 return 1;
3071 else
3072 return 0;
3073 }
3074
3075
3076 Datum
text_pattern_lt(PG_FUNCTION_ARGS)3077 text_pattern_lt(PG_FUNCTION_ARGS)
3078 {
3079 text *arg1 = PG_GETARG_TEXT_PP(0);
3080 text *arg2 = PG_GETARG_TEXT_PP(1);
3081 int result;
3082
3083 result = internal_text_pattern_compare(arg1, arg2);
3084
3085 PG_FREE_IF_COPY(arg1, 0);
3086 PG_FREE_IF_COPY(arg2, 1);
3087
3088 PG_RETURN_BOOL(result < 0);
3089 }
3090
3091
3092 Datum
text_pattern_le(PG_FUNCTION_ARGS)3093 text_pattern_le(PG_FUNCTION_ARGS)
3094 {
3095 text *arg1 = PG_GETARG_TEXT_PP(0);
3096 text *arg2 = PG_GETARG_TEXT_PP(1);
3097 int result;
3098
3099 result = internal_text_pattern_compare(arg1, arg2);
3100
3101 PG_FREE_IF_COPY(arg1, 0);
3102 PG_FREE_IF_COPY(arg2, 1);
3103
3104 PG_RETURN_BOOL(result <= 0);
3105 }
3106
3107
3108 Datum
text_pattern_ge(PG_FUNCTION_ARGS)3109 text_pattern_ge(PG_FUNCTION_ARGS)
3110 {
3111 text *arg1 = PG_GETARG_TEXT_PP(0);
3112 text *arg2 = PG_GETARG_TEXT_PP(1);
3113 int result;
3114
3115 result = internal_text_pattern_compare(arg1, arg2);
3116
3117 PG_FREE_IF_COPY(arg1, 0);
3118 PG_FREE_IF_COPY(arg2, 1);
3119
3120 PG_RETURN_BOOL(result >= 0);
3121 }
3122
3123
3124 Datum
text_pattern_gt(PG_FUNCTION_ARGS)3125 text_pattern_gt(PG_FUNCTION_ARGS)
3126 {
3127 text *arg1 = PG_GETARG_TEXT_PP(0);
3128 text *arg2 = PG_GETARG_TEXT_PP(1);
3129 int result;
3130
3131 result = internal_text_pattern_compare(arg1, arg2);
3132
3133 PG_FREE_IF_COPY(arg1, 0);
3134 PG_FREE_IF_COPY(arg2, 1);
3135
3136 PG_RETURN_BOOL(result > 0);
3137 }
3138
3139
3140 Datum
bttext_pattern_cmp(PG_FUNCTION_ARGS)3141 bttext_pattern_cmp(PG_FUNCTION_ARGS)
3142 {
3143 text *arg1 = PG_GETARG_TEXT_PP(0);
3144 text *arg2 = PG_GETARG_TEXT_PP(1);
3145 int result;
3146
3147 result = internal_text_pattern_compare(arg1, arg2);
3148
3149 PG_FREE_IF_COPY(arg1, 0);
3150 PG_FREE_IF_COPY(arg2, 1);
3151
3152 PG_RETURN_INT32(result);
3153 }
3154
3155
3156 Datum
bttext_pattern_sortsupport(PG_FUNCTION_ARGS)3157 bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
3158 {
3159 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3160 MemoryContext oldcontext;
3161
3162 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3163
3164 /* Use generic string SortSupport, forcing "C" collation */
3165 varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3166
3167 MemoryContextSwitchTo(oldcontext);
3168
3169 PG_RETURN_VOID();
3170 }
3171
3172
3173 /*-------------------------------------------------------------
3174 * byteaoctetlen
3175 *
3176 * get the number of bytes contained in an instance of type 'bytea'
3177 *-------------------------------------------------------------
3178 */
3179 Datum
byteaoctetlen(PG_FUNCTION_ARGS)3180 byteaoctetlen(PG_FUNCTION_ARGS)
3181 {
3182 Datum str = PG_GETARG_DATUM(0);
3183
3184 /* We need not detoast the input at all */
3185 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
3186 }
3187
3188 /*
3189 * byteacat -
3190 * takes two bytea* and returns a bytea* that is the concatenation of
3191 * the two.
3192 *
3193 * Cloned from textcat and modified as required.
3194 */
3195 Datum
byteacat(PG_FUNCTION_ARGS)3196 byteacat(PG_FUNCTION_ARGS)
3197 {
3198 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3199 bytea *t2 = PG_GETARG_BYTEA_PP(1);
3200
3201 PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
3202 }
3203
3204 /*
3205 * bytea_catenate
3206 * Guts of byteacat(), broken out so it can be used by other functions
3207 *
3208 * Arguments can be in short-header form, but not compressed or out-of-line
3209 */
3210 static bytea *
bytea_catenate(bytea * t1,bytea * t2)3211 bytea_catenate(bytea *t1, bytea *t2)
3212 {
3213 bytea *result;
3214 int len1,
3215 len2,
3216 len;
3217 char *ptr;
3218
3219 len1 = VARSIZE_ANY_EXHDR(t1);
3220 len2 = VARSIZE_ANY_EXHDR(t2);
3221
3222 /* paranoia ... probably should throw error instead? */
3223 if (len1 < 0)
3224 len1 = 0;
3225 if (len2 < 0)
3226 len2 = 0;
3227
3228 len = len1 + len2 + VARHDRSZ;
3229 result = (bytea *) palloc(len);
3230
3231 /* Set size of result string... */
3232 SET_VARSIZE(result, len);
3233
3234 /* Fill data field of result string... */
3235 ptr = VARDATA(result);
3236 if (len1 > 0)
3237 memcpy(ptr, VARDATA_ANY(t1), len1);
3238 if (len2 > 0)
3239 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3240
3241 return result;
3242 }
3243
3244 #define PG_STR_GET_BYTEA(str_) \
3245 DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3246
3247 /*
3248 * bytea_substr()
3249 * Return a substring starting at the specified position.
3250 * Cloned from text_substr and modified as required.
3251 *
3252 * Input:
3253 * - string
3254 * - starting position (is one-based)
3255 * - string length (optional)
3256 *
3257 * If the starting position is zero or less, then return from the start of the string
3258 * adjusting the length to be consistent with the "negative start" per SQL.
3259 * If the length is less than zero, an ERROR is thrown. If no third argument
3260 * (length) is provided, the length to the end of the string is assumed.
3261 */
3262 Datum
bytea_substr(PG_FUNCTION_ARGS)3263 bytea_substr(PG_FUNCTION_ARGS)
3264 {
3265 PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3266 PG_GETARG_INT32(1),
3267 PG_GETARG_INT32(2),
3268 false));
3269 }
3270
3271 /*
3272 * bytea_substr_no_len -
3273 * Wrapper to avoid opr_sanity failure due to
3274 * one function accepting a different number of args.
3275 */
3276 Datum
bytea_substr_no_len(PG_FUNCTION_ARGS)3277 bytea_substr_no_len(PG_FUNCTION_ARGS)
3278 {
3279 PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3280 PG_GETARG_INT32(1),
3281 -1,
3282 true));
3283 }
3284
3285 static bytea *
bytea_substring(Datum str,int S,int L,bool length_not_specified)3286 bytea_substring(Datum str,
3287 int S,
3288 int L,
3289 bool length_not_specified)
3290 {
3291 int32 S1; /* adjusted start position */
3292 int32 L1; /* adjusted substring length */
3293 int32 E; /* end position */
3294
3295 /*
3296 * The logic here should generally match text_substring().
3297 */
3298 S1 = Max(S, 1);
3299
3300 if (length_not_specified)
3301 {
3302 /*
3303 * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3304 * end of the string if we pass it a negative value for length.
3305 */
3306 L1 = -1;
3307 }
3308 else if (L < 0)
3309 {
3310 /* SQL99 says to throw an error for E < S, i.e., negative length */
3311 ereport(ERROR,
3312 (errcode(ERRCODE_SUBSTRING_ERROR),
3313 errmsg("negative substring length not allowed")));
3314 L1 = -1; /* silence stupider compilers */
3315 }
3316 else if (pg_add_s32_overflow(S, L, &E))
3317 {
3318 /*
3319 * L could be large enough for S + L to overflow, in which case the
3320 * substring must run to end of string.
3321 */
3322 L1 = -1;
3323 }
3324 else
3325 {
3326 /*
3327 * A zero or negative value for the end position can happen if the
3328 * start was negative or one. SQL99 says to return a zero-length
3329 * string.
3330 */
3331 if (E < 1)
3332 return PG_STR_GET_BYTEA("");
3333
3334 L1 = E - S1;
3335 }
3336
3337 /*
3338 * If the start position is past the end of the string, SQL99 says to
3339 * return a zero-length string -- DatumGetByteaPSlice() will do that for
3340 * us. We need only convert S1 to zero-based starting position.
3341 */
3342 return DatumGetByteaPSlice(str, S1 - 1, L1);
3343 }
3344
3345 /*
3346 * byteaoverlay
3347 * Replace specified substring of first string with second
3348 *
3349 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3350 * This code is a direct implementation of what the standard says.
3351 */
3352 Datum
byteaoverlay(PG_FUNCTION_ARGS)3353 byteaoverlay(PG_FUNCTION_ARGS)
3354 {
3355 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3356 bytea *t2 = PG_GETARG_BYTEA_PP(1);
3357 int sp = PG_GETARG_INT32(2); /* substring start position */
3358 int sl = PG_GETARG_INT32(3); /* substring length */
3359
3360 PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3361 }
3362
3363 Datum
byteaoverlay_no_len(PG_FUNCTION_ARGS)3364 byteaoverlay_no_len(PG_FUNCTION_ARGS)
3365 {
3366 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3367 bytea *t2 = PG_GETARG_BYTEA_PP(1);
3368 int sp = PG_GETARG_INT32(2); /* substring start position */
3369 int sl;
3370
3371 sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3372 PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3373 }
3374
3375 static bytea *
bytea_overlay(bytea * t1,bytea * t2,int sp,int sl)3376 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3377 {
3378 bytea *result;
3379 bytea *s1;
3380 bytea *s2;
3381 int sp_pl_sl;
3382
3383 /*
3384 * Check for possible integer-overflow cases. For negative sp, throw a
3385 * "substring length" error because that's what should be expected
3386 * according to the spec's definition of OVERLAY().
3387 */
3388 if (sp <= 0)
3389 ereport(ERROR,
3390 (errcode(ERRCODE_SUBSTRING_ERROR),
3391 errmsg("negative substring length not allowed")));
3392 if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3393 ereport(ERROR,
3394 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3395 errmsg("integer out of range")));
3396
3397 s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3398 s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3399 result = bytea_catenate(s1, t2);
3400 result = bytea_catenate(result, s2);
3401
3402 return result;
3403 }
3404
3405 /*
3406 * byteapos -
3407 * Return the position of the specified substring.
3408 * Implements the SQL POSITION() function.
3409 * Cloned from textpos and modified as required.
3410 */
3411 Datum
byteapos(PG_FUNCTION_ARGS)3412 byteapos(PG_FUNCTION_ARGS)
3413 {
3414 bytea *t1 = PG_GETARG_BYTEA_PP(0);
3415 bytea *t2 = PG_GETARG_BYTEA_PP(1);
3416 int pos;
3417 int px,
3418 p;
3419 int len1,
3420 len2;
3421 char *p1,
3422 *p2;
3423
3424 len1 = VARSIZE_ANY_EXHDR(t1);
3425 len2 = VARSIZE_ANY_EXHDR(t2);
3426
3427 if (len2 <= 0)
3428 PG_RETURN_INT32(1); /* result for empty pattern */
3429
3430 p1 = VARDATA_ANY(t1);
3431 p2 = VARDATA_ANY(t2);
3432
3433 pos = 0;
3434 px = (len1 - len2);
3435 for (p = 0; p <= px; p++)
3436 {
3437 if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3438 {
3439 pos = p + 1;
3440 break;
3441 };
3442 p1++;
3443 };
3444
3445 PG_RETURN_INT32(pos);
3446 }
3447
3448 /*-------------------------------------------------------------
3449 * byteaGetByte
3450 *
3451 * this routine treats "bytea" as an array of bytes.
3452 * It returns the Nth byte (a number between 0 and 255).
3453 *-------------------------------------------------------------
3454 */
3455 Datum
byteaGetByte(PG_FUNCTION_ARGS)3456 byteaGetByte(PG_FUNCTION_ARGS)
3457 {
3458 bytea *v = PG_GETARG_BYTEA_PP(0);
3459 int32 n = PG_GETARG_INT32(1);
3460 int len;
3461 int byte;
3462
3463 len = VARSIZE_ANY_EXHDR(v);
3464
3465 if (n < 0 || n >= len)
3466 ereport(ERROR,
3467 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3468 errmsg("index %d out of valid range, 0..%d",
3469 n, len - 1)));
3470
3471 byte = ((unsigned char *) VARDATA_ANY(v))[n];
3472
3473 PG_RETURN_INT32(byte);
3474 }
3475
3476 /*-------------------------------------------------------------
3477 * byteaGetBit
3478 *
3479 * This routine treats a "bytea" type like an array of bits.
3480 * It returns the value of the Nth bit (0 or 1).
3481 *
3482 *-------------------------------------------------------------
3483 */
3484 Datum
byteaGetBit(PG_FUNCTION_ARGS)3485 byteaGetBit(PG_FUNCTION_ARGS)
3486 {
3487 bytea *v = PG_GETARG_BYTEA_PP(0);
3488 int64 n = PG_GETARG_INT64(1);
3489 int byteNo,
3490 bitNo;
3491 int len;
3492 int byte;
3493
3494 len = VARSIZE_ANY_EXHDR(v);
3495
3496 if (n < 0 || n >= (int64) len * 8)
3497 ereport(ERROR,
3498 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3499 errmsg("index %lld out of valid range, 0..%lld",
3500 (long long) n, (long long) len * 8 - 1)));
3501
3502 /* n/8 is now known < len, so safe to cast to int */
3503 byteNo = (int) (n / 8);
3504 bitNo = (int) (n % 8);
3505
3506 byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3507
3508 if (byte & (1 << bitNo))
3509 PG_RETURN_INT32(1);
3510 else
3511 PG_RETURN_INT32(0);
3512 }
3513
3514 /*-------------------------------------------------------------
3515 * byteaSetByte
3516 *
3517 * Given an instance of type 'bytea' creates a new one with
3518 * the Nth byte set to the given value.
3519 *
3520 *-------------------------------------------------------------
3521 */
3522 Datum
byteaSetByte(PG_FUNCTION_ARGS)3523 byteaSetByte(PG_FUNCTION_ARGS)
3524 {
3525 bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3526 int32 n = PG_GETARG_INT32(1);
3527 int32 newByte = PG_GETARG_INT32(2);
3528 int len;
3529
3530 len = VARSIZE(res) - VARHDRSZ;
3531
3532 if (n < 0 || n >= len)
3533 ereport(ERROR,
3534 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3535 errmsg("index %d out of valid range, 0..%d",
3536 n, len - 1)));
3537
3538 /*
3539 * Now set the byte.
3540 */
3541 ((unsigned char *) VARDATA(res))[n] = newByte;
3542
3543 PG_RETURN_BYTEA_P(res);
3544 }
3545
3546 /*-------------------------------------------------------------
3547 * byteaSetBit
3548 *
3549 * Given an instance of type 'bytea' creates a new one with
3550 * the Nth bit set to the given value.
3551 *
3552 *-------------------------------------------------------------
3553 */
3554 Datum
byteaSetBit(PG_FUNCTION_ARGS)3555 byteaSetBit(PG_FUNCTION_ARGS)
3556 {
3557 bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3558 int64 n = PG_GETARG_INT64(1);
3559 int32 newBit = PG_GETARG_INT32(2);
3560 int len;
3561 int oldByte,
3562 newByte;
3563 int byteNo,
3564 bitNo;
3565
3566 len = VARSIZE(res) - VARHDRSZ;
3567
3568 if (n < 0 || n >= (int64) len * 8)
3569 ereport(ERROR,
3570 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3571 errmsg("index %lld out of valid range, 0..%lld",
3572 (long long) n, (long long) len * 8 - 1)));
3573
3574 /* n/8 is now known < len, so safe to cast to int */
3575 byteNo = (int) (n / 8);
3576 bitNo = (int) (n % 8);
3577
3578 /*
3579 * sanity check!
3580 */
3581 if (newBit != 0 && newBit != 1)
3582 ereport(ERROR,
3583 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3584 errmsg("new bit must be 0 or 1")));
3585
3586 /*
3587 * Update the byte.
3588 */
3589 oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3590
3591 if (newBit == 0)
3592 newByte = oldByte & (~(1 << bitNo));
3593 else
3594 newByte = oldByte | (1 << bitNo);
3595
3596 ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3597
3598 PG_RETURN_BYTEA_P(res);
3599 }
3600
3601
3602 /* text_name()
3603 * Converts a text type to a Name type.
3604 */
3605 Datum
text_name(PG_FUNCTION_ARGS)3606 text_name(PG_FUNCTION_ARGS)
3607 {
3608 text *s = PG_GETARG_TEXT_PP(0);
3609 Name result;
3610 int len;
3611
3612 len = VARSIZE_ANY_EXHDR(s);
3613
3614 /* Truncate oversize input */
3615 if (len >= NAMEDATALEN)
3616 len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3617
3618 /* We use palloc0 here to ensure result is zero-padded */
3619 result = (Name) palloc0(NAMEDATALEN);
3620 memcpy(NameStr(*result), VARDATA_ANY(s), len);
3621
3622 PG_RETURN_NAME(result);
3623 }
3624
3625 /* name_text()
3626 * Converts a Name type to a text type.
3627 */
3628 Datum
name_text(PG_FUNCTION_ARGS)3629 name_text(PG_FUNCTION_ARGS)
3630 {
3631 Name s = PG_GETARG_NAME(0);
3632
3633 PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3634 }
3635
3636
3637 /*
3638 * textToQualifiedNameList - convert a text object to list of names
3639 *
3640 * This implements the input parsing needed by nextval() and other
3641 * functions that take a text parameter representing a qualified name.
3642 * We split the name at dots, downcase if not double-quoted, and
3643 * truncate names if they're too long.
3644 */
3645 List *
textToQualifiedNameList(text * textval)3646 textToQualifiedNameList(text *textval)
3647 {
3648 char *rawname;
3649 List *result = NIL;
3650 List *namelist;
3651 ListCell *l;
3652
3653 /* Convert to C string (handles possible detoasting). */
3654 /* Note we rely on being able to modify rawname below. */
3655 rawname = text_to_cstring(textval);
3656
3657 if (!SplitIdentifierString(rawname, '.', &namelist))
3658 ereport(ERROR,
3659 (errcode(ERRCODE_INVALID_NAME),
3660 errmsg("invalid name syntax")));
3661
3662 if (namelist == NIL)
3663 ereport(ERROR,
3664 (errcode(ERRCODE_INVALID_NAME),
3665 errmsg("invalid name syntax")));
3666
3667 foreach(l, namelist)
3668 {
3669 char *curname = (char *) lfirst(l);
3670
3671 result = lappend(result, makeString(pstrdup(curname)));
3672 }
3673
3674 pfree(rawname);
3675 list_free(namelist);
3676
3677 return result;
3678 }
3679
3680 /*
3681 * SplitIdentifierString --- parse a string containing identifiers
3682 *
3683 * This is the guts of textToQualifiedNameList, and is exported for use in
3684 * other situations such as parsing GUC variables. In the GUC case, it's
3685 * important to avoid memory leaks, so the API is designed to minimize the
3686 * amount of stuff that needs to be allocated and freed.
3687 *
3688 * Inputs:
3689 * rawstring: the input string; must be overwritable! On return, it's
3690 * been modified to contain the separated identifiers.
3691 * separator: the separator punctuation expected between identifiers
3692 * (typically '.' or ','). Whitespace may also appear around
3693 * identifiers.
3694 * Outputs:
3695 * namelist: filled with a palloc'd list of pointers to identifiers within
3696 * rawstring. Caller should list_free() this even on error return.
3697 *
3698 * Returns true if okay, false if there is a syntax error in the string.
3699 *
3700 * Note that an empty string is considered okay here, though not in
3701 * textToQualifiedNameList.
3702 */
3703 bool
SplitIdentifierString(char * rawstring,char separator,List ** namelist)3704 SplitIdentifierString(char *rawstring, char separator,
3705 List **namelist)
3706 {
3707 char *nextp = rawstring;
3708 bool done = false;
3709
3710 *namelist = NIL;
3711
3712 while (scanner_isspace(*nextp))
3713 nextp++; /* skip leading whitespace */
3714
3715 if (*nextp == '\0')
3716 return true; /* allow empty string */
3717
3718 /* At the top of the loop, we are at start of a new identifier. */
3719 do
3720 {
3721 char *curname;
3722 char *endp;
3723
3724 if (*nextp == '"')
3725 {
3726 /* Quoted name --- collapse quote-quote pairs, no downcasing */
3727 curname = nextp + 1;
3728 for (;;)
3729 {
3730 endp = strchr(nextp + 1, '"');
3731 if (endp == NULL)
3732 return false; /* mismatched quotes */
3733 if (endp[1] != '"')
3734 break; /* found end of quoted name */
3735 /* Collapse adjacent quotes into one quote, and look again */
3736 memmove(endp, endp + 1, strlen(endp));
3737 nextp = endp;
3738 }
3739 /* endp now points at the terminating quote */
3740 nextp = endp + 1;
3741 }
3742 else
3743 {
3744 /* Unquoted name --- extends to separator or whitespace */
3745 char *downname;
3746 int len;
3747
3748 curname = nextp;
3749 while (*nextp && *nextp != separator &&
3750 !scanner_isspace(*nextp))
3751 nextp++;
3752 endp = nextp;
3753 if (curname == nextp)
3754 return false; /* empty unquoted name not allowed */
3755
3756 /*
3757 * Downcase the identifier, using same code as main lexer does.
3758 *
3759 * XXX because we want to overwrite the input in-place, we cannot
3760 * support a downcasing transformation that increases the string
3761 * length. This is not a problem given the current implementation
3762 * of downcase_truncate_identifier, but we'll probably have to do
3763 * something about this someday.
3764 */
3765 len = endp - curname;
3766 downname = downcase_truncate_identifier(curname, len, false);
3767 Assert(strlen(downname) <= len);
3768 strncpy(curname, downname, len); /* strncpy is required here */
3769 pfree(downname);
3770 }
3771
3772 while (scanner_isspace(*nextp))
3773 nextp++; /* skip trailing whitespace */
3774
3775 if (*nextp == separator)
3776 {
3777 nextp++;
3778 while (scanner_isspace(*nextp))
3779 nextp++; /* skip leading whitespace for next */
3780 /* we expect another name, so done remains false */
3781 }
3782 else if (*nextp == '\0')
3783 done = true;
3784 else
3785 return false; /* invalid syntax */
3786
3787 /* Now safe to overwrite separator with a null */
3788 *endp = '\0';
3789
3790 /* Truncate name if it's overlength */
3791 truncate_identifier(curname, strlen(curname), false);
3792
3793 /*
3794 * Finished isolating current name --- add it to list
3795 */
3796 *namelist = lappend(*namelist, curname);
3797
3798 /* Loop back if we didn't reach end of string */
3799 } while (!done);
3800
3801 return true;
3802 }
3803
3804
3805 /*
3806 * SplitDirectoriesString --- parse a string containing file/directory names
3807 *
3808 * This works fine on file names too; the function name is historical.
3809 *
3810 * This is similar to SplitIdentifierString, except that the parsing
3811 * rules are meant to handle pathnames instead of identifiers: there is
3812 * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3813 * and we apply canonicalize_path() to each extracted string. Because of the
3814 * last, the returned strings are separately palloc'd rather than being
3815 * pointers into rawstring --- but we still scribble on rawstring.
3816 *
3817 * Inputs:
3818 * rawstring: the input string; must be modifiable!
3819 * separator: the separator punctuation expected between directories
3820 * (typically ',' or ';'). Whitespace may also appear around
3821 * directories.
3822 * Outputs:
3823 * namelist: filled with a palloc'd list of directory names.
3824 * Caller should list_free_deep() this even on error return.
3825 *
3826 * Returns true if okay, false if there is a syntax error in the string.
3827 *
3828 * Note that an empty string is considered okay here.
3829 */
3830 bool
SplitDirectoriesString(char * rawstring,char separator,List ** namelist)3831 SplitDirectoriesString(char *rawstring, char separator,
3832 List **namelist)
3833 {
3834 char *nextp = rawstring;
3835 bool done = false;
3836
3837 *namelist = NIL;
3838
3839 while (scanner_isspace(*nextp))
3840 nextp++; /* skip leading whitespace */
3841
3842 if (*nextp == '\0')
3843 return true; /* allow empty string */
3844
3845 /* At the top of the loop, we are at start of a new directory. */
3846 do
3847 {
3848 char *curname;
3849 char *endp;
3850
3851 if (*nextp == '"')
3852 {
3853 /* Quoted name --- collapse quote-quote pairs */
3854 curname = nextp + 1;
3855 for (;;)
3856 {
3857 endp = strchr(nextp + 1, '"');
3858 if (endp == NULL)
3859 return false; /* mismatched quotes */
3860 if (endp[1] != '"')
3861 break; /* found end of quoted name */
3862 /* Collapse adjacent quotes into one quote, and look again */
3863 memmove(endp, endp + 1, strlen(endp));
3864 nextp = endp;
3865 }
3866 /* endp now points at the terminating quote */
3867 nextp = endp + 1;
3868 }
3869 else
3870 {
3871 /* Unquoted name --- extends to separator or end of string */
3872 curname = endp = nextp;
3873 while (*nextp && *nextp != separator)
3874 {
3875 /* trailing whitespace should not be included in name */
3876 if (!scanner_isspace(*nextp))
3877 endp = nextp + 1;
3878 nextp++;
3879 }
3880 if (curname == endp)
3881 return false; /* empty unquoted name not allowed */
3882 }
3883
3884 while (scanner_isspace(*nextp))
3885 nextp++; /* skip trailing whitespace */
3886
3887 if (*nextp == separator)
3888 {
3889 nextp++;
3890 while (scanner_isspace(*nextp))
3891 nextp++; /* skip leading whitespace for next */
3892 /* we expect another name, so done remains false */
3893 }
3894 else if (*nextp == '\0')
3895 done = true;
3896 else
3897 return false; /* invalid syntax */
3898
3899 /* Now safe to overwrite separator with a null */
3900 *endp = '\0';
3901
3902 /* Truncate path if it's overlength */
3903 if (strlen(curname) >= MAXPGPATH)
3904 curname[MAXPGPATH - 1] = '\0';
3905
3906 /*
3907 * Finished isolating current name --- add it to list
3908 */
3909 curname = pstrdup(curname);
3910 canonicalize_path(curname);
3911 *namelist = lappend(*namelist, curname);
3912
3913 /* Loop back if we didn't reach end of string */
3914 } while (!done);
3915
3916 return true;
3917 }
3918
3919
3920 /*
3921 * SplitGUCList --- parse a string containing identifiers or file names
3922 *
3923 * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3924 * presuming whether the elements will be taken as identifiers or file names.
3925 * We assume the input has already been through flatten_set_variable_args(),
3926 * so that we need never downcase (if appropriate, that was done already).
3927 * Nor do we ever truncate, since we don't know the correct max length.
3928 * We disallow embedded whitespace for simplicity (it shouldn't matter,
3929 * because any embedded whitespace should have led to double-quoting).
3930 * Otherwise the API is identical to SplitIdentifierString.
3931 *
3932 * XXX it's annoying to have so many copies of this string-splitting logic.
3933 * However, it's not clear that having one function with a bunch of option
3934 * flags would be much better.
3935 *
3936 * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3937 * Be sure to update that if you have to change this.
3938 *
3939 * Inputs:
3940 * rawstring: the input string; must be overwritable! On return, it's
3941 * been modified to contain the separated identifiers.
3942 * separator: the separator punctuation expected between identifiers
3943 * (typically '.' or ','). Whitespace may also appear around
3944 * identifiers.
3945 * Outputs:
3946 * namelist: filled with a palloc'd list of pointers to identifiers within
3947 * rawstring. Caller should list_free() this even on error return.
3948 *
3949 * Returns true if okay, false if there is a syntax error in the string.
3950 */
3951 bool
SplitGUCList(char * rawstring,char separator,List ** namelist)3952 SplitGUCList(char *rawstring, char separator,
3953 List **namelist)
3954 {
3955 char *nextp = rawstring;
3956 bool done = false;
3957
3958 *namelist = NIL;
3959
3960 while (scanner_isspace(*nextp))
3961 nextp++; /* skip leading whitespace */
3962
3963 if (*nextp == '\0')
3964 return true; /* allow empty string */
3965
3966 /* At the top of the loop, we are at start of a new identifier. */
3967 do
3968 {
3969 char *curname;
3970 char *endp;
3971
3972 if (*nextp == '"')
3973 {
3974 /* Quoted name --- collapse quote-quote pairs */
3975 curname = nextp + 1;
3976 for (;;)
3977 {
3978 endp = strchr(nextp + 1, '"');
3979 if (endp == NULL)
3980 return false; /* mismatched quotes */
3981 if (endp[1] != '"')
3982 break; /* found end of quoted name */
3983 /* Collapse adjacent quotes into one quote, and look again */
3984 memmove(endp, endp + 1, strlen(endp));
3985 nextp = endp;
3986 }
3987 /* endp now points at the terminating quote */
3988 nextp = endp + 1;
3989 }
3990 else
3991 {
3992 /* Unquoted name --- extends to separator or whitespace */
3993 curname = nextp;
3994 while (*nextp && *nextp != separator &&
3995 !scanner_isspace(*nextp))
3996 nextp++;
3997 endp = nextp;
3998 if (curname == nextp)
3999 return false; /* empty unquoted name not allowed */
4000 }
4001
4002 while (scanner_isspace(*nextp))
4003 nextp++; /* skip trailing whitespace */
4004
4005 if (*nextp == separator)
4006 {
4007 nextp++;
4008 while (scanner_isspace(*nextp))
4009 nextp++; /* skip leading whitespace for next */
4010 /* we expect another name, so done remains false */
4011 }
4012 else if (*nextp == '\0')
4013 done = true;
4014 else
4015 return false; /* invalid syntax */
4016
4017 /* Now safe to overwrite separator with a null */
4018 *endp = '\0';
4019
4020 /*
4021 * Finished isolating current name --- add it to list
4022 */
4023 *namelist = lappend(*namelist, curname);
4024
4025 /* Loop back if we didn't reach end of string */
4026 } while (!done);
4027
4028 return true;
4029 }
4030
4031
4032 /*****************************************************************************
4033 * Comparison Functions used for bytea
4034 *
4035 * Note: btree indexes need these routines not to leak memory; therefore,
4036 * be careful to free working copies of toasted datums. Most places don't
4037 * need to be so careful.
4038 *****************************************************************************/
4039
4040 Datum
byteaeq(PG_FUNCTION_ARGS)4041 byteaeq(PG_FUNCTION_ARGS)
4042 {
4043 Datum arg1 = PG_GETARG_DATUM(0);
4044 Datum arg2 = PG_GETARG_DATUM(1);
4045 bool result;
4046 Size len1,
4047 len2;
4048
4049 /*
4050 * We can use a fast path for unequal lengths, which might save us from
4051 * having to detoast one or both values.
4052 */
4053 len1 = toast_raw_datum_size(arg1);
4054 len2 = toast_raw_datum_size(arg2);
4055 if (len1 != len2)
4056 result = false;
4057 else
4058 {
4059 bytea *barg1 = DatumGetByteaPP(arg1);
4060 bytea *barg2 = DatumGetByteaPP(arg2);
4061
4062 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4063 len1 - VARHDRSZ) == 0);
4064
4065 PG_FREE_IF_COPY(barg1, 0);
4066 PG_FREE_IF_COPY(barg2, 1);
4067 }
4068
4069 PG_RETURN_BOOL(result);
4070 }
4071
4072 Datum
byteane(PG_FUNCTION_ARGS)4073 byteane(PG_FUNCTION_ARGS)
4074 {
4075 Datum arg1 = PG_GETARG_DATUM(0);
4076 Datum arg2 = PG_GETARG_DATUM(1);
4077 bool result;
4078 Size len1,
4079 len2;
4080
4081 /*
4082 * We can use a fast path for unequal lengths, which might save us from
4083 * having to detoast one or both values.
4084 */
4085 len1 = toast_raw_datum_size(arg1);
4086 len2 = toast_raw_datum_size(arg2);
4087 if (len1 != len2)
4088 result = true;
4089 else
4090 {
4091 bytea *barg1 = DatumGetByteaPP(arg1);
4092 bytea *barg2 = DatumGetByteaPP(arg2);
4093
4094 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4095 len1 - VARHDRSZ) != 0);
4096
4097 PG_FREE_IF_COPY(barg1, 0);
4098 PG_FREE_IF_COPY(barg2, 1);
4099 }
4100
4101 PG_RETURN_BOOL(result);
4102 }
4103
4104 Datum
bytealt(PG_FUNCTION_ARGS)4105 bytealt(PG_FUNCTION_ARGS)
4106 {
4107 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4108 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4109 int len1,
4110 len2;
4111 int cmp;
4112
4113 len1 = VARSIZE_ANY_EXHDR(arg1);
4114 len2 = VARSIZE_ANY_EXHDR(arg2);
4115
4116 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4117
4118 PG_FREE_IF_COPY(arg1, 0);
4119 PG_FREE_IF_COPY(arg2, 1);
4120
4121 PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4122 }
4123
4124 Datum
byteale(PG_FUNCTION_ARGS)4125 byteale(PG_FUNCTION_ARGS)
4126 {
4127 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4128 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4129 int len1,
4130 len2;
4131 int cmp;
4132
4133 len1 = VARSIZE_ANY_EXHDR(arg1);
4134 len2 = VARSIZE_ANY_EXHDR(arg2);
4135
4136 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4137
4138 PG_FREE_IF_COPY(arg1, 0);
4139 PG_FREE_IF_COPY(arg2, 1);
4140
4141 PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4142 }
4143
4144 Datum
byteagt(PG_FUNCTION_ARGS)4145 byteagt(PG_FUNCTION_ARGS)
4146 {
4147 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4148 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4149 int len1,
4150 len2;
4151 int cmp;
4152
4153 len1 = VARSIZE_ANY_EXHDR(arg1);
4154 len2 = VARSIZE_ANY_EXHDR(arg2);
4155
4156 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4157
4158 PG_FREE_IF_COPY(arg1, 0);
4159 PG_FREE_IF_COPY(arg2, 1);
4160
4161 PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4162 }
4163
4164 Datum
byteage(PG_FUNCTION_ARGS)4165 byteage(PG_FUNCTION_ARGS)
4166 {
4167 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4168 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4169 int len1,
4170 len2;
4171 int cmp;
4172
4173 len1 = VARSIZE_ANY_EXHDR(arg1);
4174 len2 = VARSIZE_ANY_EXHDR(arg2);
4175
4176 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4177
4178 PG_FREE_IF_COPY(arg1, 0);
4179 PG_FREE_IF_COPY(arg2, 1);
4180
4181 PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4182 }
4183
4184 Datum
byteacmp(PG_FUNCTION_ARGS)4185 byteacmp(PG_FUNCTION_ARGS)
4186 {
4187 bytea *arg1 = PG_GETARG_BYTEA_PP(0);
4188 bytea *arg2 = PG_GETARG_BYTEA_PP(1);
4189 int len1,
4190 len2;
4191 int cmp;
4192
4193 len1 = VARSIZE_ANY_EXHDR(arg1);
4194 len2 = VARSIZE_ANY_EXHDR(arg2);
4195
4196 cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4197 if ((cmp == 0) && (len1 != len2))
4198 cmp = (len1 < len2) ? -1 : 1;
4199
4200 PG_FREE_IF_COPY(arg1, 0);
4201 PG_FREE_IF_COPY(arg2, 1);
4202
4203 PG_RETURN_INT32(cmp);
4204 }
4205
4206 Datum
bytea_sortsupport(PG_FUNCTION_ARGS)4207 bytea_sortsupport(PG_FUNCTION_ARGS)
4208 {
4209 SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
4210 MemoryContext oldcontext;
4211
4212 oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4213
4214 /* Use generic string SortSupport, forcing "C" collation */
4215 varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4216
4217 MemoryContextSwitchTo(oldcontext);
4218
4219 PG_RETURN_VOID();
4220 }
4221
4222 /*
4223 * appendStringInfoText
4224 *
4225 * Append a text to str.
4226 * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4227 */
4228 static void
appendStringInfoText(StringInfo str,const text * t)4229 appendStringInfoText(StringInfo str, const text *t)
4230 {
4231 appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
4232 }
4233
4234 /*
4235 * replace_text
4236 * replace all occurrences of 'old_sub_str' in 'orig_str'
4237 * with 'new_sub_str' to form 'new_str'
4238 *
4239 * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4240 * otherwise returns 'new_str'
4241 */
4242 Datum
replace_text(PG_FUNCTION_ARGS)4243 replace_text(PG_FUNCTION_ARGS)
4244 {
4245 text *src_text = PG_GETARG_TEXT_PP(0);
4246 text *from_sub_text = PG_GETARG_TEXT_PP(1);
4247 text *to_sub_text = PG_GETARG_TEXT_PP(2);
4248 int src_text_len;
4249 int from_sub_text_len;
4250 TextPositionState state;
4251 text *ret_text;
4252 int chunk_len;
4253 char *curr_ptr;
4254 char *start_ptr;
4255 StringInfoData str;
4256 bool found;
4257
4258 src_text_len = VARSIZE_ANY_EXHDR(src_text);
4259 from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4260
4261 /* Return unmodified source string if empty source or pattern */
4262 if (src_text_len < 1 || from_sub_text_len < 1)
4263 {
4264 PG_RETURN_TEXT_P(src_text);
4265 }
4266
4267 text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4268
4269 found = text_position_next(&state);
4270
4271 /* When the from_sub_text is not found, there is nothing to do. */
4272 if (!found)
4273 {
4274 text_position_cleanup(&state);
4275 PG_RETURN_TEXT_P(src_text);
4276 }
4277 curr_ptr = text_position_get_match_ptr(&state);
4278 start_ptr = VARDATA_ANY(src_text);
4279
4280 initStringInfo(&str);
4281
4282 do
4283 {
4284 CHECK_FOR_INTERRUPTS();
4285
4286 /* copy the data skipped over by last text_position_next() */
4287 chunk_len = curr_ptr - start_ptr;
4288 appendBinaryStringInfo(&str, start_ptr, chunk_len);
4289
4290 appendStringInfoText(&str, to_sub_text);
4291
4292 start_ptr = curr_ptr + from_sub_text_len;
4293
4294 found = text_position_next(&state);
4295 if (found)
4296 curr_ptr = text_position_get_match_ptr(&state);
4297 }
4298 while (found);
4299
4300 /* copy trailing data */
4301 chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4302 appendBinaryStringInfo(&str, start_ptr, chunk_len);
4303
4304 text_position_cleanup(&state);
4305
4306 ret_text = cstring_to_text_with_len(str.data, str.len);
4307 pfree(str.data);
4308
4309 PG_RETURN_TEXT_P(ret_text);
4310 }
4311
4312 /*
4313 * check_replace_text_has_escape_char
4314 *
4315 * check whether replace_text contains escape char.
4316 */
4317 static bool
check_replace_text_has_escape_char(const text * replace_text)4318 check_replace_text_has_escape_char(const text *replace_text)
4319 {
4320 const char *p = VARDATA_ANY(replace_text);
4321 const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4322
4323 if (pg_database_encoding_max_length() == 1)
4324 {
4325 for (; p < p_end; p++)
4326 {
4327 if (*p == '\\')
4328 return true;
4329 }
4330 }
4331 else
4332 {
4333 for (; p < p_end; p += pg_mblen(p))
4334 {
4335 if (*p == '\\')
4336 return true;
4337 }
4338 }
4339
4340 return false;
4341 }
4342
4343 /*
4344 * appendStringInfoRegexpSubstr
4345 *
4346 * Append replace_text to str, substituting regexp back references for
4347 * \n escapes. start_ptr is the start of the match in the source string,
4348 * at logical character position data_pos.
4349 */
4350 static void
appendStringInfoRegexpSubstr(StringInfo str,text * replace_text,regmatch_t * pmatch,char * start_ptr,int data_pos)4351 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
4352 regmatch_t *pmatch,
4353 char *start_ptr, int data_pos)
4354 {
4355 const char *p = VARDATA_ANY(replace_text);
4356 const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4357 int eml = pg_database_encoding_max_length();
4358
4359 for (;;)
4360 {
4361 const char *chunk_start = p;
4362 int so;
4363 int eo;
4364
4365 /* Find next escape char. */
4366 if (eml == 1)
4367 {
4368 for (; p < p_end && *p != '\\'; p++)
4369 /* nothing */ ;
4370 }
4371 else
4372 {
4373 for (; p < p_end && *p != '\\'; p += pg_mblen(p))
4374 /* nothing */ ;
4375 }
4376
4377 /* Copy the text we just scanned over, if any. */
4378 if (p > chunk_start)
4379 appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4380
4381 /* Done if at end of string, else advance over escape char. */
4382 if (p >= p_end)
4383 break;
4384 p++;
4385
4386 if (p >= p_end)
4387 {
4388 /* Escape at very end of input. Treat same as unexpected char */
4389 appendStringInfoChar(str, '\\');
4390 break;
4391 }
4392
4393 if (*p >= '1' && *p <= '9')
4394 {
4395 /* Use the back reference of regexp. */
4396 int idx = *p - '0';
4397
4398 so = pmatch[idx].rm_so;
4399 eo = pmatch[idx].rm_eo;
4400 p++;
4401 }
4402 else if (*p == '&')
4403 {
4404 /* Use the entire matched string. */
4405 so = pmatch[0].rm_so;
4406 eo = pmatch[0].rm_eo;
4407 p++;
4408 }
4409 else if (*p == '\\')
4410 {
4411 /* \\ means transfer one \ to output. */
4412 appendStringInfoChar(str, '\\');
4413 p++;
4414 continue;
4415 }
4416 else
4417 {
4418 /*
4419 * If escape char is not followed by any expected char, just treat
4420 * it as ordinary data to copy. (XXX would it be better to throw
4421 * an error?)
4422 */
4423 appendStringInfoChar(str, '\\');
4424 continue;
4425 }
4426
4427 if (so != -1 && eo != -1)
4428 {
4429 /*
4430 * Copy the text that is back reference of regexp. Note so and eo
4431 * are counted in characters not bytes.
4432 */
4433 char *chunk_start;
4434 int chunk_len;
4435
4436 Assert(so >= data_pos);
4437 chunk_start = start_ptr;
4438 chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4439 chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4440 appendBinaryStringInfo(str, chunk_start, chunk_len);
4441 }
4442 }
4443 }
4444
4445 #define REGEXP_REPLACE_BACKREF_CNT 10
4446
4447 /*
4448 * replace_text_regexp
4449 *
4450 * replace text that matches to regexp in src_text to replace_text.
4451 *
4452 * Note: to avoid having to include regex.h in builtins.h, we declare
4453 * the regexp argument as void *, but really it's regex_t *.
4454 */
4455 text *
replace_text_regexp(text * src_text,void * regexp,text * replace_text,bool glob)4456 replace_text_regexp(text *src_text, void *regexp,
4457 text *replace_text, bool glob)
4458 {
4459 text *ret_text;
4460 regex_t *re = (regex_t *) regexp;
4461 int src_text_len = VARSIZE_ANY_EXHDR(src_text);
4462 StringInfoData buf;
4463 regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT];
4464 pg_wchar *data;
4465 size_t data_len;
4466 int search_start;
4467 int data_pos;
4468 char *start_ptr;
4469 bool have_escape;
4470
4471 initStringInfo(&buf);
4472
4473 /* Convert data string to wide characters. */
4474 data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4475 data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4476
4477 /* Check whether replace_text has escape char. */
4478 have_escape = check_replace_text_has_escape_char(replace_text);
4479
4480 /* start_ptr points to the data_pos'th character of src_text */
4481 start_ptr = (char *) VARDATA_ANY(src_text);
4482 data_pos = 0;
4483
4484 search_start = 0;
4485 while (search_start <= data_len)
4486 {
4487 int regexec_result;
4488
4489 CHECK_FOR_INTERRUPTS();
4490
4491 regexec_result = pg_regexec(re,
4492 data,
4493 data_len,
4494 search_start,
4495 NULL, /* no details */
4496 REGEXP_REPLACE_BACKREF_CNT,
4497 pmatch,
4498 0);
4499
4500 if (regexec_result == REG_NOMATCH)
4501 break;
4502
4503 if (regexec_result != REG_OKAY)
4504 {
4505 char errMsg[100];
4506
4507 CHECK_FOR_INTERRUPTS();
4508 pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4509 ereport(ERROR,
4510 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4511 errmsg("regular expression failed: %s", errMsg)));
4512 }
4513
4514 /*
4515 * Copy the text to the left of the match position. Note we are given
4516 * character not byte indexes.
4517 */
4518 if (pmatch[0].rm_so - data_pos > 0)
4519 {
4520 int chunk_len;
4521
4522 chunk_len = charlen_to_bytelen(start_ptr,
4523 pmatch[0].rm_so - data_pos);
4524 appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4525
4526 /*
4527 * Advance start_ptr over that text, to avoid multiple rescans of
4528 * it if the replace_text contains multiple back-references.
4529 */
4530 start_ptr += chunk_len;
4531 data_pos = pmatch[0].rm_so;
4532 }
4533
4534 /*
4535 * Copy the replace_text. Process back references when the
4536 * replace_text has escape characters.
4537 */
4538 if (have_escape)
4539 appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4540 start_ptr, data_pos);
4541 else
4542 appendStringInfoText(&buf, replace_text);
4543
4544 /* Advance start_ptr and data_pos over the matched text. */
4545 start_ptr += charlen_to_bytelen(start_ptr,
4546 pmatch[0].rm_eo - data_pos);
4547 data_pos = pmatch[0].rm_eo;
4548
4549 /*
4550 * When global option is off, replace the first instance only.
4551 */
4552 if (!glob)
4553 break;
4554
4555 /*
4556 * Advance search position. Normally we start the next search at the
4557 * end of the previous match; but if the match was of zero length, we
4558 * have to advance by one character, or we'd just find the same match
4559 * again.
4560 */
4561 search_start = data_pos;
4562 if (pmatch[0].rm_so == pmatch[0].rm_eo)
4563 search_start++;
4564 }
4565
4566 /*
4567 * Copy the text to the right of the last match.
4568 */
4569 if (data_pos < data_len)
4570 {
4571 int chunk_len;
4572
4573 chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4574 appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4575 }
4576
4577 ret_text = cstring_to_text_with_len(buf.data, buf.len);
4578 pfree(buf.data);
4579 pfree(data);
4580
4581 return ret_text;
4582 }
4583
4584 /*
4585 * split_text
4586 * parse input string
4587 * return ord item (1 based)
4588 * based on provided field separator
4589 */
4590 Datum
split_text(PG_FUNCTION_ARGS)4591 split_text(PG_FUNCTION_ARGS)
4592 {
4593 text *inputstring = PG_GETARG_TEXT_PP(0);
4594 text *fldsep = PG_GETARG_TEXT_PP(1);
4595 int fldnum = PG_GETARG_INT32(2);
4596 int inputstring_len;
4597 int fldsep_len;
4598 TextPositionState state;
4599 char *start_ptr;
4600 char *end_ptr;
4601 text *result_text;
4602 bool found;
4603
4604 /* field number is 1 based */
4605 if (fldnum < 1)
4606 ereport(ERROR,
4607 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4608 errmsg("field position must be greater than zero")));
4609
4610 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4611 fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4612
4613 /* return empty string for empty input string */
4614 if (inputstring_len < 1)
4615 PG_RETURN_TEXT_P(cstring_to_text(""));
4616
4617 /* empty field separator */
4618 if (fldsep_len < 1)
4619 {
4620 text_position_cleanup(&state);
4621 /* if first field, return input string, else empty string */
4622 if (fldnum == 1)
4623 PG_RETURN_TEXT_P(inputstring);
4624 else
4625 PG_RETURN_TEXT_P(cstring_to_text(""));
4626 }
4627
4628 text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4629
4630 /* identify bounds of first field */
4631 start_ptr = VARDATA_ANY(inputstring);
4632 found = text_position_next(&state);
4633
4634 /* special case if fldsep not found at all */
4635 if (!found)
4636 {
4637 text_position_cleanup(&state);
4638 /* if field 1 requested, return input string, else empty string */
4639 if (fldnum == 1)
4640 PG_RETURN_TEXT_P(inputstring);
4641 else
4642 PG_RETURN_TEXT_P(cstring_to_text(""));
4643 }
4644 end_ptr = text_position_get_match_ptr(&state);
4645
4646 while (found && --fldnum > 0)
4647 {
4648 /* identify bounds of next field */
4649 start_ptr = end_ptr + fldsep_len;
4650 found = text_position_next(&state);
4651 if (found)
4652 end_ptr = text_position_get_match_ptr(&state);
4653 }
4654
4655 text_position_cleanup(&state);
4656
4657 if (fldnum > 0)
4658 {
4659 /* N'th field separator not found */
4660 /* if last field requested, return it, else empty string */
4661 if (fldnum == 1)
4662 {
4663 int last_len = start_ptr - VARDATA_ANY(inputstring);
4664
4665 result_text = cstring_to_text_with_len(start_ptr,
4666 inputstring_len - last_len);
4667 }
4668 else
4669 result_text = cstring_to_text("");
4670 }
4671 else
4672 {
4673 /* non-last field requested */
4674 result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4675 }
4676
4677 PG_RETURN_TEXT_P(result_text);
4678 }
4679
4680 /*
4681 * Convenience function to return true when two text params are equal.
4682 */
4683 static bool
text_isequal(text * txt1,text * txt2,Oid collid)4684 text_isequal(text *txt1, text *txt2, Oid collid)
4685 {
4686 return DatumGetBool(DirectFunctionCall2Coll(texteq,
4687 collid,
4688 PointerGetDatum(txt1),
4689 PointerGetDatum(txt2)));
4690 }
4691
4692 /*
4693 * text_to_array
4694 * parse input string and return text array of elements,
4695 * based on provided field separator
4696 */
4697 Datum
text_to_array(PG_FUNCTION_ARGS)4698 text_to_array(PG_FUNCTION_ARGS)
4699 {
4700 return text_to_array_internal(fcinfo);
4701 }
4702
4703 /*
4704 * text_to_array_null
4705 * parse input string and return text array of elements,
4706 * based on provided field separator and null string
4707 *
4708 * This is a separate entry point only to prevent the regression tests from
4709 * complaining about different argument sets for the same internal function.
4710 */
4711 Datum
text_to_array_null(PG_FUNCTION_ARGS)4712 text_to_array_null(PG_FUNCTION_ARGS)
4713 {
4714 return text_to_array_internal(fcinfo);
4715 }
4716
4717 /*
4718 * common code for text_to_array and text_to_array_null functions
4719 *
4720 * These are not strict so we have to test for null inputs explicitly.
4721 */
4722 static Datum
text_to_array_internal(PG_FUNCTION_ARGS)4723 text_to_array_internal(PG_FUNCTION_ARGS)
4724 {
4725 text *inputstring;
4726 text *fldsep;
4727 text *null_string;
4728 int inputstring_len;
4729 int fldsep_len;
4730 char *start_ptr;
4731 text *result_text;
4732 bool is_null;
4733 ArrayBuildState *astate = NULL;
4734
4735 /* when input string is NULL, then result is NULL too */
4736 if (PG_ARGISNULL(0))
4737 PG_RETURN_NULL();
4738
4739 inputstring = PG_GETARG_TEXT_PP(0);
4740
4741 /* fldsep can be NULL */
4742 if (!PG_ARGISNULL(1))
4743 fldsep = PG_GETARG_TEXT_PP(1);
4744 else
4745 fldsep = NULL;
4746
4747 /* null_string can be NULL or omitted */
4748 if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4749 null_string = PG_GETARG_TEXT_PP(2);
4750 else
4751 null_string = NULL;
4752
4753 if (fldsep != NULL)
4754 {
4755 /*
4756 * Normal case with non-null fldsep. Use the text_position machinery
4757 * to search for occurrences of fldsep.
4758 */
4759 TextPositionState state;
4760
4761 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4762 fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4763
4764 /* return empty array for empty input string */
4765 if (inputstring_len < 1)
4766 PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4767
4768 /*
4769 * empty field separator: return the input string as a one-element
4770 * array
4771 */
4772 if (fldsep_len < 1)
4773 {
4774 Datum elems[1];
4775 bool nulls[1];
4776 int dims[1];
4777 int lbs[1];
4778
4779 /* single element can be a NULL too */
4780 is_null = null_string ? text_isequal(inputstring, null_string, PG_GET_COLLATION()) : false;
4781
4782 elems[0] = PointerGetDatum(inputstring);
4783 nulls[0] = is_null;
4784 dims[0] = 1;
4785 lbs[0] = 1;
4786 /* XXX: this hardcodes assumptions about the text type */
4787 PG_RETURN_ARRAYTYPE_P(construct_md_array(elems, nulls,
4788 1, dims, lbs,
4789 TEXTOID, -1, false, TYPALIGN_INT));
4790 }
4791
4792 text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4793
4794 start_ptr = VARDATA_ANY(inputstring);
4795
4796 for (;;)
4797 {
4798 bool found;
4799 char *end_ptr;
4800 int chunk_len;
4801
4802 CHECK_FOR_INTERRUPTS();
4803
4804 found = text_position_next(&state);
4805 if (!found)
4806 {
4807 /* fetch last field */
4808 chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4809 end_ptr = NULL; /* not used, but some compilers complain */
4810 }
4811 else
4812 {
4813 /* fetch non-last field */
4814 end_ptr = text_position_get_match_ptr(&state);
4815 chunk_len = end_ptr - start_ptr;
4816 }
4817
4818 /* must build a temp text datum to pass to accumArrayResult */
4819 result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4820 is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false;
4821
4822 /* stash away this field */
4823 astate = accumArrayResult(astate,
4824 PointerGetDatum(result_text),
4825 is_null,
4826 TEXTOID,
4827 CurrentMemoryContext);
4828
4829 pfree(result_text);
4830
4831 if (!found)
4832 break;
4833
4834 start_ptr = end_ptr + fldsep_len;
4835 }
4836
4837 text_position_cleanup(&state);
4838 }
4839 else
4840 {
4841 /*
4842 * When fldsep is NULL, each character in the inputstring becomes an
4843 * element in the result array. The separator is effectively the
4844 * space between characters.
4845 */
4846 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4847
4848 /* return empty array for empty input string */
4849 if (inputstring_len < 1)
4850 PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4851
4852 start_ptr = VARDATA_ANY(inputstring);
4853
4854 while (inputstring_len > 0)
4855 {
4856 int chunk_len = pg_mblen(start_ptr);
4857
4858 CHECK_FOR_INTERRUPTS();
4859
4860 /* must build a temp text datum to pass to accumArrayResult */
4861 result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4862 is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false;
4863
4864 /* stash away this field */
4865 astate = accumArrayResult(astate,
4866 PointerGetDatum(result_text),
4867 is_null,
4868 TEXTOID,
4869 CurrentMemoryContext);
4870
4871 pfree(result_text);
4872
4873 start_ptr += chunk_len;
4874 inputstring_len -= chunk_len;
4875 }
4876 }
4877
4878 PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
4879 CurrentMemoryContext));
4880 }
4881
4882 /*
4883 * array_to_text
4884 * concatenate Cstring representation of input array elements
4885 * using provided field separator
4886 */
4887 Datum
array_to_text(PG_FUNCTION_ARGS)4888 array_to_text(PG_FUNCTION_ARGS)
4889 {
4890 ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
4891 char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4892
4893 PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4894 }
4895
4896 /*
4897 * array_to_text_null
4898 * concatenate Cstring representation of input array elements
4899 * using provided field separator and null string
4900 *
4901 * This version is not strict so we have to test for null inputs explicitly.
4902 */
4903 Datum
array_to_text_null(PG_FUNCTION_ARGS)4904 array_to_text_null(PG_FUNCTION_ARGS)
4905 {
4906 ArrayType *v;
4907 char *fldsep;
4908 char *null_string;
4909
4910 /* returns NULL when first or second parameter is NULL */
4911 if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4912 PG_RETURN_NULL();
4913
4914 v = PG_GETARG_ARRAYTYPE_P(0);
4915 fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4916
4917 /* NULL null string is passed through as a null pointer */
4918 if (!PG_ARGISNULL(2))
4919 null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4920 else
4921 null_string = NULL;
4922
4923 PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4924 }
4925
4926 /*
4927 * common code for array_to_text and array_to_text_null functions
4928 */
4929 static text *
array_to_text_internal(FunctionCallInfo fcinfo,ArrayType * v,const char * fldsep,const char * null_string)4930 array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
4931 const char *fldsep, const char *null_string)
4932 {
4933 text *result;
4934 int nitems,
4935 *dims,
4936 ndims;
4937 Oid element_type;
4938 int typlen;
4939 bool typbyval;
4940 char typalign;
4941 StringInfoData buf;
4942 bool printed = false;
4943 char *p;
4944 bits8 *bitmap;
4945 int bitmask;
4946 int i;
4947 ArrayMetaState *my_extra;
4948
4949 ndims = ARR_NDIM(v);
4950 dims = ARR_DIMS(v);
4951 nitems = ArrayGetNItems(ndims, dims);
4952
4953 /* if there are no elements, return an empty string */
4954 if (nitems == 0)
4955 return cstring_to_text_with_len("", 0);
4956
4957 element_type = ARR_ELEMTYPE(v);
4958 initStringInfo(&buf);
4959
4960 /*
4961 * We arrange to look up info about element type, including its output
4962 * conversion proc, only once per series of calls, assuming the element
4963 * type doesn't change underneath us.
4964 */
4965 my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4966 if (my_extra == NULL)
4967 {
4968 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4969 sizeof(ArrayMetaState));
4970 my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4971 my_extra->element_type = ~element_type;
4972 }
4973
4974 if (my_extra->element_type != element_type)
4975 {
4976 /*
4977 * Get info about element type, including its output conversion proc
4978 */
4979 get_type_io_data(element_type, IOFunc_output,
4980 &my_extra->typlen, &my_extra->typbyval,
4981 &my_extra->typalign, &my_extra->typdelim,
4982 &my_extra->typioparam, &my_extra->typiofunc);
4983 fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4984 fcinfo->flinfo->fn_mcxt);
4985 my_extra->element_type = element_type;
4986 }
4987 typlen = my_extra->typlen;
4988 typbyval = my_extra->typbyval;
4989 typalign = my_extra->typalign;
4990
4991 p = ARR_DATA_PTR(v);
4992 bitmap = ARR_NULLBITMAP(v);
4993 bitmask = 1;
4994
4995 for (i = 0; i < nitems; i++)
4996 {
4997 Datum itemvalue;
4998 char *value;
4999
5000 /* Get source element, checking for NULL */
5001 if (bitmap && (*bitmap & bitmask) == 0)
5002 {
5003 /* if null_string is NULL, we just ignore null elements */
5004 if (null_string != NULL)
5005 {
5006 if (printed)
5007 appendStringInfo(&buf, "%s%s", fldsep, null_string);
5008 else
5009 appendStringInfoString(&buf, null_string);
5010 printed = true;
5011 }
5012 }
5013 else
5014 {
5015 itemvalue = fetch_att(p, typbyval, typlen);
5016
5017 value = OutputFunctionCall(&my_extra->proc, itemvalue);
5018
5019 if (printed)
5020 appendStringInfo(&buf, "%s%s", fldsep, value);
5021 else
5022 appendStringInfoString(&buf, value);
5023 printed = true;
5024
5025 p = att_addlength_pointer(p, typlen, p);
5026 p = (char *) att_align_nominal(p, typalign);
5027 }
5028
5029 /* advance bitmap pointer if any */
5030 if (bitmap)
5031 {
5032 bitmask <<= 1;
5033 if (bitmask == 0x100)
5034 {
5035 bitmap++;
5036 bitmask = 1;
5037 }
5038 }
5039 }
5040
5041 result = cstring_to_text_with_len(buf.data, buf.len);
5042 pfree(buf.data);
5043
5044 return result;
5045 }
5046
5047 #define HEXBASE 16
5048 /*
5049 * Convert an int32 to a string containing a base 16 (hex) representation of
5050 * the number.
5051 */
5052 Datum
to_hex32(PG_FUNCTION_ARGS)5053 to_hex32(PG_FUNCTION_ARGS)
5054 {
5055 uint32 value = (uint32) PG_GETARG_INT32(0);
5056 char *ptr;
5057 const char *digits = "0123456789abcdef";
5058 char buf[32]; /* bigger than needed, but reasonable */
5059
5060 ptr = buf + sizeof(buf) - 1;
5061 *ptr = '\0';
5062
5063 do
5064 {
5065 *--ptr = digits[value % HEXBASE];
5066 value /= HEXBASE;
5067 } while (ptr > buf && value);
5068
5069 PG_RETURN_TEXT_P(cstring_to_text(ptr));
5070 }
5071
5072 /*
5073 * Convert an int64 to a string containing a base 16 (hex) representation of
5074 * the number.
5075 */
5076 Datum
to_hex64(PG_FUNCTION_ARGS)5077 to_hex64(PG_FUNCTION_ARGS)
5078 {
5079 uint64 value = (uint64) PG_GETARG_INT64(0);
5080 char *ptr;
5081 const char *digits = "0123456789abcdef";
5082 char buf[32]; /* bigger than needed, but reasonable */
5083
5084 ptr = buf + sizeof(buf) - 1;
5085 *ptr = '\0';
5086
5087 do
5088 {
5089 *--ptr = digits[value % HEXBASE];
5090 value /= HEXBASE;
5091 } while (ptr > buf && value);
5092
5093 PG_RETURN_TEXT_P(cstring_to_text(ptr));
5094 }
5095
5096 /*
5097 * Return the size of a datum, possibly compressed
5098 *
5099 * Works on any data type
5100 */
5101 Datum
pg_column_size(PG_FUNCTION_ARGS)5102 pg_column_size(PG_FUNCTION_ARGS)
5103 {
5104 Datum value = PG_GETARG_DATUM(0);
5105 int32 result;
5106 int typlen;
5107
5108 /* On first call, get the input type's typlen, and save at *fn_extra */
5109 if (fcinfo->flinfo->fn_extra == NULL)
5110 {
5111 /* Lookup the datatype of the supplied argument */
5112 Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5113
5114 typlen = get_typlen(argtypeid);
5115 if (typlen == 0) /* should not happen */
5116 elog(ERROR, "cache lookup failed for type %u", argtypeid);
5117
5118 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5119 sizeof(int));
5120 *((int *) fcinfo->flinfo->fn_extra) = typlen;
5121 }
5122 else
5123 typlen = *((int *) fcinfo->flinfo->fn_extra);
5124
5125 if (typlen == -1)
5126 {
5127 /* varlena type, possibly toasted */
5128 result = toast_datum_size(value);
5129 }
5130 else if (typlen == -2)
5131 {
5132 /* cstring */
5133 result = strlen(DatumGetCString(value)) + 1;
5134 }
5135 else
5136 {
5137 /* ordinary fixed-width type */
5138 result = typlen;
5139 }
5140
5141 PG_RETURN_INT32(result);
5142 }
5143
5144 /*
5145 * string_agg - Concatenates values and returns string.
5146 *
5147 * Syntax: string_agg(value text, delimiter text) RETURNS text
5148 *
5149 * Note: Any NULL values are ignored. The first-call delimiter isn't
5150 * actually used at all, and on subsequent calls the delimiter precedes
5151 * the associated value.
5152 */
5153
5154 /* subroutine to initialize state */
5155 static StringInfo
makeStringAggState(FunctionCallInfo fcinfo)5156 makeStringAggState(FunctionCallInfo fcinfo)
5157 {
5158 StringInfo state;
5159 MemoryContext aggcontext;
5160 MemoryContext oldcontext;
5161
5162 if (!AggCheckCallContext(fcinfo, &aggcontext))
5163 {
5164 /* cannot be called directly because of internal-type argument */
5165 elog(ERROR, "string_agg_transfn called in non-aggregate context");
5166 }
5167
5168 /*
5169 * Create state in aggregate context. It'll stay there across subsequent
5170 * calls.
5171 */
5172 oldcontext = MemoryContextSwitchTo(aggcontext);
5173 state = makeStringInfo();
5174 MemoryContextSwitchTo(oldcontext);
5175
5176 return state;
5177 }
5178
5179 Datum
string_agg_transfn(PG_FUNCTION_ARGS)5180 string_agg_transfn(PG_FUNCTION_ARGS)
5181 {
5182 StringInfo state;
5183
5184 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5185
5186 /* Append the value unless null. */
5187 if (!PG_ARGISNULL(1))
5188 {
5189 /* On the first time through, we ignore the delimiter. */
5190 if (state == NULL)
5191 state = makeStringAggState(fcinfo);
5192 else if (!PG_ARGISNULL(2))
5193 appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
5194
5195 appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
5196 }
5197
5198 /*
5199 * The transition type for string_agg() is declared to be "internal",
5200 * which is a pass-by-value type the same size as a pointer.
5201 */
5202 PG_RETURN_POINTER(state);
5203 }
5204
5205 Datum
string_agg_finalfn(PG_FUNCTION_ARGS)5206 string_agg_finalfn(PG_FUNCTION_ARGS)
5207 {
5208 StringInfo state;
5209
5210 /* cannot be called directly because of internal-type argument */
5211 Assert(AggCheckCallContext(fcinfo, NULL));
5212
5213 state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5214
5215 if (state != NULL)
5216 PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
5217 else
5218 PG_RETURN_NULL();
5219 }
5220
5221 /*
5222 * Prepare cache with fmgr info for the output functions of the datatypes of
5223 * the arguments of a concat-like function, beginning with argument "argidx".
5224 * (Arguments before that will have corresponding slots in the resulting
5225 * FmgrInfo array, but we don't fill those slots.)
5226 */
5227 static FmgrInfo *
build_concat_foutcache(FunctionCallInfo fcinfo,int argidx)5228 build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
5229 {
5230 FmgrInfo *foutcache;
5231 int i;
5232
5233 /* We keep the info in fn_mcxt so it survives across calls */
5234 foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5235 PG_NARGS() * sizeof(FmgrInfo));
5236
5237 for (i = argidx; i < PG_NARGS(); i++)
5238 {
5239 Oid valtype;
5240 Oid typOutput;
5241 bool typIsVarlena;
5242
5243 valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5244 if (!OidIsValid(valtype))
5245 elog(ERROR, "could not determine data type of concat() input");
5246
5247 getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5248 fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5249 }
5250
5251 fcinfo->flinfo->fn_extra = foutcache;
5252
5253 return foutcache;
5254 }
5255
5256 /*
5257 * Implementation of both concat() and concat_ws().
5258 *
5259 * sepstr is the separator string to place between values.
5260 * argidx identifies the first argument to concatenate (counting from zero);
5261 * note that this must be constant across any one series of calls.
5262 *
5263 * Returns NULL if result should be NULL, else text value.
5264 */
5265 static text *
concat_internal(const char * sepstr,int argidx,FunctionCallInfo fcinfo)5266 concat_internal(const char *sepstr, int argidx,
5267 FunctionCallInfo fcinfo)
5268 {
5269 text *result;
5270 StringInfoData str;
5271 FmgrInfo *foutcache;
5272 bool first_arg = true;
5273 int i;
5274
5275 /*
5276 * concat(VARIADIC some-array) is essentially equivalent to
5277 * array_to_text(), ie concat the array elements with the given separator.
5278 * So we just pass the case off to that code.
5279 */
5280 if (get_fn_expr_variadic(fcinfo->flinfo))
5281 {
5282 ArrayType *arr;
5283
5284 /* Should have just the one argument */
5285 Assert(argidx == PG_NARGS() - 1);
5286
5287 /* concat(VARIADIC NULL) is defined as NULL */
5288 if (PG_ARGISNULL(argidx))
5289 return NULL;
5290
5291 /*
5292 * Non-null argument had better be an array. We assume that any call
5293 * context that could let get_fn_expr_variadic return true will have
5294 * checked that a VARIADIC-labeled parameter actually is an array. So
5295 * it should be okay to just Assert that it's an array rather than
5296 * doing a full-fledged error check.
5297 */
5298 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
5299
5300 /* OK, safe to fetch the array value */
5301 arr = PG_GETARG_ARRAYTYPE_P(argidx);
5302
5303 /*
5304 * And serialize the array. We tell array_to_text to ignore null
5305 * elements, which matches the behavior of the loop below.
5306 */
5307 return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5308 }
5309
5310 /* Normal case without explicit VARIADIC marker */
5311 initStringInfo(&str);
5312
5313 /* Get output function info, building it if first time through */
5314 foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5315 if (foutcache == NULL)
5316 foutcache = build_concat_foutcache(fcinfo, argidx);
5317
5318 for (i = argidx; i < PG_NARGS(); i++)
5319 {
5320 if (!PG_ARGISNULL(i))
5321 {
5322 Datum value = PG_GETARG_DATUM(i);
5323
5324 /* add separator if appropriate */
5325 if (first_arg)
5326 first_arg = false;
5327 else
5328 appendStringInfoString(&str, sepstr);
5329
5330 /* call the appropriate type output function, append the result */
5331 appendStringInfoString(&str,
5332 OutputFunctionCall(&foutcache[i], value));
5333 }
5334 }
5335
5336 result = cstring_to_text_with_len(str.data, str.len);
5337 pfree(str.data);
5338
5339 return result;
5340 }
5341
5342 /*
5343 * Concatenate all arguments. NULL arguments are ignored.
5344 */
5345 Datum
text_concat(PG_FUNCTION_ARGS)5346 text_concat(PG_FUNCTION_ARGS)
5347 {
5348 text *result;
5349
5350 result = concat_internal("", 0, fcinfo);
5351 if (result == NULL)
5352 PG_RETURN_NULL();
5353 PG_RETURN_TEXT_P(result);
5354 }
5355
5356 /*
5357 * Concatenate all but first argument value with separators. The first
5358 * parameter is used as the separator. NULL arguments are ignored.
5359 */
5360 Datum
text_concat_ws(PG_FUNCTION_ARGS)5361 text_concat_ws(PG_FUNCTION_ARGS)
5362 {
5363 char *sep;
5364 text *result;
5365
5366 /* return NULL when separator is NULL */
5367 if (PG_ARGISNULL(0))
5368 PG_RETURN_NULL();
5369 sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
5370
5371 result = concat_internal(sep, 1, fcinfo);
5372 if (result == NULL)
5373 PG_RETURN_NULL();
5374 PG_RETURN_TEXT_P(result);
5375 }
5376
5377 /*
5378 * Return first n characters in the string. When n is negative,
5379 * return all but last |n| characters.
5380 */
5381 Datum
text_left(PG_FUNCTION_ARGS)5382 text_left(PG_FUNCTION_ARGS)
5383 {
5384 int n = PG_GETARG_INT32(1);
5385
5386 if (n < 0)
5387 {
5388 text *str = PG_GETARG_TEXT_PP(0);
5389 const char *p = VARDATA_ANY(str);
5390 int len = VARSIZE_ANY_EXHDR(str);
5391 int rlen;
5392
5393 n = pg_mbstrlen_with_len(p, len) + n;
5394 rlen = pg_mbcharcliplen(p, len, n);
5395 PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
5396 }
5397 else
5398 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
5399 }
5400
5401 /*
5402 * Return last n characters in the string. When n is negative,
5403 * return all but first |n| characters.
5404 */
5405 Datum
text_right(PG_FUNCTION_ARGS)5406 text_right(PG_FUNCTION_ARGS)
5407 {
5408 text *str = PG_GETARG_TEXT_PP(0);
5409 const char *p = VARDATA_ANY(str);
5410 int len = VARSIZE_ANY_EXHDR(str);
5411 int n = PG_GETARG_INT32(1);
5412 int off;
5413
5414 if (n < 0)
5415 n = -n;
5416 else
5417 n = pg_mbstrlen_with_len(p, len) - n;
5418 off = pg_mbcharcliplen(p, len, n);
5419
5420 PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5421 }
5422
5423 /*
5424 * Return reversed string
5425 */
5426 Datum
text_reverse(PG_FUNCTION_ARGS)5427 text_reverse(PG_FUNCTION_ARGS)
5428 {
5429 text *str = PG_GETARG_TEXT_PP(0);
5430 const char *p = VARDATA_ANY(str);
5431 int len = VARSIZE_ANY_EXHDR(str);
5432 const char *endp = p + len;
5433 text *result;
5434 char *dst;
5435
5436 result = palloc(len + VARHDRSZ);
5437 dst = (char *) VARDATA(result) + len;
5438 SET_VARSIZE(result, len + VARHDRSZ);
5439
5440 if (pg_database_encoding_max_length() > 1)
5441 {
5442 /* multibyte version */
5443 while (p < endp)
5444 {
5445 int sz;
5446
5447 sz = pg_mblen(p);
5448 dst -= sz;
5449 memcpy(dst, p, sz);
5450 p += sz;
5451 }
5452 }
5453 else
5454 {
5455 /* single byte version */
5456 while (p < endp)
5457 *(--dst) = *p++;
5458 }
5459
5460 PG_RETURN_TEXT_P(result);
5461 }
5462
5463
5464 /*
5465 * Support macros for text_format()
5466 */
5467 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5468
5469 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5470 do { \
5471 if (++(ptr) >= (end_ptr)) \
5472 ereport(ERROR, \
5473 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5474 errmsg("unterminated format() type specifier"), \
5475 errhint("For a single \"%%\" use \"%%%%\"."))); \
5476 } while (0)
5477
5478 /*
5479 * Returns a formatted string
5480 */
5481 Datum
text_format(PG_FUNCTION_ARGS)5482 text_format(PG_FUNCTION_ARGS)
5483 {
5484 text *fmt;
5485 StringInfoData str;
5486 const char *cp;
5487 const char *start_ptr;
5488 const char *end_ptr;
5489 text *result;
5490 int arg;
5491 bool funcvariadic;
5492 int nargs;
5493 Datum *elements = NULL;
5494 bool *nulls = NULL;
5495 Oid element_type = InvalidOid;
5496 Oid prev_type = InvalidOid;
5497 Oid prev_width_type = InvalidOid;
5498 FmgrInfo typoutputfinfo;
5499 FmgrInfo typoutputinfo_width;
5500
5501 /* When format string is null, immediately return null */
5502 if (PG_ARGISNULL(0))
5503 PG_RETURN_NULL();
5504
5505 /* If argument is marked VARIADIC, expand array into elements */
5506 if (get_fn_expr_variadic(fcinfo->flinfo))
5507 {
5508 ArrayType *arr;
5509 int16 elmlen;
5510 bool elmbyval;
5511 char elmalign;
5512 int nitems;
5513
5514 /* Should have just the one argument */
5515 Assert(PG_NARGS() == 2);
5516
5517 /* If argument is NULL, we treat it as zero-length array */
5518 if (PG_ARGISNULL(1))
5519 nitems = 0;
5520 else
5521 {
5522 /*
5523 * Non-null argument had better be an array. We assume that any
5524 * call context that could let get_fn_expr_variadic return true
5525 * will have checked that a VARIADIC-labeled parameter actually is
5526 * an array. So it should be okay to just Assert that it's an
5527 * array rather than doing a full-fledged error check.
5528 */
5529 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5530
5531 /* OK, safe to fetch the array value */
5532 arr = PG_GETARG_ARRAYTYPE_P(1);
5533
5534 /* Get info about array element type */
5535 element_type = ARR_ELEMTYPE(arr);
5536 get_typlenbyvalalign(element_type,
5537 &elmlen, &elmbyval, &elmalign);
5538
5539 /* Extract all array elements */
5540 deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5541 &elements, &nulls, &nitems);
5542 }
5543
5544 nargs = nitems + 1;
5545 funcvariadic = true;
5546 }
5547 else
5548 {
5549 /* Non-variadic case, we'll process the arguments individually */
5550 nargs = PG_NARGS();
5551 funcvariadic = false;
5552 }
5553
5554 /* Setup for main loop. */
5555 fmt = PG_GETARG_TEXT_PP(0);
5556 start_ptr = VARDATA_ANY(fmt);
5557 end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5558 initStringInfo(&str);
5559 arg = 1; /* next argument position to print */
5560
5561 /* Scan format string, looking for conversion specifiers. */
5562 for (cp = start_ptr; cp < end_ptr; cp++)
5563 {
5564 int argpos;
5565 int widthpos;
5566 int flags;
5567 int width;
5568 Datum value;
5569 bool isNull;
5570 Oid typid;
5571
5572 /*
5573 * If it's not the start of a conversion specifier, just copy it to
5574 * the output buffer.
5575 */
5576 if (*cp != '%')
5577 {
5578 appendStringInfoCharMacro(&str, *cp);
5579 continue;
5580 }
5581
5582 ADVANCE_PARSE_POINTER(cp, end_ptr);
5583
5584 /* Easy case: %% outputs a single % */
5585 if (*cp == '%')
5586 {
5587 appendStringInfoCharMacro(&str, *cp);
5588 continue;
5589 }
5590
5591 /* Parse the optional portions of the format specifier */
5592 cp = text_format_parse_format(cp, end_ptr,
5593 &argpos, &widthpos,
5594 &flags, &width);
5595
5596 /*
5597 * Next we should see the main conversion specifier. Whether or not
5598 * an argument position was present, it's known that at least one
5599 * character remains in the string at this point. Experience suggests
5600 * that it's worth checking that that character is one of the expected
5601 * ones before we try to fetch arguments, so as to produce the least
5602 * confusing response to a mis-formatted specifier.
5603 */
5604 if (strchr("sIL", *cp) == NULL)
5605 ereport(ERROR,
5606 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5607 errmsg("unrecognized format() type specifier \"%c\"",
5608 *cp),
5609 errhint("For a single \"%%\" use \"%%%%\".")));
5610
5611 /* If indirect width was specified, get its value */
5612 if (widthpos >= 0)
5613 {
5614 /* Collect the specified or next argument position */
5615 if (widthpos > 0)
5616 arg = widthpos;
5617 if (arg >= nargs)
5618 ereport(ERROR,
5619 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5620 errmsg("too few arguments for format()")));
5621
5622 /* Get the value and type of the selected argument */
5623 if (!funcvariadic)
5624 {
5625 value = PG_GETARG_DATUM(arg);
5626 isNull = PG_ARGISNULL(arg);
5627 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5628 }
5629 else
5630 {
5631 value = elements[arg - 1];
5632 isNull = nulls[arg - 1];
5633 typid = element_type;
5634 }
5635 if (!OidIsValid(typid))
5636 elog(ERROR, "could not determine data type of format() input");
5637
5638 arg++;
5639
5640 /* We can treat NULL width the same as zero */
5641 if (isNull)
5642 width = 0;
5643 else if (typid == INT4OID)
5644 width = DatumGetInt32(value);
5645 else if (typid == INT2OID)
5646 width = DatumGetInt16(value);
5647 else
5648 {
5649 /* For less-usual datatypes, convert to text then to int */
5650 char *str;
5651
5652 if (typid != prev_width_type)
5653 {
5654 Oid typoutputfunc;
5655 bool typIsVarlena;
5656
5657 getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5658 fmgr_info(typoutputfunc, &typoutputinfo_width);
5659 prev_width_type = typid;
5660 }
5661
5662 str = OutputFunctionCall(&typoutputinfo_width, value);
5663
5664 /* pg_strtoint32 will complain about bad data or overflow */
5665 width = pg_strtoint32(str);
5666
5667 pfree(str);
5668 }
5669 }
5670
5671 /* Collect the specified or next argument position */
5672 if (argpos > 0)
5673 arg = argpos;
5674 if (arg >= nargs)
5675 ereport(ERROR,
5676 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5677 errmsg("too few arguments for format()")));
5678
5679 /* Get the value and type of the selected argument */
5680 if (!funcvariadic)
5681 {
5682 value = PG_GETARG_DATUM(arg);
5683 isNull = PG_ARGISNULL(arg);
5684 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5685 }
5686 else
5687 {
5688 value = elements[arg - 1];
5689 isNull = nulls[arg - 1];
5690 typid = element_type;
5691 }
5692 if (!OidIsValid(typid))
5693 elog(ERROR, "could not determine data type of format() input");
5694
5695 arg++;
5696
5697 /*
5698 * Get the appropriate typOutput function, reusing previous one if
5699 * same type as previous argument. That's particularly useful in the
5700 * variadic-array case, but often saves work even for ordinary calls.
5701 */
5702 if (typid != prev_type)
5703 {
5704 Oid typoutputfunc;
5705 bool typIsVarlena;
5706
5707 getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5708 fmgr_info(typoutputfunc, &typoutputfinfo);
5709 prev_type = typid;
5710 }
5711
5712 /*
5713 * And now we can format the value.
5714 */
5715 switch (*cp)
5716 {
5717 case 's':
5718 case 'I':
5719 case 'L':
5720 text_format_string_conversion(&str, *cp, &typoutputfinfo,
5721 value, isNull,
5722 flags, width);
5723 break;
5724 default:
5725 /* should not get here, because of previous check */
5726 ereport(ERROR,
5727 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5728 errmsg("unrecognized format() type specifier \"%c\"",
5729 *cp),
5730 errhint("For a single \"%%\" use \"%%%%\".")));
5731 break;
5732 }
5733 }
5734
5735 /* Don't need deconstruct_array results anymore. */
5736 if (elements != NULL)
5737 pfree(elements);
5738 if (nulls != NULL)
5739 pfree(nulls);
5740
5741 /* Generate results. */
5742 result = cstring_to_text_with_len(str.data, str.len);
5743 pfree(str.data);
5744
5745 PG_RETURN_TEXT_P(result);
5746 }
5747
5748 /*
5749 * Parse contiguous digits as a decimal number.
5750 *
5751 * Returns true if some digits could be parsed.
5752 * The value is returned into *value, and *ptr is advanced to the next
5753 * character to be parsed.
5754 *
5755 * Note parsing invariant: at least one character is known available before
5756 * string end (end_ptr) at entry, and this is still true at exit.
5757 */
5758 static bool
text_format_parse_digits(const char ** ptr,const char * end_ptr,int * value)5759 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5760 {
5761 bool found = false;
5762 const char *cp = *ptr;
5763 int val = 0;
5764
5765 while (*cp >= '0' && *cp <= '9')
5766 {
5767 int8 digit = (*cp - '0');
5768
5769 if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5770 unlikely(pg_add_s32_overflow(val, digit, &val)))
5771 ereport(ERROR,
5772 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5773 errmsg("number is out of range")));
5774 ADVANCE_PARSE_POINTER(cp, end_ptr);
5775 found = true;
5776 }
5777
5778 *ptr = cp;
5779 *value = val;
5780
5781 return found;
5782 }
5783
5784 /*
5785 * Parse a format specifier (generally following the SUS printf spec).
5786 *
5787 * We have already advanced over the initial '%', and we are looking for
5788 * [argpos][flags][width]type (but the type character is not consumed here).
5789 *
5790 * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5791 * Output parameters:
5792 * argpos: argument position for value to be printed. -1 means unspecified.
5793 * widthpos: argument position for width. Zero means the argument position
5794 * was unspecified (ie, take the next arg) and -1 means no width
5795 * argument (width was omitted or specified as a constant).
5796 * flags: bitmask of flags.
5797 * width: directly-specified width value. Zero means the width was omitted
5798 * (note it's not necessary to distinguish this case from an explicit
5799 * zero width value).
5800 *
5801 * The function result is the next character position to be parsed, ie, the
5802 * location where the type character is/should be.
5803 *
5804 * Note parsing invariant: at least one character is known available before
5805 * string end (end_ptr) at entry, and this is still true at exit.
5806 */
5807 static const char *
text_format_parse_format(const char * start_ptr,const char * end_ptr,int * argpos,int * widthpos,int * flags,int * width)5808 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5809 int *argpos, int *widthpos,
5810 int *flags, int *width)
5811 {
5812 const char *cp = start_ptr;
5813 int n;
5814
5815 /* set defaults for output parameters */
5816 *argpos = -1;
5817 *widthpos = -1;
5818 *flags = 0;
5819 *width = 0;
5820
5821 /* try to identify first number */
5822 if (text_format_parse_digits(&cp, end_ptr, &n))
5823 {
5824 if (*cp != '$')
5825 {
5826 /* Must be just a width and a type, so we're done */
5827 *width = n;
5828 return cp;
5829 }
5830 /* The number was argument position */
5831 *argpos = n;
5832 /* Explicit 0 for argument index is immediately refused */
5833 if (n == 0)
5834 ereport(ERROR,
5835 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5836 errmsg("format specifies argument 0, but arguments are numbered from 1")));
5837 ADVANCE_PARSE_POINTER(cp, end_ptr);
5838 }
5839
5840 /* Handle flags (only minus is supported now) */
5841 while (*cp == '-')
5842 {
5843 *flags |= TEXT_FORMAT_FLAG_MINUS;
5844 ADVANCE_PARSE_POINTER(cp, end_ptr);
5845 }
5846
5847 if (*cp == '*')
5848 {
5849 /* Handle indirect width */
5850 ADVANCE_PARSE_POINTER(cp, end_ptr);
5851 if (text_format_parse_digits(&cp, end_ptr, &n))
5852 {
5853 /* number in this position must be closed by $ */
5854 if (*cp != '$')
5855 ereport(ERROR,
5856 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5857 errmsg("width argument position must be ended by \"$\"")));
5858 /* The number was width argument position */
5859 *widthpos = n;
5860 /* Explicit 0 for argument index is immediately refused */
5861 if (n == 0)
5862 ereport(ERROR,
5863 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5864 errmsg("format specifies argument 0, but arguments are numbered from 1")));
5865 ADVANCE_PARSE_POINTER(cp, end_ptr);
5866 }
5867 else
5868 *widthpos = 0; /* width's argument position is unspecified */
5869 }
5870 else
5871 {
5872 /* Check for direct width specification */
5873 if (text_format_parse_digits(&cp, end_ptr, &n))
5874 *width = n;
5875 }
5876
5877 /* cp should now be pointing at type character */
5878 return cp;
5879 }
5880
5881 /*
5882 * Format a %s, %I, or %L conversion
5883 */
5884 static void
text_format_string_conversion(StringInfo buf,char conversion,FmgrInfo * typOutputInfo,Datum value,bool isNull,int flags,int width)5885 text_format_string_conversion(StringInfo buf, char conversion,
5886 FmgrInfo *typOutputInfo,
5887 Datum value, bool isNull,
5888 int flags, int width)
5889 {
5890 char *str;
5891
5892 /* Handle NULL arguments before trying to stringify the value. */
5893 if (isNull)
5894 {
5895 if (conversion == 's')
5896 text_format_append_string(buf, "", flags, width);
5897 else if (conversion == 'L')
5898 text_format_append_string(buf, "NULL", flags, width);
5899 else if (conversion == 'I')
5900 ereport(ERROR,
5901 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
5902 errmsg("null values cannot be formatted as an SQL identifier")));
5903 return;
5904 }
5905
5906 /* Stringify. */
5907 str = OutputFunctionCall(typOutputInfo, value);
5908
5909 /* Escape. */
5910 if (conversion == 'I')
5911 {
5912 /* quote_identifier may or may not allocate a new string. */
5913 text_format_append_string(buf, quote_identifier(str), flags, width);
5914 }
5915 else if (conversion == 'L')
5916 {
5917 char *qstr = quote_literal_cstr(str);
5918
5919 text_format_append_string(buf, qstr, flags, width);
5920 /* quote_literal_cstr() always allocates a new string */
5921 pfree(qstr);
5922 }
5923 else
5924 text_format_append_string(buf, str, flags, width);
5925
5926 /* Cleanup. */
5927 pfree(str);
5928 }
5929
5930 /*
5931 * Append str to buf, padding as directed by flags/width
5932 */
5933 static void
text_format_append_string(StringInfo buf,const char * str,int flags,int width)5934 text_format_append_string(StringInfo buf, const char *str,
5935 int flags, int width)
5936 {
5937 bool align_to_left = false;
5938 int len;
5939
5940 /* fast path for typical easy case */
5941 if (width == 0)
5942 {
5943 appendStringInfoString(buf, str);
5944 return;
5945 }
5946
5947 if (width < 0)
5948 {
5949 /* Negative width: implicit '-' flag, then take absolute value */
5950 align_to_left = true;
5951 /* -INT_MIN is undefined */
5952 if (width <= INT_MIN)
5953 ereport(ERROR,
5954 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5955 errmsg("number is out of range")));
5956 width = -width;
5957 }
5958 else if (flags & TEXT_FORMAT_FLAG_MINUS)
5959 align_to_left = true;
5960
5961 len = pg_mbstrlen(str);
5962 if (align_to_left)
5963 {
5964 /* left justify */
5965 appendStringInfoString(buf, str);
5966 if (len < width)
5967 appendStringInfoSpaces(buf, width - len);
5968 }
5969 else
5970 {
5971 /* right justify */
5972 if (len < width)
5973 appendStringInfoSpaces(buf, width - len);
5974 appendStringInfoString(buf, str);
5975 }
5976 }
5977
5978 /*
5979 * text_format_nv - nonvariadic wrapper for text_format function.
5980 *
5981 * note: this wrapper is necessary to pass the sanity check in opr_sanity,
5982 * which checks that all built-in functions that share the implementing C
5983 * function take the same number of arguments.
5984 */
5985 Datum
text_format_nv(PG_FUNCTION_ARGS)5986 text_format_nv(PG_FUNCTION_ARGS)
5987 {
5988 return text_format(fcinfo);
5989 }
5990
5991 /*
5992 * Helper function for Levenshtein distance functions. Faster than memcmp(),
5993 * for this use case.
5994 */
5995 static inline bool
rest_of_char_same(const char * s1,const char * s2,int len)5996 rest_of_char_same(const char *s1, const char *s2, int len)
5997 {
5998 while (len > 0)
5999 {
6000 len--;
6001 if (s1[len] != s2[len])
6002 return false;
6003 }
6004 return true;
6005 }
6006
6007 /* Expand each Levenshtein distance variant */
6008 #include "levenshtein.c"
6009 #define LEVENSHTEIN_LESS_EQUAL
6010 #include "levenshtein.c"
6011
6012
6013 /*
6014 * Unicode support
6015 */
6016
6017 static UnicodeNormalizationForm
unicode_norm_form_from_string(const char * formstr)6018 unicode_norm_form_from_string(const char *formstr)
6019 {
6020 UnicodeNormalizationForm form = -1;
6021
6022 /*
6023 * Might as well check this while we're here.
6024 */
6025 if (GetDatabaseEncoding() != PG_UTF8)
6026 ereport(ERROR,
6027 (errcode(ERRCODE_SYNTAX_ERROR),
6028 errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6029
6030 if (pg_strcasecmp(formstr, "NFC") == 0)
6031 form = UNICODE_NFC;
6032 else if (pg_strcasecmp(formstr, "NFD") == 0)
6033 form = UNICODE_NFD;
6034 else if (pg_strcasecmp(formstr, "NFKC") == 0)
6035 form = UNICODE_NFKC;
6036 else if (pg_strcasecmp(formstr, "NFKD") == 0)
6037 form = UNICODE_NFKD;
6038 else
6039 ereport(ERROR,
6040 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6041 errmsg("invalid normalization form: %s", formstr)));
6042
6043 return form;
6044 }
6045
6046 Datum
unicode_normalize_func(PG_FUNCTION_ARGS)6047 unicode_normalize_func(PG_FUNCTION_ARGS)
6048 {
6049 text *input = PG_GETARG_TEXT_PP(0);
6050 char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6051 UnicodeNormalizationForm form;
6052 int size;
6053 pg_wchar *input_chars;
6054 pg_wchar *output_chars;
6055 unsigned char *p;
6056 text *result;
6057 int i;
6058
6059 form = unicode_norm_form_from_string(formstr);
6060
6061 /* convert to pg_wchar */
6062 size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6063 input_chars = palloc((size + 1) * sizeof(pg_wchar));
6064 p = (unsigned char *) VARDATA_ANY(input);
6065 for (i = 0; i < size; i++)
6066 {
6067 input_chars[i] = utf8_to_unicode(p);
6068 p += pg_utf_mblen(p);
6069 }
6070 input_chars[i] = (pg_wchar) '\0';
6071 Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6072
6073 /* action */
6074 output_chars = unicode_normalize(form, input_chars);
6075
6076 /* convert back to UTF-8 string */
6077 size = 0;
6078 for (pg_wchar *wp = output_chars; *wp; wp++)
6079 {
6080 unsigned char buf[4];
6081
6082 unicode_to_utf8(*wp, buf);
6083 size += pg_utf_mblen(buf);
6084 }
6085
6086 result = palloc(size + VARHDRSZ);
6087 SET_VARSIZE(result, size + VARHDRSZ);
6088
6089 p = (unsigned char *) VARDATA_ANY(result);
6090 for (pg_wchar *wp = output_chars; *wp; wp++)
6091 {
6092 unicode_to_utf8(*wp, p);
6093 p += pg_utf_mblen(p);
6094 }
6095 Assert((char *) p == (char *) result + size + VARHDRSZ);
6096
6097 PG_RETURN_TEXT_P(result);
6098 }
6099
6100 /*
6101 * Check whether the string is in the specified Unicode normalization form.
6102 *
6103 * This is done by convering the string to the specified normal form and then
6104 * comparing that to the original string. To speed that up, we also apply the
6105 * "quick check" algorithm specified in UAX #15, which can give a yes or no
6106 * answer for many strings by just scanning the string once.
6107 *
6108 * This function should generally be optimized for the case where the string
6109 * is in fact normalized. In that case, we'll end up looking at the entire
6110 * string, so it's probably not worth doing any incremental conversion etc.
6111 */
6112 Datum
unicode_is_normalized(PG_FUNCTION_ARGS)6113 unicode_is_normalized(PG_FUNCTION_ARGS)
6114 {
6115 text *input = PG_GETARG_TEXT_PP(0);
6116 char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6117 UnicodeNormalizationForm form;
6118 int size;
6119 pg_wchar *input_chars;
6120 pg_wchar *output_chars;
6121 unsigned char *p;
6122 int i;
6123 UnicodeNormalizationQC quickcheck;
6124 int output_size;
6125 bool result;
6126
6127 form = unicode_norm_form_from_string(formstr);
6128
6129 /* convert to pg_wchar */
6130 size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6131 input_chars = palloc((size + 1) * sizeof(pg_wchar));
6132 p = (unsigned char *) VARDATA_ANY(input);
6133 for (i = 0; i < size; i++)
6134 {
6135 input_chars[i] = utf8_to_unicode(p);
6136 p += pg_utf_mblen(p);
6137 }
6138 input_chars[i] = (pg_wchar) '\0';
6139 Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6140
6141 /* quick check (see UAX #15) */
6142 quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
6143 if (quickcheck == UNICODE_NORM_QC_YES)
6144 PG_RETURN_BOOL(true);
6145 else if (quickcheck == UNICODE_NORM_QC_NO)
6146 PG_RETURN_BOOL(false);
6147
6148 /* normalize and compare with original */
6149 output_chars = unicode_normalize(form, input_chars);
6150
6151 output_size = 0;
6152 for (pg_wchar *wp = output_chars; *wp; wp++)
6153 output_size++;
6154
6155 result = (size == output_size) &&
6156 (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
6157
6158 PG_RETURN_BOOL(result);
6159 }
6160