1 /*-------------------------------------------------------------------------
2  *
3  * varlena.c
4  *	  Functions for the variable-length built-in types.
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/utils/adt/varlena.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <limits.h>
19 
20 #include "access/detoast.h"
21 #include "access/toast_compression.h"
22 #include "catalog/pg_collation.h"
23 #include "catalog/pg_type.h"
24 #include "common/hashfn.h"
25 #include "common/int.h"
26 #include "common/unicode_norm.h"
27 #include "lib/hyperloglog.h"
28 #include "libpq/pqformat.h"
29 #include "miscadmin.h"
30 #include "nodes/execnodes.h"
31 #include "parser/scansup.h"
32 #include "port/pg_bswap.h"
33 #include "regex/regex.h"
34 #include "utils/builtins.h"
35 #include "utils/bytea.h"
36 #include "utils/lsyscache.h"
37 #include "utils/memutils.h"
38 #include "utils/pg_locale.h"
39 #include "utils/sortsupport.h"
40 #include "utils/varlena.h"
41 
42 
43 /* GUC variable */
44 int			bytea_output = BYTEA_OUTPUT_HEX;
45 
46 typedef struct varlena unknown;
47 typedef struct varlena VarString;
48 
49 /*
50  * State for text_position_* functions.
51  */
52 typedef struct
53 {
54 	bool		is_multibyte;	/* T if multibyte encoding */
55 	bool		is_multibyte_char_in_char;	/* need to check char boundaries? */
56 
57 	char	   *str1;			/* haystack string */
58 	char	   *str2;			/* needle string */
59 	int			len1;			/* string lengths in bytes */
60 	int			len2;
61 
62 	/* Skip table for Boyer-Moore-Horspool search algorithm: */
63 	int			skiptablemask;	/* mask for ANDing with skiptable subscripts */
64 	int			skiptable[256]; /* skip distance for given mismatched char */
65 
66 	char	   *last_match;		/* pointer to last match in 'str1' */
67 
68 	/*
69 	 * Sometimes we need to convert the byte position of a match to a
70 	 * character position.  These store the last position that was converted,
71 	 * so that on the next call, we can continue from that point, rather than
wc_HmacSetKey(Hmac * hmac,int type,const byte * key,word32 keySz)72 	 * count characters from the very beginning.
73 	 */
74 	char	   *refpoint;		/* pointer within original haystack string */
75 	int			refpos;			/* 0-based character offset of the same point */
76 } TextPositionState;
77 
78 typedef struct
79 {
80 	char	   *buf1;			/* 1st string, or abbreviation original string
81 								 * buf */
wc_HmacUpdate(Hmac * hmac,const byte * in,word32 sz)82 	char	   *buf2;			/* 2nd string, or abbreviation strxfrm() buf */
83 	int			buflen1;
84 	int			buflen2;
85 	int			last_len1;		/* Length of last buf1 string/strxfrm() input */
86 	int			last_len2;		/* Length of last buf2 string/strxfrm() blob */
87 	int			last_returned;	/* Last comparison result (cache) */
88 	bool		cache_blob;		/* Does buf2 contain strxfrm() blob, etc? */
89 	bool		collate_c;
wc_HmacFinal(Hmac * hmac,byte * out)90 	Oid			typid;			/* Actual datatype (text/bpchar/bytea/name) */
91 	hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
92 	hyperLogLogState full_card; /* Full key cardinality state */
93 	double		prop_card;		/* Required cardinality proportion */
94 	pg_locale_t locale;
95 } VarStringSortSupport;
96 
97 /*
wolfSSL_GetHmacMaxSize(void)98  * Output data for split_text(): we output either to an array or a table.
99  * tupstore and tupdesc must be set up in advance to output to a table.
100  */
101 typedef struct
102 {
103 	ArrayBuildState *astate;
104 	Tuplestorestate *tupstore;
105 	TupleDesc	tupdesc;
106 } SplitTextOutputData;
107 
108 /*
109  * This should be large enough that most strings will fit, but small enough
110  * that we feel comfortable putting it on the stack
111  */
112 #define TEXTBUFLEN		1024
113 
wc_HmacFree(Hmac * hmac)114 #define DatumGetUnknownP(X)			((unknown *) PG_DETOAST_DATUM(X))
115 #define DatumGetUnknownPCopy(X)		((unknown *) PG_DETOAST_DATUM_COPY(X))
116 #define PG_GETARG_UNKNOWN_P(n)		DatumGetUnknownP(PG_GETARG_DATUM(n))
117 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
118 #define PG_RETURN_UNKNOWN_P(x)		PG_RETURN_POINTER(x)
119 
120 #define DatumGetVarStringP(X)		((VarString *) PG_DETOAST_DATUM(X))
121 #define DatumGetVarStringPP(X)		((VarString *) PG_DETOAST_DATUM_PACKED(X))
122 
123 static int	varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
wc_HKDF(int type,const byte * inKey,word32 inKeySz,const byte * salt,word32 saltSz,const byte * info,word32 infoSz,byte * out,word32 outSz)124 static int	bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
125 static int	namefastcmp_c(Datum x, Datum y, SortSupport ssup);
126 static int	varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
127 static int	namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
128 static int	varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
129 static int	varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
130 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
131 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
132 static int32 text_length(Datum str);
133 static text *text_catenate(text *t1, text *t2);
134 static text *text_substring(Datum str,
135 							int32 start,
136 							int32 length,
wc_HmacSizeByType(int type)137 							bool length_not_specified);
138 static text *text_overlay(text *t1, text *t2, int sp, int sl);
139 static int	text_position(text *t1, text *t2, Oid collid);
140 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
141 static bool text_position_next(TextPositionState *state);
142 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
143 static char *text_position_get_match_ptr(TextPositionState *state);
144 static int	text_position_get_match_pos(TextPositionState *state);
145 static void text_position_cleanup(TextPositionState *state);
146 static void check_collation_set(Oid collid);
147 static int	text_cmp(text *arg1, text *arg2, Oid collid);
148 static bytea *bytea_catenate(bytea *t1, bytea *t2);
149 static bytea *bytea_substring(Datum str,
150 							  int S,
151 							  int L,
152 							  bool length_not_specified);
153 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
154 static void appendStringInfoText(StringInfo str, const text *t);
155 static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
156 static void split_text_accum_result(SplitTextOutputData *tstate,
157 									text *field_value,
158 									text *null_string,
159 									Oid collation);
160 static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
161 									const char *fldsep, const char *null_string);
162 static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
163 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
164 									 int *value);
165 static const char *text_format_parse_format(const char *start_ptr,
166 											const char *end_ptr,
167 											int *argpos, int *widthpos,
168 											int *flags, int *width);
169 static void text_format_string_conversion(StringInfo buf, char conversion,
170 										  FmgrInfo *typOutputInfo,
171 										  Datum value, bool isNull,
172 										  int flags, int width);
173 static void text_format_append_string(StringInfo buf, const char *str,
174 									  int flags, int width);
175 
176 
177 /*****************************************************************************
178  *	 CONVERSION ROUTINES EXPORTED FOR USE BY C CODE							 *
179  *****************************************************************************/
180 
181 /*
182  * cstring_to_text
183  *
184  * Create a text value from a null-terminated C string.
185  *
186  * The new text value is freshly palloc'd with a full-size VARHDR.
187  */
188 text *
189 cstring_to_text(const char *s)
190 {
191 	return cstring_to_text_with_len(s, strlen(s));
192 }
193 
194 /*
195  * cstring_to_text_with_len
196  *
197  * Same as cstring_to_text except the caller specifies the string length;
198  * the string need not be null_terminated.
199  */
200 text *
201 cstring_to_text_with_len(const char *s, int len)
202 {
203 	text	   *result = (text *) palloc(len + VARHDRSZ);
204 
205 	SET_VARSIZE(result, len + VARHDRSZ);
206 	memcpy(VARDATA(result), s, len);
207 
208 	return result;
209 }
210 
211 /*
_InitHmac(Hmac * hmac,int type,void * heap)212  * text_to_cstring
213  *
214  * Create a palloc'd, null-terminated C string from a text value.
215  *
216  * We support being passed a compressed or toasted text value.
217  * This is a bit bogus since such values shouldn't really be referred to as
218  * "text *", but it seems useful for robustness.  If we didn't handle that
219  * case here, we'd need another routine that did, anyway.
220  */
221 char *
222 text_to_cstring(const text *t)
223 {
224 	/* must cast away the const, unfortunately */
225 	text	   *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
226 	int			len = VARSIZE_ANY_EXHDR(tunpacked);
227 	char	   *result;
228 
229 	result = (char *) palloc(len + 1);
230 	memcpy(result, VARDATA_ANY(tunpacked), len);
231 	result[len] = '\0';
232 
233 	if (tunpacked != t)
234 		pfree(tunpacked);
235 
236 	return result;
237 }
238 
239 /*
240  * text_to_cstring_buffer
241  *
242  * Copy a text value into a caller-supplied buffer of size dst_len.
243  *
244  * The text string is truncated if necessary to fit.  The result is
245  * guaranteed null-terminated (unless dst_len == 0).
246  *
247  * We support being passed a compressed or toasted text value.
248  * This is a bit bogus since such values shouldn't really be referred to as
249  * "text *", but it seems useful for robustness.  If we didn't handle that
250  * case here, we'd need another routine that did, anyway.
251  */
252 void
253 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
254 {
255 	/* must cast away the const, unfortunately */
256 	text	   *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
257 	size_t		src_len = VARSIZE_ANY_EXHDR(srcunpacked);
258 
259 	if (dst_len > 0)
260 	{
261 		dst_len--;
262 		if (dst_len >= src_len)
263 			dst_len = src_len;
264 		else					/* ensure truncation is encoding-safe */
265 			dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
266 		memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
267 		dst[dst_len] = '\0';
268 	}
269 
270 	if (srcunpacked != src)
271 		pfree(srcunpacked);
272 }
273 
274 
275 /*****************************************************************************
276  *	 USER I/O ROUTINES														 *
277  *****************************************************************************/
278 
279 
280 #define VAL(CH)			((CH) - '0')
281 #define DIG(VAL)		((VAL) + '0')
282 
283 /*
284  *		byteain			- converts from printable representation of byte array
285  *
286  *		Non-printable characters must be passed as '\nnn' (octal) and are
287  *		converted to internal form.  '\' must be passed as '\\'.
288  *		ereport(ERROR, ...) if bad form.
289  *
290  *		BUGS:
291  *				The input is scanned twice.
292  *				The error checking of input is minimal.
293  */
294 Datum
wc_HmacSetKey(Hmac * hmac,int type,const byte * key,word32 length)295 byteain(PG_FUNCTION_ARGS)
296 {
297 	char	   *inputText = PG_GETARG_CSTRING(0);
298 	char	   *tp;
299 	char	   *rp;
300 	int			bc;
301 	bytea	   *result;
302 
303 	/* Recognize hex input */
304 	if (inputText[0] == '\\' && inputText[1] == 'x')
305 	{
306 		size_t		len = strlen(inputText);
307 
308 		bc = (len - 2) / 2 + VARHDRSZ;	/* maximum possible length */
309 		result = palloc(bc);
310 		bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
311 		SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
312 
313 		PG_RETURN_BYTEA_P(result);
314 	}
315 
316 	/* Else, it's the traditional escaped style */
317 	for (bc = 0, tp = inputText; *tp != '\0'; bc++)
318 	{
319 		if (tp[0] != '\\')
320 			tp++;
321 		else if ((tp[0] == '\\') &&
322 				 (tp[1] >= '0' && tp[1] <= '3') &&
323 				 (tp[2] >= '0' && tp[2] <= '7') &&
324 				 (tp[3] >= '0' && tp[3] <= '7'))
325 			tp += 4;
326 		else if ((tp[0] == '\\') &&
327 				 (tp[1] == '\\'))
328 			tp += 2;
329 		else
330 		{
331 			/*
332 			 * one backslash, not followed by another or ### valid octal
333 			 */
334 			ereport(ERROR,
335 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
336 					 errmsg("invalid input syntax for type %s", "bytea")));
337 		}
338 	}
339 
340 	bc += VARHDRSZ;
341 
342 	result = (bytea *) palloc(bc);
343 	SET_VARSIZE(result, bc);
344 
345 	tp = inputText;
346 	rp = VARDATA(result);
347 	while (*tp != '\0')
348 	{
349 		if (tp[0] != '\\')
350 			*rp++ = *tp++;
351 		else if ((tp[0] == '\\') &&
352 				 (tp[1] >= '0' && tp[1] <= '3') &&
353 				 (tp[2] >= '0' && tp[2] <= '7') &&
354 				 (tp[3] >= '0' && tp[3] <= '7'))
355 		{
356 			bc = VAL(tp[1]);
357 			bc <<= 3;
358 			bc += VAL(tp[2]);
359 			bc <<= 3;
360 			*rp++ = bc + VAL(tp[3]);
361 
362 			tp += 4;
363 		}
364 		else if ((tp[0] == '\\') &&
365 				 (tp[1] == '\\'))
366 		{
367 			*rp++ = '\\';
368 			tp += 2;
369 		}
370 		else
371 		{
372 			/*
373 			 * We should never get here. The first pass should not allow it.
374 			 */
375 			ereport(ERROR,
376 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
377 					 errmsg("invalid input syntax for type %s", "bytea")));
378 		}
379 	}
380 
381 	PG_RETURN_BYTEA_P(result);
382 }
383 
384 /*
385  *		byteaout		- converts to printable representation of byte array
386  *
387  *		In the traditional escaped format, non-printable characters are
388  *		printed as '\nnn' (octal) and '\' as '\\'.
389  */
390 Datum
391 byteaout(PG_FUNCTION_ARGS)
392 {
393 	bytea	   *vlena = PG_GETARG_BYTEA_PP(0);
394 	char	   *result;
395 	char	   *rp;
396 
397 	if (bytea_output == BYTEA_OUTPUT_HEX)
398 	{
399 		/* Print hex format */
400 		rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
401 		*rp++ = '\\';
402 		*rp++ = 'x';
403 		rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
404 	}
405 	else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
406 	{
407 		/* Print traditional escaped format */
408 		char	   *vp;
409 		uint64		len;
410 		int			i;
411 
412 		len = 1;				/* empty string has 1 char */
413 		vp = VARDATA_ANY(vlena);
414 		for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
415 		{
416 			if (*vp == '\\')
417 				len += 2;
418 			else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
419 				len += 4;
420 			else
421 				len++;
422 		}
423 
424 		/*
425 		 * In principle len can't overflow uint32 if the input fit in 1GB, but
426 		 * for safety let's check rather than relying on palloc's internal
427 		 * check.
428 		 */
429 		if (len > MaxAllocSize)
430 			ereport(ERROR,
431 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
432 					 errmsg_internal("result of bytea output conversion is too large")));
433 		rp = result = (char *) palloc(len);
434 
435 		vp = VARDATA_ANY(vlena);
436 		for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
437 		{
438 			if (*vp == '\\')
439 			{
440 				*rp++ = '\\';
441 				*rp++ = '\\';
442 			}
443 			else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
444 			{
445 				int			val;	/* holds unprintable chars */
446 
447 				val = *vp;
448 				rp[0] = '\\';
449 				rp[3] = DIG(val & 07);
450 				val >>= 3;
451 				rp[2] = DIG(val & 07);
452 				val >>= 3;
453 				rp[1] = DIG(val & 03);
454 				rp += 4;
455 			}
456 			else
457 				*rp++ = *vp;
458 		}
459 	}
460 	else
461 	{
462 		elog(ERROR, "unrecognized bytea_output setting: %d",
463 			 bytea_output);
464 		rp = result = NULL;		/* keep compiler quiet */
465 	}
466 	*rp = '\0';
467 	PG_RETURN_CSTRING(result);
468 }
469 
470 /*
471  *		bytearecv			- converts external binary format to bytea
472  */
473 Datum
474 bytearecv(PG_FUNCTION_ARGS)
475 {
476 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
477 	bytea	   *result;
478 	int			nbytes;
479 
480 	nbytes = buf->len - buf->cursor;
481 	result = (bytea *) palloc(nbytes + VARHDRSZ);
482 	SET_VARSIZE(result, nbytes + VARHDRSZ);
483 	pq_copymsgbytes(buf, VARDATA(result), nbytes);
484 	PG_RETURN_BYTEA_P(result);
485 }
486 
487 /*
488  *		byteasend			- converts bytea to binary format
489  *
490  * This is a special case: just copy the input...
491  */
492 Datum
493 byteasend(PG_FUNCTION_ARGS)
494 {
495 	bytea	   *vlena = PG_GETARG_BYTEA_P_COPY(0);
496 
497 	PG_RETURN_BYTEA_P(vlena);
498 }
499 
500 Datum
501 bytea_string_agg_transfn(PG_FUNCTION_ARGS)
502 {
503 	StringInfo	state;
504 
505 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
506 
507 	/* Append the value unless null. */
508 	if (!PG_ARGISNULL(1))
509 	{
510 		bytea	   *value = PG_GETARG_BYTEA_PP(1);
511 
512 		/* On the first time through, we ignore the delimiter. */
513 		if (state == NULL)
514 			state = makeStringAggState(fcinfo);
515 		else if (!PG_ARGISNULL(2))
516 		{
517 			bytea	   *delim = PG_GETARG_BYTEA_PP(2);
518 
519 			appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
520 		}
521 
522 		appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
523 	}
524 
525 	/*
526 	 * The transition type for string_agg() is declared to be "internal",
527 	 * which is a pass-by-value type the same size as a pointer.
528 	 */
529 	PG_RETURN_POINTER(state);
530 }
531 
532 Datum
533 bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
534 {
535 	StringInfo	state;
536 
537 	/* cannot be called directly because of internal-type argument */
538 	Assert(AggCheckCallContext(fcinfo, NULL));
539 
540 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
541 
542 	if (state != NULL)
543 	{
544 		bytea	   *result;
545 
546 		result = (bytea *) palloc(state->len + VARHDRSZ);
547 		SET_VARSIZE(result, state->len + VARHDRSZ);
548 		memcpy(VARDATA(result), state->data, state->len);
549 		PG_RETURN_BYTEA_P(result);
550 	}
551 	else
552 		PG_RETURN_NULL();
553 }
554 
555 /*
556  *		textin			- converts "..." to internal representation
557  */
558 Datum
559 textin(PG_FUNCTION_ARGS)
560 {
561 	char	   *inputText = PG_GETARG_CSTRING(0);
562 
563 	PG_RETURN_TEXT_P(cstring_to_text(inputText));
564 }
565 
566 /*
567  *		textout			- converts internal representation to "..."
568  */
569 Datum
570 textout(PG_FUNCTION_ARGS)
571 {
572 	Datum		txt = PG_GETARG_DATUM(0);
573 
574 	PG_RETURN_CSTRING(TextDatumGetCString(txt));
575 }
576 
577 /*
578  *		textrecv			- converts external binary format to text
579  */
580 Datum
581 textrecv(PG_FUNCTION_ARGS)
582 {
583 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
584 	text	   *result;
585 	char	   *str;
586 	int			nbytes;
HmacKeyInnerHash(Hmac * hmac)587 
588 	str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
589 
590 	result = cstring_to_text_with_len(str, nbytes);
591 	pfree(str);
592 	PG_RETURN_TEXT_P(result);
593 }
594 
595 /*
596  *		textsend			- converts text to binary format
597  */
598 Datum
599 textsend(PG_FUNCTION_ARGS)
600 {
601 	text	   *t = PG_GETARG_TEXT_PP(0);
602 	StringInfoData buf;
603 
604 	pq_begintypsend(&buf);
605 	pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
606 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
607 }
608 
609 
610 /*
611  *		unknownin			- converts "..." to internal representation
612  */
613 Datum
614 unknownin(PG_FUNCTION_ARGS)
615 {
616 	char	   *str = PG_GETARG_CSTRING(0);
617 
618 	/* representation is same as cstring */
619 	PG_RETURN_CSTRING(pstrdup(str));
620 }
621 
622 /*
623  *		unknownout			- converts internal representation to "..."
624  */
625 Datum
626 unknownout(PG_FUNCTION_ARGS)
627 {
628 	/* representation is same as cstring */
629 	char	   *str = PG_GETARG_CSTRING(0);
630 
631 	PG_RETURN_CSTRING(pstrdup(str));
632 }
633 
634 /*
635  *		unknownrecv			- converts external binary format to unknown
636  */
637 Datum
638 unknownrecv(PG_FUNCTION_ARGS)
639 {
640 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
641 	char	   *str;
642 	int			nbytes;
643 
644 	str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
645 	/* representation is same as cstring */
646 	PG_RETURN_CSTRING(str);
647 }
648 
649 /*
650  *		unknownsend			- converts unknown to binary format
651  */
652 Datum
653 unknownsend(PG_FUNCTION_ARGS)
654 {
655 	/* representation is same as cstring */
656 	char	   *str = PG_GETARG_CSTRING(0);
657 	StringInfoData buf;
658 
659 	pq_begintypsend(&buf);
660 	pq_sendtext(&buf, str, strlen(str));
661 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
662 }
663 
664 
665 /* ========== PUBLIC ROUTINES ========== */
666 
667 /*
668  * textlen -
669  *	  returns the logical length of a text*
wc_HmacUpdate(Hmac * hmac,const byte * msg,word32 length)670  *	   (which is less than the VARSIZE of the text*)
671  */
672 Datum
673 textlen(PG_FUNCTION_ARGS)
674 {
675 	Datum		str = PG_GETARG_DATUM(0);
676 
677 	/* try to avoid decompressing argument */
678 	PG_RETURN_INT32(text_length(str));
679 }
680 
681 /*
682  * text_length -
683  *	Does the real work for textlen()
684  *
685  *	This is broken out so it can be called directly by other string processing
686  *	functions.  Note that the argument is passed as a Datum, to indicate that
687  *	it may still be in compressed form.  We can avoid decompressing it at all
688  *	in some cases.
689  */
690 static int32
691 text_length(Datum str)
692 {
693 	/* fastpath when max encoding length is one */
694 	if (pg_database_encoding_max_length() == 1)
695 		PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
696 	else
697 	{
698 		text	   *t = DatumGetTextPP(str);
699 
700 		PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
701 											 VARSIZE_ANY_EXHDR(t)));
702 	}
703 }
704 
705 /*
706  * textoctetlen -
707  *	  returns the physical length of a text*
708  *	   (which is less than the VARSIZE of the text*)
709  */
710 Datum
711 textoctetlen(PG_FUNCTION_ARGS)
712 {
713 	Datum		str = PG_GETARG_DATUM(0);
714 
715 	/* We need not detoast the input at all */
716 	PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
717 }
718 
719 /*
720  * textcat -
721  *	  takes two text* and returns a text* that is the concatenation of
722  *	  the two.
723  *
724  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
725  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
726  * Allocate space for output in all cases.
727  * XXX - thomas 1997-07-10
728  */
729 Datum
730 textcat(PG_FUNCTION_ARGS)
731 {
732 	text	   *t1 = PG_GETARG_TEXT_PP(0);
733 	text	   *t2 = PG_GETARG_TEXT_PP(1);
734 
735 	PG_RETURN_TEXT_P(text_catenate(t1, t2));
736 }
737 
738 /*
739  * text_catenate
740  *	Guts of textcat(), broken out so it can be used by other functions
741  *
742  * Arguments can be in short-header form, but not compressed or out-of-line
743  */
744 static text *
745 text_catenate(text *t1, text *t2)
746 {
747 	text	   *result;
748 	int			len1,
749 				len2,
750 				len;
751 	char	   *ptr;
752 
753 	len1 = VARSIZE_ANY_EXHDR(t1);
754 	len2 = VARSIZE_ANY_EXHDR(t2);
755 
756 	/* paranoia ... probably should throw error instead? */
757 	if (len1 < 0)
758 		len1 = 0;
759 	if (len2 < 0)
760 		len2 = 0;
761 
762 	len = len1 + len2 + VARHDRSZ;
763 	result = (text *) palloc(len);
764 
765 	/* Set size of result string... */
766 	SET_VARSIZE(result, len);
767 
768 	/* Fill data field of result string... */
769 	ptr = VARDATA(result);
770 	if (len1 > 0)
771 		memcpy(ptr, VARDATA_ANY(t1), len1);
772 	if (len2 > 0)
wc_HmacFinal(Hmac * hmac,byte * hash)773 		memcpy(ptr + len1, VARDATA_ANY(t2), len2);
774 
775 	return result;
776 }
777 
778 /*
779  * charlen_to_bytelen()
780  *	Compute the number of bytes occupied by n characters starting at *p
781  *
782  * It is caller's responsibility that there actually are n characters;
783  * the string need not be null-terminated.
784  */
785 static int
786 charlen_to_bytelen(const char *p, int n)
787 {
788 	if (pg_database_encoding_max_length() == 1)
789 	{
790 		/* Optimization for single-byte encodings */
791 		return n;
792 	}
793 	else
794 	{
795 		const char *s;
796 
797 		for (s = p; n > 0; n--)
798 			s += pg_mblen(s);
799 
800 		return s - p;
801 	}
802 }
803 
804 /*
805  * text_substr()
806  * Return a substring starting at the specified position.
807  * - thomas 1997-12-31
808  *
809  * Input:
810  *	- string
811  *	- starting position (is one-based)
812  *	- string length
813  *
814  * If the starting position is zero or less, then return from the start of the string
815  *	adjusting the length to be consistent with the "negative start" per SQL.
816  * If the length is less than zero, return the remaining string.
817  *
818  * Added multibyte support.
819  * - Tatsuo Ishii 1998-4-21
820  * Changed behavior if starting position is less than one to conform to SQL behavior.
821  * Formerly returned the entire string; now returns a portion.
822  * - Thomas Lockhart 1998-12-10
823  * Now uses faster TOAST-slicing interface
824  * - John Gray 2002-02-22
825  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
826  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
827  * error; if E < 1, return '', not entire string). Fixed MB related bug when
828  * S > LC and < LC + 4 sometimes garbage characters are returned.
829  * - Joe Conway 2002-08-10
830  */
831 Datum
832 text_substr(PG_FUNCTION_ARGS)
833 {
834 	PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
835 									PG_GETARG_INT32(1),
836 									PG_GETARG_INT32(2),
837 									false));
838 }
839 
840 /*
841  * text_substr_no_len -
842  *	  Wrapper to avoid opr_sanity failure due to
843  *	  one function accepting a different number of args.
844  */
845 Datum
846 text_substr_no_len(PG_FUNCTION_ARGS)
847 {
848 	PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
849 									PG_GETARG_INT32(1),
850 									-1, true));
851 }
852 
853 /*
854  * text_substring -
855  *	Does the real work for text_substr() and text_substr_no_len()
856  *
857  *	This is broken out so it can be called directly by other string processing
858  *	functions.  Note that the argument is passed as a Datum, to indicate that
859  *	it may still be in compressed/toasted form.  We can avoid detoasting all
860  *	of it in some cases.
861  *
862  *	The result is always a freshly palloc'd datum.
863  */
864 static text *
865 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
866 {
867 	int32		eml = pg_database_encoding_max_length();
868 	int32		S = start;		/* start position */
869 	int32		S1;				/* adjusted start position */
870 	int32		L1;				/* adjusted substring length */
871 	int32		E;				/* end position */
872 
873 	/*
874 	 * SQL99 says S can be zero or negative, but we still must fetch from the
875 	 * start of the string.
876 	 */
877 	S1 = Max(S, 1);
878 
879 	/* life is easy if the encoding max length is 1 */
880 	if (eml == 1)
881 	{
882 		if (length_not_specified)	/* special case - get length to end of
883 									 * string */
884 			L1 = -1;
885 		else if (length < 0)
886 		{
887 			/* SQL99 says to throw an error for E < S, i.e., negative length */
888 			ereport(ERROR,
889 					(errcode(ERRCODE_SUBSTRING_ERROR),
890 					 errmsg("negative substring length not allowed")));
891 			L1 = -1;			/* silence stupider compilers */
892 		}
893 		else if (pg_add_s32_overflow(S, length, &E))
894 		{
895 			/*
896 			 * L could be large enough for S + L to overflow, in which case
897 			 * the substring must run to end of string.
898 			 */
899 			L1 = -1;
900 		}
901 		else
902 		{
903 			/*
904 			 * A zero or negative value for the end position can happen if the
905 			 * start was negative or one. SQL99 says to return a zero-length
906 			 * string.
907 			 */
908 			if (E < 1)
909 				return cstring_to_text("");
910 
911 			L1 = E - S1;
912 		}
913 
914 		/*
915 		 * If the start position is past the end of the string, SQL99 says to
916 		 * return a zero-length string -- DatumGetTextPSlice() will do that
917 		 * for us.  We need only convert S1 to zero-based starting position.
918 		 */
919 		return DatumGetTextPSlice(str, S1 - 1, L1);
920 	}
921 	else if (eml > 1)
922 	{
923 		/*
924 		 * When encoding max length is > 1, we can't get LC without
925 		 * detoasting, so we'll grab a conservatively large slice now and go
926 		 * back later to do the right thing
927 		 */
928 		int32		slice_start;
929 		int32		slice_size;
930 		int32		slice_strlen;
931 		text	   *slice;
932 		int32		E1;
933 		int32		i;
934 		char	   *p;
935 		char	   *s;
936 		text	   *ret;
937 
938 		/*
939 		 * We need to start at position zero because there is no way to know
940 		 * in advance which byte offset corresponds to the supplied start
941 		 * position.
942 		 */
943 		slice_start = 0;
944 
945 		if (length_not_specified)	/* special case - get length to end of
946 									 * string */
947 			slice_size = L1 = -1;
948 		else if (length < 0)
949 		{
950 			/* SQL99 says to throw an error for E < S, i.e., negative length */
951 			ereport(ERROR,
952 					(errcode(ERRCODE_SUBSTRING_ERROR),
953 					 errmsg("negative substring length not allowed")));
954 			slice_size = L1 = -1;	/* silence stupider compilers */
955 		}
956 		else if (pg_add_s32_overflow(S, length, &E))
957 		{
958 			/*
959 			 * L could be large enough for S + L to overflow, in which case
960 			 * the substring must run to end of string.
961 			 */
962 			slice_size = L1 = -1;
963 		}
964 		else
965 		{
966 			/*
967 			 * A zero or negative value for the end position can happen if the
968 			 * start was negative or one. SQL99 says to return a zero-length
969 			 * string.
970 			 */
971 			if (E < 1)
972 				return cstring_to_text("");
973 
974 			/*
975 			 * if E is past the end of the string, the tuple toaster will
976 			 * truncate the length for us
977 			 */
978 			L1 = E - S1;
979 
980 			/*
981 			 * Total slice size in bytes can't be any longer than the start
982 			 * position plus substring length times the encoding max length.
983 			 * If that overflows, we can just use -1.
984 			 */
985 			if (pg_mul_s32_overflow(E, eml, &slice_size))
986 				slice_size = -1;
987 		}
988 
989 		/*
990 		 * If we're working with an untoasted source, no need to do an extra
991 		 * copying step.
992 		 */
993 		if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
994 			VARATT_IS_EXTERNAL(DatumGetPointer(str)))
995 			slice = DatumGetTextPSlice(str, slice_start, slice_size);
996 		else
997 			slice = (text *) DatumGetPointer(str);
998 
wc_HmacInit(Hmac * hmac,void * heap,int devId)999 		/* see if we got back an empty string */
1000 		if (VARSIZE_ANY_EXHDR(slice) == 0)
1001 		{
1002 			if (slice != (text *) DatumGetPointer(str))
1003 				pfree(slice);
1004 			return cstring_to_text("");
1005 		}
1006 
1007 		/* Now we can get the actual length of the slice in MB characters */
1008 		slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1009 											VARSIZE_ANY_EXHDR(slice));
1010 
1011 		/*
1012 		 * Check that the start position wasn't > slice_strlen. If so, SQL99
1013 		 * says to return a zero-length string.
1014 		 */
1015 		if (S1 > slice_strlen)
1016 		{
1017 			if (slice != (text *) DatumGetPointer(str))
1018 				pfree(slice);
1019 			return cstring_to_text("");
1020 		}
1021 
1022 		/*
1023 		 * Adjust L1 and E1 now that we know the slice string length. Again
1024 		 * remember that S1 is one based, and slice_start is zero based.
wc_HmacInit_Id(Hmac * hmac,unsigned char * id,int len,void * heap,int devId)1025 		 */
1026 		if (L1 > -1)
1027 			E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1028 		else
1029 			E1 = slice_start + 1 + slice_strlen;
1030 
1031 		/*
1032 		 * Find the start position in the slice; remember S1 is not zero based
1033 		 */
1034 		p = VARDATA_ANY(slice);
1035 		for (i = 0; i < S1 - 1; i++)
1036 			p += pg_mblen(p);
1037 
1038 		/* hang onto a pointer to our start position */
1039 		s = p;
1040 
1041 		/*
1042 		 * Count the actual bytes used by the substring of the requested
1043 		 * length.
1044 		 */
wc_HmacInit_Label(Hmac * hmac,const char * label,void * heap,int devId)1045 		for (i = S1; i < E1; i++)
1046 			p += pg_mblen(p);
1047 
1048 		ret = (text *) palloc(VARHDRSZ + (p - s));
1049 		SET_VARSIZE(ret, VARHDRSZ + (p - s));
1050 		memcpy(VARDATA(ret), s, (p - s));
1051 
1052 		if (slice != (text *) DatumGetPointer(str))
1053 			pfree(slice);
1054 
1055 		return ret;
1056 	}
1057 	else
1058 		elog(ERROR, "invalid backend encoding: encoding max length < 1");
1059 
1060 	/* not reached: suppress compiler warning */
1061 	return NULL;
1062 }
1063 
1064 /*
1065  * textoverlay
1066  *	Replace specified substring of first string with second
1067  *
1068  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1069  * This code is a direct implementation of what the standard says.
wc_HmacFree(Hmac * hmac)1070  */
1071 Datum
1072 textoverlay(PG_FUNCTION_ARGS)
1073 {
1074 	text	   *t1 = PG_GETARG_TEXT_PP(0);
1075 	text	   *t2 = PG_GETARG_TEXT_PP(1);
1076 	int			sp = PG_GETARG_INT32(2);	/* substring start position */
1077 	int			sl = PG_GETARG_INT32(3);	/* substring length */
1078 
1079 	PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1080 }
1081 
1082 Datum
1083 textoverlay_no_len(PG_FUNCTION_ARGS)
1084 {
1085 	text	   *t1 = PG_GETARG_TEXT_PP(0);
1086 	text	   *t2 = PG_GETARG_TEXT_PP(1);
1087 	int			sp = PG_GETARG_INT32(2);	/* substring start position */
1088 	int			sl;
1089 
1090 	sl = text_length(PointerGetDatum(t2));	/* defaults to length(t2) */
1091 	PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1092 }
1093 
1094 static text *
1095 text_overlay(text *t1, text *t2, int sp, int sl)
1096 {
1097 	text	   *result;
1098 	text	   *s1;
1099 	text	   *s2;
1100 	int			sp_pl_sl;
1101 
1102 	/*
1103 	 * Check for possible integer-overflow cases.  For negative sp, throw a
1104 	 * "substring length" error because that's what should be expected
1105 	 * according to the spec's definition of OVERLAY().
1106 	 */
1107 	if (sp <= 0)
1108 		ereport(ERROR,
1109 				(errcode(ERRCODE_SUBSTRING_ERROR),
1110 				 errmsg("negative substring length not allowed")));
1111 	if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1112 		ereport(ERROR,
1113 				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1114 				 errmsg("integer out of range")));
1115 
1116 	s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1117 	s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1118 	result = text_catenate(s1, t2);
1119 	result = text_catenate(result, s2);
1120 
1121 	return result;
1122 }
1123 
1124 /*
1125  * textpos -
1126  *	  Return the position of the specified substring.
1127  *	  Implements the SQL POSITION() function.
1128  *	  Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1129  * - thomas 1997-07-27
1130  */
1131 Datum
1132 textpos(PG_FUNCTION_ARGS)
1133 {
1134 	text	   *str = PG_GETARG_TEXT_PP(0);
1135 	text	   *search_str = PG_GETARG_TEXT_PP(1);
1136 
1137 	PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1138 }
1139 
1140 /*
1141  * text_position -
1142  *	Does the real work for textpos()
1143  *
1144  * Inputs:
1145  *		t1 - string to be searched
1146  *		t2 - pattern to match within t1
1147  * Result:
1148  *		Character index of the first matched char, starting from 1,
1149  *		or 0 if no match.
1150  *
1151  *	This is broken out so it can be called directly by other string processing
1152  *	functions.
1153  */
wolfSSL_GetHmacMaxSize(void)1154 static int
1155 text_position(text *t1, text *t2, Oid collid)
1156 {
1157 	TextPositionState state;
1158 	int			result;
1159 
1160 	/* Empty needle always matches at position 1 */
1161 	if (VARSIZE_ANY_EXHDR(t2) < 1)
1162 		return 1;
1163 
1164 	/* Otherwise, can't match if haystack is shorter than needle */
1165 	if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1166 		return 0;
1167 
1168 	text_position_setup(t1, t2, collid, &state);
1169 	if (!text_position_next(&state))
1170 		result = 0;
1171 	else
1172 		result = text_position_get_match_pos(&state);
1173 	text_position_cleanup(&state);
1174 	return result;
1175 }
1176 
1177 
1178 /*
1179  * text_position_setup, text_position_next, text_position_cleanup -
1180  *	Component steps of text_position()
1181  *
1182  * These are broken out so that a string can be efficiently searched for
1183  * multiple occurrences of the same pattern.  text_position_next may be
1184  * called multiple times, and it advances to the next match on each call.
1185  * text_position_get_match_ptr() and text_position_get_match_pos() return
1186  * a pointer or 1-based character position of the last match, respectively.
1187  *
1188  * The "state" variable is normally just a local variable in the caller.
1189  *
1190  * NOTE: text_position_next skips over the matched portion.  For example,
1191  * searching for "xx" in "xxx" returns only one match, not two.
1192  */
1193 
1194 static void
1195 text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1196 {
1197 	int			len1 = VARSIZE_ANY_EXHDR(t1);
1198 	int			len2 = VARSIZE_ANY_EXHDR(t2);
1199 	pg_locale_t mylocale = 0;
1200 
1201 	check_collation_set(collid);
1202 
1203 	if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1204 		mylocale = pg_newlocale_from_collation(collid);
1205 
1206 	if (mylocale && !mylocale->deterministic)
1207 		ereport(ERROR,
1208 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1209 				 errmsg("nondeterministic collations are not supported for substring searches")));
1210 
1211 	Assert(len1 > 0);
1212 	Assert(len2 > 0);
1213 
1214 	/*
1215 	 * Even with a multi-byte encoding, we perform the search using the raw
wc_HKDF_Expand(int type,const byte * inKey,word32 inKeySz,const byte * info,word32 infoSz,byte * out,word32 outSz)1216 	 * byte sequence, ignoring multibyte issues.  For UTF-8, that works fine,
1217 	 * because in UTF-8 the byte sequence of one character cannot contain
1218 	 * another character.  For other multi-byte encodings, we do the search
1219 	 * initially as a simple byte search, ignoring multibyte issues, but
1220 	 * verify afterwards that the match we found is at a character boundary,
1221 	 * and continue the search if it was a false match.
1222 	 */
1223 	if (pg_database_encoding_max_length() == 1)
1224 	{
1225 		state->is_multibyte = false;
1226 		state->is_multibyte_char_in_char = false;
1227 	}
1228 	else if (GetDatabaseEncoding() == PG_UTF8)
1229 	{
1230 		state->is_multibyte = true;
1231 		state->is_multibyte_char_in_char = false;
1232 	}
1233 	else
1234 	{
1235 		state->is_multibyte = true;
1236 		state->is_multibyte_char_in_char = true;
1237 	}
1238 
1239 	state->str1 = VARDATA_ANY(t1);
1240 	state->str2 = VARDATA_ANY(t2);
1241 	state->len1 = len1;
1242 	state->len2 = len2;
1243 	state->last_match = NULL;
1244 	state->refpoint = state->str1;
1245 	state->refpos = 0;
1246 
1247 	/*
1248 	 * Prepare the skip table for Boyer-Moore-Horspool searching.  In these
1249 	 * notes we use the terminology that the "haystack" is the string to be
1250 	 * searched (t1) and the "needle" is the pattern being sought (t2).
1251 	 *
1252 	 * If the needle is empty or bigger than the haystack then there is no
1253 	 * point in wasting cycles initializing the table.  We also choose not to
1254 	 * use B-M-H for needles of length 1, since the skip table can't possibly
1255 	 * save anything in that case.
1256 	 */
1257 	if (len1 >= len2 && len2 > 1)
1258 	{
1259 		int			searchlength = len1 - len2;
1260 		int			skiptablemask;
1261 		int			last;
1262 		int			i;
1263 		const char *str2 = state->str2;
1264 
1265 		/*
1266 		 * First we must determine how much of the skip table to use.  The
1267 		 * declaration of TextPositionState allows up to 256 elements, but for
1268 		 * short search problems we don't really want to have to initialize so
1269 		 * many elements --- it would take too long in comparison to the
1270 		 * actual search time.  So we choose a useful skip table size based on
1271 		 * the haystack length minus the needle length.  The closer the needle
1272 		 * length is to the haystack length the less useful skipping becomes.
1273 		 *
1274 		 * Note: since we use bit-masking to select table elements, the skip
1275 		 * table size MUST be a power of 2, and so the mask must be 2^N-1.
1276 		 */
1277 		if (searchlength < 16)
1278 			skiptablemask = 3;
1279 		else if (searchlength < 64)
1280 			skiptablemask = 7;
1281 		else if (searchlength < 128)
wc_HKDF(int type,const byte * inKey,word32 inKeySz,const byte * salt,word32 saltSz,const byte * info,word32 infoSz,byte * out,word32 outSz)1282 			skiptablemask = 15;
1283 		else if (searchlength < 512)
1284 			skiptablemask = 31;
1285 		else if (searchlength < 2048)
1286 			skiptablemask = 63;
1287 		else if (searchlength < 4096)
1288 			skiptablemask = 127;
1289 		else
1290 			skiptablemask = 255;
1291 		state->skiptablemask = skiptablemask;
1292 
1293 		/*
1294 		 * Initialize the skip table.  We set all elements to the needle
1295 		 * length, since this is the correct skip distance for any character
1296 		 * not found in the needle.
1297 		 */
1298 		for (i = 0; i <= skiptablemask; i++)
1299 			state->skiptable[i] = len2;
1300 
1301 		/*
1302 		 * Now examine the needle.  For each character except the last one,
1303 		 * set the corresponding table element to the appropriate skip
1304 		 * distance.  Note that when two characters share the same skip table
1305 		 * entry, the one later in the needle must determine the skip
1306 		 * distance.
1307 		 */
1308 		last = len2 - 1;
1309 
1310 		for (i = 0; i < last; i++)
1311 			state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1312 	}
1313 }
1314 
1315 /*
1316  * Advance to the next match, starting from the end of the previous match
1317  * (or the beginning of the string, on first call).  Returns true if a match
1318  * is found.
1319  *
1320  * Note that this refuses to match an empty-string needle.  Most callers
1321  * will have handled that case specially and we'll never see it here.
1322  */
1323 static bool
1324 text_position_next(TextPositionState *state)
1325 {
1326 	int			needle_len = state->len2;
1327 	char	   *start_ptr;
1328 	char	   *matchptr;
1329 
1330 	if (needle_len <= 0)
1331 		return false;			/* result for empty pattern */
1332 
1333 	/* Start from the point right after the previous match. */
1334 	if (state->last_match)
1335 		start_ptr = state->last_match + needle_len;
1336 	else
1337 		start_ptr = state->str1;
1338 
1339 retry:
1340 	matchptr = text_position_next_internal(start_ptr, state);
1341 
1342 	if (!matchptr)
1343 		return false;
1344 
1345 	/*
1346 	 * Found a match for the byte sequence.  If this is a multibyte encoding,
1347 	 * where one character's byte sequence can appear inside a longer
1348 	 * multi-byte character, we need to verify that the match was at a
1349 	 * character boundary, not in the middle of a multi-byte character.
1350 	 */
1351 	if (state->is_multibyte_char_in_char)
1352 	{
1353 		/* Walk one character at a time, until we reach the match. */
1354 
1355 		/* the search should never move backwards. */
1356 		Assert(state->refpoint <= matchptr);
1357 
1358 		while (state->refpoint < matchptr)
1359 		{
1360 			/* step to next character. */
1361 			state->refpoint += pg_mblen(state->refpoint);
1362 			state->refpos++;
1363 
1364 			/*
1365 			 * If we stepped over the match's start position, then it was a
1366 			 * false positive, where the byte sequence appeared in the middle
1367 			 * of a multi-byte character.  Skip it, and continue the search at
1368 			 * the next character boundary.
1369 			 */
1370 			if (state->refpoint > matchptr)
1371 			{
1372 				start_ptr = state->refpoint;
1373 				goto retry;
1374 			}
1375 		}
1376 	}
1377 
1378 	state->last_match = matchptr;
1379 	return true;
1380 }
1381 
1382 /*
1383  * Subroutine of text_position_next().  This searches for the raw byte
1384  * sequence, ignoring any multi-byte encoding issues.  Returns the first
1385  * match starting at 'start_ptr', or NULL if no match is found.
1386  */
1387 static char *
1388 text_position_next_internal(char *start_ptr, TextPositionState *state)
1389 {
1390 	int			haystack_len = state->len1;
1391 	int			needle_len = state->len2;
1392 	int			skiptablemask = state->skiptablemask;
1393 	const char *haystack = state->str1;
1394 	const char *needle = state->str2;
1395 	const char *haystack_end = &haystack[haystack_len];
1396 	const char *hptr;
1397 
1398 	Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1399 
1400 	if (needle_len == 1)
1401 	{
1402 		/* No point in using B-M-H for a one-character needle */
1403 		char		nchar = *needle;
1404 
1405 		hptr = start_ptr;
1406 		while (hptr < haystack_end)
1407 		{
1408 			if (*hptr == nchar)
1409 				return (char *) hptr;
1410 			hptr++;
1411 		}
1412 	}
1413 	else
1414 	{
1415 		const char *needle_last = &needle[needle_len - 1];
1416 
1417 		/* Start at startpos plus the length of the needle */
1418 		hptr = start_ptr + needle_len - 1;
1419 		while (hptr < haystack_end)
1420 		{
1421 			/* Match the needle scanning *backward* */
1422 			const char *nptr;
1423 			const char *p;
1424 
1425 			nptr = needle_last;
1426 			p = hptr;
1427 			while (*nptr == *p)
1428 			{
1429 				/* Matched it all?	If so, return 1-based position */
1430 				if (nptr == needle)
1431 					return (char *) p;
1432 				nptr--, p--;
1433 			}
1434 
1435 			/*
1436 			 * No match, so use the haystack char at hptr to decide how far to
1437 			 * advance.  If the needle had any occurrence of that character
1438 			 * (or more precisely, one sharing the same skiptable entry)
1439 			 * before its last character, then we advance far enough to align
1440 			 * the last such needle character with that haystack position.
1441 			 * Otherwise we can advance by the whole needle length.
1442 			 */
1443 			hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1444 		}
1445 	}
1446 
1447 	return 0;					/* not found */
1448 }
1449 
1450 /*
1451  * Return a pointer to the current match.
1452  *
1453  * The returned pointer points into the original haystack string.
1454  */
1455 static char *
1456 text_position_get_match_ptr(TextPositionState *state)
1457 {
1458 	return state->last_match;
1459 }
1460 
1461 /*
1462  * Return the offset of the current match.
1463  *
1464  * The offset is in characters, 1-based.
1465  */
1466 static int
1467 text_position_get_match_pos(TextPositionState *state)
1468 {
1469 	if (!state->is_multibyte)
1470 		return state->last_match - state->str1 + 1;
1471 	else
1472 	{
1473 		/* Convert the byte position to char position. */
1474 		while (state->refpoint < state->last_match)
1475 		{
1476 			state->refpoint += pg_mblen(state->refpoint);
1477 			state->refpos++;
1478 		}
1479 		Assert(state->refpoint == state->last_match);
1480 		return state->refpos + 1;
1481 	}
1482 }
1483 
1484 /*
1485  * Reset search state to the initial state installed by text_position_setup.
1486  *
1487  * The next call to text_position_next will search from the beginning
1488  * of the string.
1489  */
1490 static void
1491 text_position_reset(TextPositionState *state)
1492 {
1493 	state->last_match = NULL;
1494 	state->refpoint = state->str1;
1495 	state->refpos = 0;
1496 }
1497 
1498 static void
1499 text_position_cleanup(TextPositionState *state)
1500 {
1501 	/* no cleanup needed */
1502 }
1503 
1504 
1505 static void
1506 check_collation_set(Oid collid)
1507 {
1508 	if (!OidIsValid(collid))
1509 	{
1510 		/*
1511 		 * This typically means that the parser could not resolve a conflict
1512 		 * of implicit collations, so report it that way.
1513 		 */
1514 		ereport(ERROR,
1515 				(errcode(ERRCODE_INDETERMINATE_COLLATION),
1516 				 errmsg("could not determine which collation to use for string comparison"),
1517 				 errhint("Use the COLLATE clause to set the collation explicitly.")));
1518 	}
1519 }
1520 
1521 /* varstr_cmp()
1522  * Comparison function for text strings with given lengths.
1523  * Includes locale support, but must copy strings to temporary memory
1524  *	to allow null-termination for inputs to strcoll().
1525  * Returns an integer less than, equal to, or greater than zero, indicating
1526  * whether arg1 is less than, equal to, or greater than arg2.
1527  *
1528  * Note: many functions that depend on this are marked leakproof; therefore,
1529  * avoid reporting the actual contents of the input when throwing errors.
1530  * All errors herein should be things that can't happen except on corrupt
1531  * data, anyway; otherwise we will have trouble with indexing strings that
1532  * would cause them.
1533  */
1534 int
1535 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1536 {
1537 	int			result;
1538 
1539 	check_collation_set(collid);
1540 
1541 	/*
1542 	 * Unfortunately, there is no strncoll(), so in the non-C locale case we
1543 	 * have to do some memory copying.  This turns out to be significantly
1544 	 * slower, so we optimize the case where LC_COLLATE is C.  We also try to
1545 	 * optimize relatively-short strings by avoiding palloc/pfree overhead.
1546 	 */
1547 	if (lc_collate_is_c(collid))
1548 	{
1549 		result = memcmp(arg1, arg2, Min(len1, len2));
1550 		if ((result == 0) && (len1 != len2))
1551 			result = (len1 < len2) ? -1 : 1;
1552 	}
1553 	else
1554 	{
1555 		char		a1buf[TEXTBUFLEN];
1556 		char		a2buf[TEXTBUFLEN];
1557 		char	   *a1p,
1558 				   *a2p;
1559 		pg_locale_t mylocale = 0;
1560 
1561 		if (collid != DEFAULT_COLLATION_OID)
1562 			mylocale = pg_newlocale_from_collation(collid);
1563 
1564 		/*
1565 		 * memcmp() can't tell us which of two unequal strings sorts first,
1566 		 * but it's a cheap way to tell if they're equal.  Testing shows that
1567 		 * memcmp() followed by strcoll() is only trivially slower than
1568 		 * strcoll() by itself, so we don't lose much if this doesn't work out
1569 		 * very often, and if it does - for example, because there are many
1570 		 * equal strings in the input - then we win big by avoiding expensive
1571 		 * collation-aware comparisons.
1572 		 */
1573 		if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1574 			return 0;
1575 
1576 #ifdef WIN32
1577 		/* Win32 does not have UTF-8, so we need to map to UTF-16 */
1578 		if (GetDatabaseEncoding() == PG_UTF8
1579 			&& (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1580 		{
1581 			int			a1len;
1582 			int			a2len;
1583 			int			r;
1584 
1585 			if (len1 >= TEXTBUFLEN / 2)
1586 			{
1587 				a1len = len1 * 2 + 2;
1588 				a1p = palloc(a1len);
1589 			}
1590 			else
1591 			{
1592 				a1len = TEXTBUFLEN;
1593 				a1p = a1buf;
1594 			}
1595 			if (len2 >= TEXTBUFLEN / 2)
1596 			{
1597 				a2len = len2 * 2 + 2;
1598 				a2p = palloc(a2len);
1599 			}
1600 			else
1601 			{
1602 				a2len = TEXTBUFLEN;
1603 				a2p = a2buf;
1604 			}
1605 
1606 			/* stupid Microsloth API does not work for zero-length input */
1607 			if (len1 == 0)
1608 				r = 0;
1609 			else
1610 			{
1611 				r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1612 										(LPWSTR) a1p, a1len / 2);
1613 				if (!r)
1614 					ereport(ERROR,
1615 							(errmsg("could not convert string to UTF-16: error code %lu",
1616 									GetLastError())));
1617 			}
1618 			((LPWSTR) a1p)[r] = 0;
1619 
1620 			if (len2 == 0)
1621 				r = 0;
1622 			else
1623 			{
1624 				r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1625 										(LPWSTR) a2p, a2len / 2);
1626 				if (!r)
1627 					ereport(ERROR,
1628 							(errmsg("could not convert string to UTF-16: error code %lu",
1629 									GetLastError())));
1630 			}
1631 			((LPWSTR) a2p)[r] = 0;
1632 
1633 			errno = 0;
1634 #ifdef HAVE_LOCALE_T
1635 			if (mylocale)
1636 				result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1637 			else
1638 #endif
1639 				result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1640 			if (result == 2147483647)	/* _NLSCMPERROR; missing from mingw
1641 										 * headers */
1642 				ereport(ERROR,
1643 						(errmsg("could not compare Unicode strings: %m")));
1644 
1645 			/* Break tie if necessary. */
1646 			if (result == 0 &&
1647 				(!mylocale || mylocale->deterministic))
1648 			{
1649 				result = memcmp(arg1, arg2, Min(len1, len2));
1650 				if ((result == 0) && (len1 != len2))
1651 					result = (len1 < len2) ? -1 : 1;
1652 			}
1653 
1654 			if (a1p != a1buf)
1655 				pfree(a1p);
1656 			if (a2p != a2buf)
1657 				pfree(a2p);
1658 
1659 			return result;
1660 		}
1661 #endif							/* WIN32 */
1662 
1663 		if (len1 >= TEXTBUFLEN)
1664 			a1p = (char *) palloc(len1 + 1);
1665 		else
1666 			a1p = a1buf;
1667 		if (len2 >= TEXTBUFLEN)
1668 			a2p = (char *) palloc(len2 + 1);
1669 		else
1670 			a2p = a2buf;
1671 
1672 		memcpy(a1p, arg1, len1);
1673 		a1p[len1] = '\0';
1674 		memcpy(a2p, arg2, len2);
1675 		a2p[len2] = '\0';
1676 
1677 		if (mylocale)
1678 		{
1679 			if (mylocale->provider == COLLPROVIDER_ICU)
1680 			{
1681 #ifdef USE_ICU
1682 #ifdef HAVE_UCOL_STRCOLLUTF8
1683 				if (GetDatabaseEncoding() == PG_UTF8)
1684 				{
1685 					UErrorCode	status;
1686 
1687 					status = U_ZERO_ERROR;
1688 					result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1689 											  arg1, len1,
1690 											  arg2, len2,
1691 											  &status);
1692 					if (U_FAILURE(status))
1693 						ereport(ERROR,
1694 								(errmsg("collation failed: %s", u_errorName(status))));
1695 				}
1696 				else
1697 #endif
1698 				{
1699 					int32_t		ulen1,
1700 								ulen2;
1701 					UChar	   *uchar1,
1702 							   *uchar2;
1703 
1704 					ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1705 					ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1706 
1707 					result = ucol_strcoll(mylocale->info.icu.ucol,
1708 										  uchar1, ulen1,
1709 										  uchar2, ulen2);
1710 
1711 					pfree(uchar1);
1712 					pfree(uchar2);
1713 				}
1714 #else							/* not USE_ICU */
1715 				/* shouldn't happen */
1716 				elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1717 #endif							/* not USE_ICU */
1718 			}
1719 			else
1720 			{
1721 #ifdef HAVE_LOCALE_T
1722 				result = strcoll_l(a1p, a2p, mylocale->info.lt);
1723 #else
1724 				/* shouldn't happen */
1725 				elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1726 #endif
1727 			}
1728 		}
1729 		else
1730 			result = strcoll(a1p, a2p);
1731 
1732 		/* Break tie if necessary. */
1733 		if (result == 0 &&
1734 			(!mylocale || mylocale->deterministic))
1735 			result = strcmp(a1p, a2p);
1736 
1737 		if (a1p != a1buf)
1738 			pfree(a1p);
1739 		if (a2p != a2buf)
1740 			pfree(a2p);
1741 	}
1742 
1743 	return result;
1744 }
1745 
1746 /* text_cmp()
1747  * Internal comparison function for text strings.
1748  * Returns -1, 0 or 1
1749  */
1750 static int
1751 text_cmp(text *arg1, text *arg2, Oid collid)
1752 {
1753 	char	   *a1p,
1754 			   *a2p;
1755 	int			len1,
1756 				len2;
1757 
1758 	a1p = VARDATA_ANY(arg1);
1759 	a2p = VARDATA_ANY(arg2);
1760 
1761 	len1 = VARSIZE_ANY_EXHDR(arg1);
1762 	len2 = VARSIZE_ANY_EXHDR(arg2);
1763 
1764 	return varstr_cmp(a1p, len1, a2p, len2, collid);
1765 }
1766 
1767 /*
1768  * Comparison functions for text strings.
1769  *
1770  * Note: btree indexes need these routines not to leak memory; therefore,
1771  * be careful to free working copies of toasted datums.  Most places don't
1772  * need to be so careful.
1773  */
1774 
1775 Datum
1776 texteq(PG_FUNCTION_ARGS)
1777 {
1778 	Oid			collid = PG_GET_COLLATION();
1779 	bool		result;
1780 
1781 	check_collation_set(collid);
1782 
1783 	if (lc_collate_is_c(collid) ||
1784 		collid == DEFAULT_COLLATION_OID ||
1785 		pg_newlocale_from_collation(collid)->deterministic)
1786 	{
1787 		Datum		arg1 = PG_GETARG_DATUM(0);
1788 		Datum		arg2 = PG_GETARG_DATUM(1);
1789 		Size		len1,
1790 					len2;
1791 
1792 		/*
1793 		 * Since we only care about equality or not-equality, we can avoid all
1794 		 * the expense of strcoll() here, and just do bitwise comparison.  In
1795 		 * fact, we don't even have to do a bitwise comparison if we can show
1796 		 * the lengths of the strings are unequal; which might save us from
1797 		 * having to detoast one or both values.
1798 		 */
1799 		len1 = toast_raw_datum_size(arg1);
1800 		len2 = toast_raw_datum_size(arg2);
1801 		if (len1 != len2)
1802 			result = false;
1803 		else
1804 		{
1805 			text	   *targ1 = DatumGetTextPP(arg1);
1806 			text	   *targ2 = DatumGetTextPP(arg2);
1807 
1808 			result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1809 							 len1 - VARHDRSZ) == 0);
1810 
1811 			PG_FREE_IF_COPY(targ1, 0);
1812 			PG_FREE_IF_COPY(targ2, 1);
1813 		}
1814 	}
1815 	else
1816 	{
1817 		text	   *arg1 = PG_GETARG_TEXT_PP(0);
1818 		text	   *arg2 = PG_GETARG_TEXT_PP(1);
1819 
1820 		result = (text_cmp(arg1, arg2, collid) == 0);
1821 
1822 		PG_FREE_IF_COPY(arg1, 0);
1823 		PG_FREE_IF_COPY(arg2, 1);
1824 	}
1825 
1826 	PG_RETURN_BOOL(result);
1827 }
1828 
1829 Datum
1830 textne(PG_FUNCTION_ARGS)
1831 {
1832 	Oid			collid = PG_GET_COLLATION();
1833 	bool		result;
1834 
1835 	check_collation_set(collid);
1836 
1837 	if (lc_collate_is_c(collid) ||
1838 		collid == DEFAULT_COLLATION_OID ||
1839 		pg_newlocale_from_collation(collid)->deterministic)
1840 	{
1841 		Datum		arg1 = PG_GETARG_DATUM(0);
1842 		Datum		arg2 = PG_GETARG_DATUM(1);
1843 		Size		len1,
1844 					len2;
1845 
1846 		/* See comment in texteq() */
1847 		len1 = toast_raw_datum_size(arg1);
1848 		len2 = toast_raw_datum_size(arg2);
1849 		if (len1 != len2)
1850 			result = true;
1851 		else
1852 		{
1853 			text	   *targ1 = DatumGetTextPP(arg1);
1854 			text	   *targ2 = DatumGetTextPP(arg2);
1855 
1856 			result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1857 							 len1 - VARHDRSZ) != 0);
1858 
1859 			PG_FREE_IF_COPY(targ1, 0);
1860 			PG_FREE_IF_COPY(targ2, 1);
1861 		}
1862 	}
1863 	else
1864 	{
1865 		text	   *arg1 = PG_GETARG_TEXT_PP(0);
1866 		text	   *arg2 = PG_GETARG_TEXT_PP(1);
1867 
1868 		result = (text_cmp(arg1, arg2, collid) != 0);
1869 
1870 		PG_FREE_IF_COPY(arg1, 0);
1871 		PG_FREE_IF_COPY(arg2, 1);
1872 	}
1873 
1874 	PG_RETURN_BOOL(result);
1875 }
1876 
1877 Datum
1878 text_lt(PG_FUNCTION_ARGS)
1879 {
1880 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1881 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1882 	bool		result;
1883 
1884 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1885 
1886 	PG_FREE_IF_COPY(arg1, 0);
1887 	PG_FREE_IF_COPY(arg2, 1);
1888 
1889 	PG_RETURN_BOOL(result);
1890 }
1891 
1892 Datum
1893 text_le(PG_FUNCTION_ARGS)
1894 {
1895 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1896 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1897 	bool		result;
1898 
1899 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1900 
1901 	PG_FREE_IF_COPY(arg1, 0);
1902 	PG_FREE_IF_COPY(arg2, 1);
1903 
1904 	PG_RETURN_BOOL(result);
1905 }
1906 
1907 Datum
1908 text_gt(PG_FUNCTION_ARGS)
1909 {
1910 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1911 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1912 	bool		result;
1913 
1914 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1915 
1916 	PG_FREE_IF_COPY(arg1, 0);
1917 	PG_FREE_IF_COPY(arg2, 1);
1918 
1919 	PG_RETURN_BOOL(result);
1920 }
1921 
1922 Datum
1923 text_ge(PG_FUNCTION_ARGS)
1924 {
1925 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1926 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1927 	bool		result;
1928 
1929 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1930 
1931 	PG_FREE_IF_COPY(arg1, 0);
1932 	PG_FREE_IF_COPY(arg2, 1);
1933 
1934 	PG_RETURN_BOOL(result);
1935 }
1936 
1937 Datum
1938 text_starts_with(PG_FUNCTION_ARGS)
1939 {
1940 	Datum		arg1 = PG_GETARG_DATUM(0);
1941 	Datum		arg2 = PG_GETARG_DATUM(1);
1942 	Oid			collid = PG_GET_COLLATION();
1943 	pg_locale_t mylocale = 0;
1944 	bool		result;
1945 	Size		len1,
1946 				len2;
1947 
1948 	check_collation_set(collid);
1949 
1950 	if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1951 		mylocale = pg_newlocale_from_collation(collid);
1952 
1953 	if (mylocale && !mylocale->deterministic)
1954 		ereport(ERROR,
1955 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1956 				 errmsg("nondeterministic collations are not supported for substring searches")));
1957 
1958 	len1 = toast_raw_datum_size(arg1);
1959 	len2 = toast_raw_datum_size(arg2);
1960 	if (len2 > len1)
1961 		result = false;
1962 	else
1963 	{
1964 		text	   *targ1 = text_substring(arg1, 1, len2, false);
1965 		text	   *targ2 = DatumGetTextPP(arg2);
1966 
1967 		result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1968 						 VARSIZE_ANY_EXHDR(targ2)) == 0);
1969 
1970 		PG_FREE_IF_COPY(targ1, 0);
1971 		PG_FREE_IF_COPY(targ2, 1);
1972 	}
1973 
1974 	PG_RETURN_BOOL(result);
1975 }
1976 
1977 Datum
1978 bttextcmp(PG_FUNCTION_ARGS)
1979 {
1980 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1981 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1982 	int32		result;
1983 
1984 	result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1985 
1986 	PG_FREE_IF_COPY(arg1, 0);
1987 	PG_FREE_IF_COPY(arg2, 1);
1988 
1989 	PG_RETURN_INT32(result);
1990 }
1991 
1992 Datum
1993 bttextsortsupport(PG_FUNCTION_ARGS)
1994 {
1995 	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1996 	Oid			collid = ssup->ssup_collation;
1997 	MemoryContext oldcontext;
1998 
1999 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2000 
2001 	/* Use generic string SortSupport */
2002 	varstr_sortsupport(ssup, TEXTOID, collid);
2003 
2004 	MemoryContextSwitchTo(oldcontext);
2005 
2006 	PG_RETURN_VOID();
2007 }
2008 
2009 /*
2010  * Generic sortsupport interface for character type's operator classes.
2011  * Includes locale support, and support for BpChar semantics (i.e. removing
2012  * trailing spaces before comparison).
2013  *
2014  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
2015  * same representation.  Callers that always use the C collation (e.g.
2016  * non-collatable type callers like bytea) may have NUL bytes in their strings;
2017  * this will not work with any other collation, though.
2018  */
2019 void
2020 varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
2021 {
2022 	bool		abbreviate = ssup->abbreviate;
2023 	bool		collate_c = false;
2024 	VarStringSortSupport *sss;
2025 	pg_locale_t locale = 0;
2026 
2027 	check_collation_set(collid);
2028 
2029 	/*
2030 	 * If possible, set ssup->comparator to a function which can be used to
2031 	 * directly compare two datums.  If we can do this, we'll avoid the
2032 	 * overhead of a trip through the fmgr layer for every comparison, which
2033 	 * can be substantial.
2034 	 *
2035 	 * Most typically, we'll set the comparator to varlenafastcmp_locale,
2036 	 * which uses strcoll() to perform comparisons.  We use that for the
2037 	 * BpChar case too, but type NAME uses namefastcmp_locale. However, if
2038 	 * LC_COLLATE = C, we can make things quite a bit faster with
2039 	 * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
2040 	 * memcmp() rather than strcoll().
2041 	 */
2042 	if (lc_collate_is_c(collid))
2043 	{
2044 		if (typid == BPCHAROID)
2045 			ssup->comparator = bpcharfastcmp_c;
2046 		else if (typid == NAMEOID)
2047 		{
2048 			ssup->comparator = namefastcmp_c;
2049 			/* Not supporting abbreviation with type NAME, for now */
2050 			abbreviate = false;
2051 		}
2052 		else
2053 			ssup->comparator = varstrfastcmp_c;
2054 
2055 		collate_c = true;
2056 	}
2057 	else
2058 	{
2059 		/*
2060 		 * We need a collation-sensitive comparison.  To make things faster,
2061 		 * we'll figure out the collation based on the locale id and cache the
2062 		 * result.
2063 		 */
2064 		if (collid != DEFAULT_COLLATION_OID)
2065 			locale = pg_newlocale_from_collation(collid);
2066 
2067 		/*
2068 		 * There is a further exception on Windows.  When the database
2069 		 * encoding is UTF-8 and we are not using the C collation, complex
2070 		 * hacks are required.  We don't currently have a comparator that
2071 		 * handles that case, so we fall back on the slow method of having the
2072 		 * sort code invoke bttextcmp() (in the case of text) via the fmgr
2073 		 * trampoline.  ICU locales work just the same on Windows, however.
2074 		 */
2075 #ifdef WIN32
2076 		if (GetDatabaseEncoding() == PG_UTF8 &&
2077 			!(locale && locale->provider == COLLPROVIDER_ICU))
2078 			return;
2079 #endif
2080 
2081 		/*
2082 		 * We use varlenafastcmp_locale except for type NAME.
2083 		 */
2084 		if (typid == NAMEOID)
2085 		{
2086 			ssup->comparator = namefastcmp_locale;
2087 			/* Not supporting abbreviation with type NAME, for now */
2088 			abbreviate = false;
2089 		}
2090 		else
2091 			ssup->comparator = varlenafastcmp_locale;
2092 	}
2093 
2094 	/*
2095 	 * Unfortunately, it seems that abbreviation for non-C collations is
2096 	 * broken on many common platforms; testing of multiple versions of glibc
2097 	 * reveals that, for many locales, strcoll() and strxfrm() do not return
2098 	 * consistent results, which is fatal to this optimization.  While no
2099 	 * other libc other than Cygwin has so far been shown to have a problem,
2100 	 * we take the conservative course of action for right now and disable
2101 	 * this categorically.  (Users who are certain this isn't a problem on
2102 	 * their system can define TRUST_STRXFRM.)
2103 	 *
2104 	 * Even apart from the risk of broken locales, it's possible that there
2105 	 * are platforms where the use of abbreviated keys should be disabled at
2106 	 * compile time.  Having only 4 byte datums could make worst-case
2107 	 * performance drastically more likely, for example.  Moreover, macOS's
2108 	 * strxfrm() implementation is known to not effectively concentrate a
2109 	 * significant amount of entropy from the original string in earlier
2110 	 * transformed blobs.  It's possible that other supported platforms are
2111 	 * similarly encumbered.  So, if we ever get past disabling this
2112 	 * categorically, we may still want or need to disable it for particular
2113 	 * platforms.
2114 	 */
2115 #ifndef TRUST_STRXFRM
2116 	if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2117 		abbreviate = false;
2118 #endif
2119 
2120 	/*
2121 	 * If we're using abbreviated keys, or if we're using a locale-aware
2122 	 * comparison, we need to initialize a VarStringSortSupport object. Both
2123 	 * cases will make use of the temporary buffers we initialize here for
2124 	 * scratch space (and to detect requirement for BpChar semantics from
2125 	 * caller), and the abbreviation case requires additional state.
2126 	 */
2127 	if (abbreviate || !collate_c)
2128 	{
2129 		sss = palloc(sizeof(VarStringSortSupport));
2130 		sss->buf1 = palloc(TEXTBUFLEN);
2131 		sss->buflen1 = TEXTBUFLEN;
2132 		sss->buf2 = palloc(TEXTBUFLEN);
2133 		sss->buflen2 = TEXTBUFLEN;
2134 		/* Start with invalid values */
2135 		sss->last_len1 = -1;
2136 		sss->last_len2 = -1;
2137 		/* Initialize */
2138 		sss->last_returned = 0;
2139 		sss->locale = locale;
2140 
2141 		/*
2142 		 * To avoid somehow confusing a strxfrm() blob and an original string,
2143 		 * constantly keep track of the variety of data that buf1 and buf2
2144 		 * currently contain.
2145 		 *
2146 		 * Comparisons may be interleaved with conversion calls.  Frequently,
2147 		 * conversions and comparisons are batched into two distinct phases,
2148 		 * but the correctness of caching cannot hinge upon this.  For
2149 		 * comparison caching, buffer state is only trusted if cache_blob is
2150 		 * found set to false, whereas strxfrm() caching only trusts the state
2151 		 * when cache_blob is found set to true.
2152 		 *
2153 		 * Arbitrarily initialize cache_blob to true.
2154 		 */
2155 		sss->cache_blob = true;
2156 		sss->collate_c = collate_c;
2157 		sss->typid = typid;
2158 		ssup->ssup_extra = sss;
2159 
2160 		/*
2161 		 * If possible, plan to use the abbreviated keys optimization.  The
2162 		 * core code may switch back to authoritative comparator should
2163 		 * abbreviation be aborted.
2164 		 */
2165 		if (abbreviate)
2166 		{
2167 			sss->prop_card = 0.20;
2168 			initHyperLogLog(&sss->abbr_card, 10);
2169 			initHyperLogLog(&sss->full_card, 10);
2170 			ssup->abbrev_full_comparator = ssup->comparator;
2171 			ssup->comparator = varstrcmp_abbrev;
2172 			ssup->abbrev_converter = varstr_abbrev_convert;
2173 			ssup->abbrev_abort = varstr_abbrev_abort;
2174 		}
2175 	}
2176 }
2177 
2178 /*
2179  * sortsupport comparison func (for C locale case)
2180  */
2181 static int
2182 varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
2183 {
2184 	VarString  *arg1 = DatumGetVarStringPP(x);
2185 	VarString  *arg2 = DatumGetVarStringPP(y);
2186 	char	   *a1p,
2187 			   *a2p;
2188 	int			len1,
2189 				len2,
2190 				result;
2191 
2192 	a1p = VARDATA_ANY(arg1);
2193 	a2p = VARDATA_ANY(arg2);
2194 
2195 	len1 = VARSIZE_ANY_EXHDR(arg1);
2196 	len2 = VARSIZE_ANY_EXHDR(arg2);
2197 
2198 	result = memcmp(a1p, a2p, Min(len1, len2));
2199 	if ((result == 0) && (len1 != len2))
2200 		result = (len1 < len2) ? -1 : 1;
2201 
2202 	/* We can't afford to leak memory here. */
2203 	if (PointerGetDatum(arg1) != x)
2204 		pfree(arg1);
2205 	if (PointerGetDatum(arg2) != y)
2206 		pfree(arg2);
2207 
2208 	return result;
2209 }
2210 
2211 /*
2212  * sortsupport comparison func (for BpChar C locale case)
2213  *
2214  * BpChar outsources its sortsupport to this module.  Specialization for the
2215  * varstr_sortsupport BpChar case, modeled on
2216  * internal_bpchar_pattern_compare().
2217  */
2218 static int
2219 bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2220 {
2221 	BpChar	   *arg1 = DatumGetBpCharPP(x);
2222 	BpChar	   *arg2 = DatumGetBpCharPP(y);
2223 	char	   *a1p,
2224 			   *a2p;
2225 	int			len1,
2226 				len2,
2227 				result;
2228 
2229 	a1p = VARDATA_ANY(arg1);
2230 	a2p = VARDATA_ANY(arg2);
2231 
2232 	len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2233 	len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2234 
2235 	result = memcmp(a1p, a2p, Min(len1, len2));
2236 	if ((result == 0) && (len1 != len2))
2237 		result = (len1 < len2) ? -1 : 1;
2238 
2239 	/* We can't afford to leak memory here. */
2240 	if (PointerGetDatum(arg1) != x)
2241 		pfree(arg1);
2242 	if (PointerGetDatum(arg2) != y)
2243 		pfree(arg2);
2244 
2245 	return result;
2246 }
2247 
2248 /*
2249  * sortsupport comparison func (for NAME C locale case)
2250  */
2251 static int
2252 namefastcmp_c(Datum x, Datum y, SortSupport ssup)
2253 {
2254 	Name		arg1 = DatumGetName(x);
2255 	Name		arg2 = DatumGetName(y);
2256 
2257 	return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2258 }
2259 
2260 /*
2261  * sortsupport comparison func (for locale case with all varlena types)
2262  */
2263 static int
2264 varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
2265 {
2266 	VarString  *arg1 = DatumGetVarStringPP(x);
2267 	VarString  *arg2 = DatumGetVarStringPP(y);
2268 	char	   *a1p,
2269 			   *a2p;
2270 	int			len1,
2271 				len2,
2272 				result;
2273 
2274 	a1p = VARDATA_ANY(arg1);
2275 	a2p = VARDATA_ANY(arg2);
2276 
2277 	len1 = VARSIZE_ANY_EXHDR(arg1);
2278 	len2 = VARSIZE_ANY_EXHDR(arg2);
2279 
2280 	result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2281 
2282 	/* We can't afford to leak memory here. */
2283 	if (PointerGetDatum(arg1) != x)
2284 		pfree(arg1);
2285 	if (PointerGetDatum(arg2) != y)
2286 		pfree(arg2);
2287 
2288 	return result;
2289 }
2290 
2291 /*
2292  * sortsupport comparison func (for locale case with NAME type)
2293  */
2294 static int
2295 namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
2296 {
2297 	Name		arg1 = DatumGetName(x);
2298 	Name		arg2 = DatumGetName(y);
2299 
2300 	return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2301 								NameStr(*arg2), strlen(NameStr(*arg2)),
2302 								ssup);
2303 }
2304 
2305 /*
2306  * sortsupport comparison func for locale cases
2307  */
2308 static int
2309 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2310 {
2311 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2312 	int			result;
2313 	bool		arg1_match;
2314 
2315 	/* Fast pre-check for equality, as discussed in varstr_cmp() */
2316 	if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2317 	{
2318 		/*
2319 		 * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2320 		 * last_len2.  Existing contents of buffers might still be used by
2321 		 * next call.
2322 		 *
2323 		 * It's fine to allow the comparison of BpChar padding bytes here,
2324 		 * even though that implies that the memcmp() will usually be
2325 		 * performed for BpChar callers (though multibyte characters could
2326 		 * still prevent that from occurring).  The memcmp() is still very
2327 		 * cheap, and BpChar's funny semantics have us remove trailing spaces
2328 		 * (not limited to padding), so we need make no distinction between
2329 		 * padding space characters and "real" space characters.
2330 		 */
2331 		return 0;
2332 	}
2333 
2334 	if (sss->typid == BPCHAROID)
2335 	{
2336 		/* Get true number of bytes, ignoring trailing spaces */
2337 		len1 = bpchartruelen(a1p, len1);
2338 		len2 = bpchartruelen(a2p, len2);
2339 	}
2340 
2341 	if (len1 >= sss->buflen1)
2342 	{
2343 		pfree(sss->buf1);
2344 		sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2345 		sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2346 	}
2347 	if (len2 >= sss->buflen2)
2348 	{
2349 		pfree(sss->buf2);
2350 		sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2351 		sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2352 	}
2353 
2354 	/*
2355 	 * We're likely to be asked to compare the same strings repeatedly, and
2356 	 * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2357 	 * comparisons, even though in general there is no reason to think that
2358 	 * that will work out (every string datum may be unique).  Caching does
2359 	 * not slow things down measurably when it doesn't work out, and can speed
2360 	 * things up by rather a lot when it does.  In part, this is because the
2361 	 * memcmp() compares data from cachelines that are needed in L1 cache even
2362 	 * when the last comparison's result cannot be reused.
2363 	 */
2364 	arg1_match = true;
2365 	if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2366 	{
2367 		arg1_match = false;
2368 		memcpy(sss->buf1, a1p, len1);
2369 		sss->buf1[len1] = '\0';
2370 		sss->last_len1 = len1;
2371 	}
2372 
2373 	/*
2374 	 * If we're comparing the same two strings as last time, we can return the
2375 	 * same answer without calling strcoll() again.  This is more likely than
2376 	 * it seems (at least with moderate to low cardinality sets), because
2377 	 * quicksort compares the same pivot against many values.
2378 	 */
2379 	if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2380 	{
2381 		memcpy(sss->buf2, a2p, len2);
2382 		sss->buf2[len2] = '\0';
2383 		sss->last_len2 = len2;
2384 	}
2385 	else if (arg1_match && !sss->cache_blob)
2386 	{
2387 		/* Use result cached following last actual strcoll() call */
2388 		return sss->last_returned;
2389 	}
2390 
2391 	if (sss->locale)
2392 	{
2393 		if (sss->locale->provider == COLLPROVIDER_ICU)
2394 		{
2395 #ifdef USE_ICU
2396 #ifdef HAVE_UCOL_STRCOLLUTF8
2397 			if (GetDatabaseEncoding() == PG_UTF8)
2398 			{
2399 				UErrorCode	status;
2400 
2401 				status = U_ZERO_ERROR;
2402 				result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2403 										  a1p, len1,
2404 										  a2p, len2,
2405 										  &status);
2406 				if (U_FAILURE(status))
2407 					ereport(ERROR,
2408 							(errmsg("collation failed: %s", u_errorName(status))));
2409 			}
2410 			else
2411 #endif
2412 			{
2413 				int32_t		ulen1,
2414 							ulen2;
2415 				UChar	   *uchar1,
2416 						   *uchar2;
2417 
2418 				ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2419 				ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2420 
2421 				result = ucol_strcoll(sss->locale->info.icu.ucol,
2422 									  uchar1, ulen1,
2423 									  uchar2, ulen2);
2424 
2425 				pfree(uchar1);
2426 				pfree(uchar2);
2427 			}
2428 #else							/* not USE_ICU */
2429 			/* shouldn't happen */
2430 			elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2431 #endif							/* not USE_ICU */
2432 		}
2433 		else
2434 		{
2435 #ifdef HAVE_LOCALE_T
2436 			result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2437 #else
2438 			/* shouldn't happen */
2439 			elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2440 #endif
2441 		}
2442 	}
2443 	else
2444 		result = strcoll(sss->buf1, sss->buf2);
2445 
2446 	/* Break tie if necessary. */
2447 	if (result == 0 &&
2448 		(!sss->locale || sss->locale->deterministic))
2449 		result = strcmp(sss->buf1, sss->buf2);
2450 
2451 	/* Cache result, perhaps saving an expensive strcoll() call next time */
2452 	sss->cache_blob = false;
2453 	sss->last_returned = result;
2454 	return result;
2455 }
2456 
2457 /*
2458  * Abbreviated key comparison func
2459  */
2460 static int
2461 varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup)
2462 {
2463 	/*
2464 	 * When 0 is returned, the core system will call varstrfastcmp_c()
2465 	 * (bpcharfastcmp_c() in BpChar case) or varlenafastcmp_locale().  Even a
2466 	 * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2467 	 * authoritatively, for the same reason that there is a strcoll()
2468 	 * tie-breaker call to strcmp() in varstr_cmp().
2469 	 */
2470 	if (x > y)
2471 		return 1;
2472 	else if (x == y)
2473 		return 0;
2474 	else
2475 		return -1;
2476 }
2477 
2478 /*
2479  * Conversion routine for sortsupport.  Converts original to abbreviated key
2480  * representation.  Our encoding strategy is simple -- pack the first 8 bytes
2481  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2482  * stored in reverse order), and treat it as an unsigned integer.  When the "C"
2483  * locale is used, or in case of bytea, just memcpy() from original instead.
2484  */
2485 static Datum
2486 varstr_abbrev_convert(Datum original, SortSupport ssup)
2487 {
2488 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2489 	VarString  *authoritative = DatumGetVarStringPP(original);
2490 	char	   *authoritative_data = VARDATA_ANY(authoritative);
2491 
2492 	/* working state */
2493 	Datum		res;
2494 	char	   *pres;
2495 	int			len;
2496 	uint32		hash;
2497 
2498 	pres = (char *) &res;
2499 	/* memset(), so any non-overwritten bytes are NUL */
2500 	memset(pres, 0, sizeof(Datum));
2501 	len = VARSIZE_ANY_EXHDR(authoritative);
2502 
2503 	/* Get number of bytes, ignoring trailing spaces */
2504 	if (sss->typid == BPCHAROID)
2505 		len = bpchartruelen(authoritative_data, len);
2506 
2507 	/*
2508 	 * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2509 	 * abbreviate keys.  The full comparator for the C locale is always
2510 	 * memcmp().  It would be incorrect to allow bytea callers (callers that
2511 	 * always force the C collation -- bytea isn't a collatable type, but this
2512 	 * approach is convenient) to use strxfrm().  This is because bytea
2513 	 * strings may contain NUL bytes.  Besides, this should be faster, too.
2514 	 *
2515 	 * More generally, it's okay that bytea callers can have NUL bytes in
2516 	 * strings because varstrcmp_abbrev() need not make a distinction between
2517 	 * terminating NUL bytes, and NUL bytes representing actual NULs in the
2518 	 * authoritative representation.  Hopefully a comparison at or past one
2519 	 * abbreviated key's terminating NUL byte will resolve the comparison
2520 	 * without consulting the authoritative representation; specifically, some
2521 	 * later non-NUL byte in the longer string can resolve the comparison
2522 	 * against a subsequent terminating NUL in the shorter string.  There will
2523 	 * usually be what is effectively a "length-wise" resolution there and
2524 	 * then.
2525 	 *
2526 	 * If that doesn't work out -- if all bytes in the longer string
2527 	 * positioned at or past the offset of the smaller string's (first)
2528 	 * terminating NUL are actually representative of NUL bytes in the
2529 	 * authoritative binary string (perhaps with some *terminating* NUL bytes
2530 	 * towards the end of the longer string iff it happens to still be small)
2531 	 * -- then an authoritative tie-breaker will happen, and do the right
2532 	 * thing: explicitly consider string length.
2533 	 */
2534 	if (sss->collate_c)
2535 		memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2536 	else
2537 	{
2538 		Size		bsize;
2539 #ifdef USE_ICU
2540 		int32_t		ulen = -1;
2541 		UChar	   *uchar = NULL;
2542 #endif
2543 
2544 		/*
2545 		 * We're not using the C collation, so fall back on strxfrm or ICU
2546 		 * analogs.
2547 		 */
2548 
2549 		/* By convention, we use buffer 1 to store and NUL-terminate */
2550 		if (len >= sss->buflen1)
2551 		{
2552 			pfree(sss->buf1);
2553 			sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2554 			sss->buf1 = palloc(sss->buflen1);
2555 		}
2556 
2557 		/* Might be able to reuse strxfrm() blob from last call */
2558 		if (sss->last_len1 == len && sss->cache_blob &&
2559 			memcmp(sss->buf1, authoritative_data, len) == 0)
2560 		{
2561 			memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2562 			/* No change affecting cardinality, so no hashing required */
2563 			goto done;
2564 		}
2565 
2566 		memcpy(sss->buf1, authoritative_data, len);
2567 
2568 		/*
2569 		 * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2570 		 * necessary for ICU, but doesn't hurt.
2571 		 */
2572 		sss->buf1[len] = '\0';
2573 		sss->last_len1 = len;
2574 
2575 #ifdef USE_ICU
2576 		/* When using ICU and not UTF8, convert string to UChar. */
2577 		if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2578 			GetDatabaseEncoding() != PG_UTF8)
2579 			ulen = icu_to_uchar(&uchar, sss->buf1, len);
2580 #endif
2581 
2582 		/*
2583 		 * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2584 		 * and try again.  Both of these functions have the result buffer
2585 		 * content undefined if the result did not fit, so we need to retry
2586 		 * until everything fits, even though we only need the first few bytes
2587 		 * in the end.  When using ucol_nextSortKeyPart(), however, we only
2588 		 * ask for as many bytes as we actually need.
2589 		 */
2590 		for (;;)
2591 		{
2592 #ifdef USE_ICU
2593 			if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2594 			{
2595 				/*
2596 				 * When using UTF8, use the iteration interface so we only
2597 				 * need to produce as many bytes as we actually need.
2598 				 */
2599 				if (GetDatabaseEncoding() == PG_UTF8)
2600 				{
2601 					UCharIterator iter;
2602 					uint32_t	state[2];
2603 					UErrorCode	status;
2604 
2605 					uiter_setUTF8(&iter, sss->buf1, len);
2606 					state[0] = state[1] = 0;	/* won't need that again */
2607 					status = U_ZERO_ERROR;
2608 					bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2609 												 &iter,
2610 												 state,
2611 												 (uint8_t *) sss->buf2,
2612 												 Min(sizeof(Datum), sss->buflen2),
2613 												 &status);
2614 					if (U_FAILURE(status))
2615 						ereport(ERROR,
2616 								(errmsg("sort key generation failed: %s",
2617 										u_errorName(status))));
2618 				}
2619 				else
2620 					bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2621 											uchar, ulen,
2622 											(uint8_t *) sss->buf2, sss->buflen2);
2623 			}
2624 			else
2625 #endif
2626 #ifdef HAVE_LOCALE_T
2627 			if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2628 				bsize = strxfrm_l(sss->buf2, sss->buf1,
2629 								  sss->buflen2, sss->locale->info.lt);
2630 			else
2631 #endif
2632 				bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2633 
2634 			sss->last_len2 = bsize;
2635 			if (bsize < sss->buflen2)
2636 				break;
2637 
2638 			/*
2639 			 * Grow buffer and retry.
2640 			 */
2641 			pfree(sss->buf2);
2642 			sss->buflen2 = Max(bsize + 1,
2643 							   Min(sss->buflen2 * 2, MaxAllocSize));
2644 			sss->buf2 = palloc(sss->buflen2);
2645 		}
2646 
2647 		/*
2648 		 * Every Datum byte is always compared.  This is safe because the
2649 		 * strxfrm() blob is itself NUL terminated, leaving no danger of
2650 		 * misinterpreting any NUL bytes not intended to be interpreted as
2651 		 * logically representing termination.
2652 		 *
2653 		 * (Actually, even if there were NUL bytes in the blob it would be
2654 		 * okay.  See remarks on bytea case above.)
2655 		 */
2656 		memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2657 
2658 #ifdef USE_ICU
2659 		if (uchar)
2660 			pfree(uchar);
2661 #endif
2662 	}
2663 
2664 	/*
2665 	 * Maintain approximate cardinality of both abbreviated keys and original,
2666 	 * authoritative keys using HyperLogLog.  Used as cheap insurance against
2667 	 * the worst case, where we do many string transformations for no saving
2668 	 * in full strcoll()-based comparisons.  These statistics are used by
2669 	 * varstr_abbrev_abort().
2670 	 *
2671 	 * First, Hash key proper, or a significant fraction of it.  Mix in length
2672 	 * in order to compensate for cases where differences are past
2673 	 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2674 	 */
2675 	hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2676 								   Min(len, PG_CACHE_LINE_SIZE)));
2677 
2678 	if (len > PG_CACHE_LINE_SIZE)
2679 		hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2680 
2681 	addHyperLogLog(&sss->full_card, hash);
2682 
2683 	/* Hash abbreviated key */
2684 #if SIZEOF_DATUM == 8
2685 	{
2686 		uint32		lohalf,
2687 					hihalf;
2688 
2689 		lohalf = (uint32) res;
2690 		hihalf = (uint32) (res >> 32);
2691 		hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2692 	}
2693 #else							/* SIZEOF_DATUM != 8 */
2694 	hash = DatumGetUInt32(hash_uint32((uint32) res));
2695 #endif
2696 
2697 	addHyperLogLog(&sss->abbr_card, hash);
2698 
2699 	/* Cache result, perhaps saving an expensive strxfrm() call next time */
2700 	sss->cache_blob = true;
2701 done:
2702 
2703 	/*
2704 	 * Byteswap on little-endian machines.
2705 	 *
2706 	 * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2707 	 * comparator) works correctly on all platforms.  If we didn't do this,
2708 	 * the comparator would have to call memcmp() with a pair of pointers to
2709 	 * the first byte of each abbreviated key, which is slower.
2710 	 */
2711 	res = DatumBigEndianToNative(res);
2712 
2713 	/* Don't leak memory here */
2714 	if (PointerGetDatum(authoritative) != original)
2715 		pfree(authoritative);
2716 
2717 	return res;
2718 }
2719 
2720 /*
2721  * Callback for estimating effectiveness of abbreviated key optimization, using
2722  * heuristic rules.  Returns value indicating if the abbreviation optimization
2723  * should be aborted, based on its projected effectiveness.
2724  */
2725 static bool
2726 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2727 {
2728 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2729 	double		abbrev_distinct,
2730 				key_distinct;
2731 
2732 	Assert(ssup->abbreviate);
2733 
2734 	/* Have a little patience */
2735 	if (memtupcount < 100)
2736 		return false;
2737 
2738 	abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2739 	key_distinct = estimateHyperLogLog(&sss->full_card);
2740 
2741 	/*
2742 	 * Clamp cardinality estimates to at least one distinct value.  While
2743 	 * NULLs are generally disregarded, if only NULL values were seen so far,
2744 	 * that might misrepresent costs if we failed to clamp.
2745 	 */
2746 	if (abbrev_distinct <= 1.0)
2747 		abbrev_distinct = 1.0;
2748 
2749 	if (key_distinct <= 1.0)
2750 		key_distinct = 1.0;
2751 
2752 	/*
2753 	 * In the worst case all abbreviated keys are identical, while at the same
2754 	 * time there are differences within full key strings not captured in
2755 	 * abbreviations.
2756 	 */
2757 #ifdef TRACE_SORT
2758 	if (trace_sort)
2759 	{
2760 		double		norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2761 
2762 		elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2763 			 "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2764 			 memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2765 			 sss->prop_card);
2766 	}
2767 #endif
2768 
2769 	/*
2770 	 * If the number of distinct abbreviated keys approximately matches the
2771 	 * number of distinct authoritative original keys, that's reason enough to
2772 	 * proceed.  We can win even with a very low cardinality set if most
2773 	 * tie-breakers only memcmp().  This is by far the most important
2774 	 * consideration.
2775 	 *
2776 	 * While comparisons that are resolved at the abbreviated key level are
2777 	 * considerably cheaper than tie-breakers resolved with memcmp(), both of
2778 	 * those two outcomes are so much cheaper than a full strcoll() once
2779 	 * sorting is underway that it doesn't seem worth it to weigh abbreviated
2780 	 * cardinality against the overall size of the set in order to more
2781 	 * accurately model costs.  Assume that an abbreviated comparison, and an
2782 	 * abbreviated comparison with a cheap memcmp()-based authoritative
2783 	 * resolution are equivalent.
2784 	 */
2785 	if (abbrev_distinct > key_distinct * sss->prop_card)
2786 	{
2787 		/*
2788 		 * When we have exceeded 10,000 tuples, decay required cardinality
2789 		 * aggressively for next call.
2790 		 *
2791 		 * This is useful because the number of comparisons required on
2792 		 * average increases at a linearithmic rate, and at roughly 10,000
2793 		 * tuples that factor will start to dominate over the linear costs of
2794 		 * string transformation (this is a conservative estimate).  The decay
2795 		 * rate is chosen to be a little less aggressive than halving -- which
2796 		 * (since we're called at points at which memtupcount has doubled)
2797 		 * would never see the cost model actually abort past the first call
2798 		 * following a decay.  This decay rate is mostly a precaution against
2799 		 * a sudden, violent swing in how well abbreviated cardinality tracks
2800 		 * full key cardinality.  The decay also serves to prevent a marginal
2801 		 * case from being aborted too late, when too much has already been
2802 		 * invested in string transformation.
2803 		 *
2804 		 * It's possible for sets of several million distinct strings with
2805 		 * mere tens of thousands of distinct abbreviated keys to still
2806 		 * benefit very significantly.  This will generally occur provided
2807 		 * each abbreviated key is a proxy for a roughly uniform number of the
2808 		 * set's full keys. If it isn't so, we hope to catch that early and
2809 		 * abort.  If it isn't caught early, by the time the problem is
2810 		 * apparent it's probably not worth aborting.
2811 		 */
2812 		if (memtupcount > 10000)
2813 			sss->prop_card *= 0.65;
2814 
2815 		return false;
2816 	}
2817 
2818 	/*
2819 	 * Abort abbreviation strategy.
2820 	 *
2821 	 * The worst case, where all abbreviated keys are identical while all
2822 	 * original strings differ will typically only see a regression of about
2823 	 * 10% in execution time for small to medium sized lists of strings.
2824 	 * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2825 	 * often expect very large improvements, particularly with sets of strings
2826 	 * of moderately high to high abbreviated cardinality.  There is little to
2827 	 * lose but much to gain, which our strategy reflects.
2828 	 */
2829 #ifdef TRACE_SORT
2830 	if (trace_sort)
2831 		elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2832 			 "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2833 			 memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2834 #endif
2835 
2836 	return true;
2837 }
2838 
2839 /*
2840  * Generic equalimage support function for character type's operator classes.
2841  * Disables the use of deduplication with nondeterministic collations.
2842  */
2843 Datum
2844 btvarstrequalimage(PG_FUNCTION_ARGS)
2845 {
2846 	/* Oid		opcintype = PG_GETARG_OID(0); */
2847 	Oid			collid = PG_GET_COLLATION();
2848 
2849 	check_collation_set(collid);
2850 
2851 	if (lc_collate_is_c(collid) ||
2852 		collid == DEFAULT_COLLATION_OID ||
2853 		get_collation_isdeterministic(collid))
2854 		PG_RETURN_BOOL(true);
2855 	else
2856 		PG_RETURN_BOOL(false);
2857 }
2858 
2859 Datum
2860 text_larger(PG_FUNCTION_ARGS)
2861 {
2862 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2863 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2864 	text	   *result;
2865 
2866 	result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2867 
2868 	PG_RETURN_TEXT_P(result);
2869 }
2870 
2871 Datum
2872 text_smaller(PG_FUNCTION_ARGS)
2873 {
2874 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2875 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2876 	text	   *result;
2877 
2878 	result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2879 
2880 	PG_RETURN_TEXT_P(result);
2881 }
2882 
2883 
2884 /*
2885  * Cross-type comparison functions for types text and name.
2886  */
2887 
2888 Datum
2889 nameeqtext(PG_FUNCTION_ARGS)
2890 {
2891 	Name		arg1 = PG_GETARG_NAME(0);
2892 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2893 	size_t		len1 = strlen(NameStr(*arg1));
2894 	size_t		len2 = VARSIZE_ANY_EXHDR(arg2);
2895 	Oid			collid = PG_GET_COLLATION();
2896 	bool		result;
2897 
2898 	check_collation_set(collid);
2899 
2900 	if (collid == C_COLLATION_OID)
2901 		result = (len1 == len2 &&
2902 				  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2903 	else
2904 		result = (varstr_cmp(NameStr(*arg1), len1,
2905 							 VARDATA_ANY(arg2), len2,
2906 							 collid) == 0);
2907 
2908 	PG_FREE_IF_COPY(arg2, 1);
2909 
2910 	PG_RETURN_BOOL(result);
2911 }
2912 
2913 Datum
2914 texteqname(PG_FUNCTION_ARGS)
2915 {
2916 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2917 	Name		arg2 = PG_GETARG_NAME(1);
2918 	size_t		len1 = VARSIZE_ANY_EXHDR(arg1);
2919 	size_t		len2 = strlen(NameStr(*arg2));
2920 	Oid			collid = PG_GET_COLLATION();
2921 	bool		result;
2922 
2923 	check_collation_set(collid);
2924 
2925 	if (collid == C_COLLATION_OID)
2926 		result = (len1 == len2 &&
2927 				  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2928 	else
2929 		result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2930 							 NameStr(*arg2), len2,
2931 							 collid) == 0);
2932 
2933 	PG_FREE_IF_COPY(arg1, 0);
2934 
2935 	PG_RETURN_BOOL(result);
2936 }
2937 
2938 Datum
2939 namenetext(PG_FUNCTION_ARGS)
2940 {
2941 	Name		arg1 = PG_GETARG_NAME(0);
2942 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2943 	size_t		len1 = strlen(NameStr(*arg1));
2944 	size_t		len2 = VARSIZE_ANY_EXHDR(arg2);
2945 	Oid			collid = PG_GET_COLLATION();
2946 	bool		result;
2947 
2948 	check_collation_set(collid);
2949 
2950 	if (collid == C_COLLATION_OID)
2951 		result = !(len1 == len2 &&
2952 				   memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2953 	else
2954 		result = !(varstr_cmp(NameStr(*arg1), len1,
2955 							  VARDATA_ANY(arg2), len2,
2956 							  collid) == 0);
2957 
2958 	PG_FREE_IF_COPY(arg2, 1);
2959 
2960 	PG_RETURN_BOOL(result);
2961 }
2962 
2963 Datum
2964 textnename(PG_FUNCTION_ARGS)
2965 {
2966 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2967 	Name		arg2 = PG_GETARG_NAME(1);
2968 	size_t		len1 = VARSIZE_ANY_EXHDR(arg1);
2969 	size_t		len2 = strlen(NameStr(*arg2));
2970 	Oid			collid = PG_GET_COLLATION();
2971 	bool		result;
2972 
2973 	check_collation_set(collid);
2974 
2975 	if (collid == C_COLLATION_OID)
2976 		result = !(len1 == len2 &&
2977 				   memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2978 	else
2979 		result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2980 							  NameStr(*arg2), len2,
2981 							  collid) == 0);
2982 
2983 	PG_FREE_IF_COPY(arg1, 0);
2984 
2985 	PG_RETURN_BOOL(result);
2986 }
2987 
2988 Datum
2989 btnametextcmp(PG_FUNCTION_ARGS)
2990 {
2991 	Name		arg1 = PG_GETARG_NAME(0);
2992 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2993 	int32		result;
2994 
2995 	result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2996 						VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2997 						PG_GET_COLLATION());
2998 
2999 	PG_FREE_IF_COPY(arg2, 1);
3000 
3001 	PG_RETURN_INT32(result);
3002 }
3003 
3004 Datum
3005 bttextnamecmp(PG_FUNCTION_ARGS)
3006 {
3007 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
3008 	Name		arg2 = PG_GETARG_NAME(1);
3009 	int32		result;
3010 
3011 	result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
3012 						NameStr(*arg2), strlen(NameStr(*arg2)),
3013 						PG_GET_COLLATION());
3014 
3015 	PG_FREE_IF_COPY(arg1, 0);
3016 
3017 	PG_RETURN_INT32(result);
3018 }
3019 
3020 #define CmpCall(cmpfunc) \
3021 	DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
3022 										  PG_GET_COLLATION(), \
3023 										  PG_GETARG_DATUM(0), \
3024 										  PG_GETARG_DATUM(1)))
3025 
3026 Datum
3027 namelttext(PG_FUNCTION_ARGS)
3028 {
3029 	PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
3030 }
3031 
3032 Datum
3033 nameletext(PG_FUNCTION_ARGS)
3034 {
3035 	PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
3036 }
3037 
3038 Datum
3039 namegttext(PG_FUNCTION_ARGS)
3040 {
3041 	PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
3042 }
3043 
3044 Datum
3045 namegetext(PG_FUNCTION_ARGS)
3046 {
3047 	PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
3048 }
3049 
3050 Datum
3051 textltname(PG_FUNCTION_ARGS)
3052 {
3053 	PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
3054 }
3055 
3056 Datum
3057 textlename(PG_FUNCTION_ARGS)
3058 {
3059 	PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
3060 }
3061 
3062 Datum
3063 textgtname(PG_FUNCTION_ARGS)
3064 {
3065 	PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
3066 }
3067 
3068 Datum
3069 textgename(PG_FUNCTION_ARGS)
3070 {
3071 	PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
3072 }
3073 
3074 #undef CmpCall
3075 
3076 
3077 /*
3078  * The following operators support character-by-character comparison
3079  * of text datums, to allow building indexes suitable for LIKE clauses.
3080  * Note that the regular texteq/textne comparison operators, and regular
3081  * support functions 1 and 2 with "C" collation are assumed to be
3082  * compatible with these!
3083  */
3084 
3085 static int
3086 internal_text_pattern_compare(text *arg1, text *arg2)
3087 {
3088 	int			result;
3089 	int			len1,
3090 				len2;
3091 
3092 	len1 = VARSIZE_ANY_EXHDR(arg1);
3093 	len2 = VARSIZE_ANY_EXHDR(arg2);
3094 
3095 	result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3096 	if (result != 0)
3097 		return result;
3098 	else if (len1 < len2)
3099 		return -1;
3100 	else if (len1 > len2)
3101 		return 1;
3102 	else
3103 		return 0;
3104 }
3105 
3106 
3107 Datum
3108 text_pattern_lt(PG_FUNCTION_ARGS)
3109 {
3110 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
3111 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
3112 	int			result;
3113 
3114 	result = internal_text_pattern_compare(arg1, arg2);
3115 
3116 	PG_FREE_IF_COPY(arg1, 0);
3117 	PG_FREE_IF_COPY(arg2, 1);
3118 
3119 	PG_RETURN_BOOL(result < 0);
3120 }
3121 
3122 
3123 Datum
3124 text_pattern_le(PG_FUNCTION_ARGS)
3125 {
3126 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
3127 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
3128 	int			result;
3129 
3130 	result = internal_text_pattern_compare(arg1, arg2);
3131 
3132 	PG_FREE_IF_COPY(arg1, 0);
3133 	PG_FREE_IF_COPY(arg2, 1);
3134 
3135 	PG_RETURN_BOOL(result <= 0);
3136 }
3137 
3138 
3139 Datum
3140 text_pattern_ge(PG_FUNCTION_ARGS)
3141 {
3142 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
3143 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
3144 	int			result;
3145 
3146 	result = internal_text_pattern_compare(arg1, arg2);
3147 
3148 	PG_FREE_IF_COPY(arg1, 0);
3149 	PG_FREE_IF_COPY(arg2, 1);
3150 
3151 	PG_RETURN_BOOL(result >= 0);
3152 }
3153 
3154 
3155 Datum
3156 text_pattern_gt(PG_FUNCTION_ARGS)
3157 {
3158 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
3159 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
3160 	int			result;
3161 
3162 	result = internal_text_pattern_compare(arg1, arg2);
3163 
3164 	PG_FREE_IF_COPY(arg1, 0);
3165 	PG_FREE_IF_COPY(arg2, 1);
3166 
3167 	PG_RETURN_BOOL(result > 0);
3168 }
3169 
3170 
3171 Datum
3172 bttext_pattern_cmp(PG_FUNCTION_ARGS)
3173 {
3174 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
3175 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
3176 	int			result;
3177 
3178 	result = internal_text_pattern_compare(arg1, arg2);
3179 
3180 	PG_FREE_IF_COPY(arg1, 0);
3181 	PG_FREE_IF_COPY(arg2, 1);
3182 
3183 	PG_RETURN_INT32(result);
3184 }
3185 
3186 
3187 Datum
3188 bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
3189 {
3190 	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3191 	MemoryContext oldcontext;
3192 
3193 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3194 
3195 	/* Use generic string SortSupport, forcing "C" collation */
3196 	varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3197 
3198 	MemoryContextSwitchTo(oldcontext);
3199 
3200 	PG_RETURN_VOID();
3201 }
3202 
3203 
3204 /*-------------------------------------------------------------
3205  * byteaoctetlen
3206  *
3207  * get the number of bytes contained in an instance of type 'bytea'
3208  *-------------------------------------------------------------
3209  */
3210 Datum
3211 byteaoctetlen(PG_FUNCTION_ARGS)
3212 {
3213 	Datum		str = PG_GETARG_DATUM(0);
3214 
3215 	/* We need not detoast the input at all */
3216 	PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
3217 }
3218 
3219 /*
3220  * byteacat -
3221  *	  takes two bytea* and returns a bytea* that is the concatenation of
3222  *	  the two.
3223  *
3224  * Cloned from textcat and modified as required.
3225  */
3226 Datum
3227 byteacat(PG_FUNCTION_ARGS)
3228 {
3229 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
3230 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
3231 
3232 	PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
3233 }
3234 
3235 /*
3236  * bytea_catenate
3237  *	Guts of byteacat(), broken out so it can be used by other functions
3238  *
3239  * Arguments can be in short-header form, but not compressed or out-of-line
3240  */
3241 static bytea *
3242 bytea_catenate(bytea *t1, bytea *t2)
3243 {
3244 	bytea	   *result;
3245 	int			len1,
3246 				len2,
3247 				len;
3248 	char	   *ptr;
3249 
3250 	len1 = VARSIZE_ANY_EXHDR(t1);
3251 	len2 = VARSIZE_ANY_EXHDR(t2);
3252 
3253 	/* paranoia ... probably should throw error instead? */
3254 	if (len1 < 0)
3255 		len1 = 0;
3256 	if (len2 < 0)
3257 		len2 = 0;
3258 
3259 	len = len1 + len2 + VARHDRSZ;
3260 	result = (bytea *) palloc(len);
3261 
3262 	/* Set size of result string... */
3263 	SET_VARSIZE(result, len);
3264 
3265 	/* Fill data field of result string... */
3266 	ptr = VARDATA(result);
3267 	if (len1 > 0)
3268 		memcpy(ptr, VARDATA_ANY(t1), len1);
3269 	if (len2 > 0)
3270 		memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3271 
3272 	return result;
3273 }
3274 
3275 #define PG_STR_GET_BYTEA(str_) \
3276 	DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3277 
3278 /*
3279  * bytea_substr()
3280  * Return a substring starting at the specified position.
3281  * Cloned from text_substr and modified as required.
3282  *
3283  * Input:
3284  *	- string
3285  *	- starting position (is one-based)
3286  *	- string length (optional)
3287  *
3288  * If the starting position is zero or less, then return from the start of the string
3289  * adjusting the length to be consistent with the "negative start" per SQL.
3290  * If the length is less than zero, an ERROR is thrown. If no third argument
3291  * (length) is provided, the length to the end of the string is assumed.
3292  */
3293 Datum
3294 bytea_substr(PG_FUNCTION_ARGS)
3295 {
3296 	PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3297 									  PG_GETARG_INT32(1),
3298 									  PG_GETARG_INT32(2),
3299 									  false));
3300 }
3301 
3302 /*
3303  * bytea_substr_no_len -
3304  *	  Wrapper to avoid opr_sanity failure due to
3305  *	  one function accepting a different number of args.
3306  */
3307 Datum
3308 bytea_substr_no_len(PG_FUNCTION_ARGS)
3309 {
3310 	PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3311 									  PG_GETARG_INT32(1),
3312 									  -1,
3313 									  true));
3314 }
3315 
3316 static bytea *
3317 bytea_substring(Datum str,
3318 				int S,
3319 				int L,
3320 				bool length_not_specified)
3321 {
3322 	int32		S1;				/* adjusted start position */
3323 	int32		L1;				/* adjusted substring length */
3324 	int32		E;				/* end position */
3325 
3326 	/*
3327 	 * The logic here should generally match text_substring().
3328 	 */
3329 	S1 = Max(S, 1);
3330 
3331 	if (length_not_specified)
3332 	{
3333 		/*
3334 		 * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3335 		 * end of the string if we pass it a negative value for length.
3336 		 */
3337 		L1 = -1;
3338 	}
3339 	else if (L < 0)
3340 	{
3341 		/* SQL99 says to throw an error for E < S, i.e., negative length */
3342 		ereport(ERROR,
3343 				(errcode(ERRCODE_SUBSTRING_ERROR),
3344 				 errmsg("negative substring length not allowed")));
3345 		L1 = -1;				/* silence stupider compilers */
3346 	}
3347 	else if (pg_add_s32_overflow(S, L, &E))
3348 	{
3349 		/*
3350 		 * L could be large enough for S + L to overflow, in which case the
3351 		 * substring must run to end of string.
3352 		 */
3353 		L1 = -1;
3354 	}
3355 	else
3356 	{
3357 		/*
3358 		 * A zero or negative value for the end position can happen if the
3359 		 * start was negative or one. SQL99 says to return a zero-length
3360 		 * string.
3361 		 */
3362 		if (E < 1)
3363 			return PG_STR_GET_BYTEA("");
3364 
3365 		L1 = E - S1;
3366 	}
3367 
3368 	/*
3369 	 * If the start position is past the end of the string, SQL99 says to
3370 	 * return a zero-length string -- DatumGetByteaPSlice() will do that for
3371 	 * us.  We need only convert S1 to zero-based starting position.
3372 	 */
3373 	return DatumGetByteaPSlice(str, S1 - 1, L1);
3374 }
3375 
3376 /*
3377  * byteaoverlay
3378  *	Replace specified substring of first string with second
3379  *
3380  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3381  * This code is a direct implementation of what the standard says.
3382  */
3383 Datum
3384 byteaoverlay(PG_FUNCTION_ARGS)
3385 {
3386 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
3387 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
3388 	int			sp = PG_GETARG_INT32(2);	/* substring start position */
3389 	int			sl = PG_GETARG_INT32(3);	/* substring length */
3390 
3391 	PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3392 }
3393 
3394 Datum
3395 byteaoverlay_no_len(PG_FUNCTION_ARGS)
3396 {
3397 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
3398 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
3399 	int			sp = PG_GETARG_INT32(2);	/* substring start position */
3400 	int			sl;
3401 
3402 	sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3403 	PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3404 }
3405 
3406 static bytea *
3407 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3408 {
3409 	bytea	   *result;
3410 	bytea	   *s1;
3411 	bytea	   *s2;
3412 	int			sp_pl_sl;
3413 
3414 	/*
3415 	 * Check for possible integer-overflow cases.  For negative sp, throw a
3416 	 * "substring length" error because that's what should be expected
3417 	 * according to the spec's definition of OVERLAY().
3418 	 */
3419 	if (sp <= 0)
3420 		ereport(ERROR,
3421 				(errcode(ERRCODE_SUBSTRING_ERROR),
3422 				 errmsg("negative substring length not allowed")));
3423 	if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3424 		ereport(ERROR,
3425 				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3426 				 errmsg("integer out of range")));
3427 
3428 	s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3429 	s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3430 	result = bytea_catenate(s1, t2);
3431 	result = bytea_catenate(result, s2);
3432 
3433 	return result;
3434 }
3435 
3436 /*
3437  * bit_count
3438  */
3439 Datum
3440 bytea_bit_count(PG_FUNCTION_ARGS)
3441 {
3442 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
3443 
3444 	PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1)));
3445 }
3446 
3447 /*
3448  * byteapos -
3449  *	  Return the position of the specified substring.
3450  *	  Implements the SQL POSITION() function.
3451  * Cloned from textpos and modified as required.
3452  */
3453 Datum
3454 byteapos(PG_FUNCTION_ARGS)
3455 {
3456 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
3457 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
3458 	int			pos;
3459 	int			px,
3460 				p;
3461 	int			len1,
3462 				len2;
3463 	char	   *p1,
3464 			   *p2;
3465 
3466 	len1 = VARSIZE_ANY_EXHDR(t1);
3467 	len2 = VARSIZE_ANY_EXHDR(t2);
3468 
3469 	if (len2 <= 0)
3470 		PG_RETURN_INT32(1);		/* result for empty pattern */
3471 
3472 	p1 = VARDATA_ANY(t1);
3473 	p2 = VARDATA_ANY(t2);
3474 
3475 	pos = 0;
3476 	px = (len1 - len2);
3477 	for (p = 0; p <= px; p++)
3478 	{
3479 		if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3480 		{
3481 			pos = p + 1;
3482 			break;
3483 		};
3484 		p1++;
3485 	};
3486 
3487 	PG_RETURN_INT32(pos);
3488 }
3489 
3490 /*-------------------------------------------------------------
3491  * byteaGetByte
3492  *
3493  * this routine treats "bytea" as an array of bytes.
3494  * It returns the Nth byte (a number between 0 and 255).
3495  *-------------------------------------------------------------
3496  */
3497 Datum
3498 byteaGetByte(PG_FUNCTION_ARGS)
3499 {
3500 	bytea	   *v = PG_GETARG_BYTEA_PP(0);
3501 	int32		n = PG_GETARG_INT32(1);
3502 	int			len;
3503 	int			byte;
3504 
3505 	len = VARSIZE_ANY_EXHDR(v);
3506 
3507 	if (n < 0 || n >= len)
3508 		ereport(ERROR,
3509 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3510 				 errmsg("index %d out of valid range, 0..%d",
3511 						n, len - 1)));
3512 
3513 	byte = ((unsigned char *) VARDATA_ANY(v))[n];
3514 
3515 	PG_RETURN_INT32(byte);
3516 }
3517 
3518 /*-------------------------------------------------------------
3519  * byteaGetBit
3520  *
3521  * This routine treats a "bytea" type like an array of bits.
3522  * It returns the value of the Nth bit (0 or 1).
3523  *
3524  *-------------------------------------------------------------
3525  */
3526 Datum
3527 byteaGetBit(PG_FUNCTION_ARGS)
3528 {
3529 	bytea	   *v = PG_GETARG_BYTEA_PP(0);
3530 	int64		n = PG_GETARG_INT64(1);
3531 	int			byteNo,
3532 				bitNo;
3533 	int			len;
3534 	int			byte;
3535 
3536 	len = VARSIZE_ANY_EXHDR(v);
3537 
3538 	if (n < 0 || n >= (int64) len * 8)
3539 		ereport(ERROR,
3540 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3541 				 errmsg("index %lld out of valid range, 0..%lld",
3542 						(long long) n, (long long) len * 8 - 1)));
3543 
3544 	/* n/8 is now known < len, so safe to cast to int */
3545 	byteNo = (int) (n / 8);
3546 	bitNo = (int) (n % 8);
3547 
3548 	byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3549 
3550 	if (byte & (1 << bitNo))
3551 		PG_RETURN_INT32(1);
3552 	else
3553 		PG_RETURN_INT32(0);
3554 }
3555 
3556 /*-------------------------------------------------------------
3557  * byteaSetByte
3558  *
3559  * Given an instance of type 'bytea' creates a new one with
3560  * the Nth byte set to the given value.
3561  *
3562  *-------------------------------------------------------------
3563  */
3564 Datum
3565 byteaSetByte(PG_FUNCTION_ARGS)
3566 {
3567 	bytea	   *res = PG_GETARG_BYTEA_P_COPY(0);
3568 	int32		n = PG_GETARG_INT32(1);
3569 	int32		newByte = PG_GETARG_INT32(2);
3570 	int			len;
3571 
3572 	len = VARSIZE(res) - VARHDRSZ;
3573 
3574 	if (n < 0 || n >= len)
3575 		ereport(ERROR,
3576 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3577 				 errmsg("index %d out of valid range, 0..%d",
3578 						n, len - 1)));
3579 
3580 	/*
3581 	 * Now set the byte.
3582 	 */
3583 	((unsigned char *) VARDATA(res))[n] = newByte;
3584 
3585 	PG_RETURN_BYTEA_P(res);
3586 }
3587 
3588 /*-------------------------------------------------------------
3589  * byteaSetBit
3590  *
3591  * Given an instance of type 'bytea' creates a new one with
3592  * the Nth bit set to the given value.
3593  *
3594  *-------------------------------------------------------------
3595  */
3596 Datum
3597 byteaSetBit(PG_FUNCTION_ARGS)
3598 {
3599 	bytea	   *res = PG_GETARG_BYTEA_P_COPY(0);
3600 	int64		n = PG_GETARG_INT64(1);
3601 	int32		newBit = PG_GETARG_INT32(2);
3602 	int			len;
3603 	int			oldByte,
3604 				newByte;
3605 	int			byteNo,
3606 				bitNo;
3607 
3608 	len = VARSIZE(res) - VARHDRSZ;
3609 
3610 	if (n < 0 || n >= (int64) len * 8)
3611 		ereport(ERROR,
3612 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3613 				 errmsg("index %lld out of valid range, 0..%lld",
3614 						(long long) n, (long long) len * 8 - 1)));
3615 
3616 	/* n/8 is now known < len, so safe to cast to int */
3617 	byteNo = (int) (n / 8);
3618 	bitNo = (int) (n % 8);
3619 
3620 	/*
3621 	 * sanity check!
3622 	 */
3623 	if (newBit != 0 && newBit != 1)
3624 		ereport(ERROR,
3625 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3626 				 errmsg("new bit must be 0 or 1")));
3627 
3628 	/*
3629 	 * Update the byte.
3630 	 */
3631 	oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3632 
3633 	if (newBit == 0)
3634 		newByte = oldByte & (~(1 << bitNo));
3635 	else
3636 		newByte = oldByte | (1 << bitNo);
3637 
3638 	((unsigned char *) VARDATA(res))[byteNo] = newByte;
3639 
3640 	PG_RETURN_BYTEA_P(res);
3641 }
3642 
3643 
3644 /* text_name()
3645  * Converts a text type to a Name type.
3646  */
3647 Datum
3648 text_name(PG_FUNCTION_ARGS)
3649 {
3650 	text	   *s = PG_GETARG_TEXT_PP(0);
3651 	Name		result;
3652 	int			len;
3653 
3654 	len = VARSIZE_ANY_EXHDR(s);
3655 
3656 	/* Truncate oversize input */
3657 	if (len >= NAMEDATALEN)
3658 		len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3659 
3660 	/* We use palloc0 here to ensure result is zero-padded */
3661 	result = (Name) palloc0(NAMEDATALEN);
3662 	memcpy(NameStr(*result), VARDATA_ANY(s), len);
3663 
3664 	PG_RETURN_NAME(result);
3665 }
3666 
3667 /* name_text()
3668  * Converts a Name type to a text type.
3669  */
3670 Datum
3671 name_text(PG_FUNCTION_ARGS)
3672 {
3673 	Name		s = PG_GETARG_NAME(0);
3674 
3675 	PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3676 }
3677 
3678 
3679 /*
3680  * textToQualifiedNameList - convert a text object to list of names
3681  *
3682  * This implements the input parsing needed by nextval() and other
3683  * functions that take a text parameter representing a qualified name.
3684  * We split the name at dots, downcase if not double-quoted, and
3685  * truncate names if they're too long.
3686  */
3687 List *
3688 textToQualifiedNameList(text *textval)
3689 {
3690 	char	   *rawname;
3691 	List	   *result = NIL;
3692 	List	   *namelist;
3693 	ListCell   *l;
3694 
3695 	/* Convert to C string (handles possible detoasting). */
3696 	/* Note we rely on being able to modify rawname below. */
3697 	rawname = text_to_cstring(textval);
3698 
3699 	if (!SplitIdentifierString(rawname, '.', &namelist))
3700 		ereport(ERROR,
3701 				(errcode(ERRCODE_INVALID_NAME),
3702 				 errmsg("invalid name syntax")));
3703 
3704 	if (namelist == NIL)
3705 		ereport(ERROR,
3706 				(errcode(ERRCODE_INVALID_NAME),
3707 				 errmsg("invalid name syntax")));
3708 
3709 	foreach(l, namelist)
3710 	{
3711 		char	   *curname = (char *) lfirst(l);
3712 
3713 		result = lappend(result, makeString(pstrdup(curname)));
3714 	}
3715 
3716 	pfree(rawname);
3717 	list_free(namelist);
3718 
3719 	return result;
3720 }
3721 
3722 /*
3723  * SplitIdentifierString --- parse a string containing identifiers
3724  *
3725  * This is the guts of textToQualifiedNameList, and is exported for use in
3726  * other situations such as parsing GUC variables.  In the GUC case, it's
3727  * important to avoid memory leaks, so the API is designed to minimize the
3728  * amount of stuff that needs to be allocated and freed.
3729  *
3730  * Inputs:
3731  *	rawstring: the input string; must be overwritable!	On return, it's
3732  *			   been modified to contain the separated identifiers.
3733  *	separator: the separator punctuation expected between identifiers
3734  *			   (typically '.' or ',').  Whitespace may also appear around
3735  *			   identifiers.
3736  * Outputs:
3737  *	namelist: filled with a palloc'd list of pointers to identifiers within
3738  *			  rawstring.  Caller should list_free() this even on error return.
3739  *
3740  * Returns true if okay, false if there is a syntax error in the string.
3741  *
3742  * Note that an empty string is considered okay here, though not in
3743  * textToQualifiedNameList.
3744  */
3745 bool
3746 SplitIdentifierString(char *rawstring, char separator,
3747 					  List **namelist)
3748 {
3749 	char	   *nextp = rawstring;
3750 	bool		done = false;
3751 
3752 	*namelist = NIL;
3753 
3754 	while (scanner_isspace(*nextp))
3755 		nextp++;				/* skip leading whitespace */
3756 
3757 	if (*nextp == '\0')
3758 		return true;			/* allow empty string */
3759 
3760 	/* At the top of the loop, we are at start of a new identifier. */
3761 	do
3762 	{
3763 		char	   *curname;
3764 		char	   *endp;
3765 
3766 		if (*nextp == '"')
3767 		{
3768 			/* Quoted name --- collapse quote-quote pairs, no downcasing */
3769 			curname = nextp + 1;
3770 			for (;;)
3771 			{
3772 				endp = strchr(nextp + 1, '"');
3773 				if (endp == NULL)
3774 					return false;	/* mismatched quotes */
3775 				if (endp[1] != '"')
3776 					break;		/* found end of quoted name */
3777 				/* Collapse adjacent quotes into one quote, and look again */
3778 				memmove(endp, endp + 1, strlen(endp));
3779 				nextp = endp;
3780 			}
3781 			/* endp now points at the terminating quote */
3782 			nextp = endp + 1;
3783 		}
3784 		else
3785 		{
3786 			/* Unquoted name --- extends to separator or whitespace */
3787 			char	   *downname;
3788 			int			len;
3789 
3790 			curname = nextp;
3791 			while (*nextp && *nextp != separator &&
3792 				   !scanner_isspace(*nextp))
3793 				nextp++;
3794 			endp = nextp;
3795 			if (curname == nextp)
3796 				return false;	/* empty unquoted name not allowed */
3797 
3798 			/*
3799 			 * Downcase the identifier, using same code as main lexer does.
3800 			 *
3801 			 * XXX because we want to overwrite the input in-place, we cannot
3802 			 * support a downcasing transformation that increases the string
3803 			 * length.  This is not a problem given the current implementation
3804 			 * of downcase_truncate_identifier, but we'll probably have to do
3805 			 * something about this someday.
3806 			 */
3807 			len = endp - curname;
3808 			downname = downcase_truncate_identifier(curname, len, false);
3809 			Assert(strlen(downname) <= len);
3810 			strncpy(curname, downname, len);	/* strncpy is required here */
3811 			pfree(downname);
3812 		}
3813 
3814 		while (scanner_isspace(*nextp))
3815 			nextp++;			/* skip trailing whitespace */
3816 
3817 		if (*nextp == separator)
3818 		{
3819 			nextp++;
3820 			while (scanner_isspace(*nextp))
3821 				nextp++;		/* skip leading whitespace for next */
3822 			/* we expect another name, so done remains false */
3823 		}
3824 		else if (*nextp == '\0')
3825 			done = true;
3826 		else
3827 			return false;		/* invalid syntax */
3828 
3829 		/* Now safe to overwrite separator with a null */
3830 		*endp = '\0';
3831 
3832 		/* Truncate name if it's overlength */
3833 		truncate_identifier(curname, strlen(curname), false);
3834 
3835 		/*
3836 		 * Finished isolating current name --- add it to list
3837 		 */
3838 		*namelist = lappend(*namelist, curname);
3839 
3840 		/* Loop back if we didn't reach end of string */
3841 	} while (!done);
3842 
3843 	return true;
3844 }
3845 
3846 
3847 /*
3848  * SplitDirectoriesString --- parse a string containing file/directory names
3849  *
3850  * This works fine on file names too; the function name is historical.
3851  *
3852  * This is similar to SplitIdentifierString, except that the parsing
3853  * rules are meant to handle pathnames instead of identifiers: there is
3854  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3855  * and we apply canonicalize_path() to each extracted string.  Because of the
3856  * last, the returned strings are separately palloc'd rather than being
3857  * pointers into rawstring --- but we still scribble on rawstring.
3858  *
3859  * Inputs:
3860  *	rawstring: the input string; must be modifiable!
3861  *	separator: the separator punctuation expected between directories
3862  *			   (typically ',' or ';').  Whitespace may also appear around
3863  *			   directories.
3864  * Outputs:
3865  *	namelist: filled with a palloc'd list of directory names.
3866  *			  Caller should list_free_deep() this even on error return.
3867  *
3868  * Returns true if okay, false if there is a syntax error in the string.
3869  *
3870  * Note that an empty string is considered okay here.
3871  */
3872 bool
3873 SplitDirectoriesString(char *rawstring, char separator,
3874 					   List **namelist)
3875 {
3876 	char	   *nextp = rawstring;
3877 	bool		done = false;
3878 
3879 	*namelist = NIL;
3880 
3881 	while (scanner_isspace(*nextp))
3882 		nextp++;				/* skip leading whitespace */
3883 
3884 	if (*nextp == '\0')
3885 		return true;			/* allow empty string */
3886 
3887 	/* At the top of the loop, we are at start of a new directory. */
3888 	do
3889 	{
3890 		char	   *curname;
3891 		char	   *endp;
3892 
3893 		if (*nextp == '"')
3894 		{
3895 			/* Quoted name --- collapse quote-quote pairs */
3896 			curname = nextp + 1;
3897 			for (;;)
3898 			{
3899 				endp = strchr(nextp + 1, '"');
3900 				if (endp == NULL)
3901 					return false;	/* mismatched quotes */
3902 				if (endp[1] != '"')
3903 					break;		/* found end of quoted name */
3904 				/* Collapse adjacent quotes into one quote, and look again */
3905 				memmove(endp, endp + 1, strlen(endp));
3906 				nextp = endp;
3907 			}
3908 			/* endp now points at the terminating quote */
3909 			nextp = endp + 1;
3910 		}
3911 		else
3912 		{
3913 			/* Unquoted name --- extends to separator or end of string */
3914 			curname = endp = nextp;
3915 			while (*nextp && *nextp != separator)
3916 			{
3917 				/* trailing whitespace should not be included in name */
3918 				if (!scanner_isspace(*nextp))
3919 					endp = nextp + 1;
3920 				nextp++;
3921 			}
3922 			if (curname == endp)
3923 				return false;	/* empty unquoted name not allowed */
3924 		}
3925 
3926 		while (scanner_isspace(*nextp))
3927 			nextp++;			/* skip trailing whitespace */
3928 
3929 		if (*nextp == separator)
3930 		{
3931 			nextp++;
3932 			while (scanner_isspace(*nextp))
3933 				nextp++;		/* skip leading whitespace for next */
3934 			/* we expect another name, so done remains false */
3935 		}
3936 		else if (*nextp == '\0')
3937 			done = true;
3938 		else
3939 			return false;		/* invalid syntax */
3940 
3941 		/* Now safe to overwrite separator with a null */
3942 		*endp = '\0';
3943 
3944 		/* Truncate path if it's overlength */
3945 		if (strlen(curname) >= MAXPGPATH)
3946 			curname[MAXPGPATH - 1] = '\0';
3947 
3948 		/*
3949 		 * Finished isolating current name --- add it to list
3950 		 */
3951 		curname = pstrdup(curname);
3952 		canonicalize_path(curname);
3953 		*namelist = lappend(*namelist, curname);
3954 
3955 		/* Loop back if we didn't reach end of string */
3956 	} while (!done);
3957 
3958 	return true;
3959 }
3960 
3961 
3962 /*
3963  * SplitGUCList --- parse a string containing identifiers or file names
3964  *
3965  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3966  * presuming whether the elements will be taken as identifiers or file names.
3967  * We assume the input has already been through flatten_set_variable_args(),
3968  * so that we need never downcase (if appropriate, that was done already).
3969  * Nor do we ever truncate, since we don't know the correct max length.
3970  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3971  * because any embedded whitespace should have led to double-quoting).
3972  * Otherwise the API is identical to SplitIdentifierString.
3973  *
3974  * XXX it's annoying to have so many copies of this string-splitting logic.
3975  * However, it's not clear that having one function with a bunch of option
3976  * flags would be much better.
3977  *
3978  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3979  * Be sure to update that if you have to change this.
3980  *
3981  * Inputs:
3982  *	rawstring: the input string; must be overwritable!	On return, it's
3983  *			   been modified to contain the separated identifiers.
3984  *	separator: the separator punctuation expected between identifiers
3985  *			   (typically '.' or ',').  Whitespace may also appear around
3986  *			   identifiers.
3987  * Outputs:
3988  *	namelist: filled with a palloc'd list of pointers to identifiers within
3989  *			  rawstring.  Caller should list_free() this even on error return.
3990  *
3991  * Returns true if okay, false if there is a syntax error in the string.
3992  */
3993 bool
3994 SplitGUCList(char *rawstring, char separator,
3995 			 List **namelist)
3996 {
3997 	char	   *nextp = rawstring;
3998 	bool		done = false;
3999 
4000 	*namelist = NIL;
4001 
4002 	while (scanner_isspace(*nextp))
4003 		nextp++;				/* skip leading whitespace */
4004 
4005 	if (*nextp == '\0')
4006 		return true;			/* allow empty string */
4007 
4008 	/* At the top of the loop, we are at start of a new identifier. */
4009 	do
4010 	{
4011 		char	   *curname;
4012 		char	   *endp;
4013 
4014 		if (*nextp == '"')
4015 		{
4016 			/* Quoted name --- collapse quote-quote pairs */
4017 			curname = nextp + 1;
4018 			for (;;)
4019 			{
4020 				endp = strchr(nextp + 1, '"');
4021 				if (endp == NULL)
4022 					return false;	/* mismatched quotes */
4023 				if (endp[1] != '"')
4024 					break;		/* found end of quoted name */
4025 				/* Collapse adjacent quotes into one quote, and look again */
4026 				memmove(endp, endp + 1, strlen(endp));
4027 				nextp = endp;
4028 			}
4029 			/* endp now points at the terminating quote */
4030 			nextp = endp + 1;
4031 		}
4032 		else
4033 		{
4034 			/* Unquoted name --- extends to separator or whitespace */
4035 			curname = nextp;
4036 			while (*nextp && *nextp != separator &&
4037 				   !scanner_isspace(*nextp))
4038 				nextp++;
4039 			endp = nextp;
4040 			if (curname == nextp)
4041 				return false;	/* empty unquoted name not allowed */
4042 		}
4043 
4044 		while (scanner_isspace(*nextp))
4045 			nextp++;			/* skip trailing whitespace */
4046 
4047 		if (*nextp == separator)
4048 		{
4049 			nextp++;
4050 			while (scanner_isspace(*nextp))
4051 				nextp++;		/* skip leading whitespace for next */
4052 			/* we expect another name, so done remains false */
4053 		}
4054 		else if (*nextp == '\0')
4055 			done = true;
4056 		else
4057 			return false;		/* invalid syntax */
4058 
4059 		/* Now safe to overwrite separator with a null */
4060 		*endp = '\0';
4061 
4062 		/*
4063 		 * Finished isolating current name --- add it to list
4064 		 */
4065 		*namelist = lappend(*namelist, curname);
4066 
4067 		/* Loop back if we didn't reach end of string */
4068 	} while (!done);
4069 
4070 	return true;
4071 }
4072 
4073 
4074 /*****************************************************************************
4075  *	Comparison Functions used for bytea
4076  *
4077  * Note: btree indexes need these routines not to leak memory; therefore,
4078  * be careful to free working copies of toasted datums.  Most places don't
4079  * need to be so careful.
4080  *****************************************************************************/
4081 
4082 Datum
4083 byteaeq(PG_FUNCTION_ARGS)
4084 {
4085 	Datum		arg1 = PG_GETARG_DATUM(0);
4086 	Datum		arg2 = PG_GETARG_DATUM(1);
4087 	bool		result;
4088 	Size		len1,
4089 				len2;
4090 
4091 	/*
4092 	 * We can use a fast path for unequal lengths, which might save us from
4093 	 * having to detoast one or both values.
4094 	 */
4095 	len1 = toast_raw_datum_size(arg1);
4096 	len2 = toast_raw_datum_size(arg2);
4097 	if (len1 != len2)
4098 		result = false;
4099 	else
4100 	{
4101 		bytea	   *barg1 = DatumGetByteaPP(arg1);
4102 		bytea	   *barg2 = DatumGetByteaPP(arg2);
4103 
4104 		result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4105 						 len1 - VARHDRSZ) == 0);
4106 
4107 		PG_FREE_IF_COPY(barg1, 0);
4108 		PG_FREE_IF_COPY(barg2, 1);
4109 	}
4110 
4111 	PG_RETURN_BOOL(result);
4112 }
4113 
4114 Datum
4115 byteane(PG_FUNCTION_ARGS)
4116 {
4117 	Datum		arg1 = PG_GETARG_DATUM(0);
4118 	Datum		arg2 = PG_GETARG_DATUM(1);
4119 	bool		result;
4120 	Size		len1,
4121 				len2;
4122 
4123 	/*
4124 	 * We can use a fast path for unequal lengths, which might save us from
4125 	 * having to detoast one or both values.
4126 	 */
4127 	len1 = toast_raw_datum_size(arg1);
4128 	len2 = toast_raw_datum_size(arg2);
4129 	if (len1 != len2)
4130 		result = true;
4131 	else
4132 	{
4133 		bytea	   *barg1 = DatumGetByteaPP(arg1);
4134 		bytea	   *barg2 = DatumGetByteaPP(arg2);
4135 
4136 		result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4137 						 len1 - VARHDRSZ) != 0);
4138 
4139 		PG_FREE_IF_COPY(barg1, 0);
4140 		PG_FREE_IF_COPY(barg2, 1);
4141 	}
4142 
4143 	PG_RETURN_BOOL(result);
4144 }
4145 
4146 Datum
4147 bytealt(PG_FUNCTION_ARGS)
4148 {
4149 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
4150 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
4151 	int			len1,
4152 				len2;
4153 	int			cmp;
4154 
4155 	len1 = VARSIZE_ANY_EXHDR(arg1);
4156 	len2 = VARSIZE_ANY_EXHDR(arg2);
4157 
4158 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4159 
4160 	PG_FREE_IF_COPY(arg1, 0);
4161 	PG_FREE_IF_COPY(arg2, 1);
4162 
4163 	PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4164 }
4165 
4166 Datum
4167 byteale(PG_FUNCTION_ARGS)
4168 {
4169 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
4170 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
4171 	int			len1,
4172 				len2;
4173 	int			cmp;
4174 
4175 	len1 = VARSIZE_ANY_EXHDR(arg1);
4176 	len2 = VARSIZE_ANY_EXHDR(arg2);
4177 
4178 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4179 
4180 	PG_FREE_IF_COPY(arg1, 0);
4181 	PG_FREE_IF_COPY(arg2, 1);
4182 
4183 	PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4184 }
4185 
4186 Datum
4187 byteagt(PG_FUNCTION_ARGS)
4188 {
4189 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
4190 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
4191 	int			len1,
4192 				len2;
4193 	int			cmp;
4194 
4195 	len1 = VARSIZE_ANY_EXHDR(arg1);
4196 	len2 = VARSIZE_ANY_EXHDR(arg2);
4197 
4198 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4199 
4200 	PG_FREE_IF_COPY(arg1, 0);
4201 	PG_FREE_IF_COPY(arg2, 1);
4202 
4203 	PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4204 }
4205 
4206 Datum
4207 byteage(PG_FUNCTION_ARGS)
4208 {
4209 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
4210 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
4211 	int			len1,
4212 				len2;
4213 	int			cmp;
4214 
4215 	len1 = VARSIZE_ANY_EXHDR(arg1);
4216 	len2 = VARSIZE_ANY_EXHDR(arg2);
4217 
4218 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4219 
4220 	PG_FREE_IF_COPY(arg1, 0);
4221 	PG_FREE_IF_COPY(arg2, 1);
4222 
4223 	PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4224 }
4225 
4226 Datum
4227 byteacmp(PG_FUNCTION_ARGS)
4228 {
4229 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
4230 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
4231 	int			len1,
4232 				len2;
4233 	int			cmp;
4234 
4235 	len1 = VARSIZE_ANY_EXHDR(arg1);
4236 	len2 = VARSIZE_ANY_EXHDR(arg2);
4237 
4238 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4239 	if ((cmp == 0) && (len1 != len2))
4240 		cmp = (len1 < len2) ? -1 : 1;
4241 
4242 	PG_FREE_IF_COPY(arg1, 0);
4243 	PG_FREE_IF_COPY(arg2, 1);
4244 
4245 	PG_RETURN_INT32(cmp);
4246 }
4247 
4248 Datum
4249 bytea_sortsupport(PG_FUNCTION_ARGS)
4250 {
4251 	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
4252 	MemoryContext oldcontext;
4253 
4254 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4255 
4256 	/* Use generic string SortSupport, forcing "C" collation */
4257 	varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4258 
4259 	MemoryContextSwitchTo(oldcontext);
4260 
4261 	PG_RETURN_VOID();
4262 }
4263 
4264 /*
4265  * appendStringInfoText
4266  *
4267  * Append a text to str.
4268  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4269  */
4270 static void
4271 appendStringInfoText(StringInfo str, const text *t)
4272 {
4273 	appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
4274 }
4275 
4276 /*
4277  * replace_text
4278  * replace all occurrences of 'old_sub_str' in 'orig_str'
4279  * with 'new_sub_str' to form 'new_str'
4280  *
4281  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4282  * otherwise returns 'new_str'
4283  */
4284 Datum
4285 replace_text(PG_FUNCTION_ARGS)
4286 {
4287 	text	   *src_text = PG_GETARG_TEXT_PP(0);
4288 	text	   *from_sub_text = PG_GETARG_TEXT_PP(1);
4289 	text	   *to_sub_text = PG_GETARG_TEXT_PP(2);
4290 	int			src_text_len;
4291 	int			from_sub_text_len;
4292 	TextPositionState state;
4293 	text	   *ret_text;
4294 	int			chunk_len;
4295 	char	   *curr_ptr;
4296 	char	   *start_ptr;
4297 	StringInfoData str;
4298 	bool		found;
4299 
4300 	src_text_len = VARSIZE_ANY_EXHDR(src_text);
4301 	from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4302 
4303 	/* Return unmodified source string if empty source or pattern */
4304 	if (src_text_len < 1 || from_sub_text_len < 1)
4305 	{
4306 		PG_RETURN_TEXT_P(src_text);
4307 	}
4308 
4309 	text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4310 
4311 	found = text_position_next(&state);
4312 
4313 	/* When the from_sub_text is not found, there is nothing to do. */
4314 	if (!found)
4315 	{
4316 		text_position_cleanup(&state);
4317 		PG_RETURN_TEXT_P(src_text);
4318 	}
4319 	curr_ptr = text_position_get_match_ptr(&state);
4320 	start_ptr = VARDATA_ANY(src_text);
4321 
4322 	initStringInfo(&str);
4323 
4324 	do
4325 	{
4326 		CHECK_FOR_INTERRUPTS();
4327 
4328 		/* copy the data skipped over by last text_position_next() */
4329 		chunk_len = curr_ptr - start_ptr;
4330 		appendBinaryStringInfo(&str, start_ptr, chunk_len);
4331 
4332 		appendStringInfoText(&str, to_sub_text);
4333 
4334 		start_ptr = curr_ptr + from_sub_text_len;
4335 
4336 		found = text_position_next(&state);
4337 		if (found)
4338 			curr_ptr = text_position_get_match_ptr(&state);
4339 	}
4340 	while (found);
4341 
4342 	/* copy trailing data */
4343 	chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4344 	appendBinaryStringInfo(&str, start_ptr, chunk_len);
4345 
4346 	text_position_cleanup(&state);
4347 
4348 	ret_text = cstring_to_text_with_len(str.data, str.len);
4349 	pfree(str.data);
4350 
4351 	PG_RETURN_TEXT_P(ret_text);
4352 }
4353 
4354 /*
4355  * check_replace_text_has_escape_char
4356  *
4357  * check whether replace_text contains escape char.
4358  */
4359 static bool
4360 check_replace_text_has_escape_char(const text *replace_text)
4361 {
4362 	const char *p = VARDATA_ANY(replace_text);
4363 	const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4364 
4365 	if (pg_database_encoding_max_length() == 1)
4366 	{
4367 		for (; p < p_end; p++)
4368 		{
4369 			if (*p == '\\')
4370 				return true;
4371 		}
4372 	}
4373 	else
4374 	{
4375 		for (; p < p_end; p += pg_mblen(p))
4376 		{
4377 			if (*p == '\\')
4378 				return true;
4379 		}
4380 	}
4381 
4382 	return false;
4383 }
4384 
4385 /*
4386  * appendStringInfoRegexpSubstr
4387  *
4388  * Append replace_text to str, substituting regexp back references for
4389  * \n escapes.  start_ptr is the start of the match in the source string,
4390  * at logical character position data_pos.
4391  */
4392 static void
4393 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
4394 							 regmatch_t *pmatch,
4395 							 char *start_ptr, int data_pos)
4396 {
4397 	const char *p = VARDATA_ANY(replace_text);
4398 	const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4399 	int			eml = pg_database_encoding_max_length();
4400 
4401 	for (;;)
4402 	{
4403 		const char *chunk_start = p;
4404 		int			so;
4405 		int			eo;
4406 
4407 		/* Find next escape char. */
4408 		if (eml == 1)
4409 		{
4410 			for (; p < p_end && *p != '\\'; p++)
4411 				 /* nothing */ ;
4412 		}
4413 		else
4414 		{
4415 			for (; p < p_end && *p != '\\'; p += pg_mblen(p))
4416 				 /* nothing */ ;
4417 		}
4418 
4419 		/* Copy the text we just scanned over, if any. */
4420 		if (p > chunk_start)
4421 			appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4422 
4423 		/* Done if at end of string, else advance over escape char. */
4424 		if (p >= p_end)
4425 			break;
4426 		p++;
4427 
4428 		if (p >= p_end)
4429 		{
4430 			/* Escape at very end of input.  Treat same as unexpected char */
4431 			appendStringInfoChar(str, '\\');
4432 			break;
4433 		}
4434 
4435 		if (*p >= '1' && *p <= '9')
4436 		{
4437 			/* Use the back reference of regexp. */
4438 			int			idx = *p - '0';
4439 
4440 			so = pmatch[idx].rm_so;
4441 			eo = pmatch[idx].rm_eo;
4442 			p++;
4443 		}
4444 		else if (*p == '&')
4445 		{
4446 			/* Use the entire matched string. */
4447 			so = pmatch[0].rm_so;
4448 			eo = pmatch[0].rm_eo;
4449 			p++;
4450 		}
4451 		else if (*p == '\\')
4452 		{
4453 			/* \\ means transfer one \ to output. */
4454 			appendStringInfoChar(str, '\\');
4455 			p++;
4456 			continue;
4457 		}
4458 		else
4459 		{
4460 			/*
4461 			 * If escape char is not followed by any expected char, just treat
4462 			 * it as ordinary data to copy.  (XXX would it be better to throw
4463 			 * an error?)
4464 			 */
4465 			appendStringInfoChar(str, '\\');
4466 			continue;
4467 		}
4468 
4469 		if (so != -1 && eo != -1)
4470 		{
4471 			/*
4472 			 * Copy the text that is back reference of regexp.  Note so and eo
4473 			 * are counted in characters not bytes.
4474 			 */
4475 			char	   *chunk_start;
4476 			int			chunk_len;
4477 
4478 			Assert(so >= data_pos);
4479 			chunk_start = start_ptr;
4480 			chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4481 			chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4482 			appendBinaryStringInfo(str, chunk_start, chunk_len);
4483 		}
4484 	}
4485 }
4486 
4487 #define REGEXP_REPLACE_BACKREF_CNT		10
4488 
4489 /*
4490  * replace_text_regexp
4491  *
4492  * replace text that matches to regexp in src_text to replace_text.
4493  *
4494  * Note: to avoid having to include regex.h in builtins.h, we declare
4495  * the regexp argument as void *, but really it's regex_t *.
4496  */
4497 text *
4498 replace_text_regexp(text *src_text, void *regexp,
4499 					text *replace_text, bool glob)
4500 {
4501 	text	   *ret_text;
4502 	regex_t    *re = (regex_t *) regexp;
4503 	int			src_text_len = VARSIZE_ANY_EXHDR(src_text);
4504 	StringInfoData buf;
4505 	regmatch_t	pmatch[REGEXP_REPLACE_BACKREF_CNT];
4506 	pg_wchar   *data;
4507 	size_t		data_len;
4508 	int			search_start;
4509 	int			data_pos;
4510 	char	   *start_ptr;
4511 	bool		have_escape;
4512 
4513 	initStringInfo(&buf);
4514 
4515 	/* Convert data string to wide characters. */
4516 	data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4517 	data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4518 
4519 	/* Check whether replace_text has escape char. */
4520 	have_escape = check_replace_text_has_escape_char(replace_text);
4521 
4522 	/* start_ptr points to the data_pos'th character of src_text */
4523 	start_ptr = (char *) VARDATA_ANY(src_text);
4524 	data_pos = 0;
4525 
4526 	search_start = 0;
4527 	while (search_start <= data_len)
4528 	{
4529 		int			regexec_result;
4530 
4531 		CHECK_FOR_INTERRUPTS();
4532 
4533 		regexec_result = pg_regexec(re,
4534 									data,
4535 									data_len,
4536 									search_start,
4537 									NULL,	/* no details */
4538 									REGEXP_REPLACE_BACKREF_CNT,
4539 									pmatch,
4540 									0);
4541 
4542 		if (regexec_result == REG_NOMATCH)
4543 			break;
4544 
4545 		if (regexec_result != REG_OKAY)
4546 		{
4547 			char		errMsg[100];
4548 
4549 			CHECK_FOR_INTERRUPTS();
4550 			pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4551 			ereport(ERROR,
4552 					(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4553 					 errmsg("regular expression failed: %s", errMsg)));
4554 		}
4555 
4556 		/*
4557 		 * Copy the text to the left of the match position.  Note we are given
4558 		 * character not byte indexes.
4559 		 */
4560 		if (pmatch[0].rm_so - data_pos > 0)
4561 		{
4562 			int			chunk_len;
4563 
4564 			chunk_len = charlen_to_bytelen(start_ptr,
4565 										   pmatch[0].rm_so - data_pos);
4566 			appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4567 
4568 			/*
4569 			 * Advance start_ptr over that text, to avoid multiple rescans of
4570 			 * it if the replace_text contains multiple back-references.
4571 			 */
4572 			start_ptr += chunk_len;
4573 			data_pos = pmatch[0].rm_so;
4574 		}
4575 
4576 		/*
4577 		 * Copy the replace_text. Process back references when the
4578 		 * replace_text has escape characters.
4579 		 */
4580 		if (have_escape)
4581 			appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4582 										 start_ptr, data_pos);
4583 		else
4584 			appendStringInfoText(&buf, replace_text);
4585 
4586 		/* Advance start_ptr and data_pos over the matched text. */
4587 		start_ptr += charlen_to_bytelen(start_ptr,
4588 										pmatch[0].rm_eo - data_pos);
4589 		data_pos = pmatch[0].rm_eo;
4590 
4591 		/*
4592 		 * When global option is off, replace the first instance only.
4593 		 */
4594 		if (!glob)
4595 			break;
4596 
4597 		/*
4598 		 * Advance search position.  Normally we start the next search at the
4599 		 * end of the previous match; but if the match was of zero length, we
4600 		 * have to advance by one character, or we'd just find the same match
4601 		 * again.
4602 		 */
4603 		search_start = data_pos;
4604 		if (pmatch[0].rm_so == pmatch[0].rm_eo)
4605 			search_start++;
4606 	}
4607 
4608 	/*
4609 	 * Copy the text to the right of the last match.
4610 	 */
4611 	if (data_pos < data_len)
4612 	{
4613 		int			chunk_len;
4614 
4615 		chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4616 		appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4617 	}
4618 
4619 	ret_text = cstring_to_text_with_len(buf.data, buf.len);
4620 	pfree(buf.data);
4621 	pfree(data);
4622 
4623 	return ret_text;
4624 }
4625 
4626 /*
4627  * split_part
4628  * parse input string based on provided field separator
4629  * return N'th item (1 based, negative counts from end)
4630  */
4631 Datum
4632 split_part(PG_FUNCTION_ARGS)
4633 {
4634 	text	   *inputstring = PG_GETARG_TEXT_PP(0);
4635 	text	   *fldsep = PG_GETARG_TEXT_PP(1);
4636 	int			fldnum = PG_GETARG_INT32(2);
4637 	int			inputstring_len;
4638 	int			fldsep_len;
4639 	TextPositionState state;
4640 	char	   *start_ptr;
4641 	char	   *end_ptr;
4642 	text	   *result_text;
4643 	bool		found;
4644 
4645 	/* field number is 1 based */
4646 	if (fldnum == 0)
4647 		ereport(ERROR,
4648 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4649 				 errmsg("field position must not be zero")));
4650 
4651 	inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4652 	fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4653 
4654 	/* return empty string for empty input string */
4655 	if (inputstring_len < 1)
4656 		PG_RETURN_TEXT_P(cstring_to_text(""));
4657 
4658 	/* handle empty field separator */
4659 	if (fldsep_len < 1)
4660 	{
4661 		/* if first or last field, return input string, else empty string */
4662 		if (fldnum == 1 || fldnum == -1)
4663 			PG_RETURN_TEXT_P(inputstring);
4664 		else
4665 			PG_RETURN_TEXT_P(cstring_to_text(""));
4666 	}
4667 
4668 	/* find the first field separator */
4669 	text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4670 
4671 	found = text_position_next(&state);
4672 
4673 	/* special case if fldsep not found at all */
4674 	if (!found)
4675 	{
4676 		text_position_cleanup(&state);
4677 		/* if first or last field, return input string, else empty string */
4678 		if (fldnum == 1 || fldnum == -1)
4679 			PG_RETURN_TEXT_P(inputstring);
4680 		else
4681 			PG_RETURN_TEXT_P(cstring_to_text(""));
4682 	}
4683 
4684 	/*
4685 	 * take care of a negative field number (i.e. count from the right) by
4686 	 * converting to a positive field number; we need total number of fields
4687 	 */
4688 	if (fldnum < 0)
4689 	{
4690 		/* we found a fldsep, so there are at least two fields */
4691 		int			numfields = 2;
4692 
4693 		while (text_position_next(&state))
4694 			numfields++;
4695 
4696 		/* special case of last field does not require an extra pass */
4697 		if (fldnum == -1)
4698 		{
4699 			start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4700 			end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4701 			text_position_cleanup(&state);
4702 			PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
4703 													  end_ptr - start_ptr));
4704 		}
4705 
4706 		/* else, convert fldnum to positive notation */
4707 		fldnum += numfields + 1;
4708 
4709 		/* if nonexistent field, return empty string */
4710 		if (fldnum <= 0)
4711 		{
4712 			text_position_cleanup(&state);
4713 			PG_RETURN_TEXT_P(cstring_to_text(""));
4714 		}
4715 
4716 		/* reset to pointing at first match, but now with positive fldnum */
4717 		text_position_reset(&state);
4718 		found = text_position_next(&state);
4719 		Assert(found);
4720 	}
4721 
4722 	/* identify bounds of first field */
4723 	start_ptr = VARDATA_ANY(inputstring);
4724 	end_ptr = text_position_get_match_ptr(&state);
4725 
4726 	while (found && --fldnum > 0)
4727 	{
4728 		/* identify bounds of next field */
4729 		start_ptr = end_ptr + fldsep_len;
4730 		found = text_position_next(&state);
4731 		if (found)
4732 			end_ptr = text_position_get_match_ptr(&state);
4733 	}
4734 
4735 	text_position_cleanup(&state);
4736 
4737 	if (fldnum > 0)
4738 	{
4739 		/* N'th field separator not found */
4740 		/* if last field requested, return it, else empty string */
4741 		if (fldnum == 1)
4742 		{
4743 			int			last_len = start_ptr - VARDATA_ANY(inputstring);
4744 
4745 			result_text = cstring_to_text_with_len(start_ptr,
4746 												   inputstring_len - last_len);
4747 		}
4748 		else
4749 			result_text = cstring_to_text("");
4750 	}
4751 	else
4752 	{
4753 		/* non-last field requested */
4754 		result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4755 	}
4756 
4757 	PG_RETURN_TEXT_P(result_text);
4758 }
4759 
4760 /*
4761  * Convenience function to return true when two text params are equal.
4762  */
4763 static bool
4764 text_isequal(text *txt1, text *txt2, Oid collid)
4765 {
4766 	return DatumGetBool(DirectFunctionCall2Coll(texteq,
4767 												collid,
4768 												PointerGetDatum(txt1),
4769 												PointerGetDatum(txt2)));
4770 }
4771 
4772 /*
4773  * text_to_array
4774  * parse input string and return text array of elements,
4775  * based on provided field separator
4776  */
4777 Datum
4778 text_to_array(PG_FUNCTION_ARGS)
4779 {
4780 	SplitTextOutputData tstate;
4781 
4782 	/* For array output, tstate should start as all zeroes */
4783 	memset(&tstate, 0, sizeof(tstate));
4784 
4785 	if (!split_text(fcinfo, &tstate))
4786 		PG_RETURN_NULL();
4787 
4788 	if (tstate.astate == NULL)
4789 		PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4790 
4791 	PG_RETURN_ARRAYTYPE_P(makeArrayResult(tstate.astate,
4792 										  CurrentMemoryContext));
4793 }
4794 
4795 /*
4796  * text_to_array_null
4797  * parse input string and return text array of elements,
4798  * based on provided field separator and null string
4799  *
4800  * This is a separate entry point only to prevent the regression tests from
4801  * complaining about different argument sets for the same internal function.
4802  */
4803 Datum
4804 text_to_array_null(PG_FUNCTION_ARGS)
4805 {
4806 	return text_to_array(fcinfo);
4807 }
4808 
4809 /*
4810  * text_to_table
4811  * parse input string and return table of elements,
4812  * based on provided field separator
4813  */
4814 Datum
4815 text_to_table(PG_FUNCTION_ARGS)
4816 {
4817 	ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4818 	SplitTextOutputData tstate;
4819 	MemoryContext old_cxt;
4820 
4821 	/* check to see if caller supports us returning a tuplestore */
4822 	if (rsi == NULL || !IsA(rsi, ReturnSetInfo))
4823 		ereport(ERROR,
4824 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4825 				 errmsg("set-valued function called in context that cannot accept a set")));
4826 	if (!(rsi->allowedModes & SFRM_Materialize))
4827 		ereport(ERROR,
4828 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4829 				 errmsg("materialize mode required, but it is not allowed in this context")));
4830 
4831 	/* OK, prepare tuplestore in per-query memory */
4832 	old_cxt = MemoryContextSwitchTo(rsi->econtext->ecxt_per_query_memory);
4833 
4834 	tstate.astate = NULL;
4835 	tstate.tupdesc = CreateTupleDescCopy(rsi->expectedDesc);
4836 	tstate.tupstore = tuplestore_begin_heap(true, false, work_mem);
4837 
4838 	MemoryContextSwitchTo(old_cxt);
4839 
4840 	(void) split_text(fcinfo, &tstate);
4841 
4842 	tuplestore_donestoring(tstate.tupstore);
4843 
4844 	rsi->returnMode = SFRM_Materialize;
4845 	rsi->setResult = tstate.tupstore;
4846 	rsi->setDesc = tstate.tupdesc;
4847 
4848 	return (Datum) 0;
4849 }
4850 
4851 /*
4852  * text_to_table_null
4853  * parse input string and return table of elements,
4854  * based on provided field separator and null string
4855  *
4856  * This is a separate entry point only to prevent the regression tests from
4857  * complaining about different argument sets for the same internal function.
4858  */
4859 Datum
4860 text_to_table_null(PG_FUNCTION_ARGS)
4861 {
4862 	return text_to_table(fcinfo);
4863 }
4864 
4865 /*
4866  * Common code for text_to_array, text_to_array_null, text_to_table
4867  * and text_to_table_null functions.
4868  *
4869  * These are not strict so we have to test for null inputs explicitly.
4870  * Returns false if result is to be null, else returns true.
4871  *
4872  * Note that if the result is valid but empty (zero elements), we return
4873  * without changing *tstate --- caller must handle that case, too.
4874  */
4875 static bool
4876 split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
4877 {
4878 	text	   *inputstring;
4879 	text	   *fldsep;
4880 	text	   *null_string;
4881 	Oid			collation = PG_GET_COLLATION();
4882 	int			inputstring_len;
4883 	int			fldsep_len;
4884 	char	   *start_ptr;
4885 	text	   *result_text;
4886 
4887 	/* when input string is NULL, then result is NULL too */
4888 	if (PG_ARGISNULL(0))
4889 		return false;
4890 
4891 	inputstring = PG_GETARG_TEXT_PP(0);
4892 
4893 	/* fldsep can be NULL */
4894 	if (!PG_ARGISNULL(1))
4895 		fldsep = PG_GETARG_TEXT_PP(1);
4896 	else
4897 		fldsep = NULL;
4898 
4899 	/* null_string can be NULL or omitted */
4900 	if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4901 		null_string = PG_GETARG_TEXT_PP(2);
4902 	else
4903 		null_string = NULL;
4904 
4905 	if (fldsep != NULL)
4906 	{
4907 		/*
4908 		 * Normal case with non-null fldsep.  Use the text_position machinery
4909 		 * to search for occurrences of fldsep.
4910 		 */
4911 		TextPositionState state;
4912 
4913 		inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4914 		fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4915 
4916 		/* return empty set for empty input string */
4917 		if (inputstring_len < 1)
4918 			return true;
4919 
4920 		/* empty field separator: return input string as a one-element set */
4921 		if (fldsep_len < 1)
4922 		{
4923 			split_text_accum_result(tstate, inputstring,
4924 									null_string, collation);
4925 			return true;
4926 		}
4927 
4928 		text_position_setup(inputstring, fldsep, collation, &state);
4929 
4930 		start_ptr = VARDATA_ANY(inputstring);
4931 
4932 		for (;;)
4933 		{
4934 			bool		found;
4935 			char	   *end_ptr;
4936 			int			chunk_len;
4937 
4938 			CHECK_FOR_INTERRUPTS();
4939 
4940 			found = text_position_next(&state);
4941 			if (!found)
4942 			{
4943 				/* fetch last field */
4944 				chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4945 				end_ptr = NULL; /* not used, but some compilers complain */
4946 			}
4947 			else
4948 			{
4949 				/* fetch non-last field */
4950 				end_ptr = text_position_get_match_ptr(&state);
4951 				chunk_len = end_ptr - start_ptr;
4952 			}
4953 
4954 			/* build a temp text datum to pass to split_text_accum_result */
4955 			result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4956 
4957 			/* stash away this field */
4958 			split_text_accum_result(tstate, result_text,
4959 									null_string, collation);
4960 
4961 			pfree(result_text);
4962 
4963 			if (!found)
4964 				break;
4965 
4966 			start_ptr = end_ptr + fldsep_len;
4967 		}
4968 
4969 		text_position_cleanup(&state);
4970 	}
4971 	else
4972 	{
4973 		/*
4974 		 * When fldsep is NULL, each character in the input string becomes a
4975 		 * separate element in the result set.  The separator is effectively
4976 		 * the space between characters.
4977 		 */
4978 		inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4979 
4980 		start_ptr = VARDATA_ANY(inputstring);
4981 
4982 		while (inputstring_len > 0)
4983 		{
4984 			int			chunk_len = pg_mblen(start_ptr);
4985 
4986 			CHECK_FOR_INTERRUPTS();
4987 
4988 			/* build a temp text datum to pass to split_text_accum_result */
4989 			result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4990 
4991 			/* stash away this field */
4992 			split_text_accum_result(tstate, result_text,
4993 									null_string, collation);
4994 
4995 			pfree(result_text);
4996 
4997 			start_ptr += chunk_len;
4998 			inputstring_len -= chunk_len;
4999 		}
5000 	}
5001 
5002 	return true;
5003 }
5004 
5005 /*
5006  * Add text item to result set (table or array).
5007  *
5008  * This is also responsible for checking to see if the item matches
5009  * the null_string, in which case we should emit NULL instead.
5010  */
5011 static void
5012 split_text_accum_result(SplitTextOutputData *tstate,
5013 						text *field_value,
5014 						text *null_string,
5015 						Oid collation)
5016 {
5017 	bool		is_null = false;
5018 
5019 	if (null_string && text_isequal(field_value, null_string, collation))
5020 		is_null = true;
5021 
5022 	if (tstate->tupstore)
5023 	{
5024 		Datum		values[1];
5025 		bool		nulls[1];
5026 
5027 		values[0] = PointerGetDatum(field_value);
5028 		nulls[0] = is_null;
5029 
5030 		tuplestore_putvalues(tstate->tupstore,
5031 							 tstate->tupdesc,
5032 							 values,
5033 							 nulls);
5034 	}
5035 	else
5036 	{
5037 		tstate->astate = accumArrayResult(tstate->astate,
5038 										  PointerGetDatum(field_value),
5039 										  is_null,
5040 										  TEXTOID,
5041 										  CurrentMemoryContext);
5042 	}
5043 }
5044 
5045 /*
5046  * array_to_text
5047  * concatenate Cstring representation of input array elements
5048  * using provided field separator
5049  */
5050 Datum
5051 array_to_text(PG_FUNCTION_ARGS)
5052 {
5053 	ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
5054 	char	   *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5055 
5056 	PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
5057 }
5058 
5059 /*
5060  * array_to_text_null
5061  * concatenate Cstring representation of input array elements
5062  * using provided field separator and null string
5063  *
5064  * This version is not strict so we have to test for null inputs explicitly.
5065  */
5066 Datum
5067 array_to_text_null(PG_FUNCTION_ARGS)
5068 {
5069 	ArrayType  *v;
5070 	char	   *fldsep;
5071 	char	   *null_string;
5072 
5073 	/* returns NULL when first or second parameter is NULL */
5074 	if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
5075 		PG_RETURN_NULL();
5076 
5077 	v = PG_GETARG_ARRAYTYPE_P(0);
5078 	fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
5079 
5080 	/* NULL null string is passed through as a null pointer */
5081 	if (!PG_ARGISNULL(2))
5082 		null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
5083 	else
5084 		null_string = NULL;
5085 
5086 	PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
5087 }
5088 
5089 /*
5090  * common code for array_to_text and array_to_text_null functions
5091  */
5092 static text *
5093 array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
5094 					   const char *fldsep, const char *null_string)
5095 {
5096 	text	   *result;
5097 	int			nitems,
5098 			   *dims,
5099 				ndims;
5100 	Oid			element_type;
5101 	int			typlen;
5102 	bool		typbyval;
5103 	char		typalign;
5104 	StringInfoData buf;
5105 	bool		printed = false;
5106 	char	   *p;
5107 	bits8	   *bitmap;
5108 	int			bitmask;
5109 	int			i;
5110 	ArrayMetaState *my_extra;
5111 
5112 	ndims = ARR_NDIM(v);
5113 	dims = ARR_DIMS(v);
5114 	nitems = ArrayGetNItems(ndims, dims);
5115 
5116 	/* if there are no elements, return an empty string */
5117 	if (nitems == 0)
5118 		return cstring_to_text_with_len("", 0);
5119 
5120 	element_type = ARR_ELEMTYPE(v);
5121 	initStringInfo(&buf);
5122 
5123 	/*
5124 	 * We arrange to look up info about element type, including its output
5125 	 * conversion proc, only once per series of calls, assuming the element
5126 	 * type doesn't change underneath us.
5127 	 */
5128 	my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5129 	if (my_extra == NULL)
5130 	{
5131 		fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5132 													  sizeof(ArrayMetaState));
5133 		my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
5134 		my_extra->element_type = ~element_type;
5135 	}
5136 
5137 	if (my_extra->element_type != element_type)
5138 	{
5139 		/*
5140 		 * Get info about element type, including its output conversion proc
5141 		 */
5142 		get_type_io_data(element_type, IOFunc_output,
5143 						 &my_extra->typlen, &my_extra->typbyval,
5144 						 &my_extra->typalign, &my_extra->typdelim,
5145 						 &my_extra->typioparam, &my_extra->typiofunc);
5146 		fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
5147 					  fcinfo->flinfo->fn_mcxt);
5148 		my_extra->element_type = element_type;
5149 	}
5150 	typlen = my_extra->typlen;
5151 	typbyval = my_extra->typbyval;
5152 	typalign = my_extra->typalign;
5153 
5154 	p = ARR_DATA_PTR(v);
5155 	bitmap = ARR_NULLBITMAP(v);
5156 	bitmask = 1;
5157 
5158 	for (i = 0; i < nitems; i++)
5159 	{
5160 		Datum		itemvalue;
5161 		char	   *value;
5162 
5163 		/* Get source element, checking for NULL */
5164 		if (bitmap && (*bitmap & bitmask) == 0)
5165 		{
5166 			/* if null_string is NULL, we just ignore null elements */
5167 			if (null_string != NULL)
5168 			{
5169 				if (printed)
5170 					appendStringInfo(&buf, "%s%s", fldsep, null_string);
5171 				else
5172 					appendStringInfoString(&buf, null_string);
5173 				printed = true;
5174 			}
5175 		}
5176 		else
5177 		{
5178 			itemvalue = fetch_att(p, typbyval, typlen);
5179 
5180 			value = OutputFunctionCall(&my_extra->proc, itemvalue);
5181 
5182 			if (printed)
5183 				appendStringInfo(&buf, "%s%s", fldsep, value);
5184 			else
5185 				appendStringInfoString(&buf, value);
5186 			printed = true;
5187 
5188 			p = att_addlength_pointer(p, typlen, p);
5189 			p = (char *) att_align_nominal(p, typalign);
5190 		}
5191 
5192 		/* advance bitmap pointer if any */
5193 		if (bitmap)
5194 		{
5195 			bitmask <<= 1;
5196 			if (bitmask == 0x100)
5197 			{
5198 				bitmap++;
5199 				bitmask = 1;
5200 			}
5201 		}
5202 	}
5203 
5204 	result = cstring_to_text_with_len(buf.data, buf.len);
5205 	pfree(buf.data);
5206 
5207 	return result;
5208 }
5209 
5210 #define HEXBASE 16
5211 /*
5212  * Convert an int32 to a string containing a base 16 (hex) representation of
5213  * the number.
5214  */
5215 Datum
5216 to_hex32(PG_FUNCTION_ARGS)
5217 {
5218 	uint32		value = (uint32) PG_GETARG_INT32(0);
5219 	char	   *ptr;
5220 	const char *digits = "0123456789abcdef";
5221 	char		buf[32];		/* bigger than needed, but reasonable */
5222 
5223 	ptr = buf + sizeof(buf) - 1;
5224 	*ptr = '\0';
5225 
5226 	do
5227 	{
5228 		*--ptr = digits[value % HEXBASE];
5229 		value /= HEXBASE;
5230 	} while (ptr > buf && value);
5231 
5232 	PG_RETURN_TEXT_P(cstring_to_text(ptr));
5233 }
5234 
5235 /*
5236  * Convert an int64 to a string containing a base 16 (hex) representation of
5237  * the number.
5238  */
5239 Datum
5240 to_hex64(PG_FUNCTION_ARGS)
5241 {
5242 	uint64		value = (uint64) PG_GETARG_INT64(0);
5243 	char	   *ptr;
5244 	const char *digits = "0123456789abcdef";
5245 	char		buf[32];		/* bigger than needed, but reasonable */
5246 
5247 	ptr = buf + sizeof(buf) - 1;
5248 	*ptr = '\0';
5249 
5250 	do
5251 	{
5252 		*--ptr = digits[value % HEXBASE];
5253 		value /= HEXBASE;
5254 	} while (ptr > buf && value);
5255 
5256 	PG_RETURN_TEXT_P(cstring_to_text(ptr));
5257 }
5258 
5259 /*
5260  * Return the size of a datum, possibly compressed
5261  *
5262  * Works on any data type
5263  */
5264 Datum
5265 pg_column_size(PG_FUNCTION_ARGS)
5266 {
5267 	Datum		value = PG_GETARG_DATUM(0);
5268 	int32		result;
5269 	int			typlen;
5270 
5271 	/* On first call, get the input type's typlen, and save at *fn_extra */
5272 	if (fcinfo->flinfo->fn_extra == NULL)
5273 	{
5274 		/* Lookup the datatype of the supplied argument */
5275 		Oid			argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5276 
5277 		typlen = get_typlen(argtypeid);
5278 		if (typlen == 0)		/* should not happen */
5279 			elog(ERROR, "cache lookup failed for type %u", argtypeid);
5280 
5281 		fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5282 													  sizeof(int));
5283 		*((int *) fcinfo->flinfo->fn_extra) = typlen;
5284 	}
5285 	else
5286 		typlen = *((int *) fcinfo->flinfo->fn_extra);
5287 
5288 	if (typlen == -1)
5289 	{
5290 		/* varlena type, possibly toasted */
5291 		result = toast_datum_size(value);
5292 	}
5293 	else if (typlen == -2)
5294 	{
5295 		/* cstring */
5296 		result = strlen(DatumGetCString(value)) + 1;
5297 	}
5298 	else
5299 	{
5300 		/* ordinary fixed-width type */
5301 		result = typlen;
5302 	}
5303 
5304 	PG_RETURN_INT32(result);
5305 }
5306 
5307 /*
5308  * Return the compression method stored in the compressed attribute.  Return
5309  * NULL for non varlena type or uncompressed data.
5310  */
5311 Datum
5312 pg_column_compression(PG_FUNCTION_ARGS)
5313 {
5314 	int			typlen;
5315 	char	   *result;
5316 	ToastCompressionId cmid;
5317 
5318 	/* On first call, get the input type's typlen, and save at *fn_extra */
5319 	if (fcinfo->flinfo->fn_extra == NULL)
5320 	{
5321 		/* Lookup the datatype of the supplied argument */
5322 		Oid			argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5323 
5324 		typlen = get_typlen(argtypeid);
5325 		if (typlen == 0)		/* should not happen */
5326 			elog(ERROR, "cache lookup failed for type %u", argtypeid);
5327 
5328 		fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5329 													  sizeof(int));
5330 		*((int *) fcinfo->flinfo->fn_extra) = typlen;
5331 	}
5332 	else
5333 		typlen = *((int *) fcinfo->flinfo->fn_extra);
5334 
5335 	if (typlen != -1)
5336 		PG_RETURN_NULL();
5337 
5338 	/* get the compression method id stored in the compressed varlena */
5339 	cmid = toast_get_compression_id((struct varlena *)
5340 									DatumGetPointer(PG_GETARG_DATUM(0)));
5341 	if (cmid == TOAST_INVALID_COMPRESSION_ID)
5342 		PG_RETURN_NULL();
5343 
5344 	/* convert compression method id to compression method name */
5345 	switch (cmid)
5346 	{
5347 		case TOAST_PGLZ_COMPRESSION_ID:
5348 			result = "pglz";
5349 			break;
5350 		case TOAST_LZ4_COMPRESSION_ID:
5351 			result = "lz4";
5352 			break;
5353 		default:
5354 			elog(ERROR, "invalid compression method id %d", cmid);
5355 	}
5356 
5357 	PG_RETURN_TEXT_P(cstring_to_text(result));
5358 }
5359 
5360 /*
5361  * string_agg - Concatenates values and returns string.
5362  *
5363  * Syntax: string_agg(value text, delimiter text) RETURNS text
5364  *
5365  * Note: Any NULL values are ignored. The first-call delimiter isn't
5366  * actually used at all, and on subsequent calls the delimiter precedes
5367  * the associated value.
5368  */
5369 
5370 /* subroutine to initialize state */
5371 static StringInfo
5372 makeStringAggState(FunctionCallInfo fcinfo)
5373 {
5374 	StringInfo	state;
5375 	MemoryContext aggcontext;
5376 	MemoryContext oldcontext;
5377 
5378 	if (!AggCheckCallContext(fcinfo, &aggcontext))
5379 	{
5380 		/* cannot be called directly because of internal-type argument */
5381 		elog(ERROR, "string_agg_transfn called in non-aggregate context");
5382 	}
5383 
5384 	/*
5385 	 * Create state in aggregate context.  It'll stay there across subsequent
5386 	 * calls.
5387 	 */
5388 	oldcontext = MemoryContextSwitchTo(aggcontext);
5389 	state = makeStringInfo();
5390 	MemoryContextSwitchTo(oldcontext);
5391 
5392 	return state;
5393 }
5394 
5395 Datum
5396 string_agg_transfn(PG_FUNCTION_ARGS)
5397 {
5398 	StringInfo	state;
5399 
5400 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5401 
5402 	/* Append the value unless null. */
5403 	if (!PG_ARGISNULL(1))
5404 	{
5405 		/* On the first time through, we ignore the delimiter. */
5406 		if (state == NULL)
5407 			state = makeStringAggState(fcinfo);
5408 		else if (!PG_ARGISNULL(2))
5409 			appendStringInfoText(state, PG_GETARG_TEXT_PP(2));	/* delimiter */
5410 
5411 		appendStringInfoText(state, PG_GETARG_TEXT_PP(1));	/* value */
5412 	}
5413 
5414 	/*
5415 	 * The transition type for string_agg() is declared to be "internal",
5416 	 * which is a pass-by-value type the same size as a pointer.
5417 	 */
5418 	PG_RETURN_POINTER(state);
5419 }
5420 
5421 Datum
5422 string_agg_finalfn(PG_FUNCTION_ARGS)
5423 {
5424 	StringInfo	state;
5425 
5426 	/* cannot be called directly because of internal-type argument */
5427 	Assert(AggCheckCallContext(fcinfo, NULL));
5428 
5429 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5430 
5431 	if (state != NULL)
5432 		PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
5433 	else
5434 		PG_RETURN_NULL();
5435 }
5436 
5437 /*
5438  * Prepare cache with fmgr info for the output functions of the datatypes of
5439  * the arguments of a concat-like function, beginning with argument "argidx".
5440  * (Arguments before that will have corresponding slots in the resulting
5441  * FmgrInfo array, but we don't fill those slots.)
5442  */
5443 static FmgrInfo *
5444 build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
5445 {
5446 	FmgrInfo   *foutcache;
5447 	int			i;
5448 
5449 	/* We keep the info in fn_mcxt so it survives across calls */
5450 	foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5451 												PG_NARGS() * sizeof(FmgrInfo));
5452 
5453 	for (i = argidx; i < PG_NARGS(); i++)
5454 	{
5455 		Oid			valtype;
5456 		Oid			typOutput;
5457 		bool		typIsVarlena;
5458 
5459 		valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5460 		if (!OidIsValid(valtype))
5461 			elog(ERROR, "could not determine data type of concat() input");
5462 
5463 		getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5464 		fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5465 	}
5466 
5467 	fcinfo->flinfo->fn_extra = foutcache;
5468 
5469 	return foutcache;
5470 }
5471 
5472 /*
5473  * Implementation of both concat() and concat_ws().
5474  *
5475  * sepstr is the separator string to place between values.
5476  * argidx identifies the first argument to concatenate (counting from zero);
5477  * note that this must be constant across any one series of calls.
5478  *
5479  * Returns NULL if result should be NULL, else text value.
5480  */
5481 static text *
5482 concat_internal(const char *sepstr, int argidx,
5483 				FunctionCallInfo fcinfo)
5484 {
5485 	text	   *result;
5486 	StringInfoData str;
5487 	FmgrInfo   *foutcache;
5488 	bool		first_arg = true;
5489 	int			i;
5490 
5491 	/*
5492 	 * concat(VARIADIC some-array) is essentially equivalent to
5493 	 * array_to_text(), ie concat the array elements with the given separator.
5494 	 * So we just pass the case off to that code.
5495 	 */
5496 	if (get_fn_expr_variadic(fcinfo->flinfo))
5497 	{
5498 		ArrayType  *arr;
5499 
5500 		/* Should have just the one argument */
5501 		Assert(argidx == PG_NARGS() - 1);
5502 
5503 		/* concat(VARIADIC NULL) is defined as NULL */
5504 		if (PG_ARGISNULL(argidx))
5505 			return NULL;
5506 
5507 		/*
5508 		 * Non-null argument had better be an array.  We assume that any call
5509 		 * context that could let get_fn_expr_variadic return true will have
5510 		 * checked that a VARIADIC-labeled parameter actually is an array.  So
5511 		 * it should be okay to just Assert that it's an array rather than
5512 		 * doing a full-fledged error check.
5513 		 */
5514 		Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
5515 
5516 		/* OK, safe to fetch the array value */
5517 		arr = PG_GETARG_ARRAYTYPE_P(argidx);
5518 
5519 		/*
5520 		 * And serialize the array.  We tell array_to_text to ignore null
5521 		 * elements, which matches the behavior of the loop below.
5522 		 */
5523 		return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5524 	}
5525 
5526 	/* Normal case without explicit VARIADIC marker */
5527 	initStringInfo(&str);
5528 
5529 	/* Get output function info, building it if first time through */
5530 	foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5531 	if (foutcache == NULL)
5532 		foutcache = build_concat_foutcache(fcinfo, argidx);
5533 
5534 	for (i = argidx; i < PG_NARGS(); i++)
5535 	{
5536 		if (!PG_ARGISNULL(i))
5537 		{
5538 			Datum		value = PG_GETARG_DATUM(i);
5539 
5540 			/* add separator if appropriate */
5541 			if (first_arg)
5542 				first_arg = false;
5543 			else
5544 				appendStringInfoString(&str, sepstr);
5545 
5546 			/* call the appropriate type output function, append the result */
5547 			appendStringInfoString(&str,
5548 								   OutputFunctionCall(&foutcache[i], value));
5549 		}
5550 	}
5551 
5552 	result = cstring_to_text_with_len(str.data, str.len);
5553 	pfree(str.data);
5554 
5555 	return result;
5556 }
5557 
5558 /*
5559  * Concatenate all arguments. NULL arguments are ignored.
5560  */
5561 Datum
5562 text_concat(PG_FUNCTION_ARGS)
5563 {
5564 	text	   *result;
5565 
5566 	result = concat_internal("", 0, fcinfo);
5567 	if (result == NULL)
5568 		PG_RETURN_NULL();
5569 	PG_RETURN_TEXT_P(result);
5570 }
5571 
5572 /*
5573  * Concatenate all but first argument value with separators. The first
5574  * parameter is used as the separator. NULL arguments are ignored.
5575  */
5576 Datum
5577 text_concat_ws(PG_FUNCTION_ARGS)
5578 {
5579 	char	   *sep;
5580 	text	   *result;
5581 
5582 	/* return NULL when separator is NULL */
5583 	if (PG_ARGISNULL(0))
5584 		PG_RETURN_NULL();
5585 	sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
5586 
5587 	result = concat_internal(sep, 1, fcinfo);
5588 	if (result == NULL)
5589 		PG_RETURN_NULL();
5590 	PG_RETURN_TEXT_P(result);
5591 }
5592 
5593 /*
5594  * Return first n characters in the string. When n is negative,
5595  * return all but last |n| characters.
5596  */
5597 Datum
5598 text_left(PG_FUNCTION_ARGS)
5599 {
5600 	int			n = PG_GETARG_INT32(1);
5601 
5602 	if (n < 0)
5603 	{
5604 		text	   *str = PG_GETARG_TEXT_PP(0);
5605 		const char *p = VARDATA_ANY(str);
5606 		int			len = VARSIZE_ANY_EXHDR(str);
5607 		int			rlen;
5608 
5609 		n = pg_mbstrlen_with_len(p, len) + n;
5610 		rlen = pg_mbcharcliplen(p, len, n);
5611 		PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
5612 	}
5613 	else
5614 		PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
5615 }
5616 
5617 /*
5618  * Return last n characters in the string. When n is negative,
5619  * return all but first |n| characters.
5620  */
5621 Datum
5622 text_right(PG_FUNCTION_ARGS)
5623 {
5624 	text	   *str = PG_GETARG_TEXT_PP(0);
5625 	const char *p = VARDATA_ANY(str);
5626 	int			len = VARSIZE_ANY_EXHDR(str);
5627 	int			n = PG_GETARG_INT32(1);
5628 	int			off;
5629 
5630 	if (n < 0)
5631 		n = -n;
5632 	else
5633 		n = pg_mbstrlen_with_len(p, len) - n;
5634 	off = pg_mbcharcliplen(p, len, n);
5635 
5636 	PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5637 }
5638 
5639 /*
5640  * Return reversed string
5641  */
5642 Datum
5643 text_reverse(PG_FUNCTION_ARGS)
5644 {
5645 	text	   *str = PG_GETARG_TEXT_PP(0);
5646 	const char *p = VARDATA_ANY(str);
5647 	int			len = VARSIZE_ANY_EXHDR(str);
5648 	const char *endp = p + len;
5649 	text	   *result;
5650 	char	   *dst;
5651 
5652 	result = palloc(len + VARHDRSZ);
5653 	dst = (char *) VARDATA(result) + len;
5654 	SET_VARSIZE(result, len + VARHDRSZ);
5655 
5656 	if (pg_database_encoding_max_length() > 1)
5657 	{
5658 		/* multibyte version */
5659 		while (p < endp)
5660 		{
5661 			int			sz;
5662 
5663 			sz = pg_mblen(p);
5664 			dst -= sz;
5665 			memcpy(dst, p, sz);
5666 			p += sz;
5667 		}
5668 	}
5669 	else
5670 	{
5671 		/* single byte version */
5672 		while (p < endp)
5673 			*(--dst) = *p++;
5674 	}
5675 
5676 	PG_RETURN_TEXT_P(result);
5677 }
5678 
5679 
5680 /*
5681  * Support macros for text_format()
5682  */
5683 #define TEXT_FORMAT_FLAG_MINUS	0x0001	/* is minus flag present? */
5684 
5685 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5686 	do { \
5687 		if (++(ptr) >= (end_ptr)) \
5688 			ereport(ERROR, \
5689 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5690 					 errmsg("unterminated format() type specifier"), \
5691 					 errhint("For a single \"%%\" use \"%%%%\"."))); \
5692 	} while (0)
5693 
5694 /*
5695  * Returns a formatted string
5696  */
5697 Datum
5698 text_format(PG_FUNCTION_ARGS)
5699 {
5700 	text	   *fmt;
5701 	StringInfoData str;
5702 	const char *cp;
5703 	const char *start_ptr;
5704 	const char *end_ptr;
5705 	text	   *result;
5706 	int			arg;
5707 	bool		funcvariadic;
5708 	int			nargs;
5709 	Datum	   *elements = NULL;
5710 	bool	   *nulls = NULL;
5711 	Oid			element_type = InvalidOid;
5712 	Oid			prev_type = InvalidOid;
5713 	Oid			prev_width_type = InvalidOid;
5714 	FmgrInfo	typoutputfinfo;
5715 	FmgrInfo	typoutputinfo_width;
5716 
5717 	/* When format string is null, immediately return null */
5718 	if (PG_ARGISNULL(0))
5719 		PG_RETURN_NULL();
5720 
5721 	/* If argument is marked VARIADIC, expand array into elements */
5722 	if (get_fn_expr_variadic(fcinfo->flinfo))
5723 	{
5724 		ArrayType  *arr;
5725 		int16		elmlen;
5726 		bool		elmbyval;
5727 		char		elmalign;
5728 		int			nitems;
5729 
5730 		/* Should have just the one argument */
5731 		Assert(PG_NARGS() == 2);
5732 
5733 		/* If argument is NULL, we treat it as zero-length array */
5734 		if (PG_ARGISNULL(1))
5735 			nitems = 0;
5736 		else
5737 		{
5738 			/*
5739 			 * Non-null argument had better be an array.  We assume that any
5740 			 * call context that could let get_fn_expr_variadic return true
5741 			 * will have checked that a VARIADIC-labeled parameter actually is
5742 			 * an array.  So it should be okay to just Assert that it's an
5743 			 * array rather than doing a full-fledged error check.
5744 			 */
5745 			Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5746 
5747 			/* OK, safe to fetch the array value */
5748 			arr = PG_GETARG_ARRAYTYPE_P(1);
5749 
5750 			/* Get info about array element type */
5751 			element_type = ARR_ELEMTYPE(arr);
5752 			get_typlenbyvalalign(element_type,
5753 								 &elmlen, &elmbyval, &elmalign);
5754 
5755 			/* Extract all array elements */
5756 			deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5757 							  &elements, &nulls, &nitems);
5758 		}
5759 
5760 		nargs = nitems + 1;
5761 		funcvariadic = true;
5762 	}
5763 	else
5764 	{
5765 		/* Non-variadic case, we'll process the arguments individually */
5766 		nargs = PG_NARGS();
5767 		funcvariadic = false;
5768 	}
5769 
5770 	/* Setup for main loop. */
5771 	fmt = PG_GETARG_TEXT_PP(0);
5772 	start_ptr = VARDATA_ANY(fmt);
5773 	end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5774 	initStringInfo(&str);
5775 	arg = 1;					/* next argument position to print */
5776 
5777 	/* Scan format string, looking for conversion specifiers. */
5778 	for (cp = start_ptr; cp < end_ptr; cp++)
5779 	{
5780 		int			argpos;
5781 		int			widthpos;
5782 		int			flags;
5783 		int			width;
5784 		Datum		value;
5785 		bool		isNull;
5786 		Oid			typid;
5787 
5788 		/*
5789 		 * If it's not the start of a conversion specifier, just copy it to
5790 		 * the output buffer.
5791 		 */
5792 		if (*cp != '%')
5793 		{
5794 			appendStringInfoCharMacro(&str, *cp);
5795 			continue;
5796 		}
5797 
5798 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5799 
5800 		/* Easy case: %% outputs a single % */
5801 		if (*cp == '%')
5802 		{
5803 			appendStringInfoCharMacro(&str, *cp);
5804 			continue;
5805 		}
5806 
5807 		/* Parse the optional portions of the format specifier */
5808 		cp = text_format_parse_format(cp, end_ptr,
5809 									  &argpos, &widthpos,
5810 									  &flags, &width);
5811 
5812 		/*
5813 		 * Next we should see the main conversion specifier.  Whether or not
5814 		 * an argument position was present, it's known that at least one
5815 		 * character remains in the string at this point.  Experience suggests
5816 		 * that it's worth checking that that character is one of the expected
5817 		 * ones before we try to fetch arguments, so as to produce the least
5818 		 * confusing response to a mis-formatted specifier.
5819 		 */
5820 		if (strchr("sIL", *cp) == NULL)
5821 			ereport(ERROR,
5822 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5823 					 errmsg("unrecognized format() type specifier \"%.*s\"",
5824 							pg_mblen(cp), cp),
5825 					 errhint("For a single \"%%\" use \"%%%%\".")));
5826 
5827 		/* If indirect width was specified, get its value */
5828 		if (widthpos >= 0)
5829 		{
5830 			/* Collect the specified or next argument position */
5831 			if (widthpos > 0)
5832 				arg = widthpos;
5833 			if (arg >= nargs)
5834 				ereport(ERROR,
5835 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5836 						 errmsg("too few arguments for format()")));
5837 
5838 			/* Get the value and type of the selected argument */
5839 			if (!funcvariadic)
5840 			{
5841 				value = PG_GETARG_DATUM(arg);
5842 				isNull = PG_ARGISNULL(arg);
5843 				typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5844 			}
5845 			else
5846 			{
5847 				value = elements[arg - 1];
5848 				isNull = nulls[arg - 1];
5849 				typid = element_type;
5850 			}
5851 			if (!OidIsValid(typid))
5852 				elog(ERROR, "could not determine data type of format() input");
5853 
5854 			arg++;
5855 
5856 			/* We can treat NULL width the same as zero */
5857 			if (isNull)
5858 				width = 0;
5859 			else if (typid == INT4OID)
5860 				width = DatumGetInt32(value);
5861 			else if (typid == INT2OID)
5862 				width = DatumGetInt16(value);
5863 			else
5864 			{
5865 				/* For less-usual datatypes, convert to text then to int */
5866 				char	   *str;
5867 
5868 				if (typid != prev_width_type)
5869 				{
5870 					Oid			typoutputfunc;
5871 					bool		typIsVarlena;
5872 
5873 					getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5874 					fmgr_info(typoutputfunc, &typoutputinfo_width);
5875 					prev_width_type = typid;
5876 				}
5877 
5878 				str = OutputFunctionCall(&typoutputinfo_width, value);
5879 
5880 				/* pg_strtoint32 will complain about bad data or overflow */
5881 				width = pg_strtoint32(str);
5882 
5883 				pfree(str);
5884 			}
5885 		}
5886 
5887 		/* Collect the specified or next argument position */
5888 		if (argpos > 0)
5889 			arg = argpos;
5890 		if (arg >= nargs)
5891 			ereport(ERROR,
5892 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5893 					 errmsg("too few arguments for format()")));
5894 
5895 		/* Get the value and type of the selected argument */
5896 		if (!funcvariadic)
5897 		{
5898 			value = PG_GETARG_DATUM(arg);
5899 			isNull = PG_ARGISNULL(arg);
5900 			typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5901 		}
5902 		else
5903 		{
5904 			value = elements[arg - 1];
5905 			isNull = nulls[arg - 1];
5906 			typid = element_type;
5907 		}
5908 		if (!OidIsValid(typid))
5909 			elog(ERROR, "could not determine data type of format() input");
5910 
5911 		arg++;
5912 
5913 		/*
5914 		 * Get the appropriate typOutput function, reusing previous one if
5915 		 * same type as previous argument.  That's particularly useful in the
5916 		 * variadic-array case, but often saves work even for ordinary calls.
5917 		 */
5918 		if (typid != prev_type)
5919 		{
5920 			Oid			typoutputfunc;
5921 			bool		typIsVarlena;
5922 
5923 			getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5924 			fmgr_info(typoutputfunc, &typoutputfinfo);
5925 			prev_type = typid;
5926 		}
5927 
5928 		/*
5929 		 * And now we can format the value.
5930 		 */
5931 		switch (*cp)
5932 		{
5933 			case 's':
5934 			case 'I':
5935 			case 'L':
5936 				text_format_string_conversion(&str, *cp, &typoutputfinfo,
5937 											  value, isNull,
5938 											  flags, width);
5939 				break;
5940 			default:
5941 				/* should not get here, because of previous check */
5942 				ereport(ERROR,
5943 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5944 						 errmsg("unrecognized format() type specifier \"%.*s\"",
5945 								pg_mblen(cp), cp),
5946 						 errhint("For a single \"%%\" use \"%%%%\".")));
5947 				break;
5948 		}
5949 	}
5950 
5951 	/* Don't need deconstruct_array results anymore. */
5952 	if (elements != NULL)
5953 		pfree(elements);
5954 	if (nulls != NULL)
5955 		pfree(nulls);
5956 
5957 	/* Generate results. */
5958 	result = cstring_to_text_with_len(str.data, str.len);
5959 	pfree(str.data);
5960 
5961 	PG_RETURN_TEXT_P(result);
5962 }
5963 
5964 /*
5965  * Parse contiguous digits as a decimal number.
5966  *
5967  * Returns true if some digits could be parsed.
5968  * The value is returned into *value, and *ptr is advanced to the next
5969  * character to be parsed.
5970  *
5971  * Note parsing invariant: at least one character is known available before
5972  * string end (end_ptr) at entry, and this is still true at exit.
5973  */
5974 static bool
5975 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5976 {
5977 	bool		found = false;
5978 	const char *cp = *ptr;
5979 	int			val = 0;
5980 
5981 	while (*cp >= '0' && *cp <= '9')
5982 	{
5983 		int8		digit = (*cp - '0');
5984 
5985 		if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5986 			unlikely(pg_add_s32_overflow(val, digit, &val)))
5987 			ereport(ERROR,
5988 					(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5989 					 errmsg("number is out of range")));
5990 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5991 		found = true;
5992 	}
5993 
5994 	*ptr = cp;
5995 	*value = val;
5996 
5997 	return found;
5998 }
5999 
6000 /*
6001  * Parse a format specifier (generally following the SUS printf spec).
6002  *
6003  * We have already advanced over the initial '%', and we are looking for
6004  * [argpos][flags][width]type (but the type character is not consumed here).
6005  *
6006  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
6007  * Output parameters:
6008  *	argpos: argument position for value to be printed.  -1 means unspecified.
6009  *	widthpos: argument position for width.  Zero means the argument position
6010  *			was unspecified (ie, take the next arg) and -1 means no width
6011  *			argument (width was omitted or specified as a constant).
6012  *	flags: bitmask of flags.
6013  *	width: directly-specified width value.  Zero means the width was omitted
6014  *			(note it's not necessary to distinguish this case from an explicit
6015  *			zero width value).
6016  *
6017  * The function result is the next character position to be parsed, ie, the
6018  * location where the type character is/should be.
6019  *
6020  * Note parsing invariant: at least one character is known available before
6021  * string end (end_ptr) at entry, and this is still true at exit.
6022  */
6023 static const char *
6024 text_format_parse_format(const char *start_ptr, const char *end_ptr,
6025 						 int *argpos, int *widthpos,
6026 						 int *flags, int *width)
6027 {
6028 	const char *cp = start_ptr;
6029 	int			n;
6030 
6031 	/* set defaults for output parameters */
6032 	*argpos = -1;
6033 	*widthpos = -1;
6034 	*flags = 0;
6035 	*width = 0;
6036 
6037 	/* try to identify first number */
6038 	if (text_format_parse_digits(&cp, end_ptr, &n))
6039 	{
6040 		if (*cp != '$')
6041 		{
6042 			/* Must be just a width and a type, so we're done */
6043 			*width = n;
6044 			return cp;
6045 		}
6046 		/* The number was argument position */
6047 		*argpos = n;
6048 		/* Explicit 0 for argument index is immediately refused */
6049 		if (n == 0)
6050 			ereport(ERROR,
6051 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6052 					 errmsg("format specifies argument 0, but arguments are numbered from 1")));
6053 		ADVANCE_PARSE_POINTER(cp, end_ptr);
6054 	}
6055 
6056 	/* Handle flags (only minus is supported now) */
6057 	while (*cp == '-')
6058 	{
6059 		*flags |= TEXT_FORMAT_FLAG_MINUS;
6060 		ADVANCE_PARSE_POINTER(cp, end_ptr);
6061 	}
6062 
6063 	if (*cp == '*')
6064 	{
6065 		/* Handle indirect width */
6066 		ADVANCE_PARSE_POINTER(cp, end_ptr);
6067 		if (text_format_parse_digits(&cp, end_ptr, &n))
6068 		{
6069 			/* number in this position must be closed by $ */
6070 			if (*cp != '$')
6071 				ereport(ERROR,
6072 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6073 						 errmsg("width argument position must be ended by \"$\"")));
6074 			/* The number was width argument position */
6075 			*widthpos = n;
6076 			/* Explicit 0 for argument index is immediately refused */
6077 			if (n == 0)
6078 				ereport(ERROR,
6079 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6080 						 errmsg("format specifies argument 0, but arguments are numbered from 1")));
6081 			ADVANCE_PARSE_POINTER(cp, end_ptr);
6082 		}
6083 		else
6084 			*widthpos = 0;		/* width's argument position is unspecified */
6085 	}
6086 	else
6087 	{
6088 		/* Check for direct width specification */
6089 		if (text_format_parse_digits(&cp, end_ptr, &n))
6090 			*width = n;
6091 	}
6092 
6093 	/* cp should now be pointing at type character */
6094 	return cp;
6095 }
6096 
6097 /*
6098  * Format a %s, %I, or %L conversion
6099  */
6100 static void
6101 text_format_string_conversion(StringInfo buf, char conversion,
6102 							  FmgrInfo *typOutputInfo,
6103 							  Datum value, bool isNull,
6104 							  int flags, int width)
6105 {
6106 	char	   *str;
6107 
6108 	/* Handle NULL arguments before trying to stringify the value. */
6109 	if (isNull)
6110 	{
6111 		if (conversion == 's')
6112 			text_format_append_string(buf, "", flags, width);
6113 		else if (conversion == 'L')
6114 			text_format_append_string(buf, "NULL", flags, width);
6115 		else if (conversion == 'I')
6116 			ereport(ERROR,
6117 					(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
6118 					 errmsg("null values cannot be formatted as an SQL identifier")));
6119 		return;
6120 	}
6121 
6122 	/* Stringify. */
6123 	str = OutputFunctionCall(typOutputInfo, value);
6124 
6125 	/* Escape. */
6126 	if (conversion == 'I')
6127 	{
6128 		/* quote_identifier may or may not allocate a new string. */
6129 		text_format_append_string(buf, quote_identifier(str), flags, width);
6130 	}
6131 	else if (conversion == 'L')
6132 	{
6133 		char	   *qstr = quote_literal_cstr(str);
6134 
6135 		text_format_append_string(buf, qstr, flags, width);
6136 		/* quote_literal_cstr() always allocates a new string */
6137 		pfree(qstr);
6138 	}
6139 	else
6140 		text_format_append_string(buf, str, flags, width);
6141 
6142 	/* Cleanup. */
6143 	pfree(str);
6144 }
6145 
6146 /*
6147  * Append str to buf, padding as directed by flags/width
6148  */
6149 static void
6150 text_format_append_string(StringInfo buf, const char *str,
6151 						  int flags, int width)
6152 {
6153 	bool		align_to_left = false;
6154 	int			len;
6155 
6156 	/* fast path for typical easy case */
6157 	if (width == 0)
6158 	{
6159 		appendStringInfoString(buf, str);
6160 		return;
6161 	}
6162 
6163 	if (width < 0)
6164 	{
6165 		/* Negative width: implicit '-' flag, then take absolute value */
6166 		align_to_left = true;
6167 		/* -INT_MIN is undefined */
6168 		if (width <= INT_MIN)
6169 			ereport(ERROR,
6170 					(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6171 					 errmsg("number is out of range")));
6172 		width = -width;
6173 	}
6174 	else if (flags & TEXT_FORMAT_FLAG_MINUS)
6175 		align_to_left = true;
6176 
6177 	len = pg_mbstrlen(str);
6178 	if (align_to_left)
6179 	{
6180 		/* left justify */
6181 		appendStringInfoString(buf, str);
6182 		if (len < width)
6183 			appendStringInfoSpaces(buf, width - len);
6184 	}
6185 	else
6186 	{
6187 		/* right justify */
6188 		if (len < width)
6189 			appendStringInfoSpaces(buf, width - len);
6190 		appendStringInfoString(buf, str);
6191 	}
6192 }
6193 
6194 /*
6195  * text_format_nv - nonvariadic wrapper for text_format function.
6196  *
6197  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6198  * which checks that all built-in functions that share the implementing C
6199  * function take the same number of arguments.
6200  */
6201 Datum
6202 text_format_nv(PG_FUNCTION_ARGS)
6203 {
6204 	return text_format(fcinfo);
6205 }
6206 
6207 /*
6208  * Helper function for Levenshtein distance functions. Faster than memcmp(),
6209  * for this use case.
6210  */
6211 static inline bool
6212 rest_of_char_same(const char *s1, const char *s2, int len)
6213 {
6214 	while (len > 0)
6215 	{
6216 		len--;
6217 		if (s1[len] != s2[len])
6218 			return false;
6219 	}
6220 	return true;
6221 }
6222 
6223 /* Expand each Levenshtein distance variant */
6224 #include "levenshtein.c"
6225 #define LEVENSHTEIN_LESS_EQUAL
6226 #include "levenshtein.c"
6227 
6228 
6229 /*
6230  * Unicode support
6231  */
6232 
6233 static UnicodeNormalizationForm
6234 unicode_norm_form_from_string(const char *formstr)
6235 {
6236 	UnicodeNormalizationForm form = -1;
6237 
6238 	/*
6239 	 * Might as well check this while we're here.
6240 	 */
6241 	if (GetDatabaseEncoding() != PG_UTF8)
6242 		ereport(ERROR,
6243 				(errcode(ERRCODE_SYNTAX_ERROR),
6244 				 errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6245 
6246 	if (pg_strcasecmp(formstr, "NFC") == 0)
6247 		form = UNICODE_NFC;
6248 	else if (pg_strcasecmp(formstr, "NFD") == 0)
6249 		form = UNICODE_NFD;
6250 	else if (pg_strcasecmp(formstr, "NFKC") == 0)
6251 		form = UNICODE_NFKC;
6252 	else if (pg_strcasecmp(formstr, "NFKD") == 0)
6253 		form = UNICODE_NFKD;
6254 	else
6255 		ereport(ERROR,
6256 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6257 				 errmsg("invalid normalization form: %s", formstr)));
6258 
6259 	return form;
6260 }
6261 
6262 Datum
6263 unicode_normalize_func(PG_FUNCTION_ARGS)
6264 {
6265 	text	   *input = PG_GETARG_TEXT_PP(0);
6266 	char	   *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6267 	UnicodeNormalizationForm form;
6268 	int			size;
6269 	pg_wchar   *input_chars;
6270 	pg_wchar   *output_chars;
6271 	unsigned char *p;
6272 	text	   *result;
6273 	int			i;
6274 
6275 	form = unicode_norm_form_from_string(formstr);
6276 
6277 	/* convert to pg_wchar */
6278 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6279 	input_chars = palloc((size + 1) * sizeof(pg_wchar));
6280 	p = (unsigned char *) VARDATA_ANY(input);
6281 	for (i = 0; i < size; i++)
6282 	{
6283 		input_chars[i] = utf8_to_unicode(p);
6284 		p += pg_utf_mblen(p);
6285 	}
6286 	input_chars[i] = (pg_wchar) '\0';
6287 	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6288 
6289 	/* action */
6290 	output_chars = unicode_normalize(form, input_chars);
6291 
6292 	/* convert back to UTF-8 string */
6293 	size = 0;
6294 	for (pg_wchar *wp = output_chars; *wp; wp++)
6295 	{
6296 		unsigned char buf[4];
6297 
6298 		unicode_to_utf8(*wp, buf);
6299 		size += pg_utf_mblen(buf);
6300 	}
6301 
6302 	result = palloc(size + VARHDRSZ);
6303 	SET_VARSIZE(result, size + VARHDRSZ);
6304 
6305 	p = (unsigned char *) VARDATA_ANY(result);
6306 	for (pg_wchar *wp = output_chars; *wp; wp++)
6307 	{
6308 		unicode_to_utf8(*wp, p);
6309 		p += pg_utf_mblen(p);
6310 	}
6311 	Assert((char *) p == (char *) result + size + VARHDRSZ);
6312 
6313 	PG_RETURN_TEXT_P(result);
6314 }
6315 
6316 /*
6317  * Check whether the string is in the specified Unicode normalization form.
6318  *
6319  * This is done by converting the string to the specified normal form and then
6320  * comparing that to the original string.  To speed that up, we also apply the
6321  * "quick check" algorithm specified in UAX #15, which can give a yes or no
6322  * answer for many strings by just scanning the string once.
6323  *
6324  * This function should generally be optimized for the case where the string
6325  * is in fact normalized.  In that case, we'll end up looking at the entire
6326  * string, so it's probably not worth doing any incremental conversion etc.
6327  */
6328 Datum
6329 unicode_is_normalized(PG_FUNCTION_ARGS)
6330 {
6331 	text	   *input = PG_GETARG_TEXT_PP(0);
6332 	char	   *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6333 	UnicodeNormalizationForm form;
6334 	int			size;
6335 	pg_wchar   *input_chars;
6336 	pg_wchar   *output_chars;
6337 	unsigned char *p;
6338 	int			i;
6339 	UnicodeNormalizationQC quickcheck;
6340 	int			output_size;
6341 	bool		result;
6342 
6343 	form = unicode_norm_form_from_string(formstr);
6344 
6345 	/* convert to pg_wchar */
6346 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6347 	input_chars = palloc((size + 1) * sizeof(pg_wchar));
6348 	p = (unsigned char *) VARDATA_ANY(input);
6349 	for (i = 0; i < size; i++)
6350 	{
6351 		input_chars[i] = utf8_to_unicode(p);
6352 		p += pg_utf_mblen(p);
6353 	}
6354 	input_chars[i] = (pg_wchar) '\0';
6355 	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6356 
6357 	/* quick check (see UAX #15) */
6358 	quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
6359 	if (quickcheck == UNICODE_NORM_QC_YES)
6360 		PG_RETURN_BOOL(true);
6361 	else if (quickcheck == UNICODE_NORM_QC_NO)
6362 		PG_RETURN_BOOL(false);
6363 
6364 	/* normalize and compare with original */
6365 	output_chars = unicode_normalize(form, input_chars);
6366 
6367 	output_size = 0;
6368 	for (pg_wchar *wp = output_chars; *wp; wp++)
6369 		output_size++;
6370 
6371 	result = (size == output_size) &&
6372 		(memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
6373 
6374 	PG_RETURN_BOOL(result);
6375 }
6376 
6377 /*
6378  * Check if first n chars are hexadecimal digits
6379  */
6380 static bool
6381 isxdigits_n(const char *instr, size_t n)
6382 {
6383 	for (size_t i = 0; i < n; i++)
6384 		if (!isxdigit((unsigned char) instr[i]))
6385 			return false;
6386 
6387 	return true;
6388 }
6389 
6390 static unsigned int
6391 hexval(unsigned char c)
6392 {
6393 	if (c >= '0' && c <= '9')
6394 		return c - '0';
6395 	if (c >= 'a' && c <= 'f')
6396 		return c - 'a' + 0xA;
6397 	if (c >= 'A' && c <= 'F')
6398 		return c - 'A' + 0xA;
6399 	elog(ERROR, "invalid hexadecimal digit");
6400 	return 0;					/* not reached */
6401 }
6402 
6403 /*
6404  * Translate string with hexadecimal digits to number
6405  */
6406 static unsigned int
6407 hexval_n(const char *instr, size_t n)
6408 {
6409 	unsigned int result = 0;
6410 
6411 	for (size_t i = 0; i < n; i++)
6412 		result += hexval(instr[i]) << (4 * (n - i - 1));
6413 
6414 	return result;
6415 }
6416 
6417 /*
6418  * Replaces Unicode escape sequences by Unicode characters
6419  */
6420 Datum
6421 unistr(PG_FUNCTION_ARGS)
6422 {
6423 	text	   *input_text = PG_GETARG_TEXT_PP(0);
6424 	char	   *instr;
6425 	int			len;
6426 	StringInfoData str;
6427 	text	   *result;
6428 	pg_wchar	pair_first = 0;
6429 	char		cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
6430 
6431 	instr = VARDATA_ANY(input_text);
6432 	len = VARSIZE_ANY_EXHDR(input_text);
6433 
6434 	initStringInfo(&str);
6435 
6436 	while (len > 0)
6437 	{
6438 		if (instr[0] == '\\')
6439 		{
6440 			if (len >= 2 &&
6441 				instr[1] == '\\')
6442 			{
6443 				if (pair_first)
6444 					goto invalid_pair;
6445 				appendStringInfoChar(&str, '\\');
6446 				instr += 2;
6447 				len -= 2;
6448 			}
6449 			else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
6450 					 (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
6451 			{
6452 				pg_wchar	unicode;
6453 				int			offset = instr[1] == 'u' ? 2 : 1;
6454 
6455 				unicode = hexval_n(instr + offset, 4);
6456 
6457 				if (!is_valid_unicode_codepoint(unicode))
6458 					ereport(ERROR,
6459 							errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6460 							errmsg("invalid Unicode code point: %04X", unicode));
6461 
6462 				if (pair_first)
6463 				{
6464 					if (is_utf16_surrogate_second(unicode))
6465 					{
6466 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6467 						pair_first = 0;
6468 					}
6469 					else
6470 						goto invalid_pair;
6471 				}
6472 				else if (is_utf16_surrogate_second(unicode))
6473 					goto invalid_pair;
6474 
6475 				if (is_utf16_surrogate_first(unicode))
6476 					pair_first = unicode;
6477 				else
6478 				{
6479 					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6480 					appendStringInfoString(&str, cbuf);
6481 				}
6482 
6483 				instr += 4 + offset;
6484 				len -= 4 + offset;
6485 			}
6486 			else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
6487 			{
6488 				pg_wchar	unicode;
6489 
6490 				unicode = hexval_n(instr + 2, 6);
6491 
6492 				if (!is_valid_unicode_codepoint(unicode))
6493 					ereport(ERROR,
6494 							errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6495 							errmsg("invalid Unicode code point: %04X", unicode));
6496 
6497 				if (pair_first)
6498 				{
6499 					if (is_utf16_surrogate_second(unicode))
6500 					{
6501 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6502 						pair_first = 0;
6503 					}
6504 					else
6505 						goto invalid_pair;
6506 				}
6507 				else if (is_utf16_surrogate_second(unicode))
6508 					goto invalid_pair;
6509 
6510 				if (is_utf16_surrogate_first(unicode))
6511 					pair_first = unicode;
6512 				else
6513 				{
6514 					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6515 					appendStringInfoString(&str, cbuf);
6516 				}
6517 
6518 				instr += 8;
6519 				len -= 8;
6520 			}
6521 			else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
6522 			{
6523 				pg_wchar	unicode;
6524 
6525 				unicode = hexval_n(instr + 2, 8);
6526 
6527 				if (!is_valid_unicode_codepoint(unicode))
6528 					ereport(ERROR,
6529 							errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6530 							errmsg("invalid Unicode code point: %04X", unicode));
6531 
6532 				if (pair_first)
6533 				{
6534 					if (is_utf16_surrogate_second(unicode))
6535 					{
6536 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6537 						pair_first = 0;
6538 					}
6539 					else
6540 						goto invalid_pair;
6541 				}
6542 				else if (is_utf16_surrogate_second(unicode))
6543 					goto invalid_pair;
6544 
6545 				if (is_utf16_surrogate_first(unicode))
6546 					pair_first = unicode;
6547 				else
6548 				{
6549 					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6550 					appendStringInfoString(&str, cbuf);
6551 				}
6552 
6553 				instr += 10;
6554 				len -= 10;
6555 			}
6556 			else
6557 				ereport(ERROR,
6558 						(errcode(ERRCODE_SYNTAX_ERROR),
6559 						 errmsg("invalid Unicode escape"),
6560 						 errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6561 		}
6562 		else
6563 		{
6564 			if (pair_first)
6565 				goto invalid_pair;
6566 
6567 			appendStringInfoChar(&str, *instr++);
6568 			len--;
6569 		}
6570 	}
6571 
6572 	/* unfinished surrogate pair? */
6573 	if (pair_first)
6574 		goto invalid_pair;
6575 
6576 	result = cstring_to_text_with_len(str.data, str.len);
6577 	pfree(str.data);
6578 
6579 	PG_RETURN_TEXT_P(result);
6580 
6581 invalid_pair:
6582 	ereport(ERROR,
6583 			(errcode(ERRCODE_SYNTAX_ERROR),
6584 			 errmsg("invalid Unicode surrogate pair")));
6585 	PG_RETURN_NULL();			/* keep compiler quiet */
6586 }
6587