1 /*-------------------------------------------------------------------------
2  *
3  * varlena.c
4  *	  Functions for the variable-length built-in types.
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/utils/adt/varlena.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <limits.h>
19 
20 #include "access/tuptoaster.h"
21 #include "catalog/pg_collation.h"
22 #include "catalog/pg_type.h"
23 #include "common/int.h"
24 #include "lib/hyperloglog.h"
25 #include "libpq/pqformat.h"
26 #include "miscadmin.h"
27 #include "parser/scansup.h"
28 #include "port/pg_bswap.h"
29 #include "regex/regex.h"
30 #include "utils/builtins.h"
31 #include "utils/bytea.h"
32 #include "utils/hashutils.h"
33 #include "utils/lsyscache.h"
34 #include "utils/memutils.h"
35 #include "utils/pg_locale.h"
36 #include "utils/sortsupport.h"
37 #include "utils/varlena.h"
38 
39 
40 /* GUC variable */
41 int			bytea_output = BYTEA_OUTPUT_HEX;
42 
43 typedef struct varlena unknown;
44 typedef struct varlena VarString;
45 
46 /*
47  * State for text_position_* functions.
48  */
49 typedef struct
50 {
51 	bool		is_multibyte;	/* T if multibyte encoding */
52 	bool		is_multibyte_char_in_char;
53 
54 	char	   *str1;			/* haystack string */
55 	char	   *str2;			/* needle string */
56 	int			len1;			/* string lengths in bytes */
57 	int			len2;
58 
59 	/* Skip table for Boyer-Moore-Horspool search algorithm: */
60 	int			skiptablemask;	/* mask for ANDing with skiptable subscripts */
61 	int			skiptable[256]; /* skip distance for given mismatched char */
62 
63 	char	   *last_match;		/* pointer to last match in 'str1' */
64 
65 	/*
66 	 * Sometimes we need to convert the byte position of a match to a
67 	 * character position.  These store the last position that was converted,
68 	 * so that on the next call, we can continue from that point, rather than
69 	 * count characters from the very beginning.
70 	 */
71 	char	   *refpoint;		/* pointer within original haystack string */
72 	int			refpos;			/* 0-based character offset of the same point */
73 } TextPositionState;
74 
75 typedef struct
76 {
77 	char	   *buf1;			/* 1st string, or abbreviation original string
78 								 * buf */
79 	char	   *buf2;			/* 2nd string, or abbreviation strxfrm() buf */
80 	int			buflen1;
81 	int			buflen2;
82 	int			last_len1;		/* Length of last buf1 string/strxfrm() input */
83 	int			last_len2;		/* Length of last buf2 string/strxfrm() blob */
84 	int			last_returned;	/* Last comparison result (cache) */
85 	bool		cache_blob;		/* Does buf2 contain strxfrm() blob, etc? */
86 	bool		collate_c;
87 	Oid			typid;			/* Actual datatype (text/bpchar/bytea/name) */
88 	hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
89 	hyperLogLogState full_card; /* Full key cardinality state */
90 	double		prop_card;		/* Required cardinality proportion */
91 	pg_locale_t locale;
92 } VarStringSortSupport;
93 
94 /*
95  * This should be large enough that most strings will fit, but small enough
96  * that we feel comfortable putting it on the stack
97  */
98 #define TEXTBUFLEN		1024
99 
100 #define DatumGetUnknownP(X)			((unknown *) PG_DETOAST_DATUM(X))
101 #define DatumGetUnknownPCopy(X)		((unknown *) PG_DETOAST_DATUM_COPY(X))
102 #define PG_GETARG_UNKNOWN_P(n)		DatumGetUnknownP(PG_GETARG_DATUM(n))
103 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
104 #define PG_RETURN_UNKNOWN_P(x)		PG_RETURN_POINTER(x)
105 
106 #define DatumGetVarStringP(X)		((VarString *) PG_DETOAST_DATUM(X))
107 #define DatumGetVarStringPP(X)		((VarString *) PG_DETOAST_DATUM_PACKED(X))
108 
109 static int	varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
110 static int	bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
111 static int	namefastcmp_c(Datum x, Datum y, SortSupport ssup);
112 static int	varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
113 static int	namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
114 static int	varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
115 static int	varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
116 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
117 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
118 static int32 text_length(Datum str);
119 static text *text_catenate(text *t1, text *t2);
120 static text *text_substring(Datum str,
121 							int32 start,
122 							int32 length,
123 							bool length_not_specified);
124 static text *text_overlay(text *t1, text *t2, int sp, int sl);
125 static int	text_position(text *t1, text *t2, Oid collid);
126 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
127 static bool text_position_next(TextPositionState *state);
128 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
129 static char *text_position_get_match_ptr(TextPositionState *state);
130 static int	text_position_get_match_pos(TextPositionState *state);
131 static void text_position_cleanup(TextPositionState *state);
132 static void check_collation_set(Oid collid);
133 static int	text_cmp(text *arg1, text *arg2, Oid collid);
134 static bytea *bytea_catenate(bytea *t1, bytea *t2);
135 static bytea *bytea_substring(Datum str,
136 							  int S,
137 							  int L,
138 							  bool length_not_specified);
139 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
140 static void appendStringInfoText(StringInfo str, const text *t);
141 static Datum text_to_array_internal(PG_FUNCTION_ARGS);
142 static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
143 									const char *fldsep, const char *null_string);
144 static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
145 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
146 									 int *value);
147 static const char *text_format_parse_format(const char *start_ptr,
148 											const char *end_ptr,
149 											int *argpos, int *widthpos,
150 											int *flags, int *width);
151 static void text_format_string_conversion(StringInfo buf, char conversion,
152 										  FmgrInfo *typOutputInfo,
153 										  Datum value, bool isNull,
154 										  int flags, int width);
155 static void text_format_append_string(StringInfo buf, const char *str,
156 									  int flags, int width);
157 
158 
159 /*****************************************************************************
160  *	 CONVERSION ROUTINES EXPORTED FOR USE BY C CODE							 *
161  *****************************************************************************/
162 
163 /*
164  * cstring_to_text
165  *
166  * Create a text value from a null-terminated C string.
167  *
168  * The new text value is freshly palloc'd with a full-size VARHDR.
169  */
170 text *
cstring_to_text(const char * s)171 cstring_to_text(const char *s)
172 {
173 	return cstring_to_text_with_len(s, strlen(s));
174 }
175 
176 /*
177  * cstring_to_text_with_len
178  *
179  * Same as cstring_to_text except the caller specifies the string length;
180  * the string need not be null_terminated.
181  */
182 text *
cstring_to_text_with_len(const char * s,int len)183 cstring_to_text_with_len(const char *s, int len)
184 {
185 	text	   *result = (text *) palloc(len + VARHDRSZ);
186 
187 	SET_VARSIZE(result, len + VARHDRSZ);
188 	memcpy(VARDATA(result), s, len);
189 
190 	return result;
191 }
192 
193 /*
194  * text_to_cstring
195  *
196  * Create a palloc'd, null-terminated C string from a text value.
197  *
198  * We support being passed a compressed or toasted text value.
199  * This is a bit bogus since such values shouldn't really be referred to as
200  * "text *", but it seems useful for robustness.  If we didn't handle that
201  * case here, we'd need another routine that did, anyway.
202  */
203 char *
text_to_cstring(const text * t)204 text_to_cstring(const text *t)
205 {
206 	/* must cast away the const, unfortunately */
207 	text	   *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
208 	int			len = VARSIZE_ANY_EXHDR(tunpacked);
209 	char	   *result;
210 
211 	result = (char *) palloc(len + 1);
212 	memcpy(result, VARDATA_ANY(tunpacked), len);
213 	result[len] = '\0';
214 
215 	if (tunpacked != t)
216 		pfree(tunpacked);
217 
218 	return result;
219 }
220 
221 /*
222  * text_to_cstring_buffer
223  *
224  * Copy a text value into a caller-supplied buffer of size dst_len.
225  *
226  * The text string is truncated if necessary to fit.  The result is
227  * guaranteed null-terminated (unless dst_len == 0).
228  *
229  * We support being passed a compressed or toasted text value.
230  * This is a bit bogus since such values shouldn't really be referred to as
231  * "text *", but it seems useful for robustness.  If we didn't handle that
232  * case here, we'd need another routine that did, anyway.
233  */
234 void
text_to_cstring_buffer(const text * src,char * dst,size_t dst_len)235 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
236 {
237 	/* must cast away the const, unfortunately */
238 	text	   *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
239 	size_t		src_len = VARSIZE_ANY_EXHDR(srcunpacked);
240 
241 	if (dst_len > 0)
242 	{
243 		dst_len--;
244 		if (dst_len >= src_len)
245 			dst_len = src_len;
246 		else					/* ensure truncation is encoding-safe */
247 			dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
248 		memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
249 		dst[dst_len] = '\0';
250 	}
251 
252 	if (srcunpacked != src)
253 		pfree(srcunpacked);
254 }
255 
256 
257 /*****************************************************************************
258  *	 USER I/O ROUTINES														 *
259  *****************************************************************************/
260 
261 
262 #define VAL(CH)			((CH) - '0')
263 #define DIG(VAL)		((VAL) + '0')
264 
265 /*
266  *		byteain			- converts from printable representation of byte array
267  *
268  *		Non-printable characters must be passed as '\nnn' (octal) and are
269  *		converted to internal form.  '\' must be passed as '\\'.
270  *		ereport(ERROR, ...) if bad form.
271  *
272  *		BUGS:
273  *				The input is scanned twice.
274  *				The error checking of input is minimal.
275  */
276 Datum
byteain(PG_FUNCTION_ARGS)277 byteain(PG_FUNCTION_ARGS)
278 {
279 	char	   *inputText = PG_GETARG_CSTRING(0);
280 	char	   *tp;
281 	char	   *rp;
282 	int			bc;
283 	bytea	   *result;
284 
285 	/* Recognize hex input */
286 	if (inputText[0] == '\\' && inputText[1] == 'x')
287 	{
288 		size_t		len = strlen(inputText);
289 
290 		bc = (len - 2) / 2 + VARHDRSZ;	/* maximum possible length */
291 		result = palloc(bc);
292 		bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
293 		SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
294 
295 		PG_RETURN_BYTEA_P(result);
296 	}
297 
298 	/* Else, it's the traditional escaped style */
299 	for (bc = 0, tp = inputText; *tp != '\0'; bc++)
300 	{
301 		if (tp[0] != '\\')
302 			tp++;
303 		else if ((tp[0] == '\\') &&
304 				 (tp[1] >= '0' && tp[1] <= '3') &&
305 				 (tp[2] >= '0' && tp[2] <= '7') &&
306 				 (tp[3] >= '0' && tp[3] <= '7'))
307 			tp += 4;
308 		else if ((tp[0] == '\\') &&
309 				 (tp[1] == '\\'))
310 			tp += 2;
311 		else
312 		{
313 			/*
314 			 * one backslash, not followed by another or ### valid octal
315 			 */
316 			ereport(ERROR,
317 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
318 					 errmsg("invalid input syntax for type %s", "bytea")));
319 		}
320 	}
321 
322 	bc += VARHDRSZ;
323 
324 	result = (bytea *) palloc(bc);
325 	SET_VARSIZE(result, bc);
326 
327 	tp = inputText;
328 	rp = VARDATA(result);
329 	while (*tp != '\0')
330 	{
331 		if (tp[0] != '\\')
332 			*rp++ = *tp++;
333 		else if ((tp[0] == '\\') &&
334 				 (tp[1] >= '0' && tp[1] <= '3') &&
335 				 (tp[2] >= '0' && tp[2] <= '7') &&
336 				 (tp[3] >= '0' && tp[3] <= '7'))
337 		{
338 			bc = VAL(tp[1]);
339 			bc <<= 3;
340 			bc += VAL(tp[2]);
341 			bc <<= 3;
342 			*rp++ = bc + VAL(tp[3]);
343 
344 			tp += 4;
345 		}
346 		else if ((tp[0] == '\\') &&
347 				 (tp[1] == '\\'))
348 		{
349 			*rp++ = '\\';
350 			tp += 2;
351 		}
352 		else
353 		{
354 			/*
355 			 * We should never get here. The first pass should not allow it.
356 			 */
357 			ereport(ERROR,
358 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
359 					 errmsg("invalid input syntax for type %s", "bytea")));
360 		}
361 	}
362 
363 	PG_RETURN_BYTEA_P(result);
364 }
365 
366 /*
367  *		byteaout		- converts to printable representation of byte array
368  *
369  *		In the traditional escaped format, non-printable characters are
370  *		printed as '\nnn' (octal) and '\' as '\\'.
371  */
372 Datum
byteaout(PG_FUNCTION_ARGS)373 byteaout(PG_FUNCTION_ARGS)
374 {
375 	bytea	   *vlena = PG_GETARG_BYTEA_PP(0);
376 	char	   *result;
377 	char	   *rp;
378 
379 	if (bytea_output == BYTEA_OUTPUT_HEX)
380 	{
381 		/* Print hex format */
382 		rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
383 		*rp++ = '\\';
384 		*rp++ = 'x';
385 		rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
386 	}
387 	else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
388 	{
389 		/* Print traditional escaped format */
390 		char	   *vp;
391 		int			len;
392 		int			i;
393 
394 		len = 1;				/* empty string has 1 char */
395 		vp = VARDATA_ANY(vlena);
396 		for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
397 		{
398 			if (*vp == '\\')
399 				len += 2;
400 			else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
401 				len += 4;
402 			else
403 				len++;
404 		}
405 		rp = result = (char *) palloc(len);
406 		vp = VARDATA_ANY(vlena);
407 		for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
408 		{
409 			if (*vp == '\\')
410 			{
411 				*rp++ = '\\';
412 				*rp++ = '\\';
413 			}
414 			else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
415 			{
416 				int			val;	/* holds unprintable chars */
417 
418 				val = *vp;
419 				rp[0] = '\\';
420 				rp[3] = DIG(val & 07);
421 				val >>= 3;
422 				rp[2] = DIG(val & 07);
423 				val >>= 3;
424 				rp[1] = DIG(val & 03);
425 				rp += 4;
426 			}
427 			else
428 				*rp++ = *vp;
429 		}
430 	}
431 	else
432 	{
433 		elog(ERROR, "unrecognized bytea_output setting: %d",
434 			 bytea_output);
435 		rp = result = NULL;		/* keep compiler quiet */
436 	}
437 	*rp = '\0';
438 	PG_RETURN_CSTRING(result);
439 }
440 
441 /*
442  *		bytearecv			- converts external binary format to bytea
443  */
444 Datum
bytearecv(PG_FUNCTION_ARGS)445 bytearecv(PG_FUNCTION_ARGS)
446 {
447 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
448 	bytea	   *result;
449 	int			nbytes;
450 
451 	nbytes = buf->len - buf->cursor;
452 	result = (bytea *) palloc(nbytes + VARHDRSZ);
453 	SET_VARSIZE(result, nbytes + VARHDRSZ);
454 	pq_copymsgbytes(buf, VARDATA(result), nbytes);
455 	PG_RETURN_BYTEA_P(result);
456 }
457 
458 /*
459  *		byteasend			- converts bytea to binary format
460  *
461  * This is a special case: just copy the input...
462  */
463 Datum
byteasend(PG_FUNCTION_ARGS)464 byteasend(PG_FUNCTION_ARGS)
465 {
466 	bytea	   *vlena = PG_GETARG_BYTEA_P_COPY(0);
467 
468 	PG_RETURN_BYTEA_P(vlena);
469 }
470 
471 Datum
bytea_string_agg_transfn(PG_FUNCTION_ARGS)472 bytea_string_agg_transfn(PG_FUNCTION_ARGS)
473 {
474 	StringInfo	state;
475 
476 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
477 
478 	/* Append the value unless null. */
479 	if (!PG_ARGISNULL(1))
480 	{
481 		bytea	   *value = PG_GETARG_BYTEA_PP(1);
482 
483 		/* On the first time through, we ignore the delimiter. */
484 		if (state == NULL)
485 			state = makeStringAggState(fcinfo);
486 		else if (!PG_ARGISNULL(2))
487 		{
488 			bytea	   *delim = PG_GETARG_BYTEA_PP(2);
489 
490 			appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
491 		}
492 
493 		appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
494 	}
495 
496 	/*
497 	 * The transition type for string_agg() is declared to be "internal",
498 	 * which is a pass-by-value type the same size as a pointer.
499 	 */
500 	PG_RETURN_POINTER(state);
501 }
502 
503 Datum
bytea_string_agg_finalfn(PG_FUNCTION_ARGS)504 bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
505 {
506 	StringInfo	state;
507 
508 	/* cannot be called directly because of internal-type argument */
509 	Assert(AggCheckCallContext(fcinfo, NULL));
510 
511 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
512 
513 	if (state != NULL)
514 	{
515 		bytea	   *result;
516 
517 		result = (bytea *) palloc(state->len + VARHDRSZ);
518 		SET_VARSIZE(result, state->len + VARHDRSZ);
519 		memcpy(VARDATA(result), state->data, state->len);
520 		PG_RETURN_BYTEA_P(result);
521 	}
522 	else
523 		PG_RETURN_NULL();
524 }
525 
526 /*
527  *		textin			- converts "..." to internal representation
528  */
529 Datum
textin(PG_FUNCTION_ARGS)530 textin(PG_FUNCTION_ARGS)
531 {
532 	char	   *inputText = PG_GETARG_CSTRING(0);
533 
534 	PG_RETURN_TEXT_P(cstring_to_text(inputText));
535 }
536 
537 /*
538  *		textout			- converts internal representation to "..."
539  */
540 Datum
textout(PG_FUNCTION_ARGS)541 textout(PG_FUNCTION_ARGS)
542 {
543 	Datum		txt = PG_GETARG_DATUM(0);
544 
545 	PG_RETURN_CSTRING(TextDatumGetCString(txt));
546 }
547 
548 /*
549  *		textrecv			- converts external binary format to text
550  */
551 Datum
textrecv(PG_FUNCTION_ARGS)552 textrecv(PG_FUNCTION_ARGS)
553 {
554 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
555 	text	   *result;
556 	char	   *str;
557 	int			nbytes;
558 
559 	str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
560 
561 	result = cstring_to_text_with_len(str, nbytes);
562 	pfree(str);
563 	PG_RETURN_TEXT_P(result);
564 }
565 
566 /*
567  *		textsend			- converts text to binary format
568  */
569 Datum
textsend(PG_FUNCTION_ARGS)570 textsend(PG_FUNCTION_ARGS)
571 {
572 	text	   *t = PG_GETARG_TEXT_PP(0);
573 	StringInfoData buf;
574 
575 	pq_begintypsend(&buf);
576 	pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
577 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
578 }
579 
580 
581 /*
582  *		unknownin			- converts "..." to internal representation
583  */
584 Datum
unknownin(PG_FUNCTION_ARGS)585 unknownin(PG_FUNCTION_ARGS)
586 {
587 	char	   *str = PG_GETARG_CSTRING(0);
588 
589 	/* representation is same as cstring */
590 	PG_RETURN_CSTRING(pstrdup(str));
591 }
592 
593 /*
594  *		unknownout			- converts internal representation to "..."
595  */
596 Datum
unknownout(PG_FUNCTION_ARGS)597 unknownout(PG_FUNCTION_ARGS)
598 {
599 	/* representation is same as cstring */
600 	char	   *str = PG_GETARG_CSTRING(0);
601 
602 	PG_RETURN_CSTRING(pstrdup(str));
603 }
604 
605 /*
606  *		unknownrecv			- converts external binary format to unknown
607  */
608 Datum
unknownrecv(PG_FUNCTION_ARGS)609 unknownrecv(PG_FUNCTION_ARGS)
610 {
611 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
612 	char	   *str;
613 	int			nbytes;
614 
615 	str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
616 	/* representation is same as cstring */
617 	PG_RETURN_CSTRING(str);
618 }
619 
620 /*
621  *		unknownsend			- converts unknown to binary format
622  */
623 Datum
unknownsend(PG_FUNCTION_ARGS)624 unknownsend(PG_FUNCTION_ARGS)
625 {
626 	/* representation is same as cstring */
627 	char	   *str = PG_GETARG_CSTRING(0);
628 	StringInfoData buf;
629 
630 	pq_begintypsend(&buf);
631 	pq_sendtext(&buf, str, strlen(str));
632 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
633 }
634 
635 
636 /* ========== PUBLIC ROUTINES ========== */
637 
638 /*
639  * textlen -
640  *	  returns the logical length of a text*
641  *	   (which is less than the VARSIZE of the text*)
642  */
643 Datum
textlen(PG_FUNCTION_ARGS)644 textlen(PG_FUNCTION_ARGS)
645 {
646 	Datum		str = PG_GETARG_DATUM(0);
647 
648 	/* try to avoid decompressing argument */
649 	PG_RETURN_INT32(text_length(str));
650 }
651 
652 /*
653  * text_length -
654  *	Does the real work for textlen()
655  *
656  *	This is broken out so it can be called directly by other string processing
657  *	functions.  Note that the argument is passed as a Datum, to indicate that
658  *	it may still be in compressed form.  We can avoid decompressing it at all
659  *	in some cases.
660  */
661 static int32
text_length(Datum str)662 text_length(Datum str)
663 {
664 	/* fastpath when max encoding length is one */
665 	if (pg_database_encoding_max_length() == 1)
666 		PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
667 	else
668 	{
669 		text	   *t = DatumGetTextPP(str);
670 
671 		PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
672 											 VARSIZE_ANY_EXHDR(t)));
673 	}
674 }
675 
676 /*
677  * textoctetlen -
678  *	  returns the physical length of a text*
679  *	   (which is less than the VARSIZE of the text*)
680  */
681 Datum
textoctetlen(PG_FUNCTION_ARGS)682 textoctetlen(PG_FUNCTION_ARGS)
683 {
684 	Datum		str = PG_GETARG_DATUM(0);
685 
686 	/* We need not detoast the input at all */
687 	PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
688 }
689 
690 /*
691  * textcat -
692  *	  takes two text* and returns a text* that is the concatenation of
693  *	  the two.
694  *
695  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
696  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
697  * Allocate space for output in all cases.
698  * XXX - thomas 1997-07-10
699  */
700 Datum
textcat(PG_FUNCTION_ARGS)701 textcat(PG_FUNCTION_ARGS)
702 {
703 	text	   *t1 = PG_GETARG_TEXT_PP(0);
704 	text	   *t2 = PG_GETARG_TEXT_PP(1);
705 
706 	PG_RETURN_TEXT_P(text_catenate(t1, t2));
707 }
708 
709 /*
710  * text_catenate
711  *	Guts of textcat(), broken out so it can be used by other functions
712  *
713  * Arguments can be in short-header form, but not compressed or out-of-line
714  */
715 static text *
text_catenate(text * t1,text * t2)716 text_catenate(text *t1, text *t2)
717 {
718 	text	   *result;
719 	int			len1,
720 				len2,
721 				len;
722 	char	   *ptr;
723 
724 	len1 = VARSIZE_ANY_EXHDR(t1);
725 	len2 = VARSIZE_ANY_EXHDR(t2);
726 
727 	/* paranoia ... probably should throw error instead? */
728 	if (len1 < 0)
729 		len1 = 0;
730 	if (len2 < 0)
731 		len2 = 0;
732 
733 	len = len1 + len2 + VARHDRSZ;
734 	result = (text *) palloc(len);
735 
736 	/* Set size of result string... */
737 	SET_VARSIZE(result, len);
738 
739 	/* Fill data field of result string... */
740 	ptr = VARDATA(result);
741 	if (len1 > 0)
742 		memcpy(ptr, VARDATA_ANY(t1), len1);
743 	if (len2 > 0)
744 		memcpy(ptr + len1, VARDATA_ANY(t2), len2);
745 
746 	return result;
747 }
748 
749 /*
750  * charlen_to_bytelen()
751  *	Compute the number of bytes occupied by n characters starting at *p
752  *
753  * It is caller's responsibility that there actually are n characters;
754  * the string need not be null-terminated.
755  */
756 static int
charlen_to_bytelen(const char * p,int n)757 charlen_to_bytelen(const char *p, int n)
758 {
759 	if (pg_database_encoding_max_length() == 1)
760 	{
761 		/* Optimization for single-byte encodings */
762 		return n;
763 	}
764 	else
765 	{
766 		const char *s;
767 
768 		for (s = p; n > 0; n--)
769 			s += pg_mblen(s);
770 
771 		return s - p;
772 	}
773 }
774 
775 /*
776  * text_substr()
777  * Return a substring starting at the specified position.
778  * - thomas 1997-12-31
779  *
780  * Input:
781  *	- string
782  *	- starting position (is one-based)
783  *	- string length
784  *
785  * If the starting position is zero or less, then return from the start of the string
786  *	adjusting the length to be consistent with the "negative start" per SQL.
787  * If the length is less than zero, return the remaining string.
788  *
789  * Added multibyte support.
790  * - Tatsuo Ishii 1998-4-21
791  * Changed behavior if starting position is less than one to conform to SQL behavior.
792  * Formerly returned the entire string; now returns a portion.
793  * - Thomas Lockhart 1998-12-10
794  * Now uses faster TOAST-slicing interface
795  * - John Gray 2002-02-22
796  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
797  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
798  * error; if E < 1, return '', not entire string). Fixed MB related bug when
799  * S > LC and < LC + 4 sometimes garbage characters are returned.
800  * - Joe Conway 2002-08-10
801  */
802 Datum
text_substr(PG_FUNCTION_ARGS)803 text_substr(PG_FUNCTION_ARGS)
804 {
805 	PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
806 									PG_GETARG_INT32(1),
807 									PG_GETARG_INT32(2),
808 									false));
809 }
810 
811 /*
812  * text_substr_no_len -
813  *	  Wrapper to avoid opr_sanity failure due to
814  *	  one function accepting a different number of args.
815  */
816 Datum
text_substr_no_len(PG_FUNCTION_ARGS)817 text_substr_no_len(PG_FUNCTION_ARGS)
818 {
819 	PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
820 									PG_GETARG_INT32(1),
821 									-1, true));
822 }
823 
824 /*
825  * text_substring -
826  *	Does the real work for text_substr() and text_substr_no_len()
827  *
828  *	This is broken out so it can be called directly by other string processing
829  *	functions.  Note that the argument is passed as a Datum, to indicate that
830  *	it may still be in compressed/toasted form.  We can avoid detoasting all
831  *	of it in some cases.
832  *
833  *	The result is always a freshly palloc'd datum.
834  */
835 static text *
text_substring(Datum str,int32 start,int32 length,bool length_not_specified)836 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
837 {
838 	int32		eml = pg_database_encoding_max_length();
839 	int32		S = start;		/* start position */
840 	int32		S1;				/* adjusted start position */
841 	int32		L1;				/* adjusted substring length */
842 	int32		E;				/* end position */
843 
844 	/*
845 	 * SQL99 says S can be zero or negative, but we still must fetch from the
846 	 * start of the string.
847 	 */
848 	S1 = Max(S, 1);
849 
850 	/* life is easy if the encoding max length is 1 */
851 	if (eml == 1)
852 	{
853 		if (length_not_specified)	/* special case - get length to end of
854 									 * string */
855 			L1 = -1;
856 		else if (length < 0)
857 		{
858 			/* SQL99 says to throw an error for E < S, i.e., negative length */
859 			ereport(ERROR,
860 					(errcode(ERRCODE_SUBSTRING_ERROR),
861 					 errmsg("negative substring length not allowed")));
862 			L1 = -1;			/* silence stupider compilers */
863 		}
864 		else if (pg_add_s32_overflow(S, length, &E))
865 		{
866 			/*
867 			 * L could be large enough for S + L to overflow, in which case
868 			 * the substring must run to end of string.
869 			 */
870 			L1 = -1;
871 		}
872 		else
873 		{
874 			/*
875 			 * A zero or negative value for the end position can happen if the
876 			 * start was negative or one. SQL99 says to return a zero-length
877 			 * string.
878 			 */
879 			if (E < 1)
880 				return cstring_to_text("");
881 
882 			L1 = E - S1;
883 		}
884 
885 		/*
886 		 * If the start position is past the end of the string, SQL99 says to
887 		 * return a zero-length string -- DatumGetTextPSlice() will do that
888 		 * for us.  We need only convert S1 to zero-based starting position.
889 		 */
890 		return DatumGetTextPSlice(str, S1 - 1, L1);
891 	}
892 	else if (eml > 1)
893 	{
894 		/*
895 		 * When encoding max length is > 1, we can't get LC without
896 		 * detoasting, so we'll grab a conservatively large slice now and go
897 		 * back later to do the right thing
898 		 */
899 		int32		slice_start;
900 		int32		slice_size;
901 		int32		slice_strlen;
902 		text	   *slice;
903 		int32		E1;
904 		int32		i;
905 		char	   *p;
906 		char	   *s;
907 		text	   *ret;
908 
909 		/*
910 		 * We need to start at position zero because there is no way to know
911 		 * in advance which byte offset corresponds to the supplied start
912 		 * position.
913 		 */
914 		slice_start = 0;
915 
916 		if (length_not_specified)	/* special case - get length to end of
917 									 * string */
918 			slice_size = L1 = -1;
919 		else if (length < 0)
920 		{
921 			/* SQL99 says to throw an error for E < S, i.e., negative length */
922 			ereport(ERROR,
923 					(errcode(ERRCODE_SUBSTRING_ERROR),
924 					 errmsg("negative substring length not allowed")));
925 			slice_size = L1 = -1;	/* silence stupider compilers */
926 		}
927 		else if (pg_add_s32_overflow(S, length, &E))
928 		{
929 			/*
930 			 * L could be large enough for S + L to overflow, in which case
931 			 * the substring must run to end of string.
932 			 */
933 			slice_size = L1 = -1;
934 		}
935 		else
936 		{
937 			/*
938 			 * A zero or negative value for the end position can happen if the
939 			 * start was negative or one. SQL99 says to return a zero-length
940 			 * string.
941 			 */
942 			if (E < 1)
943 				return cstring_to_text("");
944 
945 			/*
946 			 * if E is past the end of the string, the tuple toaster will
947 			 * truncate the length for us
948 			 */
949 			L1 = E - S1;
950 
951 			/*
952 			 * Total slice size in bytes can't be any longer than the start
953 			 * position plus substring length times the encoding max length.
954 			 * If that overflows, we can just use -1.
955 			 */
956 			if (pg_mul_s32_overflow(E, eml, &slice_size))
957 				slice_size = -1;
958 		}
959 
960 		/*
961 		 * If we're working with an untoasted source, no need to do an extra
962 		 * copying step.
963 		 */
964 		if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
965 			VARATT_IS_EXTERNAL(DatumGetPointer(str)))
966 			slice = DatumGetTextPSlice(str, slice_start, slice_size);
967 		else
968 			slice = (text *) DatumGetPointer(str);
969 
970 		/* see if we got back an empty string */
971 		if (VARSIZE_ANY_EXHDR(slice) == 0)
972 		{
973 			if (slice != (text *) DatumGetPointer(str))
974 				pfree(slice);
975 			return cstring_to_text("");
976 		}
977 
978 		/* Now we can get the actual length of the slice in MB characters */
979 		slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
980 											VARSIZE_ANY_EXHDR(slice));
981 
982 		/*
983 		 * Check that the start position wasn't > slice_strlen. If so, SQL99
984 		 * says to return a zero-length string.
985 		 */
986 		if (S1 > slice_strlen)
987 		{
988 			if (slice != (text *) DatumGetPointer(str))
989 				pfree(slice);
990 			return cstring_to_text("");
991 		}
992 
993 		/*
994 		 * Adjust L1 and E1 now that we know the slice string length. Again
995 		 * remember that S1 is one based, and slice_start is zero based.
996 		 */
997 		if (L1 > -1)
998 			E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
999 		else
1000 			E1 = slice_start + 1 + slice_strlen;
1001 
1002 		/*
1003 		 * Find the start position in the slice; remember S1 is not zero based
1004 		 */
1005 		p = VARDATA_ANY(slice);
1006 		for (i = 0; i < S1 - 1; i++)
1007 			p += pg_mblen(p);
1008 
1009 		/* hang onto a pointer to our start position */
1010 		s = p;
1011 
1012 		/*
1013 		 * Count the actual bytes used by the substring of the requested
1014 		 * length.
1015 		 */
1016 		for (i = S1; i < E1; i++)
1017 			p += pg_mblen(p);
1018 
1019 		ret = (text *) palloc(VARHDRSZ + (p - s));
1020 		SET_VARSIZE(ret, VARHDRSZ + (p - s));
1021 		memcpy(VARDATA(ret), s, (p - s));
1022 
1023 		if (slice != (text *) DatumGetPointer(str))
1024 			pfree(slice);
1025 
1026 		return ret;
1027 	}
1028 	else
1029 		elog(ERROR, "invalid backend encoding: encoding max length < 1");
1030 
1031 	/* not reached: suppress compiler warning */
1032 	return NULL;
1033 }
1034 
1035 /*
1036  * textoverlay
1037  *	Replace specified substring of first string with second
1038  *
1039  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1040  * This code is a direct implementation of what the standard says.
1041  */
1042 Datum
textoverlay(PG_FUNCTION_ARGS)1043 textoverlay(PG_FUNCTION_ARGS)
1044 {
1045 	text	   *t1 = PG_GETARG_TEXT_PP(0);
1046 	text	   *t2 = PG_GETARG_TEXT_PP(1);
1047 	int			sp = PG_GETARG_INT32(2);	/* substring start position */
1048 	int			sl = PG_GETARG_INT32(3);	/* substring length */
1049 
1050 	PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1051 }
1052 
1053 Datum
textoverlay_no_len(PG_FUNCTION_ARGS)1054 textoverlay_no_len(PG_FUNCTION_ARGS)
1055 {
1056 	text	   *t1 = PG_GETARG_TEXT_PP(0);
1057 	text	   *t2 = PG_GETARG_TEXT_PP(1);
1058 	int			sp = PG_GETARG_INT32(2);	/* substring start position */
1059 	int			sl;
1060 
1061 	sl = text_length(PointerGetDatum(t2));	/* defaults to length(t2) */
1062 	PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1063 }
1064 
1065 static text *
text_overlay(text * t1,text * t2,int sp,int sl)1066 text_overlay(text *t1, text *t2, int sp, int sl)
1067 {
1068 	text	   *result;
1069 	text	   *s1;
1070 	text	   *s2;
1071 	int			sp_pl_sl;
1072 
1073 	/*
1074 	 * Check for possible integer-overflow cases.  For negative sp, throw a
1075 	 * "substring length" error because that's what should be expected
1076 	 * according to the spec's definition of OVERLAY().
1077 	 */
1078 	if (sp <= 0)
1079 		ereport(ERROR,
1080 				(errcode(ERRCODE_SUBSTRING_ERROR),
1081 				 errmsg("negative substring length not allowed")));
1082 	if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1083 		ereport(ERROR,
1084 				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1085 				 errmsg("integer out of range")));
1086 
1087 	s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1088 	s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1089 	result = text_catenate(s1, t2);
1090 	result = text_catenate(result, s2);
1091 
1092 	return result;
1093 }
1094 
1095 /*
1096  * textpos -
1097  *	  Return the position of the specified substring.
1098  *	  Implements the SQL POSITION() function.
1099  *	  Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1100  * - thomas 1997-07-27
1101  */
1102 Datum
textpos(PG_FUNCTION_ARGS)1103 textpos(PG_FUNCTION_ARGS)
1104 {
1105 	text	   *str = PG_GETARG_TEXT_PP(0);
1106 	text	   *search_str = PG_GETARG_TEXT_PP(1);
1107 
1108 	PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1109 }
1110 
1111 /*
1112  * text_position -
1113  *	Does the real work for textpos()
1114  *
1115  * Inputs:
1116  *		t1 - string to be searched
1117  *		t2 - pattern to match within t1
1118  * Result:
1119  *		Character index of the first matched char, starting from 1,
1120  *		or 0 if no match.
1121  *
1122  *	This is broken out so it can be called directly by other string processing
1123  *	functions.
1124  */
1125 static int
text_position(text * t1,text * t2,Oid collid)1126 text_position(text *t1, text *t2, Oid collid)
1127 {
1128 	TextPositionState state;
1129 	int			result;
1130 
1131 	/* Empty needle always matches at position 1 */
1132 	if (VARSIZE_ANY_EXHDR(t2) < 1)
1133 		return 1;
1134 
1135 	/* Otherwise, can't match if haystack is shorter than needle */
1136 	if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1137 		return 0;
1138 
1139 	text_position_setup(t1, t2, collid, &state);
1140 	if (!text_position_next(&state))
1141 		result = 0;
1142 	else
1143 		result = text_position_get_match_pos(&state);
1144 	text_position_cleanup(&state);
1145 	return result;
1146 }
1147 
1148 
1149 /*
1150  * text_position_setup, text_position_next, text_position_cleanup -
1151  *	Component steps of text_position()
1152  *
1153  * These are broken out so that a string can be efficiently searched for
1154  * multiple occurrences of the same pattern.  text_position_next may be
1155  * called multiple times, and it advances to the next match on each call.
1156  * text_position_get_match_ptr() and text_position_get_match_pos() return
1157  * a pointer or 1-based character position of the last match, respectively.
1158  *
1159  * The "state" variable is normally just a local variable in the caller.
1160  *
1161  * NOTE: text_position_next skips over the matched portion.  For example,
1162  * searching for "xx" in "xxx" returns only one match, not two.
1163  */
1164 
1165 static void
text_position_setup(text * t1,text * t2,Oid collid,TextPositionState * state)1166 text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1167 {
1168 	int			len1 = VARSIZE_ANY_EXHDR(t1);
1169 	int			len2 = VARSIZE_ANY_EXHDR(t2);
1170 	pg_locale_t mylocale = 0;
1171 
1172 	check_collation_set(collid);
1173 
1174 	if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1175 		mylocale = pg_newlocale_from_collation(collid);
1176 
1177 	if (mylocale && !mylocale->deterministic)
1178 		ereport(ERROR,
1179 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1180 				 errmsg("nondeterministic collations are not supported for substring searches")));
1181 
1182 	Assert(len1 > 0);
1183 	Assert(len2 > 0);
1184 
1185 	/*
1186 	 * Even with a multi-byte encoding, we perform the search using the raw
1187 	 * byte sequence, ignoring multibyte issues.  For UTF-8, that works fine,
1188 	 * because in UTF-8 the byte sequence of one character cannot contain
1189 	 * another character.  For other multi-byte encodings, we do the search
1190 	 * initially as a simple byte search, ignoring multibyte issues, but
1191 	 * verify afterwards that the match we found is at a character boundary,
1192 	 * and continue the search if it was a false match.
1193 	 */
1194 	if (pg_database_encoding_max_length() == 1)
1195 	{
1196 		state->is_multibyte = false;
1197 		state->is_multibyte_char_in_char = false;
1198 	}
1199 	else if (GetDatabaseEncoding() == PG_UTF8)
1200 	{
1201 		state->is_multibyte = true;
1202 		state->is_multibyte_char_in_char = false;
1203 	}
1204 	else
1205 	{
1206 		state->is_multibyte = true;
1207 		state->is_multibyte_char_in_char = true;
1208 	}
1209 
1210 	state->str1 = VARDATA_ANY(t1);
1211 	state->str2 = VARDATA_ANY(t2);
1212 	state->len1 = len1;
1213 	state->len2 = len2;
1214 	state->last_match = NULL;
1215 	state->refpoint = state->str1;
1216 	state->refpos = 0;
1217 
1218 	/*
1219 	 * Prepare the skip table for Boyer-Moore-Horspool searching.  In these
1220 	 * notes we use the terminology that the "haystack" is the string to be
1221 	 * searched (t1) and the "needle" is the pattern being sought (t2).
1222 	 *
1223 	 * If the needle is empty or bigger than the haystack then there is no
1224 	 * point in wasting cycles initializing the table.  We also choose not to
1225 	 * use B-M-H for needles of length 1, since the skip table can't possibly
1226 	 * save anything in that case.
1227 	 */
1228 	if (len1 >= len2 && len2 > 1)
1229 	{
1230 		int			searchlength = len1 - len2;
1231 		int			skiptablemask;
1232 		int			last;
1233 		int			i;
1234 		const char *str2 = state->str2;
1235 
1236 		/*
1237 		 * First we must determine how much of the skip table to use.  The
1238 		 * declaration of TextPositionState allows up to 256 elements, but for
1239 		 * short search problems we don't really want to have to initialize so
1240 		 * many elements --- it would take too long in comparison to the
1241 		 * actual search time.  So we choose a useful skip table size based on
1242 		 * the haystack length minus the needle length.  The closer the needle
1243 		 * length is to the haystack length the less useful skipping becomes.
1244 		 *
1245 		 * Note: since we use bit-masking to select table elements, the skip
1246 		 * table size MUST be a power of 2, and so the mask must be 2^N-1.
1247 		 */
1248 		if (searchlength < 16)
1249 			skiptablemask = 3;
1250 		else if (searchlength < 64)
1251 			skiptablemask = 7;
1252 		else if (searchlength < 128)
1253 			skiptablemask = 15;
1254 		else if (searchlength < 512)
1255 			skiptablemask = 31;
1256 		else if (searchlength < 2048)
1257 			skiptablemask = 63;
1258 		else if (searchlength < 4096)
1259 			skiptablemask = 127;
1260 		else
1261 			skiptablemask = 255;
1262 		state->skiptablemask = skiptablemask;
1263 
1264 		/*
1265 		 * Initialize the skip table.  We set all elements to the needle
1266 		 * length, since this is the correct skip distance for any character
1267 		 * not found in the needle.
1268 		 */
1269 		for (i = 0; i <= skiptablemask; i++)
1270 			state->skiptable[i] = len2;
1271 
1272 		/*
1273 		 * Now examine the needle.  For each character except the last one,
1274 		 * set the corresponding table element to the appropriate skip
1275 		 * distance.  Note that when two characters share the same skip table
1276 		 * entry, the one later in the needle must determine the skip
1277 		 * distance.
1278 		 */
1279 		last = len2 - 1;
1280 
1281 		for (i = 0; i < last; i++)
1282 			state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1283 	}
1284 }
1285 
1286 /*
1287  * Advance to the next match, starting from the end of the previous match
1288  * (or the beginning of the string, on first call).  Returns true if a match
1289  * is found.
1290  *
1291  * Note that this refuses to match an empty-string needle.  Most callers
1292  * will have handled that case specially and we'll never see it here.
1293  */
1294 static bool
text_position_next(TextPositionState * state)1295 text_position_next(TextPositionState *state)
1296 {
1297 	int			needle_len = state->len2;
1298 	char	   *start_ptr;
1299 	char	   *matchptr;
1300 
1301 	if (needle_len <= 0)
1302 		return false;			/* result for empty pattern */
1303 
1304 	/* Start from the point right after the previous match. */
1305 	if (state->last_match)
1306 		start_ptr = state->last_match + needle_len;
1307 	else
1308 		start_ptr = state->str1;
1309 
1310 retry:
1311 	matchptr = text_position_next_internal(start_ptr, state);
1312 
1313 	if (!matchptr)
1314 		return false;
1315 
1316 	/*
1317 	 * Found a match for the byte sequence.  If this is a multibyte encoding,
1318 	 * where one character's byte sequence can appear inside a longer
1319 	 * multi-byte character, we need to verify that the match was at a
1320 	 * character boundary, not in the middle of a multi-byte character.
1321 	 */
1322 	if (state->is_multibyte_char_in_char)
1323 	{
1324 		/* Walk one character at a time, until we reach the match. */
1325 
1326 		/* the search should never move backwards. */
1327 		Assert(state->refpoint <= matchptr);
1328 
1329 		while (state->refpoint < matchptr)
1330 		{
1331 			/* step to next character. */
1332 			state->refpoint += pg_mblen(state->refpoint);
1333 			state->refpos++;
1334 
1335 			/*
1336 			 * If we stepped over the match's start position, then it was a
1337 			 * false positive, where the byte sequence appeared in the middle
1338 			 * of a multi-byte character.  Skip it, and continue the search at
1339 			 * the next character boundary.
1340 			 */
1341 			if (state->refpoint > matchptr)
1342 			{
1343 				start_ptr = state->refpoint;
1344 				goto retry;
1345 			}
1346 		}
1347 	}
1348 
1349 	state->last_match = matchptr;
1350 	return true;
1351 }
1352 
1353 /*
1354  * Subroutine of text_position_next().  This searches for the raw byte
1355  * sequence, ignoring any multi-byte encoding issues.  Returns the first
1356  * match starting at 'start_ptr', or NULL if no match is found.
1357  */
1358 static char *
text_position_next_internal(char * start_ptr,TextPositionState * state)1359 text_position_next_internal(char *start_ptr, TextPositionState *state)
1360 {
1361 	int			haystack_len = state->len1;
1362 	int			needle_len = state->len2;
1363 	int			skiptablemask = state->skiptablemask;
1364 	const char *haystack = state->str1;
1365 	const char *needle = state->str2;
1366 	const char *haystack_end = &haystack[haystack_len];
1367 	const char *hptr;
1368 
1369 	Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1370 
1371 	if (needle_len == 1)
1372 	{
1373 		/* No point in using B-M-H for a one-character needle */
1374 		char		nchar = *needle;
1375 
1376 		hptr = start_ptr;
1377 		while (hptr < haystack_end)
1378 		{
1379 			if (*hptr == nchar)
1380 				return (char *) hptr;
1381 			hptr++;
1382 		}
1383 	}
1384 	else
1385 	{
1386 		const char *needle_last = &needle[needle_len - 1];
1387 
1388 		/* Start at startpos plus the length of the needle */
1389 		hptr = start_ptr + needle_len - 1;
1390 		while (hptr < haystack_end)
1391 		{
1392 			/* Match the needle scanning *backward* */
1393 			const char *nptr;
1394 			const char *p;
1395 
1396 			nptr = needle_last;
1397 			p = hptr;
1398 			while (*nptr == *p)
1399 			{
1400 				/* Matched it all?	If so, return 1-based position */
1401 				if (nptr == needle)
1402 					return (char *) p;
1403 				nptr--, p--;
1404 			}
1405 
1406 			/*
1407 			 * No match, so use the haystack char at hptr to decide how far to
1408 			 * advance.  If the needle had any occurrence of that character
1409 			 * (or more precisely, one sharing the same skiptable entry)
1410 			 * before its last character, then we advance far enough to align
1411 			 * the last such needle character with that haystack position.
1412 			 * Otherwise we can advance by the whole needle length.
1413 			 */
1414 			hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1415 		}
1416 	}
1417 
1418 	return 0;					/* not found */
1419 }
1420 
1421 /*
1422  * Return a pointer to the current match.
1423  *
1424  * The returned pointer points into correct position in the original
1425  * the haystack string.
1426  */
1427 static char *
text_position_get_match_ptr(TextPositionState * state)1428 text_position_get_match_ptr(TextPositionState *state)
1429 {
1430 	return state->last_match;
1431 }
1432 
1433 /*
1434  * Return the offset of the current match.
1435  *
1436  * The offset is in characters, 1-based.
1437  */
1438 static int
text_position_get_match_pos(TextPositionState * state)1439 text_position_get_match_pos(TextPositionState *state)
1440 {
1441 	if (!state->is_multibyte)
1442 		return state->last_match - state->str1 + 1;
1443 	else
1444 	{
1445 		/* Convert the byte position to char position. */
1446 		while (state->refpoint < state->last_match)
1447 		{
1448 			state->refpoint += pg_mblen(state->refpoint);
1449 			state->refpos++;
1450 		}
1451 		Assert(state->refpoint == state->last_match);
1452 		return state->refpos + 1;
1453 	}
1454 }
1455 
1456 static void
text_position_cleanup(TextPositionState * state)1457 text_position_cleanup(TextPositionState *state)
1458 {
1459 	/* no cleanup needed */
1460 }
1461 
1462 static void
check_collation_set(Oid collid)1463 check_collation_set(Oid collid)
1464 {
1465 	if (!OidIsValid(collid))
1466 	{
1467 		/*
1468 		 * This typically means that the parser could not resolve a conflict
1469 		 * of implicit collations, so report it that way.
1470 		 */
1471 		ereport(ERROR,
1472 				(errcode(ERRCODE_INDETERMINATE_COLLATION),
1473 				 errmsg("could not determine which collation to use for string comparison"),
1474 				 errhint("Use the COLLATE clause to set the collation explicitly.")));
1475 	}
1476 }
1477 
1478 /* varstr_cmp()
1479  * Comparison function for text strings with given lengths.
1480  * Includes locale support, but must copy strings to temporary memory
1481  *	to allow null-termination for inputs to strcoll().
1482  * Returns an integer less than, equal to, or greater than zero, indicating
1483  * whether arg1 is less than, equal to, or greater than arg2.
1484  *
1485  * Note: many functions that depend on this are marked leakproof; therefore,
1486  * avoid reporting the actual contents of the input when throwing errors.
1487  * All errors herein should be things that can't happen except on corrupt
1488  * data, anyway; otherwise we will have trouble with indexing strings that
1489  * would cause them.
1490  */
1491 int
varstr_cmp(const char * arg1,int len1,const char * arg2,int len2,Oid collid)1492 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1493 {
1494 	int			result;
1495 
1496 	check_collation_set(collid);
1497 
1498 	/*
1499 	 * Unfortunately, there is no strncoll(), so in the non-C locale case we
1500 	 * have to do some memory copying.  This turns out to be significantly
1501 	 * slower, so we optimize the case where LC_COLLATE is C.  We also try to
1502 	 * optimize relatively-short strings by avoiding palloc/pfree overhead.
1503 	 */
1504 	if (lc_collate_is_c(collid))
1505 	{
1506 		result = memcmp(arg1, arg2, Min(len1, len2));
1507 		if ((result == 0) && (len1 != len2))
1508 			result = (len1 < len2) ? -1 : 1;
1509 	}
1510 	else
1511 	{
1512 		char		a1buf[TEXTBUFLEN];
1513 		char		a2buf[TEXTBUFLEN];
1514 		char	   *a1p,
1515 				   *a2p;
1516 		pg_locale_t mylocale = 0;
1517 
1518 		if (collid != DEFAULT_COLLATION_OID)
1519 			mylocale = pg_newlocale_from_collation(collid);
1520 
1521 		/*
1522 		 * memcmp() can't tell us which of two unequal strings sorts first,
1523 		 * but it's a cheap way to tell if they're equal.  Testing shows that
1524 		 * memcmp() followed by strcoll() is only trivially slower than
1525 		 * strcoll() by itself, so we don't lose much if this doesn't work out
1526 		 * very often, and if it does - for example, because there are many
1527 		 * equal strings in the input - then we win big by avoiding expensive
1528 		 * collation-aware comparisons.
1529 		 */
1530 		if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1531 			return 0;
1532 
1533 #ifdef WIN32
1534 		/* Win32 does not have UTF-8, so we need to map to UTF-16 */
1535 		if (GetDatabaseEncoding() == PG_UTF8
1536 			&& (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1537 		{
1538 			int			a1len;
1539 			int			a2len;
1540 			int			r;
1541 
1542 			if (len1 >= TEXTBUFLEN / 2)
1543 			{
1544 				a1len = len1 * 2 + 2;
1545 				a1p = palloc(a1len);
1546 			}
1547 			else
1548 			{
1549 				a1len = TEXTBUFLEN;
1550 				a1p = a1buf;
1551 			}
1552 			if (len2 >= TEXTBUFLEN / 2)
1553 			{
1554 				a2len = len2 * 2 + 2;
1555 				a2p = palloc(a2len);
1556 			}
1557 			else
1558 			{
1559 				a2len = TEXTBUFLEN;
1560 				a2p = a2buf;
1561 			}
1562 
1563 			/* stupid Microsloth API does not work for zero-length input */
1564 			if (len1 == 0)
1565 				r = 0;
1566 			else
1567 			{
1568 				r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1569 										(LPWSTR) a1p, a1len / 2);
1570 				if (!r)
1571 					ereport(ERROR,
1572 							(errmsg("could not convert string to UTF-16: error code %lu",
1573 									GetLastError())));
1574 			}
1575 			((LPWSTR) a1p)[r] = 0;
1576 
1577 			if (len2 == 0)
1578 				r = 0;
1579 			else
1580 			{
1581 				r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1582 										(LPWSTR) a2p, a2len / 2);
1583 				if (!r)
1584 					ereport(ERROR,
1585 							(errmsg("could not convert string to UTF-16: error code %lu",
1586 									GetLastError())));
1587 			}
1588 			((LPWSTR) a2p)[r] = 0;
1589 
1590 			errno = 0;
1591 #ifdef HAVE_LOCALE_T
1592 			if (mylocale)
1593 				result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1594 			else
1595 #endif
1596 				result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1597 			if (result == 2147483647)	/* _NLSCMPERROR; missing from mingw
1598 										 * headers */
1599 				ereport(ERROR,
1600 						(errmsg("could not compare Unicode strings: %m")));
1601 
1602 			/* Break tie if necessary. */
1603 			if (result == 0 &&
1604 				(!mylocale || mylocale->deterministic))
1605 			{
1606 				result = memcmp(arg1, arg2, Min(len1, len2));
1607 				if ((result == 0) && (len1 != len2))
1608 					result = (len1 < len2) ? -1 : 1;
1609 			}
1610 
1611 			if (a1p != a1buf)
1612 				pfree(a1p);
1613 			if (a2p != a2buf)
1614 				pfree(a2p);
1615 
1616 			return result;
1617 		}
1618 #endif							/* WIN32 */
1619 
1620 		if (len1 >= TEXTBUFLEN)
1621 			a1p = (char *) palloc(len1 + 1);
1622 		else
1623 			a1p = a1buf;
1624 		if (len2 >= TEXTBUFLEN)
1625 			a2p = (char *) palloc(len2 + 1);
1626 		else
1627 			a2p = a2buf;
1628 
1629 		memcpy(a1p, arg1, len1);
1630 		a1p[len1] = '\0';
1631 		memcpy(a2p, arg2, len2);
1632 		a2p[len2] = '\0';
1633 
1634 		if (mylocale)
1635 		{
1636 			if (mylocale->provider == COLLPROVIDER_ICU)
1637 			{
1638 #ifdef USE_ICU
1639 #ifdef HAVE_UCOL_STRCOLLUTF8
1640 				if (GetDatabaseEncoding() == PG_UTF8)
1641 				{
1642 					UErrorCode	status;
1643 
1644 					status = U_ZERO_ERROR;
1645 					result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1646 											  arg1, len1,
1647 											  arg2, len2,
1648 											  &status);
1649 					if (U_FAILURE(status))
1650 						ereport(ERROR,
1651 								(errmsg("collation failed: %s", u_errorName(status))));
1652 				}
1653 				else
1654 #endif
1655 				{
1656 					int32_t		ulen1,
1657 								ulen2;
1658 					UChar	   *uchar1,
1659 							   *uchar2;
1660 
1661 					ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1662 					ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1663 
1664 					result = ucol_strcoll(mylocale->info.icu.ucol,
1665 										  uchar1, ulen1,
1666 										  uchar2, ulen2);
1667 
1668 					pfree(uchar1);
1669 					pfree(uchar2);
1670 				}
1671 #else							/* not USE_ICU */
1672 				/* shouldn't happen */
1673 				elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1674 #endif							/* not USE_ICU */
1675 			}
1676 			else
1677 			{
1678 #ifdef HAVE_LOCALE_T
1679 				result = strcoll_l(a1p, a2p, mylocale->info.lt);
1680 #else
1681 				/* shouldn't happen */
1682 				elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1683 #endif
1684 			}
1685 		}
1686 		else
1687 			result = strcoll(a1p, a2p);
1688 
1689 		/* Break tie if necessary. */
1690 		if (result == 0 &&
1691 			(!mylocale || mylocale->deterministic))
1692 			result = strcmp(a1p, a2p);
1693 
1694 		if (a1p != a1buf)
1695 			pfree(a1p);
1696 		if (a2p != a2buf)
1697 			pfree(a2p);
1698 	}
1699 
1700 	return result;
1701 }
1702 
1703 /* text_cmp()
1704  * Internal comparison function for text strings.
1705  * Returns -1, 0 or 1
1706  */
1707 static int
text_cmp(text * arg1,text * arg2,Oid collid)1708 text_cmp(text *arg1, text *arg2, Oid collid)
1709 {
1710 	char	   *a1p,
1711 			   *a2p;
1712 	int			len1,
1713 				len2;
1714 
1715 	a1p = VARDATA_ANY(arg1);
1716 	a2p = VARDATA_ANY(arg2);
1717 
1718 	len1 = VARSIZE_ANY_EXHDR(arg1);
1719 	len2 = VARSIZE_ANY_EXHDR(arg2);
1720 
1721 	return varstr_cmp(a1p, len1, a2p, len2, collid);
1722 }
1723 
1724 /*
1725  * Comparison functions for text strings.
1726  *
1727  * Note: btree indexes need these routines not to leak memory; therefore,
1728  * be careful to free working copies of toasted datums.  Most places don't
1729  * need to be so careful.
1730  */
1731 
1732 Datum
texteq(PG_FUNCTION_ARGS)1733 texteq(PG_FUNCTION_ARGS)
1734 {
1735 	Oid			collid = PG_GET_COLLATION();
1736 	bool		result;
1737 
1738 	check_collation_set(collid);
1739 
1740 	if (lc_collate_is_c(collid) ||
1741 		collid == DEFAULT_COLLATION_OID ||
1742 		pg_newlocale_from_collation(collid)->deterministic)
1743 	{
1744 		Datum		arg1 = PG_GETARG_DATUM(0);
1745 		Datum		arg2 = PG_GETARG_DATUM(1);
1746 		Size		len1,
1747 					len2;
1748 
1749 		/*
1750 		 * Since we only care about equality or not-equality, we can avoid all
1751 		 * the expense of strcoll() here, and just do bitwise comparison.  In
1752 		 * fact, we don't even have to do a bitwise comparison if we can show
1753 		 * the lengths of the strings are unequal; which might save us from
1754 		 * having to detoast one or both values.
1755 		 */
1756 		len1 = toast_raw_datum_size(arg1);
1757 		len2 = toast_raw_datum_size(arg2);
1758 		if (len1 != len2)
1759 			result = false;
1760 		else
1761 		{
1762 			text	   *targ1 = DatumGetTextPP(arg1);
1763 			text	   *targ2 = DatumGetTextPP(arg2);
1764 
1765 			result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1766 							 len1 - VARHDRSZ) == 0);
1767 
1768 			PG_FREE_IF_COPY(targ1, 0);
1769 			PG_FREE_IF_COPY(targ2, 1);
1770 		}
1771 	}
1772 	else
1773 	{
1774 		text	   *arg1 = PG_GETARG_TEXT_PP(0);
1775 		text	   *arg2 = PG_GETARG_TEXT_PP(1);
1776 
1777 		result = (text_cmp(arg1, arg2, collid) == 0);
1778 
1779 		PG_FREE_IF_COPY(arg1, 0);
1780 		PG_FREE_IF_COPY(arg2, 1);
1781 	}
1782 
1783 	PG_RETURN_BOOL(result);
1784 }
1785 
1786 Datum
textne(PG_FUNCTION_ARGS)1787 textne(PG_FUNCTION_ARGS)
1788 {
1789 	Oid			collid = PG_GET_COLLATION();
1790 	bool		result;
1791 
1792 	check_collation_set(collid);
1793 
1794 	if (lc_collate_is_c(collid) ||
1795 		collid == DEFAULT_COLLATION_OID ||
1796 		pg_newlocale_from_collation(collid)->deterministic)
1797 	{
1798 		Datum		arg1 = PG_GETARG_DATUM(0);
1799 		Datum		arg2 = PG_GETARG_DATUM(1);
1800 		Size		len1,
1801 					len2;
1802 
1803 		/* See comment in texteq() */
1804 		len1 = toast_raw_datum_size(arg1);
1805 		len2 = toast_raw_datum_size(arg2);
1806 		if (len1 != len2)
1807 			result = true;
1808 		else
1809 		{
1810 			text	   *targ1 = DatumGetTextPP(arg1);
1811 			text	   *targ2 = DatumGetTextPP(arg2);
1812 
1813 			result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1814 							 len1 - VARHDRSZ) != 0);
1815 
1816 			PG_FREE_IF_COPY(targ1, 0);
1817 			PG_FREE_IF_COPY(targ2, 1);
1818 		}
1819 	}
1820 	else
1821 	{
1822 		text	   *arg1 = PG_GETARG_TEXT_PP(0);
1823 		text	   *arg2 = PG_GETARG_TEXT_PP(1);
1824 
1825 		result = (text_cmp(arg1, arg2, collid) != 0);
1826 
1827 		PG_FREE_IF_COPY(arg1, 0);
1828 		PG_FREE_IF_COPY(arg2, 1);
1829 	}
1830 
1831 	PG_RETURN_BOOL(result);
1832 }
1833 
1834 Datum
text_lt(PG_FUNCTION_ARGS)1835 text_lt(PG_FUNCTION_ARGS)
1836 {
1837 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1838 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1839 	bool		result;
1840 
1841 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1842 
1843 	PG_FREE_IF_COPY(arg1, 0);
1844 	PG_FREE_IF_COPY(arg2, 1);
1845 
1846 	PG_RETURN_BOOL(result);
1847 }
1848 
1849 Datum
text_le(PG_FUNCTION_ARGS)1850 text_le(PG_FUNCTION_ARGS)
1851 {
1852 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1853 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1854 	bool		result;
1855 
1856 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1857 
1858 	PG_FREE_IF_COPY(arg1, 0);
1859 	PG_FREE_IF_COPY(arg2, 1);
1860 
1861 	PG_RETURN_BOOL(result);
1862 }
1863 
1864 Datum
text_gt(PG_FUNCTION_ARGS)1865 text_gt(PG_FUNCTION_ARGS)
1866 {
1867 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1868 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1869 	bool		result;
1870 
1871 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1872 
1873 	PG_FREE_IF_COPY(arg1, 0);
1874 	PG_FREE_IF_COPY(arg2, 1);
1875 
1876 	PG_RETURN_BOOL(result);
1877 }
1878 
1879 Datum
text_ge(PG_FUNCTION_ARGS)1880 text_ge(PG_FUNCTION_ARGS)
1881 {
1882 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1883 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1884 	bool		result;
1885 
1886 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1887 
1888 	PG_FREE_IF_COPY(arg1, 0);
1889 	PG_FREE_IF_COPY(arg2, 1);
1890 
1891 	PG_RETURN_BOOL(result);
1892 }
1893 
1894 Datum
text_starts_with(PG_FUNCTION_ARGS)1895 text_starts_with(PG_FUNCTION_ARGS)
1896 {
1897 	Datum		arg1 = PG_GETARG_DATUM(0);
1898 	Datum		arg2 = PG_GETARG_DATUM(1);
1899 	Oid			collid = PG_GET_COLLATION();
1900 	pg_locale_t mylocale = 0;
1901 	bool		result;
1902 	Size		len1,
1903 				len2;
1904 
1905 	check_collation_set(collid);
1906 
1907 	if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1908 		mylocale = pg_newlocale_from_collation(collid);
1909 
1910 	if (mylocale && !mylocale->deterministic)
1911 		ereport(ERROR,
1912 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1913 				 errmsg("nondeterministic collations are not supported for substring searches")));
1914 
1915 	len1 = toast_raw_datum_size(arg1);
1916 	len2 = toast_raw_datum_size(arg2);
1917 	if (len2 > len1)
1918 		result = false;
1919 	else
1920 	{
1921 		text	   *targ1 = text_substring(arg1, 1, len2, false);
1922 		text	   *targ2 = DatumGetTextPP(arg2);
1923 
1924 		result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1925 						 VARSIZE_ANY_EXHDR(targ2)) == 0);
1926 
1927 		PG_FREE_IF_COPY(targ1, 0);
1928 		PG_FREE_IF_COPY(targ2, 1);
1929 	}
1930 
1931 	PG_RETURN_BOOL(result);
1932 }
1933 
1934 Datum
bttextcmp(PG_FUNCTION_ARGS)1935 bttextcmp(PG_FUNCTION_ARGS)
1936 {
1937 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1938 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1939 	int32		result;
1940 
1941 	result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1942 
1943 	PG_FREE_IF_COPY(arg1, 0);
1944 	PG_FREE_IF_COPY(arg2, 1);
1945 
1946 	PG_RETURN_INT32(result);
1947 }
1948 
1949 Datum
bttextsortsupport(PG_FUNCTION_ARGS)1950 bttextsortsupport(PG_FUNCTION_ARGS)
1951 {
1952 	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1953 	Oid			collid = ssup->ssup_collation;
1954 	MemoryContext oldcontext;
1955 
1956 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1957 
1958 	/* Use generic string SortSupport */
1959 	varstr_sortsupport(ssup, TEXTOID, collid);
1960 
1961 	MemoryContextSwitchTo(oldcontext);
1962 
1963 	PG_RETURN_VOID();
1964 }
1965 
1966 /*
1967  * Generic sortsupport interface for character type's operator classes.
1968  * Includes locale support, and support for BpChar semantics (i.e. removing
1969  * trailing spaces before comparison).
1970  *
1971  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1972  * same representation.  Callers that always use the C collation (e.g.
1973  * non-collatable type callers like bytea) may have NUL bytes in their strings;
1974  * this will not work with any other collation, though.
1975  */
1976 void
varstr_sortsupport(SortSupport ssup,Oid typid,Oid collid)1977 varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
1978 {
1979 	bool		abbreviate = ssup->abbreviate;
1980 	bool		collate_c = false;
1981 	VarStringSortSupport *sss;
1982 	pg_locale_t locale = 0;
1983 
1984 	check_collation_set(collid);
1985 
1986 	/*
1987 	 * If possible, set ssup->comparator to a function which can be used to
1988 	 * directly compare two datums.  If we can do this, we'll avoid the
1989 	 * overhead of a trip through the fmgr layer for every comparison, which
1990 	 * can be substantial.
1991 	 *
1992 	 * Most typically, we'll set the comparator to varlenafastcmp_locale,
1993 	 * which uses strcoll() to perform comparisons.  We use that for the
1994 	 * BpChar case too, but type NAME uses namefastcmp_locale. However, if
1995 	 * LC_COLLATE = C, we can make things quite a bit faster with
1996 	 * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
1997 	 * memcmp() rather than strcoll().
1998 	 */
1999 	if (lc_collate_is_c(collid))
2000 	{
2001 		if (typid == BPCHAROID)
2002 			ssup->comparator = bpcharfastcmp_c;
2003 		else if (typid == NAMEOID)
2004 		{
2005 			ssup->comparator = namefastcmp_c;
2006 			/* Not supporting abbreviation with type NAME, for now */
2007 			abbreviate = false;
2008 		}
2009 		else
2010 			ssup->comparator = varstrfastcmp_c;
2011 
2012 		collate_c = true;
2013 	}
2014 	else
2015 	{
2016 		/*
2017 		 * We need a collation-sensitive comparison.  To make things faster,
2018 		 * we'll figure out the collation based on the locale id and cache the
2019 		 * result.
2020 		 */
2021 		if (collid != DEFAULT_COLLATION_OID)
2022 			locale = pg_newlocale_from_collation(collid);
2023 
2024 		/*
2025 		 * There is a further exception on Windows.  When the database
2026 		 * encoding is UTF-8 and we are not using the C collation, complex
2027 		 * hacks are required.  We don't currently have a comparator that
2028 		 * handles that case, so we fall back on the slow method of having the
2029 		 * sort code invoke bttextcmp() (in the case of text) via the fmgr
2030 		 * trampoline.  ICU locales work just the same on Windows, however.
2031 		 */
2032 #ifdef WIN32
2033 		if (GetDatabaseEncoding() == PG_UTF8 &&
2034 			!(locale && locale->provider == COLLPROVIDER_ICU))
2035 			return;
2036 #endif
2037 
2038 		/*
2039 		 * We use varlenafastcmp_locale except for type NAME.
2040 		 */
2041 		if (typid == NAMEOID)
2042 		{
2043 			ssup->comparator = namefastcmp_locale;
2044 			/* Not supporting abbreviation with type NAME, for now */
2045 			abbreviate = false;
2046 		}
2047 		else
2048 			ssup->comparator = varlenafastcmp_locale;
2049 	}
2050 
2051 	/*
2052 	 * Unfortunately, it seems that abbreviation for non-C collations is
2053 	 * broken on many common platforms; testing of multiple versions of glibc
2054 	 * reveals that, for many locales, strcoll() and strxfrm() do not return
2055 	 * consistent results, which is fatal to this optimization.  While no
2056 	 * other libc other than Cygwin has so far been shown to have a problem,
2057 	 * we take the conservative course of action for right now and disable
2058 	 * this categorically.  (Users who are certain this isn't a problem on
2059 	 * their system can define TRUST_STRXFRM.)
2060 	 *
2061 	 * Even apart from the risk of broken locales, it's possible that there
2062 	 * are platforms where the use of abbreviated keys should be disabled at
2063 	 * compile time.  Having only 4 byte datums could make worst-case
2064 	 * performance drastically more likely, for example.  Moreover, macOS's
2065 	 * strxfrm() implementation is known to not effectively concentrate a
2066 	 * significant amount of entropy from the original string in earlier
2067 	 * transformed blobs.  It's possible that other supported platforms are
2068 	 * similarly encumbered.  So, if we ever get past disabling this
2069 	 * categorically, we may still want or need to disable it for particular
2070 	 * platforms.
2071 	 */
2072 #ifndef TRUST_STRXFRM
2073 	if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2074 		abbreviate = false;
2075 #endif
2076 
2077 	/*
2078 	 * If we're using abbreviated keys, or if we're using a locale-aware
2079 	 * comparison, we need to initialize a StringSortSupport object.  Both
2080 	 * cases will make use of the temporary buffers we initialize here for
2081 	 * scratch space (and to detect requirement for BpChar semantics from
2082 	 * caller), and the abbreviation case requires additional state.
2083 	 */
2084 	if (abbreviate || !collate_c)
2085 	{
2086 		sss = palloc(sizeof(VarStringSortSupport));
2087 		sss->buf1 = palloc(TEXTBUFLEN);
2088 		sss->buflen1 = TEXTBUFLEN;
2089 		sss->buf2 = palloc(TEXTBUFLEN);
2090 		sss->buflen2 = TEXTBUFLEN;
2091 		/* Start with invalid values */
2092 		sss->last_len1 = -1;
2093 		sss->last_len2 = -1;
2094 		/* Initialize */
2095 		sss->last_returned = 0;
2096 		sss->locale = locale;
2097 
2098 		/*
2099 		 * To avoid somehow confusing a strxfrm() blob and an original string,
2100 		 * constantly keep track of the variety of data that buf1 and buf2
2101 		 * currently contain.
2102 		 *
2103 		 * Comparisons may be interleaved with conversion calls.  Frequently,
2104 		 * conversions and comparisons are batched into two distinct phases,
2105 		 * but the correctness of caching cannot hinge upon this.  For
2106 		 * comparison caching, buffer state is only trusted if cache_blob is
2107 		 * found set to false, whereas strxfrm() caching only trusts the state
2108 		 * when cache_blob is found set to true.
2109 		 *
2110 		 * Arbitrarily initialize cache_blob to true.
2111 		 */
2112 		sss->cache_blob = true;
2113 		sss->collate_c = collate_c;
2114 		sss->typid = typid;
2115 		ssup->ssup_extra = sss;
2116 
2117 		/*
2118 		 * If possible, plan to use the abbreviated keys optimization.  The
2119 		 * core code may switch back to authoritative comparator should
2120 		 * abbreviation be aborted.
2121 		 */
2122 		if (abbreviate)
2123 		{
2124 			sss->prop_card = 0.20;
2125 			initHyperLogLog(&sss->abbr_card, 10);
2126 			initHyperLogLog(&sss->full_card, 10);
2127 			ssup->abbrev_full_comparator = ssup->comparator;
2128 			ssup->comparator = varstrcmp_abbrev;
2129 			ssup->abbrev_converter = varstr_abbrev_convert;
2130 			ssup->abbrev_abort = varstr_abbrev_abort;
2131 		}
2132 	}
2133 }
2134 
2135 /*
2136  * sortsupport comparison func (for C locale case)
2137  */
2138 static int
varstrfastcmp_c(Datum x,Datum y,SortSupport ssup)2139 varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
2140 {
2141 	VarString  *arg1 = DatumGetVarStringPP(x);
2142 	VarString  *arg2 = DatumGetVarStringPP(y);
2143 	char	   *a1p,
2144 			   *a2p;
2145 	int			len1,
2146 				len2,
2147 				result;
2148 
2149 	a1p = VARDATA_ANY(arg1);
2150 	a2p = VARDATA_ANY(arg2);
2151 
2152 	len1 = VARSIZE_ANY_EXHDR(arg1);
2153 	len2 = VARSIZE_ANY_EXHDR(arg2);
2154 
2155 	result = memcmp(a1p, a2p, Min(len1, len2));
2156 	if ((result == 0) && (len1 != len2))
2157 		result = (len1 < len2) ? -1 : 1;
2158 
2159 	/* We can't afford to leak memory here. */
2160 	if (PointerGetDatum(arg1) != x)
2161 		pfree(arg1);
2162 	if (PointerGetDatum(arg2) != y)
2163 		pfree(arg2);
2164 
2165 	return result;
2166 }
2167 
2168 /*
2169  * sortsupport comparison func (for BpChar C locale case)
2170  *
2171  * BpChar outsources its sortsupport to this module.  Specialization for the
2172  * varstr_sortsupport BpChar case, modeled on
2173  * internal_bpchar_pattern_compare().
2174  */
2175 static int
bpcharfastcmp_c(Datum x,Datum y,SortSupport ssup)2176 bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2177 {
2178 	BpChar	   *arg1 = DatumGetBpCharPP(x);
2179 	BpChar	   *arg2 = DatumGetBpCharPP(y);
2180 	char	   *a1p,
2181 			   *a2p;
2182 	int			len1,
2183 				len2,
2184 				result;
2185 
2186 	a1p = VARDATA_ANY(arg1);
2187 	a2p = VARDATA_ANY(arg2);
2188 
2189 	len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2190 	len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2191 
2192 	result = memcmp(a1p, a2p, Min(len1, len2));
2193 	if ((result == 0) && (len1 != len2))
2194 		result = (len1 < len2) ? -1 : 1;
2195 
2196 	/* We can't afford to leak memory here. */
2197 	if (PointerGetDatum(arg1) != x)
2198 		pfree(arg1);
2199 	if (PointerGetDatum(arg2) != y)
2200 		pfree(arg2);
2201 
2202 	return result;
2203 }
2204 
2205 /*
2206  * sortsupport comparison func (for NAME C locale case)
2207  */
2208 static int
namefastcmp_c(Datum x,Datum y,SortSupport ssup)2209 namefastcmp_c(Datum x, Datum y, SortSupport ssup)
2210 {
2211 	Name		arg1 = DatumGetName(x);
2212 	Name		arg2 = DatumGetName(y);
2213 
2214 	return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2215 }
2216 
2217 /*
2218  * sortsupport comparison func (for locale case with all varlena types)
2219  */
2220 static int
varlenafastcmp_locale(Datum x,Datum y,SortSupport ssup)2221 varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
2222 {
2223 	VarString  *arg1 = DatumGetVarStringPP(x);
2224 	VarString  *arg2 = DatumGetVarStringPP(y);
2225 	char	   *a1p,
2226 			   *a2p;
2227 	int			len1,
2228 				len2,
2229 				result;
2230 
2231 	a1p = VARDATA_ANY(arg1);
2232 	a2p = VARDATA_ANY(arg2);
2233 
2234 	len1 = VARSIZE_ANY_EXHDR(arg1);
2235 	len2 = VARSIZE_ANY_EXHDR(arg2);
2236 
2237 	result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2238 
2239 	/* We can't afford to leak memory here. */
2240 	if (PointerGetDatum(arg1) != x)
2241 		pfree(arg1);
2242 	if (PointerGetDatum(arg2) != y)
2243 		pfree(arg2);
2244 
2245 	return result;
2246 }
2247 
2248 /*
2249  * sortsupport comparison func (for locale case with NAME type)
2250  */
2251 static int
namefastcmp_locale(Datum x,Datum y,SortSupport ssup)2252 namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
2253 {
2254 	Name		arg1 = DatumGetName(x);
2255 	Name		arg2 = DatumGetName(y);
2256 
2257 	return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2258 								NameStr(*arg2), strlen(NameStr(*arg2)),
2259 								ssup);
2260 }
2261 
2262 /*
2263  * sortsupport comparison func for locale cases
2264  */
2265 static int
varstrfastcmp_locale(char * a1p,int len1,char * a2p,int len2,SortSupport ssup)2266 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2267 {
2268 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2269 	int			result;
2270 	bool		arg1_match;
2271 
2272 	/* Fast pre-check for equality, as discussed in varstr_cmp() */
2273 	if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2274 	{
2275 		/*
2276 		 * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2277 		 * last_len2.  Existing contents of buffers might still be used by
2278 		 * next call.
2279 		 *
2280 		 * It's fine to allow the comparison of BpChar padding bytes here,
2281 		 * even though that implies that the memcmp() will usually be
2282 		 * performed for BpChar callers (though multibyte characters could
2283 		 * still prevent that from occurring).  The memcmp() is still very
2284 		 * cheap, and BpChar's funny semantics have us remove trailing spaces
2285 		 * (not limited to padding), so we need make no distinction between
2286 		 * padding space characters and "real" space characters.
2287 		 */
2288 		return 0;
2289 	}
2290 
2291 	if (sss->typid == BPCHAROID)
2292 	{
2293 		/* Get true number of bytes, ignoring trailing spaces */
2294 		len1 = bpchartruelen(a1p, len1);
2295 		len2 = bpchartruelen(a2p, len2);
2296 	}
2297 
2298 	if (len1 >= sss->buflen1)
2299 	{
2300 		pfree(sss->buf1);
2301 		sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2302 		sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2303 	}
2304 	if (len2 >= sss->buflen2)
2305 	{
2306 		pfree(sss->buf2);
2307 		sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2308 		sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2309 	}
2310 
2311 	/*
2312 	 * We're likely to be asked to compare the same strings repeatedly, and
2313 	 * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2314 	 * comparisons, even though in general there is no reason to think that
2315 	 * that will work out (every string datum may be unique).  Caching does
2316 	 * not slow things down measurably when it doesn't work out, and can speed
2317 	 * things up by rather a lot when it does.  In part, this is because the
2318 	 * memcmp() compares data from cachelines that are needed in L1 cache even
2319 	 * when the last comparison's result cannot be reused.
2320 	 */
2321 	arg1_match = true;
2322 	if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2323 	{
2324 		arg1_match = false;
2325 		memcpy(sss->buf1, a1p, len1);
2326 		sss->buf1[len1] = '\0';
2327 		sss->last_len1 = len1;
2328 	}
2329 
2330 	/*
2331 	 * If we're comparing the same two strings as last time, we can return the
2332 	 * same answer without calling strcoll() again.  This is more likely than
2333 	 * it seems (at least with moderate to low cardinality sets), because
2334 	 * quicksort compares the same pivot against many values.
2335 	 */
2336 	if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2337 	{
2338 		memcpy(sss->buf2, a2p, len2);
2339 		sss->buf2[len2] = '\0';
2340 		sss->last_len2 = len2;
2341 	}
2342 	else if (arg1_match && !sss->cache_blob)
2343 	{
2344 		/* Use result cached following last actual strcoll() call */
2345 		return sss->last_returned;
2346 	}
2347 
2348 	if (sss->locale)
2349 	{
2350 		if (sss->locale->provider == COLLPROVIDER_ICU)
2351 		{
2352 #ifdef USE_ICU
2353 #ifdef HAVE_UCOL_STRCOLLUTF8
2354 			if (GetDatabaseEncoding() == PG_UTF8)
2355 			{
2356 				UErrorCode	status;
2357 
2358 				status = U_ZERO_ERROR;
2359 				result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2360 										  a1p, len1,
2361 										  a2p, len2,
2362 										  &status);
2363 				if (U_FAILURE(status))
2364 					ereport(ERROR,
2365 							(errmsg("collation failed: %s", u_errorName(status))));
2366 			}
2367 			else
2368 #endif
2369 			{
2370 				int32_t		ulen1,
2371 							ulen2;
2372 				UChar	   *uchar1,
2373 						   *uchar2;
2374 
2375 				ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2376 				ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2377 
2378 				result = ucol_strcoll(sss->locale->info.icu.ucol,
2379 									  uchar1, ulen1,
2380 									  uchar2, ulen2);
2381 
2382 				pfree(uchar1);
2383 				pfree(uchar2);
2384 			}
2385 #else							/* not USE_ICU */
2386 			/* shouldn't happen */
2387 			elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2388 #endif							/* not USE_ICU */
2389 		}
2390 		else
2391 		{
2392 #ifdef HAVE_LOCALE_T
2393 			result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2394 #else
2395 			/* shouldn't happen */
2396 			elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2397 #endif
2398 		}
2399 	}
2400 	else
2401 		result = strcoll(sss->buf1, sss->buf2);
2402 
2403 	/* Break tie if necessary. */
2404 	if (result == 0 &&
2405 		(!sss->locale || sss->locale->deterministic))
2406 		result = strcmp(sss->buf1, sss->buf2);
2407 
2408 	/* Cache result, perhaps saving an expensive strcoll() call next time */
2409 	sss->cache_blob = false;
2410 	sss->last_returned = result;
2411 	return result;
2412 }
2413 
2414 /*
2415  * Abbreviated key comparison func
2416  */
2417 static int
varstrcmp_abbrev(Datum x,Datum y,SortSupport ssup)2418 varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup)
2419 {
2420 	/*
2421 	 * When 0 is returned, the core system will call varstrfastcmp_c()
2422 	 * (bpcharfastcmp_c() in BpChar case) or varlenafastcmp_locale().  Even a
2423 	 * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2424 	 * authoritatively, for the same reason that there is a strcoll()
2425 	 * tie-breaker call to strcmp() in varstr_cmp().
2426 	 */
2427 	if (x > y)
2428 		return 1;
2429 	else if (x == y)
2430 		return 0;
2431 	else
2432 		return -1;
2433 }
2434 
2435 /*
2436  * Conversion routine for sortsupport.  Converts original to abbreviated key
2437  * representation.  Our encoding strategy is simple -- pack the first 8 bytes
2438  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2439  * stored in reverse order), and treat it as an unsigned integer.  When the "C"
2440  * locale is used, or in case of bytea, just memcpy() from original instead.
2441  */
2442 static Datum
varstr_abbrev_convert(Datum original,SortSupport ssup)2443 varstr_abbrev_convert(Datum original, SortSupport ssup)
2444 {
2445 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2446 	VarString  *authoritative = DatumGetVarStringPP(original);
2447 	char	   *authoritative_data = VARDATA_ANY(authoritative);
2448 
2449 	/* working state */
2450 	Datum		res;
2451 	char	   *pres;
2452 	int			len;
2453 	uint32		hash;
2454 
2455 	pres = (char *) &res;
2456 	/* memset(), so any non-overwritten bytes are NUL */
2457 	memset(pres, 0, sizeof(Datum));
2458 	len = VARSIZE_ANY_EXHDR(authoritative);
2459 
2460 	/* Get number of bytes, ignoring trailing spaces */
2461 	if (sss->typid == BPCHAROID)
2462 		len = bpchartruelen(authoritative_data, len);
2463 
2464 	/*
2465 	 * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2466 	 * abbreviate keys.  The full comparator for the C locale is always
2467 	 * memcmp().  It would be incorrect to allow bytea callers (callers that
2468 	 * always force the C collation -- bytea isn't a collatable type, but this
2469 	 * approach is convenient) to use strxfrm().  This is because bytea
2470 	 * strings may contain NUL bytes.  Besides, this should be faster, too.
2471 	 *
2472 	 * More generally, it's okay that bytea callers can have NUL bytes in
2473 	 * strings because varstrcmp_abbrev() need not make a distinction between
2474 	 * terminating NUL bytes, and NUL bytes representing actual NULs in the
2475 	 * authoritative representation.  Hopefully a comparison at or past one
2476 	 * abbreviated key's terminating NUL byte will resolve the comparison
2477 	 * without consulting the authoritative representation; specifically, some
2478 	 * later non-NUL byte in the longer string can resolve the comparison
2479 	 * against a subsequent terminating NUL in the shorter string.  There will
2480 	 * usually be what is effectively a "length-wise" resolution there and
2481 	 * then.
2482 	 *
2483 	 * If that doesn't work out -- if all bytes in the longer string
2484 	 * positioned at or past the offset of the smaller string's (first)
2485 	 * terminating NUL are actually representative of NUL bytes in the
2486 	 * authoritative binary string (perhaps with some *terminating* NUL bytes
2487 	 * towards the end of the longer string iff it happens to still be small)
2488 	 * -- then an authoritative tie-breaker will happen, and do the right
2489 	 * thing: explicitly consider string length.
2490 	 */
2491 	if (sss->collate_c)
2492 		memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2493 	else
2494 	{
2495 		Size		bsize;
2496 #ifdef USE_ICU
2497 		int32_t		ulen = -1;
2498 		UChar	   *uchar = NULL;
2499 #endif
2500 
2501 		/*
2502 		 * We're not using the C collation, so fall back on strxfrm or ICU
2503 		 * analogs.
2504 		 */
2505 
2506 		/* By convention, we use buffer 1 to store and NUL-terminate */
2507 		if (len >= sss->buflen1)
2508 		{
2509 			pfree(sss->buf1);
2510 			sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2511 			sss->buf1 = palloc(sss->buflen1);
2512 		}
2513 
2514 		/* Might be able to reuse strxfrm() blob from last call */
2515 		if (sss->last_len1 == len && sss->cache_blob &&
2516 			memcmp(sss->buf1, authoritative_data, len) == 0)
2517 		{
2518 			memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2519 			/* No change affecting cardinality, so no hashing required */
2520 			goto done;
2521 		}
2522 
2523 		memcpy(sss->buf1, authoritative_data, len);
2524 
2525 		/*
2526 		 * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2527 		 * necessary for ICU, but doesn't hurt.
2528 		 */
2529 		sss->buf1[len] = '\0';
2530 		sss->last_len1 = len;
2531 
2532 #ifdef USE_ICU
2533 		/* When using ICU and not UTF8, convert string to UChar. */
2534 		if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2535 			GetDatabaseEncoding() != PG_UTF8)
2536 			ulen = icu_to_uchar(&uchar, sss->buf1, len);
2537 #endif
2538 
2539 		/*
2540 		 * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2541 		 * and try again.  Both of these functions have the result buffer
2542 		 * content undefined if the result did not fit, so we need to retry
2543 		 * until everything fits, even though we only need the first few bytes
2544 		 * in the end.  When using ucol_nextSortKeyPart(), however, we only
2545 		 * ask for as many bytes as we actually need.
2546 		 */
2547 		for (;;)
2548 		{
2549 #ifdef USE_ICU
2550 			if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2551 			{
2552 				/*
2553 				 * When using UTF8, use the iteration interface so we only
2554 				 * need to produce as many bytes as we actually need.
2555 				 */
2556 				if (GetDatabaseEncoding() == PG_UTF8)
2557 				{
2558 					UCharIterator iter;
2559 					uint32_t	state[2];
2560 					UErrorCode	status;
2561 
2562 					uiter_setUTF8(&iter, sss->buf1, len);
2563 					state[0] = state[1] = 0;	/* won't need that again */
2564 					status = U_ZERO_ERROR;
2565 					bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2566 												 &iter,
2567 												 state,
2568 												 (uint8_t *) sss->buf2,
2569 												 Min(sizeof(Datum), sss->buflen2),
2570 												 &status);
2571 					if (U_FAILURE(status))
2572 						ereport(ERROR,
2573 								(errmsg("sort key generation failed: %s",
2574 										u_errorName(status))));
2575 				}
2576 				else
2577 					bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2578 											uchar, ulen,
2579 											(uint8_t *) sss->buf2, sss->buflen2);
2580 			}
2581 			else
2582 #endif
2583 #ifdef HAVE_LOCALE_T
2584 			if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2585 				bsize = strxfrm_l(sss->buf2, sss->buf1,
2586 								  sss->buflen2, sss->locale->info.lt);
2587 			else
2588 #endif
2589 				bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2590 
2591 			sss->last_len2 = bsize;
2592 			if (bsize < sss->buflen2)
2593 				break;
2594 
2595 			/*
2596 			 * Grow buffer and retry.
2597 			 */
2598 			pfree(sss->buf2);
2599 			sss->buflen2 = Max(bsize + 1,
2600 							   Min(sss->buflen2 * 2, MaxAllocSize));
2601 			sss->buf2 = palloc(sss->buflen2);
2602 		}
2603 
2604 		/*
2605 		 * Every Datum byte is always compared.  This is safe because the
2606 		 * strxfrm() blob is itself NUL terminated, leaving no danger of
2607 		 * misinterpreting any NUL bytes not intended to be interpreted as
2608 		 * logically representing termination.
2609 		 *
2610 		 * (Actually, even if there were NUL bytes in the blob it would be
2611 		 * okay.  See remarks on bytea case above.)
2612 		 */
2613 		memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2614 
2615 #ifdef USE_ICU
2616 		if (uchar)
2617 			pfree(uchar);
2618 #endif
2619 	}
2620 
2621 	/*
2622 	 * Maintain approximate cardinality of both abbreviated keys and original,
2623 	 * authoritative keys using HyperLogLog.  Used as cheap insurance against
2624 	 * the worst case, where we do many string transformations for no saving
2625 	 * in full strcoll()-based comparisons.  These statistics are used by
2626 	 * varstr_abbrev_abort().
2627 	 *
2628 	 * First, Hash key proper, or a significant fraction of it.  Mix in length
2629 	 * in order to compensate for cases where differences are past
2630 	 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2631 	 */
2632 	hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2633 								   Min(len, PG_CACHE_LINE_SIZE)));
2634 
2635 	if (len > PG_CACHE_LINE_SIZE)
2636 		hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2637 
2638 	addHyperLogLog(&sss->full_card, hash);
2639 
2640 	/* Hash abbreviated key */
2641 #if SIZEOF_DATUM == 8
2642 	{
2643 		uint32		lohalf,
2644 					hihalf;
2645 
2646 		lohalf = (uint32) res;
2647 		hihalf = (uint32) (res >> 32);
2648 		hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2649 	}
2650 #else							/* SIZEOF_DATUM != 8 */
2651 	hash = DatumGetUInt32(hash_uint32((uint32) res));
2652 #endif
2653 
2654 	addHyperLogLog(&sss->abbr_card, hash);
2655 
2656 	/* Cache result, perhaps saving an expensive strxfrm() call next time */
2657 	sss->cache_blob = true;
2658 done:
2659 
2660 	/*
2661 	 * Byteswap on little-endian machines.
2662 	 *
2663 	 * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2664 	 * comparator) works correctly on all platforms.  If we didn't do this,
2665 	 * the comparator would have to call memcmp() with a pair of pointers to
2666 	 * the first byte of each abbreviated key, which is slower.
2667 	 */
2668 	res = DatumBigEndianToNative(res);
2669 
2670 	/* Don't leak memory here */
2671 	if (PointerGetDatum(authoritative) != original)
2672 		pfree(authoritative);
2673 
2674 	return res;
2675 }
2676 
2677 /*
2678  * Callback for estimating effectiveness of abbreviated key optimization, using
2679  * heuristic rules.  Returns value indicating if the abbreviation optimization
2680  * should be aborted, based on its projected effectiveness.
2681  */
2682 static bool
varstr_abbrev_abort(int memtupcount,SortSupport ssup)2683 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2684 {
2685 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2686 	double		abbrev_distinct,
2687 				key_distinct;
2688 
2689 	Assert(ssup->abbreviate);
2690 
2691 	/* Have a little patience */
2692 	if (memtupcount < 100)
2693 		return false;
2694 
2695 	abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2696 	key_distinct = estimateHyperLogLog(&sss->full_card);
2697 
2698 	/*
2699 	 * Clamp cardinality estimates to at least one distinct value.  While
2700 	 * NULLs are generally disregarded, if only NULL values were seen so far,
2701 	 * that might misrepresent costs if we failed to clamp.
2702 	 */
2703 	if (abbrev_distinct <= 1.0)
2704 		abbrev_distinct = 1.0;
2705 
2706 	if (key_distinct <= 1.0)
2707 		key_distinct = 1.0;
2708 
2709 	/*
2710 	 * In the worst case all abbreviated keys are identical, while at the same
2711 	 * time there are differences within full key strings not captured in
2712 	 * abbreviations.
2713 	 */
2714 #ifdef TRACE_SORT
2715 	if (trace_sort)
2716 	{
2717 		double		norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2718 
2719 		elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2720 			 "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2721 			 memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2722 			 sss->prop_card);
2723 	}
2724 #endif
2725 
2726 	/*
2727 	 * If the number of distinct abbreviated keys approximately matches the
2728 	 * number of distinct authoritative original keys, that's reason enough to
2729 	 * proceed.  We can win even with a very low cardinality set if most
2730 	 * tie-breakers only memcmp().  This is by far the most important
2731 	 * consideration.
2732 	 *
2733 	 * While comparisons that are resolved at the abbreviated key level are
2734 	 * considerably cheaper than tie-breakers resolved with memcmp(), both of
2735 	 * those two outcomes are so much cheaper than a full strcoll() once
2736 	 * sorting is underway that it doesn't seem worth it to weigh abbreviated
2737 	 * cardinality against the overall size of the set in order to more
2738 	 * accurately model costs.  Assume that an abbreviated comparison, and an
2739 	 * abbreviated comparison with a cheap memcmp()-based authoritative
2740 	 * resolution are equivalent.
2741 	 */
2742 	if (abbrev_distinct > key_distinct * sss->prop_card)
2743 	{
2744 		/*
2745 		 * When we have exceeded 10,000 tuples, decay required cardinality
2746 		 * aggressively for next call.
2747 		 *
2748 		 * This is useful because the number of comparisons required on
2749 		 * average increases at a linearithmic rate, and at roughly 10,000
2750 		 * tuples that factor will start to dominate over the linear costs of
2751 		 * string transformation (this is a conservative estimate).  The decay
2752 		 * rate is chosen to be a little less aggressive than halving -- which
2753 		 * (since we're called at points at which memtupcount has doubled)
2754 		 * would never see the cost model actually abort past the first call
2755 		 * following a decay.  This decay rate is mostly a precaution against
2756 		 * a sudden, violent swing in how well abbreviated cardinality tracks
2757 		 * full key cardinality.  The decay also serves to prevent a marginal
2758 		 * case from being aborted too late, when too much has already been
2759 		 * invested in string transformation.
2760 		 *
2761 		 * It's possible for sets of several million distinct strings with
2762 		 * mere tens of thousands of distinct abbreviated keys to still
2763 		 * benefit very significantly.  This will generally occur provided
2764 		 * each abbreviated key is a proxy for a roughly uniform number of the
2765 		 * set's full keys. If it isn't so, we hope to catch that early and
2766 		 * abort.  If it isn't caught early, by the time the problem is
2767 		 * apparent it's probably not worth aborting.
2768 		 */
2769 		if (memtupcount > 10000)
2770 			sss->prop_card *= 0.65;
2771 
2772 		return false;
2773 	}
2774 
2775 	/*
2776 	 * Abort abbreviation strategy.
2777 	 *
2778 	 * The worst case, where all abbreviated keys are identical while all
2779 	 * original strings differ will typically only see a regression of about
2780 	 * 10% in execution time for small to medium sized lists of strings.
2781 	 * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2782 	 * often expect very large improvements, particularly with sets of strings
2783 	 * of moderately high to high abbreviated cardinality.  There is little to
2784 	 * lose but much to gain, which our strategy reflects.
2785 	 */
2786 #ifdef TRACE_SORT
2787 	if (trace_sort)
2788 		elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2789 			 "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2790 			 memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2791 #endif
2792 
2793 	return true;
2794 }
2795 
2796 Datum
text_larger(PG_FUNCTION_ARGS)2797 text_larger(PG_FUNCTION_ARGS)
2798 {
2799 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2800 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2801 	text	   *result;
2802 
2803 	result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2804 
2805 	PG_RETURN_TEXT_P(result);
2806 }
2807 
2808 Datum
text_smaller(PG_FUNCTION_ARGS)2809 text_smaller(PG_FUNCTION_ARGS)
2810 {
2811 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2812 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2813 	text	   *result;
2814 
2815 	result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2816 
2817 	PG_RETURN_TEXT_P(result);
2818 }
2819 
2820 
2821 /*
2822  * Cross-type comparison functions for types text and name.
2823  */
2824 
2825 Datum
nameeqtext(PG_FUNCTION_ARGS)2826 nameeqtext(PG_FUNCTION_ARGS)
2827 {
2828 	Name		arg1 = PG_GETARG_NAME(0);
2829 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2830 	size_t		len1 = strlen(NameStr(*arg1));
2831 	size_t		len2 = VARSIZE_ANY_EXHDR(arg2);
2832 	Oid			collid = PG_GET_COLLATION();
2833 	bool		result;
2834 
2835 	check_collation_set(collid);
2836 
2837 	if (collid == C_COLLATION_OID)
2838 		result = (len1 == len2 &&
2839 				  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2840 	else
2841 		result = (varstr_cmp(NameStr(*arg1), len1,
2842 							 VARDATA_ANY(arg2), len2,
2843 							 collid) == 0);
2844 
2845 	PG_FREE_IF_COPY(arg2, 1);
2846 
2847 	PG_RETURN_BOOL(result);
2848 }
2849 
2850 Datum
texteqname(PG_FUNCTION_ARGS)2851 texteqname(PG_FUNCTION_ARGS)
2852 {
2853 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2854 	Name		arg2 = PG_GETARG_NAME(1);
2855 	size_t		len1 = VARSIZE_ANY_EXHDR(arg1);
2856 	size_t		len2 = strlen(NameStr(*arg2));
2857 	Oid			collid = PG_GET_COLLATION();
2858 	bool		result;
2859 
2860 	check_collation_set(collid);
2861 
2862 	if (collid == C_COLLATION_OID)
2863 		result = (len1 == len2 &&
2864 				  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2865 	else
2866 		result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2867 							 NameStr(*arg2), len2,
2868 							 collid) == 0);
2869 
2870 	PG_FREE_IF_COPY(arg1, 0);
2871 
2872 	PG_RETURN_BOOL(result);
2873 }
2874 
2875 Datum
namenetext(PG_FUNCTION_ARGS)2876 namenetext(PG_FUNCTION_ARGS)
2877 {
2878 	Name		arg1 = PG_GETARG_NAME(0);
2879 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2880 	size_t		len1 = strlen(NameStr(*arg1));
2881 	size_t		len2 = VARSIZE_ANY_EXHDR(arg2);
2882 	Oid			collid = PG_GET_COLLATION();
2883 	bool		result;
2884 
2885 	check_collation_set(collid);
2886 
2887 	if (collid == C_COLLATION_OID)
2888 		result = !(len1 == len2 &&
2889 				   memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2890 	else
2891 		result = !(varstr_cmp(NameStr(*arg1), len1,
2892 							  VARDATA_ANY(arg2), len2,
2893 							  collid) == 0);
2894 
2895 	PG_FREE_IF_COPY(arg2, 1);
2896 
2897 	PG_RETURN_BOOL(result);
2898 }
2899 
2900 Datum
textnename(PG_FUNCTION_ARGS)2901 textnename(PG_FUNCTION_ARGS)
2902 {
2903 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2904 	Name		arg2 = PG_GETARG_NAME(1);
2905 	size_t		len1 = VARSIZE_ANY_EXHDR(arg1);
2906 	size_t		len2 = strlen(NameStr(*arg2));
2907 	Oid			collid = PG_GET_COLLATION();
2908 	bool		result;
2909 
2910 	check_collation_set(collid);
2911 
2912 	if (collid == C_COLLATION_OID)
2913 		result = !(len1 == len2 &&
2914 				   memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2915 	else
2916 		result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2917 							  NameStr(*arg2), len2,
2918 							  collid) == 0);
2919 
2920 	PG_FREE_IF_COPY(arg1, 0);
2921 
2922 	PG_RETURN_BOOL(result);
2923 }
2924 
2925 Datum
btnametextcmp(PG_FUNCTION_ARGS)2926 btnametextcmp(PG_FUNCTION_ARGS)
2927 {
2928 	Name		arg1 = PG_GETARG_NAME(0);
2929 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2930 	int32		result;
2931 
2932 	result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2933 						VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2934 						PG_GET_COLLATION());
2935 
2936 	PG_FREE_IF_COPY(arg2, 1);
2937 
2938 	PG_RETURN_INT32(result);
2939 }
2940 
2941 Datum
bttextnamecmp(PG_FUNCTION_ARGS)2942 bttextnamecmp(PG_FUNCTION_ARGS)
2943 {
2944 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2945 	Name		arg2 = PG_GETARG_NAME(1);
2946 	int32		result;
2947 
2948 	result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2949 						NameStr(*arg2), strlen(NameStr(*arg2)),
2950 						PG_GET_COLLATION());
2951 
2952 	PG_FREE_IF_COPY(arg1, 0);
2953 
2954 	PG_RETURN_INT32(result);
2955 }
2956 
2957 #define CmpCall(cmpfunc) \
2958 	DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2959 										  PG_GET_COLLATION(), \
2960 										  PG_GETARG_DATUM(0), \
2961 										  PG_GETARG_DATUM(1)))
2962 
2963 Datum
namelttext(PG_FUNCTION_ARGS)2964 namelttext(PG_FUNCTION_ARGS)
2965 {
2966 	PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
2967 }
2968 
2969 Datum
nameletext(PG_FUNCTION_ARGS)2970 nameletext(PG_FUNCTION_ARGS)
2971 {
2972 	PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
2973 }
2974 
2975 Datum
namegttext(PG_FUNCTION_ARGS)2976 namegttext(PG_FUNCTION_ARGS)
2977 {
2978 	PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
2979 }
2980 
2981 Datum
namegetext(PG_FUNCTION_ARGS)2982 namegetext(PG_FUNCTION_ARGS)
2983 {
2984 	PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
2985 }
2986 
2987 Datum
textltname(PG_FUNCTION_ARGS)2988 textltname(PG_FUNCTION_ARGS)
2989 {
2990 	PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
2991 }
2992 
2993 Datum
textlename(PG_FUNCTION_ARGS)2994 textlename(PG_FUNCTION_ARGS)
2995 {
2996 	PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
2997 }
2998 
2999 Datum
textgtname(PG_FUNCTION_ARGS)3000 textgtname(PG_FUNCTION_ARGS)
3001 {
3002 	PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
3003 }
3004 
3005 Datum
textgename(PG_FUNCTION_ARGS)3006 textgename(PG_FUNCTION_ARGS)
3007 {
3008 	PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
3009 }
3010 
3011 #undef CmpCall
3012 
3013 
3014 /*
3015  * The following operators support character-by-character comparison
3016  * of text datums, to allow building indexes suitable for LIKE clauses.
3017  * Note that the regular texteq/textne comparison operators, and regular
3018  * support functions 1 and 2 with "C" collation are assumed to be
3019  * compatible with these!
3020  */
3021 
3022 static int
internal_text_pattern_compare(text * arg1,text * arg2)3023 internal_text_pattern_compare(text *arg1, text *arg2)
3024 {
3025 	int			result;
3026 	int			len1,
3027 				len2;
3028 
3029 	len1 = VARSIZE_ANY_EXHDR(arg1);
3030 	len2 = VARSIZE_ANY_EXHDR(arg2);
3031 
3032 	result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3033 	if (result != 0)
3034 		return result;
3035 	else if (len1 < len2)
3036 		return -1;
3037 	else if (len1 > len2)
3038 		return 1;
3039 	else
3040 		return 0;
3041 }
3042 
3043 
3044 Datum
text_pattern_lt(PG_FUNCTION_ARGS)3045 text_pattern_lt(PG_FUNCTION_ARGS)
3046 {
3047 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
3048 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
3049 	int			result;
3050 
3051 	result = internal_text_pattern_compare(arg1, arg2);
3052 
3053 	PG_FREE_IF_COPY(arg1, 0);
3054 	PG_FREE_IF_COPY(arg2, 1);
3055 
3056 	PG_RETURN_BOOL(result < 0);
3057 }
3058 
3059 
3060 Datum
text_pattern_le(PG_FUNCTION_ARGS)3061 text_pattern_le(PG_FUNCTION_ARGS)
3062 {
3063 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
3064 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
3065 	int			result;
3066 
3067 	result = internal_text_pattern_compare(arg1, arg2);
3068 
3069 	PG_FREE_IF_COPY(arg1, 0);
3070 	PG_FREE_IF_COPY(arg2, 1);
3071 
3072 	PG_RETURN_BOOL(result <= 0);
3073 }
3074 
3075 
3076 Datum
text_pattern_ge(PG_FUNCTION_ARGS)3077 text_pattern_ge(PG_FUNCTION_ARGS)
3078 {
3079 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
3080 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
3081 	int			result;
3082 
3083 	result = internal_text_pattern_compare(arg1, arg2);
3084 
3085 	PG_FREE_IF_COPY(arg1, 0);
3086 	PG_FREE_IF_COPY(arg2, 1);
3087 
3088 	PG_RETURN_BOOL(result >= 0);
3089 }
3090 
3091 
3092 Datum
text_pattern_gt(PG_FUNCTION_ARGS)3093 text_pattern_gt(PG_FUNCTION_ARGS)
3094 {
3095 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
3096 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
3097 	int			result;
3098 
3099 	result = internal_text_pattern_compare(arg1, arg2);
3100 
3101 	PG_FREE_IF_COPY(arg1, 0);
3102 	PG_FREE_IF_COPY(arg2, 1);
3103 
3104 	PG_RETURN_BOOL(result > 0);
3105 }
3106 
3107 
3108 Datum
bttext_pattern_cmp(PG_FUNCTION_ARGS)3109 bttext_pattern_cmp(PG_FUNCTION_ARGS)
3110 {
3111 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
3112 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
3113 	int			result;
3114 
3115 	result = internal_text_pattern_compare(arg1, arg2);
3116 
3117 	PG_FREE_IF_COPY(arg1, 0);
3118 	PG_FREE_IF_COPY(arg2, 1);
3119 
3120 	PG_RETURN_INT32(result);
3121 }
3122 
3123 
3124 Datum
bttext_pattern_sortsupport(PG_FUNCTION_ARGS)3125 bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
3126 {
3127 	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3128 	MemoryContext oldcontext;
3129 
3130 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3131 
3132 	/* Use generic string SortSupport, forcing "C" collation */
3133 	varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3134 
3135 	MemoryContextSwitchTo(oldcontext);
3136 
3137 	PG_RETURN_VOID();
3138 }
3139 
3140 
3141 /*-------------------------------------------------------------
3142  * byteaoctetlen
3143  *
3144  * get the number of bytes contained in an instance of type 'bytea'
3145  *-------------------------------------------------------------
3146  */
3147 Datum
byteaoctetlen(PG_FUNCTION_ARGS)3148 byteaoctetlen(PG_FUNCTION_ARGS)
3149 {
3150 	Datum		str = PG_GETARG_DATUM(0);
3151 
3152 	/* We need not detoast the input at all */
3153 	PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
3154 }
3155 
3156 /*
3157  * byteacat -
3158  *	  takes two bytea* and returns a bytea* that is the concatenation of
3159  *	  the two.
3160  *
3161  * Cloned from textcat and modified as required.
3162  */
3163 Datum
byteacat(PG_FUNCTION_ARGS)3164 byteacat(PG_FUNCTION_ARGS)
3165 {
3166 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
3167 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
3168 
3169 	PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
3170 }
3171 
3172 /*
3173  * bytea_catenate
3174  *	Guts of byteacat(), broken out so it can be used by other functions
3175  *
3176  * Arguments can be in short-header form, but not compressed or out-of-line
3177  */
3178 static bytea *
bytea_catenate(bytea * t1,bytea * t2)3179 bytea_catenate(bytea *t1, bytea *t2)
3180 {
3181 	bytea	   *result;
3182 	int			len1,
3183 				len2,
3184 				len;
3185 	char	   *ptr;
3186 
3187 	len1 = VARSIZE_ANY_EXHDR(t1);
3188 	len2 = VARSIZE_ANY_EXHDR(t2);
3189 
3190 	/* paranoia ... probably should throw error instead? */
3191 	if (len1 < 0)
3192 		len1 = 0;
3193 	if (len2 < 0)
3194 		len2 = 0;
3195 
3196 	len = len1 + len2 + VARHDRSZ;
3197 	result = (bytea *) palloc(len);
3198 
3199 	/* Set size of result string... */
3200 	SET_VARSIZE(result, len);
3201 
3202 	/* Fill data field of result string... */
3203 	ptr = VARDATA(result);
3204 	if (len1 > 0)
3205 		memcpy(ptr, VARDATA_ANY(t1), len1);
3206 	if (len2 > 0)
3207 		memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3208 
3209 	return result;
3210 }
3211 
3212 #define PG_STR_GET_BYTEA(str_) \
3213 	DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3214 
3215 /*
3216  * bytea_substr()
3217  * Return a substring starting at the specified position.
3218  * Cloned from text_substr and modified as required.
3219  *
3220  * Input:
3221  *	- string
3222  *	- starting position (is one-based)
3223  *	- string length (optional)
3224  *
3225  * If the starting position is zero or less, then return from the start of the string
3226  * adjusting the length to be consistent with the "negative start" per SQL.
3227  * If the length is less than zero, an ERROR is thrown. If no third argument
3228  * (length) is provided, the length to the end of the string is assumed.
3229  */
3230 Datum
bytea_substr(PG_FUNCTION_ARGS)3231 bytea_substr(PG_FUNCTION_ARGS)
3232 {
3233 	PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3234 									  PG_GETARG_INT32(1),
3235 									  PG_GETARG_INT32(2),
3236 									  false));
3237 }
3238 
3239 /*
3240  * bytea_substr_no_len -
3241  *	  Wrapper to avoid opr_sanity failure due to
3242  *	  one function accepting a different number of args.
3243  */
3244 Datum
bytea_substr_no_len(PG_FUNCTION_ARGS)3245 bytea_substr_no_len(PG_FUNCTION_ARGS)
3246 {
3247 	PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3248 									  PG_GETARG_INT32(1),
3249 									  -1,
3250 									  true));
3251 }
3252 
3253 static bytea *
bytea_substring(Datum str,int S,int L,bool length_not_specified)3254 bytea_substring(Datum str,
3255 				int S,
3256 				int L,
3257 				bool length_not_specified)
3258 {
3259 	int32		S1;				/* adjusted start position */
3260 	int32		L1;				/* adjusted substring length */
3261 	int32		E;				/* end position */
3262 
3263 	/*
3264 	 * The logic here should generally match text_substring().
3265 	 */
3266 	S1 = Max(S, 1);
3267 
3268 	if (length_not_specified)
3269 	{
3270 		/*
3271 		 * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3272 		 * end of the string if we pass it a negative value for length.
3273 		 */
3274 		L1 = -1;
3275 	}
3276 	else if (L < 0)
3277 	{
3278 		/* SQL99 says to throw an error for E < S, i.e., negative length */
3279 		ereport(ERROR,
3280 				(errcode(ERRCODE_SUBSTRING_ERROR),
3281 				 errmsg("negative substring length not allowed")));
3282 		L1 = -1;				/* silence stupider compilers */
3283 	}
3284 	else if (pg_add_s32_overflow(S, L, &E))
3285 	{
3286 		/*
3287 		 * L could be large enough for S + L to overflow, in which case the
3288 		 * substring must run to end of string.
3289 		 */
3290 		L1 = -1;
3291 	}
3292 	else
3293 	{
3294 		/*
3295 		 * A zero or negative value for the end position can happen if the
3296 		 * start was negative or one. SQL99 says to return a zero-length
3297 		 * string.
3298 		 */
3299 		if (E < 1)
3300 			return PG_STR_GET_BYTEA("");
3301 
3302 		L1 = E - S1;
3303 	}
3304 
3305 	/*
3306 	 * If the start position is past the end of the string, SQL99 says to
3307 	 * return a zero-length string -- DatumGetByteaPSlice() will do that for
3308 	 * us.  We need only convert S1 to zero-based starting position.
3309 	 */
3310 	return DatumGetByteaPSlice(str, S1 - 1, L1);
3311 }
3312 
3313 /*
3314  * byteaoverlay
3315  *	Replace specified substring of first string with second
3316  *
3317  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3318  * This code is a direct implementation of what the standard says.
3319  */
3320 Datum
byteaoverlay(PG_FUNCTION_ARGS)3321 byteaoverlay(PG_FUNCTION_ARGS)
3322 {
3323 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
3324 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
3325 	int			sp = PG_GETARG_INT32(2);	/* substring start position */
3326 	int			sl = PG_GETARG_INT32(3);	/* substring length */
3327 
3328 	PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3329 }
3330 
3331 Datum
byteaoverlay_no_len(PG_FUNCTION_ARGS)3332 byteaoverlay_no_len(PG_FUNCTION_ARGS)
3333 {
3334 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
3335 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
3336 	int			sp = PG_GETARG_INT32(2);	/* substring start position */
3337 	int			sl;
3338 
3339 	sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3340 	PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3341 }
3342 
3343 static bytea *
bytea_overlay(bytea * t1,bytea * t2,int sp,int sl)3344 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3345 {
3346 	bytea	   *result;
3347 	bytea	   *s1;
3348 	bytea	   *s2;
3349 	int			sp_pl_sl;
3350 
3351 	/*
3352 	 * Check for possible integer-overflow cases.  For negative sp, throw a
3353 	 * "substring length" error because that's what should be expected
3354 	 * according to the spec's definition of OVERLAY().
3355 	 */
3356 	if (sp <= 0)
3357 		ereport(ERROR,
3358 				(errcode(ERRCODE_SUBSTRING_ERROR),
3359 				 errmsg("negative substring length not allowed")));
3360 	if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3361 		ereport(ERROR,
3362 				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3363 				 errmsg("integer out of range")));
3364 
3365 	s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3366 	s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3367 	result = bytea_catenate(s1, t2);
3368 	result = bytea_catenate(result, s2);
3369 
3370 	return result;
3371 }
3372 
3373 /*
3374  * byteapos -
3375  *	  Return the position of the specified substring.
3376  *	  Implements the SQL POSITION() function.
3377  * Cloned from textpos and modified as required.
3378  */
3379 Datum
byteapos(PG_FUNCTION_ARGS)3380 byteapos(PG_FUNCTION_ARGS)
3381 {
3382 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
3383 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
3384 	int			pos;
3385 	int			px,
3386 				p;
3387 	int			len1,
3388 				len2;
3389 	char	   *p1,
3390 			   *p2;
3391 
3392 	len1 = VARSIZE_ANY_EXHDR(t1);
3393 	len2 = VARSIZE_ANY_EXHDR(t2);
3394 
3395 	if (len2 <= 0)
3396 		PG_RETURN_INT32(1);		/* result for empty pattern */
3397 
3398 	p1 = VARDATA_ANY(t1);
3399 	p2 = VARDATA_ANY(t2);
3400 
3401 	pos = 0;
3402 	px = (len1 - len2);
3403 	for (p = 0; p <= px; p++)
3404 	{
3405 		if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3406 		{
3407 			pos = p + 1;
3408 			break;
3409 		};
3410 		p1++;
3411 	};
3412 
3413 	PG_RETURN_INT32(pos);
3414 }
3415 
3416 /*-------------------------------------------------------------
3417  * byteaGetByte
3418  *
3419  * this routine treats "bytea" as an array of bytes.
3420  * It returns the Nth byte (a number between 0 and 255).
3421  *-------------------------------------------------------------
3422  */
3423 Datum
byteaGetByte(PG_FUNCTION_ARGS)3424 byteaGetByte(PG_FUNCTION_ARGS)
3425 {
3426 	bytea	   *v = PG_GETARG_BYTEA_PP(0);
3427 	int32		n = PG_GETARG_INT32(1);
3428 	int			len;
3429 	int			byte;
3430 
3431 	len = VARSIZE_ANY_EXHDR(v);
3432 
3433 	if (n < 0 || n >= len)
3434 		ereport(ERROR,
3435 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3436 				 errmsg("index %d out of valid range, 0..%d",
3437 						n, len - 1)));
3438 
3439 	byte = ((unsigned char *) VARDATA_ANY(v))[n];
3440 
3441 	PG_RETURN_INT32(byte);
3442 }
3443 
3444 /*-------------------------------------------------------------
3445  * byteaGetBit
3446  *
3447  * This routine treats a "bytea" type like an array of bits.
3448  * It returns the value of the Nth bit (0 or 1).
3449  *
3450  *-------------------------------------------------------------
3451  */
3452 Datum
byteaGetBit(PG_FUNCTION_ARGS)3453 byteaGetBit(PG_FUNCTION_ARGS)
3454 {
3455 	bytea	   *v = PG_GETARG_BYTEA_PP(0);
3456 	int32		n = PG_GETARG_INT32(1);
3457 	int			byteNo,
3458 				bitNo;
3459 	int			len;
3460 	int			byte;
3461 
3462 	len = VARSIZE_ANY_EXHDR(v);
3463 
3464 	/* Do comparison arithmetic in int64 in case len exceeds INT_MAX/8 */
3465 	if (n < 0 || n >= (int64) len * 8)
3466 		ereport(ERROR,
3467 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3468 				 errmsg("index %d out of valid range, 0..%d",
3469 						n, (int) Min((int64) len * 8 - 1, INT_MAX))));
3470 
3471 	byteNo = n / 8;
3472 	bitNo = n % 8;
3473 
3474 	byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3475 
3476 	if (byte & (1 << bitNo))
3477 		PG_RETURN_INT32(1);
3478 	else
3479 		PG_RETURN_INT32(0);
3480 }
3481 
3482 /*-------------------------------------------------------------
3483  * byteaSetByte
3484  *
3485  * Given an instance of type 'bytea' creates a new one with
3486  * the Nth byte set to the given value.
3487  *
3488  *-------------------------------------------------------------
3489  */
3490 Datum
byteaSetByte(PG_FUNCTION_ARGS)3491 byteaSetByte(PG_FUNCTION_ARGS)
3492 {
3493 	bytea	   *res = PG_GETARG_BYTEA_P_COPY(0);
3494 	int32		n = PG_GETARG_INT32(1);
3495 	int32		newByte = PG_GETARG_INT32(2);
3496 	int			len;
3497 
3498 	len = VARSIZE(res) - VARHDRSZ;
3499 
3500 	if (n < 0 || n >= len)
3501 		ereport(ERROR,
3502 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3503 				 errmsg("index %d out of valid range, 0..%d",
3504 						n, len - 1)));
3505 
3506 	/*
3507 	 * Now set the byte.
3508 	 */
3509 	((unsigned char *) VARDATA(res))[n] = newByte;
3510 
3511 	PG_RETURN_BYTEA_P(res);
3512 }
3513 
3514 /*-------------------------------------------------------------
3515  * byteaSetBit
3516  *
3517  * Given an instance of type 'bytea' creates a new one with
3518  * the Nth bit set to the given value.
3519  *
3520  *-------------------------------------------------------------
3521  */
3522 Datum
byteaSetBit(PG_FUNCTION_ARGS)3523 byteaSetBit(PG_FUNCTION_ARGS)
3524 {
3525 	bytea	   *res = PG_GETARG_BYTEA_P_COPY(0);
3526 	int32		n = PG_GETARG_INT32(1);
3527 	int32		newBit = PG_GETARG_INT32(2);
3528 	int			len;
3529 	int			oldByte,
3530 				newByte;
3531 	int			byteNo,
3532 				bitNo;
3533 
3534 	len = VARSIZE(res) - VARHDRSZ;
3535 
3536 	/* Do comparison arithmetic in int64 in case len exceeds INT_MAX/8 */
3537 	if (n < 0 || n >= (int64) len * 8)
3538 		ereport(ERROR,
3539 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3540 				 errmsg("index %d out of valid range, 0..%d",
3541 						n, (int) Min((int64) len * 8 - 1, INT_MAX))));
3542 
3543 	byteNo = n / 8;
3544 	bitNo = n % 8;
3545 
3546 	/*
3547 	 * sanity check!
3548 	 */
3549 	if (newBit != 0 && newBit != 1)
3550 		ereport(ERROR,
3551 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3552 				 errmsg("new bit must be 0 or 1")));
3553 
3554 	/*
3555 	 * Update the byte.
3556 	 */
3557 	oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3558 
3559 	if (newBit == 0)
3560 		newByte = oldByte & (~(1 << bitNo));
3561 	else
3562 		newByte = oldByte | (1 << bitNo);
3563 
3564 	((unsigned char *) VARDATA(res))[byteNo] = newByte;
3565 
3566 	PG_RETURN_BYTEA_P(res);
3567 }
3568 
3569 
3570 /* text_name()
3571  * Converts a text type to a Name type.
3572  */
3573 Datum
text_name(PG_FUNCTION_ARGS)3574 text_name(PG_FUNCTION_ARGS)
3575 {
3576 	text	   *s = PG_GETARG_TEXT_PP(0);
3577 	Name		result;
3578 	int			len;
3579 
3580 	len = VARSIZE_ANY_EXHDR(s);
3581 
3582 	/* Truncate oversize input */
3583 	if (len >= NAMEDATALEN)
3584 		len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3585 
3586 	/* We use palloc0 here to ensure result is zero-padded */
3587 	result = (Name) palloc0(NAMEDATALEN);
3588 	memcpy(NameStr(*result), VARDATA_ANY(s), len);
3589 
3590 	PG_RETURN_NAME(result);
3591 }
3592 
3593 /* name_text()
3594  * Converts a Name type to a text type.
3595  */
3596 Datum
name_text(PG_FUNCTION_ARGS)3597 name_text(PG_FUNCTION_ARGS)
3598 {
3599 	Name		s = PG_GETARG_NAME(0);
3600 
3601 	PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3602 }
3603 
3604 
3605 /*
3606  * textToQualifiedNameList - convert a text object to list of names
3607  *
3608  * This implements the input parsing needed by nextval() and other
3609  * functions that take a text parameter representing a qualified name.
3610  * We split the name at dots, downcase if not double-quoted, and
3611  * truncate names if they're too long.
3612  */
3613 List *
textToQualifiedNameList(text * textval)3614 textToQualifiedNameList(text *textval)
3615 {
3616 	char	   *rawname;
3617 	List	   *result = NIL;
3618 	List	   *namelist;
3619 	ListCell   *l;
3620 
3621 	/* Convert to C string (handles possible detoasting). */
3622 	/* Note we rely on being able to modify rawname below. */
3623 	rawname = text_to_cstring(textval);
3624 
3625 	if (!SplitIdentifierString(rawname, '.', &namelist))
3626 		ereport(ERROR,
3627 				(errcode(ERRCODE_INVALID_NAME),
3628 				 errmsg("invalid name syntax")));
3629 
3630 	if (namelist == NIL)
3631 		ereport(ERROR,
3632 				(errcode(ERRCODE_INVALID_NAME),
3633 				 errmsg("invalid name syntax")));
3634 
3635 	foreach(l, namelist)
3636 	{
3637 		char	   *curname = (char *) lfirst(l);
3638 
3639 		result = lappend(result, makeString(pstrdup(curname)));
3640 	}
3641 
3642 	pfree(rawname);
3643 	list_free(namelist);
3644 
3645 	return result;
3646 }
3647 
3648 /*
3649  * SplitIdentifierString --- parse a string containing identifiers
3650  *
3651  * This is the guts of textToQualifiedNameList, and is exported for use in
3652  * other situations such as parsing GUC variables.  In the GUC case, it's
3653  * important to avoid memory leaks, so the API is designed to minimize the
3654  * amount of stuff that needs to be allocated and freed.
3655  *
3656  * Inputs:
3657  *	rawstring: the input string; must be overwritable!	On return, it's
3658  *			   been modified to contain the separated identifiers.
3659  *	separator: the separator punctuation expected between identifiers
3660  *			   (typically '.' or ',').  Whitespace may also appear around
3661  *			   identifiers.
3662  * Outputs:
3663  *	namelist: filled with a palloc'd list of pointers to identifiers within
3664  *			  rawstring.  Caller should list_free() this even on error return.
3665  *
3666  * Returns true if okay, false if there is a syntax error in the string.
3667  *
3668  * Note that an empty string is considered okay here, though not in
3669  * textToQualifiedNameList.
3670  */
3671 bool
SplitIdentifierString(char * rawstring,char separator,List ** namelist)3672 SplitIdentifierString(char *rawstring, char separator,
3673 					  List **namelist)
3674 {
3675 	char	   *nextp = rawstring;
3676 	bool		done = false;
3677 
3678 	*namelist = NIL;
3679 
3680 	while (scanner_isspace(*nextp))
3681 		nextp++;				/* skip leading whitespace */
3682 
3683 	if (*nextp == '\0')
3684 		return true;			/* allow empty string */
3685 
3686 	/* At the top of the loop, we are at start of a new identifier. */
3687 	do
3688 	{
3689 		char	   *curname;
3690 		char	   *endp;
3691 
3692 		if (*nextp == '"')
3693 		{
3694 			/* Quoted name --- collapse quote-quote pairs, no downcasing */
3695 			curname = nextp + 1;
3696 			for (;;)
3697 			{
3698 				endp = strchr(nextp + 1, '"');
3699 				if (endp == NULL)
3700 					return false;	/* mismatched quotes */
3701 				if (endp[1] != '"')
3702 					break;		/* found end of quoted name */
3703 				/* Collapse adjacent quotes into one quote, and look again */
3704 				memmove(endp, endp + 1, strlen(endp));
3705 				nextp = endp;
3706 			}
3707 			/* endp now points at the terminating quote */
3708 			nextp = endp + 1;
3709 		}
3710 		else
3711 		{
3712 			/* Unquoted name --- extends to separator or whitespace */
3713 			char	   *downname;
3714 			int			len;
3715 
3716 			curname = nextp;
3717 			while (*nextp && *nextp != separator &&
3718 				   !scanner_isspace(*nextp))
3719 				nextp++;
3720 			endp = nextp;
3721 			if (curname == nextp)
3722 				return false;	/* empty unquoted name not allowed */
3723 
3724 			/*
3725 			 * Downcase the identifier, using same code as main lexer does.
3726 			 *
3727 			 * XXX because we want to overwrite the input in-place, we cannot
3728 			 * support a downcasing transformation that increases the string
3729 			 * length.  This is not a problem given the current implementation
3730 			 * of downcase_truncate_identifier, but we'll probably have to do
3731 			 * something about this someday.
3732 			 */
3733 			len = endp - curname;
3734 			downname = downcase_truncate_identifier(curname, len, false);
3735 			Assert(strlen(downname) <= len);
3736 			strncpy(curname, downname, len);	/* strncpy is required here */
3737 			pfree(downname);
3738 		}
3739 
3740 		while (scanner_isspace(*nextp))
3741 			nextp++;			/* skip trailing whitespace */
3742 
3743 		if (*nextp == separator)
3744 		{
3745 			nextp++;
3746 			while (scanner_isspace(*nextp))
3747 				nextp++;		/* skip leading whitespace for next */
3748 			/* we expect another name, so done remains false */
3749 		}
3750 		else if (*nextp == '\0')
3751 			done = true;
3752 		else
3753 			return false;		/* invalid syntax */
3754 
3755 		/* Now safe to overwrite separator with a null */
3756 		*endp = '\0';
3757 
3758 		/* Truncate name if it's overlength */
3759 		truncate_identifier(curname, strlen(curname), false);
3760 
3761 		/*
3762 		 * Finished isolating current name --- add it to list
3763 		 */
3764 		*namelist = lappend(*namelist, curname);
3765 
3766 		/* Loop back if we didn't reach end of string */
3767 	} while (!done);
3768 
3769 	return true;
3770 }
3771 
3772 
3773 /*
3774  * SplitDirectoriesString --- parse a string containing file/directory names
3775  *
3776  * This works fine on file names too; the function name is historical.
3777  *
3778  * This is similar to SplitIdentifierString, except that the parsing
3779  * rules are meant to handle pathnames instead of identifiers: there is
3780  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3781  * and we apply canonicalize_path() to each extracted string.  Because of the
3782  * last, the returned strings are separately palloc'd rather than being
3783  * pointers into rawstring --- but we still scribble on rawstring.
3784  *
3785  * Inputs:
3786  *	rawstring: the input string; must be modifiable!
3787  *	separator: the separator punctuation expected between directories
3788  *			   (typically ',' or ';').  Whitespace may also appear around
3789  *			   directories.
3790  * Outputs:
3791  *	namelist: filled with a palloc'd list of directory names.
3792  *			  Caller should list_free_deep() this even on error return.
3793  *
3794  * Returns true if okay, false if there is a syntax error in the string.
3795  *
3796  * Note that an empty string is considered okay here.
3797  */
3798 bool
SplitDirectoriesString(char * rawstring,char separator,List ** namelist)3799 SplitDirectoriesString(char *rawstring, char separator,
3800 					   List **namelist)
3801 {
3802 	char	   *nextp = rawstring;
3803 	bool		done = false;
3804 
3805 	*namelist = NIL;
3806 
3807 	while (scanner_isspace(*nextp))
3808 		nextp++;				/* skip leading whitespace */
3809 
3810 	if (*nextp == '\0')
3811 		return true;			/* allow empty string */
3812 
3813 	/* At the top of the loop, we are at start of a new directory. */
3814 	do
3815 	{
3816 		char	   *curname;
3817 		char	   *endp;
3818 
3819 		if (*nextp == '"')
3820 		{
3821 			/* Quoted name --- collapse quote-quote pairs */
3822 			curname = nextp + 1;
3823 			for (;;)
3824 			{
3825 				endp = strchr(nextp + 1, '"');
3826 				if (endp == NULL)
3827 					return false;	/* mismatched quotes */
3828 				if (endp[1] != '"')
3829 					break;		/* found end of quoted name */
3830 				/* Collapse adjacent quotes into one quote, and look again */
3831 				memmove(endp, endp + 1, strlen(endp));
3832 				nextp = endp;
3833 			}
3834 			/* endp now points at the terminating quote */
3835 			nextp = endp + 1;
3836 		}
3837 		else
3838 		{
3839 			/* Unquoted name --- extends to separator or end of string */
3840 			curname = endp = nextp;
3841 			while (*nextp && *nextp != separator)
3842 			{
3843 				/* trailing whitespace should not be included in name */
3844 				if (!scanner_isspace(*nextp))
3845 					endp = nextp + 1;
3846 				nextp++;
3847 			}
3848 			if (curname == endp)
3849 				return false;	/* empty unquoted name not allowed */
3850 		}
3851 
3852 		while (scanner_isspace(*nextp))
3853 			nextp++;			/* skip trailing whitespace */
3854 
3855 		if (*nextp == separator)
3856 		{
3857 			nextp++;
3858 			while (scanner_isspace(*nextp))
3859 				nextp++;		/* skip leading whitespace for next */
3860 			/* we expect another name, so done remains false */
3861 		}
3862 		else if (*nextp == '\0')
3863 			done = true;
3864 		else
3865 			return false;		/* invalid syntax */
3866 
3867 		/* Now safe to overwrite separator with a null */
3868 		*endp = '\0';
3869 
3870 		/* Truncate path if it's overlength */
3871 		if (strlen(curname) >= MAXPGPATH)
3872 			curname[MAXPGPATH - 1] = '\0';
3873 
3874 		/*
3875 		 * Finished isolating current name --- add it to list
3876 		 */
3877 		curname = pstrdup(curname);
3878 		canonicalize_path(curname);
3879 		*namelist = lappend(*namelist, curname);
3880 
3881 		/* Loop back if we didn't reach end of string */
3882 	} while (!done);
3883 
3884 	return true;
3885 }
3886 
3887 
3888 /*
3889  * SplitGUCList --- parse a string containing identifiers or file names
3890  *
3891  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3892  * presuming whether the elements will be taken as identifiers or file names.
3893  * We assume the input has already been through flatten_set_variable_args(),
3894  * so that we need never downcase (if appropriate, that was done already).
3895  * Nor do we ever truncate, since we don't know the correct max length.
3896  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3897  * because any embedded whitespace should have led to double-quoting).
3898  * Otherwise the API is identical to SplitIdentifierString.
3899  *
3900  * XXX it's annoying to have so many copies of this string-splitting logic.
3901  * However, it's not clear that having one function with a bunch of option
3902  * flags would be much better.
3903  *
3904  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3905  * Be sure to update that if you have to change this.
3906  *
3907  * Inputs:
3908  *	rawstring: the input string; must be overwritable!	On return, it's
3909  *			   been modified to contain the separated identifiers.
3910  *	separator: the separator punctuation expected between identifiers
3911  *			   (typically '.' or ',').  Whitespace may also appear around
3912  *			   identifiers.
3913  * Outputs:
3914  *	namelist: filled with a palloc'd list of pointers to identifiers within
3915  *			  rawstring.  Caller should list_free() this even on error return.
3916  *
3917  * Returns true if okay, false if there is a syntax error in the string.
3918  */
3919 bool
SplitGUCList(char * rawstring,char separator,List ** namelist)3920 SplitGUCList(char *rawstring, char separator,
3921 			 List **namelist)
3922 {
3923 	char	   *nextp = rawstring;
3924 	bool		done = false;
3925 
3926 	*namelist = NIL;
3927 
3928 	while (scanner_isspace(*nextp))
3929 		nextp++;				/* skip leading whitespace */
3930 
3931 	if (*nextp == '\0')
3932 		return true;			/* allow empty string */
3933 
3934 	/* At the top of the loop, we are at start of a new identifier. */
3935 	do
3936 	{
3937 		char	   *curname;
3938 		char	   *endp;
3939 
3940 		if (*nextp == '"')
3941 		{
3942 			/* Quoted name --- collapse quote-quote pairs */
3943 			curname = nextp + 1;
3944 			for (;;)
3945 			{
3946 				endp = strchr(nextp + 1, '"');
3947 				if (endp == NULL)
3948 					return false;	/* mismatched quotes */
3949 				if (endp[1] != '"')
3950 					break;		/* found end of quoted name */
3951 				/* Collapse adjacent quotes into one quote, and look again */
3952 				memmove(endp, endp + 1, strlen(endp));
3953 				nextp = endp;
3954 			}
3955 			/* endp now points at the terminating quote */
3956 			nextp = endp + 1;
3957 		}
3958 		else
3959 		{
3960 			/* Unquoted name --- extends to separator or whitespace */
3961 			curname = nextp;
3962 			while (*nextp && *nextp != separator &&
3963 				   !scanner_isspace(*nextp))
3964 				nextp++;
3965 			endp = nextp;
3966 			if (curname == nextp)
3967 				return false;	/* empty unquoted name not allowed */
3968 		}
3969 
3970 		while (scanner_isspace(*nextp))
3971 			nextp++;			/* skip trailing whitespace */
3972 
3973 		if (*nextp == separator)
3974 		{
3975 			nextp++;
3976 			while (scanner_isspace(*nextp))
3977 				nextp++;		/* skip leading whitespace for next */
3978 			/* we expect another name, so done remains false */
3979 		}
3980 		else if (*nextp == '\0')
3981 			done = true;
3982 		else
3983 			return false;		/* invalid syntax */
3984 
3985 		/* Now safe to overwrite separator with a null */
3986 		*endp = '\0';
3987 
3988 		/*
3989 		 * Finished isolating current name --- add it to list
3990 		 */
3991 		*namelist = lappend(*namelist, curname);
3992 
3993 		/* Loop back if we didn't reach end of string */
3994 	} while (!done);
3995 
3996 	return true;
3997 }
3998 
3999 
4000 /*****************************************************************************
4001  *	Comparison Functions used for bytea
4002  *
4003  * Note: btree indexes need these routines not to leak memory; therefore,
4004  * be careful to free working copies of toasted datums.  Most places don't
4005  * need to be so careful.
4006  *****************************************************************************/
4007 
4008 Datum
byteaeq(PG_FUNCTION_ARGS)4009 byteaeq(PG_FUNCTION_ARGS)
4010 {
4011 	Datum		arg1 = PG_GETARG_DATUM(0);
4012 	Datum		arg2 = PG_GETARG_DATUM(1);
4013 	bool		result;
4014 	Size		len1,
4015 				len2;
4016 
4017 	/*
4018 	 * We can use a fast path for unequal lengths, which might save us from
4019 	 * having to detoast one or both values.
4020 	 */
4021 	len1 = toast_raw_datum_size(arg1);
4022 	len2 = toast_raw_datum_size(arg2);
4023 	if (len1 != len2)
4024 		result = false;
4025 	else
4026 	{
4027 		bytea	   *barg1 = DatumGetByteaPP(arg1);
4028 		bytea	   *barg2 = DatumGetByteaPP(arg2);
4029 
4030 		result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4031 						 len1 - VARHDRSZ) == 0);
4032 
4033 		PG_FREE_IF_COPY(barg1, 0);
4034 		PG_FREE_IF_COPY(barg2, 1);
4035 	}
4036 
4037 	PG_RETURN_BOOL(result);
4038 }
4039 
4040 Datum
byteane(PG_FUNCTION_ARGS)4041 byteane(PG_FUNCTION_ARGS)
4042 {
4043 	Datum		arg1 = PG_GETARG_DATUM(0);
4044 	Datum		arg2 = PG_GETARG_DATUM(1);
4045 	bool		result;
4046 	Size		len1,
4047 				len2;
4048 
4049 	/*
4050 	 * We can use a fast path for unequal lengths, which might save us from
4051 	 * having to detoast one or both values.
4052 	 */
4053 	len1 = toast_raw_datum_size(arg1);
4054 	len2 = toast_raw_datum_size(arg2);
4055 	if (len1 != len2)
4056 		result = true;
4057 	else
4058 	{
4059 		bytea	   *barg1 = DatumGetByteaPP(arg1);
4060 		bytea	   *barg2 = DatumGetByteaPP(arg2);
4061 
4062 		result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4063 						 len1 - VARHDRSZ) != 0);
4064 
4065 		PG_FREE_IF_COPY(barg1, 0);
4066 		PG_FREE_IF_COPY(barg2, 1);
4067 	}
4068 
4069 	PG_RETURN_BOOL(result);
4070 }
4071 
4072 Datum
bytealt(PG_FUNCTION_ARGS)4073 bytealt(PG_FUNCTION_ARGS)
4074 {
4075 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
4076 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
4077 	int			len1,
4078 				len2;
4079 	int			cmp;
4080 
4081 	len1 = VARSIZE_ANY_EXHDR(arg1);
4082 	len2 = VARSIZE_ANY_EXHDR(arg2);
4083 
4084 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4085 
4086 	PG_FREE_IF_COPY(arg1, 0);
4087 	PG_FREE_IF_COPY(arg2, 1);
4088 
4089 	PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4090 }
4091 
4092 Datum
byteale(PG_FUNCTION_ARGS)4093 byteale(PG_FUNCTION_ARGS)
4094 {
4095 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
4096 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
4097 	int			len1,
4098 				len2;
4099 	int			cmp;
4100 
4101 	len1 = VARSIZE_ANY_EXHDR(arg1);
4102 	len2 = VARSIZE_ANY_EXHDR(arg2);
4103 
4104 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4105 
4106 	PG_FREE_IF_COPY(arg1, 0);
4107 	PG_FREE_IF_COPY(arg2, 1);
4108 
4109 	PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4110 }
4111 
4112 Datum
byteagt(PG_FUNCTION_ARGS)4113 byteagt(PG_FUNCTION_ARGS)
4114 {
4115 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
4116 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
4117 	int			len1,
4118 				len2;
4119 	int			cmp;
4120 
4121 	len1 = VARSIZE_ANY_EXHDR(arg1);
4122 	len2 = VARSIZE_ANY_EXHDR(arg2);
4123 
4124 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4125 
4126 	PG_FREE_IF_COPY(arg1, 0);
4127 	PG_FREE_IF_COPY(arg2, 1);
4128 
4129 	PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4130 }
4131 
4132 Datum
byteage(PG_FUNCTION_ARGS)4133 byteage(PG_FUNCTION_ARGS)
4134 {
4135 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
4136 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
4137 	int			len1,
4138 				len2;
4139 	int			cmp;
4140 
4141 	len1 = VARSIZE_ANY_EXHDR(arg1);
4142 	len2 = VARSIZE_ANY_EXHDR(arg2);
4143 
4144 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4145 
4146 	PG_FREE_IF_COPY(arg1, 0);
4147 	PG_FREE_IF_COPY(arg2, 1);
4148 
4149 	PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4150 }
4151 
4152 Datum
byteacmp(PG_FUNCTION_ARGS)4153 byteacmp(PG_FUNCTION_ARGS)
4154 {
4155 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
4156 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
4157 	int			len1,
4158 				len2;
4159 	int			cmp;
4160 
4161 	len1 = VARSIZE_ANY_EXHDR(arg1);
4162 	len2 = VARSIZE_ANY_EXHDR(arg2);
4163 
4164 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4165 	if ((cmp == 0) && (len1 != len2))
4166 		cmp = (len1 < len2) ? -1 : 1;
4167 
4168 	PG_FREE_IF_COPY(arg1, 0);
4169 	PG_FREE_IF_COPY(arg2, 1);
4170 
4171 	PG_RETURN_INT32(cmp);
4172 }
4173 
4174 Datum
bytea_sortsupport(PG_FUNCTION_ARGS)4175 bytea_sortsupport(PG_FUNCTION_ARGS)
4176 {
4177 	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
4178 	MemoryContext oldcontext;
4179 
4180 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4181 
4182 	/* Use generic string SortSupport, forcing "C" collation */
4183 	varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4184 
4185 	MemoryContextSwitchTo(oldcontext);
4186 
4187 	PG_RETURN_VOID();
4188 }
4189 
4190 /*
4191  * appendStringInfoText
4192  *
4193  * Append a text to str.
4194  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4195  */
4196 static void
appendStringInfoText(StringInfo str,const text * t)4197 appendStringInfoText(StringInfo str, const text *t)
4198 {
4199 	appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
4200 }
4201 
4202 /*
4203  * replace_text
4204  * replace all occurrences of 'old_sub_str' in 'orig_str'
4205  * with 'new_sub_str' to form 'new_str'
4206  *
4207  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4208  * otherwise returns 'new_str'
4209  */
4210 Datum
replace_text(PG_FUNCTION_ARGS)4211 replace_text(PG_FUNCTION_ARGS)
4212 {
4213 	text	   *src_text = PG_GETARG_TEXT_PP(0);
4214 	text	   *from_sub_text = PG_GETARG_TEXT_PP(1);
4215 	text	   *to_sub_text = PG_GETARG_TEXT_PP(2);
4216 	int			src_text_len;
4217 	int			from_sub_text_len;
4218 	TextPositionState state;
4219 	text	   *ret_text;
4220 	int			chunk_len;
4221 	char	   *curr_ptr;
4222 	char	   *start_ptr;
4223 	StringInfoData str;
4224 	bool		found;
4225 
4226 	src_text_len = VARSIZE_ANY_EXHDR(src_text);
4227 	from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4228 
4229 	/* Return unmodified source string if empty source or pattern */
4230 	if (src_text_len < 1 || from_sub_text_len < 1)
4231 	{
4232 		PG_RETURN_TEXT_P(src_text);
4233 	}
4234 
4235 	text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4236 
4237 	found = text_position_next(&state);
4238 
4239 	/* When the from_sub_text is not found, there is nothing to do. */
4240 	if (!found)
4241 	{
4242 		text_position_cleanup(&state);
4243 		PG_RETURN_TEXT_P(src_text);
4244 	}
4245 	curr_ptr = text_position_get_match_ptr(&state);
4246 	start_ptr = VARDATA_ANY(src_text);
4247 
4248 	initStringInfo(&str);
4249 
4250 	do
4251 	{
4252 		CHECK_FOR_INTERRUPTS();
4253 
4254 		/* copy the data skipped over by last text_position_next() */
4255 		chunk_len = curr_ptr - start_ptr;
4256 		appendBinaryStringInfo(&str, start_ptr, chunk_len);
4257 
4258 		appendStringInfoText(&str, to_sub_text);
4259 
4260 		start_ptr = curr_ptr + from_sub_text_len;
4261 
4262 		found = text_position_next(&state);
4263 		if (found)
4264 			curr_ptr = text_position_get_match_ptr(&state);
4265 	}
4266 	while (found);
4267 
4268 	/* copy trailing data */
4269 	chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4270 	appendBinaryStringInfo(&str, start_ptr, chunk_len);
4271 
4272 	text_position_cleanup(&state);
4273 
4274 	ret_text = cstring_to_text_with_len(str.data, str.len);
4275 	pfree(str.data);
4276 
4277 	PG_RETURN_TEXT_P(ret_text);
4278 }
4279 
4280 /*
4281  * check_replace_text_has_escape_char
4282  *
4283  * check whether replace_text contains escape char.
4284  */
4285 static bool
check_replace_text_has_escape_char(const text * replace_text)4286 check_replace_text_has_escape_char(const text *replace_text)
4287 {
4288 	const char *p = VARDATA_ANY(replace_text);
4289 	const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4290 
4291 	if (pg_database_encoding_max_length() == 1)
4292 	{
4293 		for (; p < p_end; p++)
4294 		{
4295 			if (*p == '\\')
4296 				return true;
4297 		}
4298 	}
4299 	else
4300 	{
4301 		for (; p < p_end; p += pg_mblen(p))
4302 		{
4303 			if (*p == '\\')
4304 				return true;
4305 		}
4306 	}
4307 
4308 	return false;
4309 }
4310 
4311 /*
4312  * appendStringInfoRegexpSubstr
4313  *
4314  * Append replace_text to str, substituting regexp back references for
4315  * \n escapes.  start_ptr is the start of the match in the source string,
4316  * at logical character position data_pos.
4317  */
4318 static void
appendStringInfoRegexpSubstr(StringInfo str,text * replace_text,regmatch_t * pmatch,char * start_ptr,int data_pos)4319 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
4320 							 regmatch_t *pmatch,
4321 							 char *start_ptr, int data_pos)
4322 {
4323 	const char *p = VARDATA_ANY(replace_text);
4324 	const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4325 	int			eml = pg_database_encoding_max_length();
4326 
4327 	for (;;)
4328 	{
4329 		const char *chunk_start = p;
4330 		int			so;
4331 		int			eo;
4332 
4333 		/* Find next escape char. */
4334 		if (eml == 1)
4335 		{
4336 			for (; p < p_end && *p != '\\'; p++)
4337 				 /* nothing */ ;
4338 		}
4339 		else
4340 		{
4341 			for (; p < p_end && *p != '\\'; p += pg_mblen(p))
4342 				 /* nothing */ ;
4343 		}
4344 
4345 		/* Copy the text we just scanned over, if any. */
4346 		if (p > chunk_start)
4347 			appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4348 
4349 		/* Done if at end of string, else advance over escape char. */
4350 		if (p >= p_end)
4351 			break;
4352 		p++;
4353 
4354 		if (p >= p_end)
4355 		{
4356 			/* Escape at very end of input.  Treat same as unexpected char */
4357 			appendStringInfoChar(str, '\\');
4358 			break;
4359 		}
4360 
4361 		if (*p >= '1' && *p <= '9')
4362 		{
4363 			/* Use the back reference of regexp. */
4364 			int			idx = *p - '0';
4365 
4366 			so = pmatch[idx].rm_so;
4367 			eo = pmatch[idx].rm_eo;
4368 			p++;
4369 		}
4370 		else if (*p == '&')
4371 		{
4372 			/* Use the entire matched string. */
4373 			so = pmatch[0].rm_so;
4374 			eo = pmatch[0].rm_eo;
4375 			p++;
4376 		}
4377 		else if (*p == '\\')
4378 		{
4379 			/* \\ means transfer one \ to output. */
4380 			appendStringInfoChar(str, '\\');
4381 			p++;
4382 			continue;
4383 		}
4384 		else
4385 		{
4386 			/*
4387 			 * If escape char is not followed by any expected char, just treat
4388 			 * it as ordinary data to copy.  (XXX would it be better to throw
4389 			 * an error?)
4390 			 */
4391 			appendStringInfoChar(str, '\\');
4392 			continue;
4393 		}
4394 
4395 		if (so != -1 && eo != -1)
4396 		{
4397 			/*
4398 			 * Copy the text that is back reference of regexp.  Note so and eo
4399 			 * are counted in characters not bytes.
4400 			 */
4401 			char	   *chunk_start;
4402 			int			chunk_len;
4403 
4404 			Assert(so >= data_pos);
4405 			chunk_start = start_ptr;
4406 			chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4407 			chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4408 			appendBinaryStringInfo(str, chunk_start, chunk_len);
4409 		}
4410 	}
4411 }
4412 
4413 #define REGEXP_REPLACE_BACKREF_CNT		10
4414 
4415 /*
4416  * replace_text_regexp
4417  *
4418  * replace text that matches to regexp in src_text to replace_text.
4419  *
4420  * Note: to avoid having to include regex.h in builtins.h, we declare
4421  * the regexp argument as void *, but really it's regex_t *.
4422  */
4423 text *
replace_text_regexp(text * src_text,void * regexp,text * replace_text,bool glob)4424 replace_text_regexp(text *src_text, void *regexp,
4425 					text *replace_text, bool glob)
4426 {
4427 	text	   *ret_text;
4428 	regex_t    *re = (regex_t *) regexp;
4429 	int			src_text_len = VARSIZE_ANY_EXHDR(src_text);
4430 	StringInfoData buf;
4431 	regmatch_t	pmatch[REGEXP_REPLACE_BACKREF_CNT];
4432 	pg_wchar   *data;
4433 	size_t		data_len;
4434 	int			search_start;
4435 	int			data_pos;
4436 	char	   *start_ptr;
4437 	bool		have_escape;
4438 
4439 	initStringInfo(&buf);
4440 
4441 	/* Convert data string to wide characters. */
4442 	data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4443 	data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4444 
4445 	/* Check whether replace_text has escape char. */
4446 	have_escape = check_replace_text_has_escape_char(replace_text);
4447 
4448 	/* start_ptr points to the data_pos'th character of src_text */
4449 	start_ptr = (char *) VARDATA_ANY(src_text);
4450 	data_pos = 0;
4451 
4452 	search_start = 0;
4453 	while (search_start <= data_len)
4454 	{
4455 		int			regexec_result;
4456 
4457 		CHECK_FOR_INTERRUPTS();
4458 
4459 		regexec_result = pg_regexec(re,
4460 									data,
4461 									data_len,
4462 									search_start,
4463 									NULL,	/* no details */
4464 									REGEXP_REPLACE_BACKREF_CNT,
4465 									pmatch,
4466 									0);
4467 
4468 		if (regexec_result == REG_NOMATCH)
4469 			break;
4470 
4471 		if (regexec_result != REG_OKAY)
4472 		{
4473 			char		errMsg[100];
4474 
4475 			CHECK_FOR_INTERRUPTS();
4476 			pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4477 			ereport(ERROR,
4478 					(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4479 					 errmsg("regular expression failed: %s", errMsg)));
4480 		}
4481 
4482 		/*
4483 		 * Copy the text to the left of the match position.  Note we are given
4484 		 * character not byte indexes.
4485 		 */
4486 		if (pmatch[0].rm_so - data_pos > 0)
4487 		{
4488 			int			chunk_len;
4489 
4490 			chunk_len = charlen_to_bytelen(start_ptr,
4491 										   pmatch[0].rm_so - data_pos);
4492 			appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4493 
4494 			/*
4495 			 * Advance start_ptr over that text, to avoid multiple rescans of
4496 			 * it if the replace_text contains multiple back-references.
4497 			 */
4498 			start_ptr += chunk_len;
4499 			data_pos = pmatch[0].rm_so;
4500 		}
4501 
4502 		/*
4503 		 * Copy the replace_text. Process back references when the
4504 		 * replace_text has escape characters.
4505 		 */
4506 		if (have_escape)
4507 			appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4508 										 start_ptr, data_pos);
4509 		else
4510 			appendStringInfoText(&buf, replace_text);
4511 
4512 		/* Advance start_ptr and data_pos over the matched text. */
4513 		start_ptr += charlen_to_bytelen(start_ptr,
4514 										pmatch[0].rm_eo - data_pos);
4515 		data_pos = pmatch[0].rm_eo;
4516 
4517 		/*
4518 		 * When global option is off, replace the first instance only.
4519 		 */
4520 		if (!glob)
4521 			break;
4522 
4523 		/*
4524 		 * Advance search position.  Normally we start the next search at the
4525 		 * end of the previous match; but if the match was of zero length, we
4526 		 * have to advance by one character, or we'd just find the same match
4527 		 * again.
4528 		 */
4529 		search_start = data_pos;
4530 		if (pmatch[0].rm_so == pmatch[0].rm_eo)
4531 			search_start++;
4532 	}
4533 
4534 	/*
4535 	 * Copy the text to the right of the last match.
4536 	 */
4537 	if (data_pos < data_len)
4538 	{
4539 		int			chunk_len;
4540 
4541 		chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4542 		appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4543 	}
4544 
4545 	ret_text = cstring_to_text_with_len(buf.data, buf.len);
4546 	pfree(buf.data);
4547 	pfree(data);
4548 
4549 	return ret_text;
4550 }
4551 
4552 /*
4553  * split_text
4554  * parse input string
4555  * return ord item (1 based)
4556  * based on provided field separator
4557  */
4558 Datum
split_text(PG_FUNCTION_ARGS)4559 split_text(PG_FUNCTION_ARGS)
4560 {
4561 	text	   *inputstring = PG_GETARG_TEXT_PP(0);
4562 	text	   *fldsep = PG_GETARG_TEXT_PP(1);
4563 	int			fldnum = PG_GETARG_INT32(2);
4564 	int			inputstring_len;
4565 	int			fldsep_len;
4566 	TextPositionState state;
4567 	char	   *start_ptr;
4568 	char	   *end_ptr;
4569 	text	   *result_text;
4570 	bool		found;
4571 
4572 	/* field number is 1 based */
4573 	if (fldnum < 1)
4574 		ereport(ERROR,
4575 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4576 				 errmsg("field position must be greater than zero")));
4577 
4578 	inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4579 	fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4580 
4581 	/* return empty string for empty input string */
4582 	if (inputstring_len < 1)
4583 		PG_RETURN_TEXT_P(cstring_to_text(""));
4584 
4585 	/* empty field separator */
4586 	if (fldsep_len < 1)
4587 	{
4588 		text_position_cleanup(&state);
4589 		/* if first field, return input string, else empty string */
4590 		if (fldnum == 1)
4591 			PG_RETURN_TEXT_P(inputstring);
4592 		else
4593 			PG_RETURN_TEXT_P(cstring_to_text(""));
4594 	}
4595 
4596 	text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4597 
4598 	/* identify bounds of first field */
4599 	start_ptr = VARDATA_ANY(inputstring);
4600 	found = text_position_next(&state);
4601 
4602 	/* special case if fldsep not found at all */
4603 	if (!found)
4604 	{
4605 		text_position_cleanup(&state);
4606 		/* if field 1 requested, return input string, else empty string */
4607 		if (fldnum == 1)
4608 			PG_RETURN_TEXT_P(inputstring);
4609 		else
4610 			PG_RETURN_TEXT_P(cstring_to_text(""));
4611 	}
4612 	end_ptr = text_position_get_match_ptr(&state);
4613 
4614 	while (found && --fldnum > 0)
4615 	{
4616 		/* identify bounds of next field */
4617 		start_ptr = end_ptr + fldsep_len;
4618 		found = text_position_next(&state);
4619 		if (found)
4620 			end_ptr = text_position_get_match_ptr(&state);
4621 	}
4622 
4623 	text_position_cleanup(&state);
4624 
4625 	if (fldnum > 0)
4626 	{
4627 		/* N'th field separator not found */
4628 		/* if last field requested, return it, else empty string */
4629 		if (fldnum == 1)
4630 		{
4631 			int			last_len = start_ptr - VARDATA_ANY(inputstring);
4632 
4633 			result_text = cstring_to_text_with_len(start_ptr,
4634 												   inputstring_len - last_len);
4635 		}
4636 		else
4637 			result_text = cstring_to_text("");
4638 	}
4639 	else
4640 	{
4641 		/* non-last field requested */
4642 		result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4643 	}
4644 
4645 	PG_RETURN_TEXT_P(result_text);
4646 }
4647 
4648 /*
4649  * Convenience function to return true when two text params are equal.
4650  */
4651 static bool
text_isequal(text * txt1,text * txt2,Oid collid)4652 text_isequal(text *txt1, text *txt2, Oid collid)
4653 {
4654 	return DatumGetBool(DirectFunctionCall2Coll(texteq,
4655 												collid,
4656 												PointerGetDatum(txt1),
4657 												PointerGetDatum(txt2)));
4658 }
4659 
4660 /*
4661  * text_to_array
4662  * parse input string and return text array of elements,
4663  * based on provided field separator
4664  */
4665 Datum
text_to_array(PG_FUNCTION_ARGS)4666 text_to_array(PG_FUNCTION_ARGS)
4667 {
4668 	return text_to_array_internal(fcinfo);
4669 }
4670 
4671 /*
4672  * text_to_array_null
4673  * parse input string and return text array of elements,
4674  * based on provided field separator and null string
4675  *
4676  * This is a separate entry point only to prevent the regression tests from
4677  * complaining about different argument sets for the same internal function.
4678  */
4679 Datum
text_to_array_null(PG_FUNCTION_ARGS)4680 text_to_array_null(PG_FUNCTION_ARGS)
4681 {
4682 	return text_to_array_internal(fcinfo);
4683 }
4684 
4685 /*
4686  * common code for text_to_array and text_to_array_null functions
4687  *
4688  * These are not strict so we have to test for null inputs explicitly.
4689  */
4690 static Datum
text_to_array_internal(PG_FUNCTION_ARGS)4691 text_to_array_internal(PG_FUNCTION_ARGS)
4692 {
4693 	text	   *inputstring;
4694 	text	   *fldsep;
4695 	text	   *null_string;
4696 	int			inputstring_len;
4697 	int			fldsep_len;
4698 	char	   *start_ptr;
4699 	text	   *result_text;
4700 	bool		is_null;
4701 	ArrayBuildState *astate = NULL;
4702 
4703 	/* when input string is NULL, then result is NULL too */
4704 	if (PG_ARGISNULL(0))
4705 		PG_RETURN_NULL();
4706 
4707 	inputstring = PG_GETARG_TEXT_PP(0);
4708 
4709 	/* fldsep can be NULL */
4710 	if (!PG_ARGISNULL(1))
4711 		fldsep = PG_GETARG_TEXT_PP(1);
4712 	else
4713 		fldsep = NULL;
4714 
4715 	/* null_string can be NULL or omitted */
4716 	if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4717 		null_string = PG_GETARG_TEXT_PP(2);
4718 	else
4719 		null_string = NULL;
4720 
4721 	if (fldsep != NULL)
4722 	{
4723 		/*
4724 		 * Normal case with non-null fldsep.  Use the text_position machinery
4725 		 * to search for occurrences of fldsep.
4726 		 */
4727 		TextPositionState state;
4728 
4729 		inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4730 		fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4731 
4732 		/* return empty array for empty input string */
4733 		if (inputstring_len < 1)
4734 			PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4735 
4736 		/*
4737 		 * empty field separator: return the input string as a one-element
4738 		 * array
4739 		 */
4740 		if (fldsep_len < 1)
4741 		{
4742 			Datum		elems[1];
4743 			bool		nulls[1];
4744 			int			dims[1];
4745 			int			lbs[1];
4746 
4747 			/* single element can be a NULL too */
4748 			is_null = null_string ? text_isequal(inputstring, null_string, PG_GET_COLLATION()) : false;
4749 
4750 			elems[0] = PointerGetDatum(inputstring);
4751 			nulls[0] = is_null;
4752 			dims[0] = 1;
4753 			lbs[0] = 1;
4754 			/* XXX: this hardcodes assumptions about the text type */
4755 			PG_RETURN_ARRAYTYPE_P(construct_md_array(elems, nulls,
4756 													 1, dims, lbs,
4757 													 TEXTOID, -1, false, 'i'));
4758 		}
4759 
4760 		text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4761 
4762 		start_ptr = VARDATA_ANY(inputstring);
4763 
4764 		for (;;)
4765 		{
4766 			bool		found;
4767 			char	   *end_ptr;
4768 			int			chunk_len;
4769 
4770 			CHECK_FOR_INTERRUPTS();
4771 
4772 			found = text_position_next(&state);
4773 			if (!found)
4774 			{
4775 				/* fetch last field */
4776 				chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4777 				end_ptr = NULL; /* not used, but some compilers complain */
4778 			}
4779 			else
4780 			{
4781 				/* fetch non-last field */
4782 				end_ptr = text_position_get_match_ptr(&state);
4783 				chunk_len = end_ptr - start_ptr;
4784 			}
4785 
4786 			/* must build a temp text datum to pass to accumArrayResult */
4787 			result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4788 			is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false;
4789 
4790 			/* stash away this field */
4791 			astate = accumArrayResult(astate,
4792 									  PointerGetDatum(result_text),
4793 									  is_null,
4794 									  TEXTOID,
4795 									  CurrentMemoryContext);
4796 
4797 			pfree(result_text);
4798 
4799 			if (!found)
4800 				break;
4801 
4802 			start_ptr = end_ptr + fldsep_len;
4803 		}
4804 
4805 		text_position_cleanup(&state);
4806 	}
4807 	else
4808 	{
4809 		/*
4810 		 * When fldsep is NULL, each character in the inputstring becomes an
4811 		 * element in the result array.  The separator is effectively the
4812 		 * space between characters.
4813 		 */
4814 		inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4815 
4816 		/* return empty array for empty input string */
4817 		if (inputstring_len < 1)
4818 			PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4819 
4820 		start_ptr = VARDATA_ANY(inputstring);
4821 
4822 		while (inputstring_len > 0)
4823 		{
4824 			int			chunk_len = pg_mblen(start_ptr);
4825 
4826 			CHECK_FOR_INTERRUPTS();
4827 
4828 			/* must build a temp text datum to pass to accumArrayResult */
4829 			result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4830 			is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false;
4831 
4832 			/* stash away this field */
4833 			astate = accumArrayResult(astate,
4834 									  PointerGetDatum(result_text),
4835 									  is_null,
4836 									  TEXTOID,
4837 									  CurrentMemoryContext);
4838 
4839 			pfree(result_text);
4840 
4841 			start_ptr += chunk_len;
4842 			inputstring_len -= chunk_len;
4843 		}
4844 	}
4845 
4846 	PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
4847 										  CurrentMemoryContext));
4848 }
4849 
4850 /*
4851  * array_to_text
4852  * concatenate Cstring representation of input array elements
4853  * using provided field separator
4854  */
4855 Datum
array_to_text(PG_FUNCTION_ARGS)4856 array_to_text(PG_FUNCTION_ARGS)
4857 {
4858 	ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
4859 	char	   *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4860 
4861 	PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4862 }
4863 
4864 /*
4865  * array_to_text_null
4866  * concatenate Cstring representation of input array elements
4867  * using provided field separator and null string
4868  *
4869  * This version is not strict so we have to test for null inputs explicitly.
4870  */
4871 Datum
array_to_text_null(PG_FUNCTION_ARGS)4872 array_to_text_null(PG_FUNCTION_ARGS)
4873 {
4874 	ArrayType  *v;
4875 	char	   *fldsep;
4876 	char	   *null_string;
4877 
4878 	/* returns NULL when first or second parameter is NULL */
4879 	if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4880 		PG_RETURN_NULL();
4881 
4882 	v = PG_GETARG_ARRAYTYPE_P(0);
4883 	fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4884 
4885 	/* NULL null string is passed through as a null pointer */
4886 	if (!PG_ARGISNULL(2))
4887 		null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4888 	else
4889 		null_string = NULL;
4890 
4891 	PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4892 }
4893 
4894 /*
4895  * common code for array_to_text and array_to_text_null functions
4896  */
4897 static text *
array_to_text_internal(FunctionCallInfo fcinfo,ArrayType * v,const char * fldsep,const char * null_string)4898 array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
4899 					   const char *fldsep, const char *null_string)
4900 {
4901 	text	   *result;
4902 	int			nitems,
4903 			   *dims,
4904 				ndims;
4905 	Oid			element_type;
4906 	int			typlen;
4907 	bool		typbyval;
4908 	char		typalign;
4909 	StringInfoData buf;
4910 	bool		printed = false;
4911 	char	   *p;
4912 	bits8	   *bitmap;
4913 	int			bitmask;
4914 	int			i;
4915 	ArrayMetaState *my_extra;
4916 
4917 	ndims = ARR_NDIM(v);
4918 	dims = ARR_DIMS(v);
4919 	nitems = ArrayGetNItems(ndims, dims);
4920 
4921 	/* if there are no elements, return an empty string */
4922 	if (nitems == 0)
4923 		return cstring_to_text_with_len("", 0);
4924 
4925 	element_type = ARR_ELEMTYPE(v);
4926 	initStringInfo(&buf);
4927 
4928 	/*
4929 	 * We arrange to look up info about element type, including its output
4930 	 * conversion proc, only once per series of calls, assuming the element
4931 	 * type doesn't change underneath us.
4932 	 */
4933 	my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4934 	if (my_extra == NULL)
4935 	{
4936 		fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4937 													  sizeof(ArrayMetaState));
4938 		my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4939 		my_extra->element_type = ~element_type;
4940 	}
4941 
4942 	if (my_extra->element_type != element_type)
4943 	{
4944 		/*
4945 		 * Get info about element type, including its output conversion proc
4946 		 */
4947 		get_type_io_data(element_type, IOFunc_output,
4948 						 &my_extra->typlen, &my_extra->typbyval,
4949 						 &my_extra->typalign, &my_extra->typdelim,
4950 						 &my_extra->typioparam, &my_extra->typiofunc);
4951 		fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4952 					  fcinfo->flinfo->fn_mcxt);
4953 		my_extra->element_type = element_type;
4954 	}
4955 	typlen = my_extra->typlen;
4956 	typbyval = my_extra->typbyval;
4957 	typalign = my_extra->typalign;
4958 
4959 	p = ARR_DATA_PTR(v);
4960 	bitmap = ARR_NULLBITMAP(v);
4961 	bitmask = 1;
4962 
4963 	for (i = 0; i < nitems; i++)
4964 	{
4965 		Datum		itemvalue;
4966 		char	   *value;
4967 
4968 		/* Get source element, checking for NULL */
4969 		if (bitmap && (*bitmap & bitmask) == 0)
4970 		{
4971 			/* if null_string is NULL, we just ignore null elements */
4972 			if (null_string != NULL)
4973 			{
4974 				if (printed)
4975 					appendStringInfo(&buf, "%s%s", fldsep, null_string);
4976 				else
4977 					appendStringInfoString(&buf, null_string);
4978 				printed = true;
4979 			}
4980 		}
4981 		else
4982 		{
4983 			itemvalue = fetch_att(p, typbyval, typlen);
4984 
4985 			value = OutputFunctionCall(&my_extra->proc, itemvalue);
4986 
4987 			if (printed)
4988 				appendStringInfo(&buf, "%s%s", fldsep, value);
4989 			else
4990 				appendStringInfoString(&buf, value);
4991 			printed = true;
4992 
4993 			p = att_addlength_pointer(p, typlen, p);
4994 			p = (char *) att_align_nominal(p, typalign);
4995 		}
4996 
4997 		/* advance bitmap pointer if any */
4998 		if (bitmap)
4999 		{
5000 			bitmask <<= 1;
5001 			if (bitmask == 0x100)
5002 			{
5003 				bitmap++;
5004 				bitmask = 1;
5005 			}
5006 		}
5007 	}
5008 
5009 	result = cstring_to_text_with_len(buf.data, buf.len);
5010 	pfree(buf.data);
5011 
5012 	return result;
5013 }
5014 
5015 #define HEXBASE 16
5016 /*
5017  * Convert an int32 to a string containing a base 16 (hex) representation of
5018  * the number.
5019  */
5020 Datum
to_hex32(PG_FUNCTION_ARGS)5021 to_hex32(PG_FUNCTION_ARGS)
5022 {
5023 	uint32		value = (uint32) PG_GETARG_INT32(0);
5024 	char	   *ptr;
5025 	const char *digits = "0123456789abcdef";
5026 	char		buf[32];		/* bigger than needed, but reasonable */
5027 
5028 	ptr = buf + sizeof(buf) - 1;
5029 	*ptr = '\0';
5030 
5031 	do
5032 	{
5033 		*--ptr = digits[value % HEXBASE];
5034 		value /= HEXBASE;
5035 	} while (ptr > buf && value);
5036 
5037 	PG_RETURN_TEXT_P(cstring_to_text(ptr));
5038 }
5039 
5040 /*
5041  * Convert an int64 to a string containing a base 16 (hex) representation of
5042  * the number.
5043  */
5044 Datum
to_hex64(PG_FUNCTION_ARGS)5045 to_hex64(PG_FUNCTION_ARGS)
5046 {
5047 	uint64		value = (uint64) PG_GETARG_INT64(0);
5048 	char	   *ptr;
5049 	const char *digits = "0123456789abcdef";
5050 	char		buf[32];		/* bigger than needed, but reasonable */
5051 
5052 	ptr = buf + sizeof(buf) - 1;
5053 	*ptr = '\0';
5054 
5055 	do
5056 	{
5057 		*--ptr = digits[value % HEXBASE];
5058 		value /= HEXBASE;
5059 	} while (ptr > buf && value);
5060 
5061 	PG_RETURN_TEXT_P(cstring_to_text(ptr));
5062 }
5063 
5064 /*
5065  * Return the size of a datum, possibly compressed
5066  *
5067  * Works on any data type
5068  */
5069 Datum
pg_column_size(PG_FUNCTION_ARGS)5070 pg_column_size(PG_FUNCTION_ARGS)
5071 {
5072 	Datum		value = PG_GETARG_DATUM(0);
5073 	int32		result;
5074 	int			typlen;
5075 
5076 	/* On first call, get the input type's typlen, and save at *fn_extra */
5077 	if (fcinfo->flinfo->fn_extra == NULL)
5078 	{
5079 		/* Lookup the datatype of the supplied argument */
5080 		Oid			argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5081 
5082 		typlen = get_typlen(argtypeid);
5083 		if (typlen == 0)		/* should not happen */
5084 			elog(ERROR, "cache lookup failed for type %u", argtypeid);
5085 
5086 		fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5087 													  sizeof(int));
5088 		*((int *) fcinfo->flinfo->fn_extra) = typlen;
5089 	}
5090 	else
5091 		typlen = *((int *) fcinfo->flinfo->fn_extra);
5092 
5093 	if (typlen == -1)
5094 	{
5095 		/* varlena type, possibly toasted */
5096 		result = toast_datum_size(value);
5097 	}
5098 	else if (typlen == -2)
5099 	{
5100 		/* cstring */
5101 		result = strlen(DatumGetCString(value)) + 1;
5102 	}
5103 	else
5104 	{
5105 		/* ordinary fixed-width type */
5106 		result = typlen;
5107 	}
5108 
5109 	PG_RETURN_INT32(result);
5110 }
5111 
5112 /*
5113  * string_agg - Concatenates values and returns string.
5114  *
5115  * Syntax: string_agg(value text, delimiter text) RETURNS text
5116  *
5117  * Note: Any NULL values are ignored. The first-call delimiter isn't
5118  * actually used at all, and on subsequent calls the delimiter precedes
5119  * the associated value.
5120  */
5121 
5122 /* subroutine to initialize state */
5123 static StringInfo
makeStringAggState(FunctionCallInfo fcinfo)5124 makeStringAggState(FunctionCallInfo fcinfo)
5125 {
5126 	StringInfo	state;
5127 	MemoryContext aggcontext;
5128 	MemoryContext oldcontext;
5129 
5130 	if (!AggCheckCallContext(fcinfo, &aggcontext))
5131 	{
5132 		/* cannot be called directly because of internal-type argument */
5133 		elog(ERROR, "string_agg_transfn called in non-aggregate context");
5134 	}
5135 
5136 	/*
5137 	 * Create state in aggregate context.  It'll stay there across subsequent
5138 	 * calls.
5139 	 */
5140 	oldcontext = MemoryContextSwitchTo(aggcontext);
5141 	state = makeStringInfo();
5142 	MemoryContextSwitchTo(oldcontext);
5143 
5144 	return state;
5145 }
5146 
5147 Datum
string_agg_transfn(PG_FUNCTION_ARGS)5148 string_agg_transfn(PG_FUNCTION_ARGS)
5149 {
5150 	StringInfo	state;
5151 
5152 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5153 
5154 	/* Append the value unless null. */
5155 	if (!PG_ARGISNULL(1))
5156 	{
5157 		/* On the first time through, we ignore the delimiter. */
5158 		if (state == NULL)
5159 			state = makeStringAggState(fcinfo);
5160 		else if (!PG_ARGISNULL(2))
5161 			appendStringInfoText(state, PG_GETARG_TEXT_PP(2));	/* delimiter */
5162 
5163 		appendStringInfoText(state, PG_GETARG_TEXT_PP(1));	/* value */
5164 	}
5165 
5166 	/*
5167 	 * The transition type for string_agg() is declared to be "internal",
5168 	 * which is a pass-by-value type the same size as a pointer.
5169 	 */
5170 	PG_RETURN_POINTER(state);
5171 }
5172 
5173 Datum
string_agg_finalfn(PG_FUNCTION_ARGS)5174 string_agg_finalfn(PG_FUNCTION_ARGS)
5175 {
5176 	StringInfo	state;
5177 
5178 	/* cannot be called directly because of internal-type argument */
5179 	Assert(AggCheckCallContext(fcinfo, NULL));
5180 
5181 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5182 
5183 	if (state != NULL)
5184 		PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
5185 	else
5186 		PG_RETURN_NULL();
5187 }
5188 
5189 /*
5190  * Prepare cache with fmgr info for the output functions of the datatypes of
5191  * the arguments of a concat-like function, beginning with argument "argidx".
5192  * (Arguments before that will have corresponding slots in the resulting
5193  * FmgrInfo array, but we don't fill those slots.)
5194  */
5195 static FmgrInfo *
build_concat_foutcache(FunctionCallInfo fcinfo,int argidx)5196 build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
5197 {
5198 	FmgrInfo   *foutcache;
5199 	int			i;
5200 
5201 	/* We keep the info in fn_mcxt so it survives across calls */
5202 	foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5203 												PG_NARGS() * sizeof(FmgrInfo));
5204 
5205 	for (i = argidx; i < PG_NARGS(); i++)
5206 	{
5207 		Oid			valtype;
5208 		Oid			typOutput;
5209 		bool		typIsVarlena;
5210 
5211 		valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5212 		if (!OidIsValid(valtype))
5213 			elog(ERROR, "could not determine data type of concat() input");
5214 
5215 		getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5216 		fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5217 	}
5218 
5219 	fcinfo->flinfo->fn_extra = foutcache;
5220 
5221 	return foutcache;
5222 }
5223 
5224 /*
5225  * Implementation of both concat() and concat_ws().
5226  *
5227  * sepstr is the separator string to place between values.
5228  * argidx identifies the first argument to concatenate (counting from zero);
5229  * note that this must be constant across any one series of calls.
5230  *
5231  * Returns NULL if result should be NULL, else text value.
5232  */
5233 static text *
concat_internal(const char * sepstr,int argidx,FunctionCallInfo fcinfo)5234 concat_internal(const char *sepstr, int argidx,
5235 				FunctionCallInfo fcinfo)
5236 {
5237 	text	   *result;
5238 	StringInfoData str;
5239 	FmgrInfo   *foutcache;
5240 	bool		first_arg = true;
5241 	int			i;
5242 
5243 	/*
5244 	 * concat(VARIADIC some-array) is essentially equivalent to
5245 	 * array_to_text(), ie concat the array elements with the given separator.
5246 	 * So we just pass the case off to that code.
5247 	 */
5248 	if (get_fn_expr_variadic(fcinfo->flinfo))
5249 	{
5250 		ArrayType  *arr;
5251 
5252 		/* Should have just the one argument */
5253 		Assert(argidx == PG_NARGS() - 1);
5254 
5255 		/* concat(VARIADIC NULL) is defined as NULL */
5256 		if (PG_ARGISNULL(argidx))
5257 			return NULL;
5258 
5259 		/*
5260 		 * Non-null argument had better be an array.  We assume that any call
5261 		 * context that could let get_fn_expr_variadic return true will have
5262 		 * checked that a VARIADIC-labeled parameter actually is an array.  So
5263 		 * it should be okay to just Assert that it's an array rather than
5264 		 * doing a full-fledged error check.
5265 		 */
5266 		Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
5267 
5268 		/* OK, safe to fetch the array value */
5269 		arr = PG_GETARG_ARRAYTYPE_P(argidx);
5270 
5271 		/*
5272 		 * And serialize the array.  We tell array_to_text to ignore null
5273 		 * elements, which matches the behavior of the loop below.
5274 		 */
5275 		return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5276 	}
5277 
5278 	/* Normal case without explicit VARIADIC marker */
5279 	initStringInfo(&str);
5280 
5281 	/* Get output function info, building it if first time through */
5282 	foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5283 	if (foutcache == NULL)
5284 		foutcache = build_concat_foutcache(fcinfo, argidx);
5285 
5286 	for (i = argidx; i < PG_NARGS(); i++)
5287 	{
5288 		if (!PG_ARGISNULL(i))
5289 		{
5290 			Datum		value = PG_GETARG_DATUM(i);
5291 
5292 			/* add separator if appropriate */
5293 			if (first_arg)
5294 				first_arg = false;
5295 			else
5296 				appendStringInfoString(&str, sepstr);
5297 
5298 			/* call the appropriate type output function, append the result */
5299 			appendStringInfoString(&str,
5300 								   OutputFunctionCall(&foutcache[i], value));
5301 		}
5302 	}
5303 
5304 	result = cstring_to_text_with_len(str.data, str.len);
5305 	pfree(str.data);
5306 
5307 	return result;
5308 }
5309 
5310 /*
5311  * Concatenate all arguments. NULL arguments are ignored.
5312  */
5313 Datum
text_concat(PG_FUNCTION_ARGS)5314 text_concat(PG_FUNCTION_ARGS)
5315 {
5316 	text	   *result;
5317 
5318 	result = concat_internal("", 0, fcinfo);
5319 	if (result == NULL)
5320 		PG_RETURN_NULL();
5321 	PG_RETURN_TEXT_P(result);
5322 }
5323 
5324 /*
5325  * Concatenate all but first argument value with separators. The first
5326  * parameter is used as the separator. NULL arguments are ignored.
5327  */
5328 Datum
text_concat_ws(PG_FUNCTION_ARGS)5329 text_concat_ws(PG_FUNCTION_ARGS)
5330 {
5331 	char	   *sep;
5332 	text	   *result;
5333 
5334 	/* return NULL when separator is NULL */
5335 	if (PG_ARGISNULL(0))
5336 		PG_RETURN_NULL();
5337 	sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
5338 
5339 	result = concat_internal(sep, 1, fcinfo);
5340 	if (result == NULL)
5341 		PG_RETURN_NULL();
5342 	PG_RETURN_TEXT_P(result);
5343 }
5344 
5345 /*
5346  * Return first n characters in the string. When n is negative,
5347  * return all but last |n| characters.
5348  */
5349 Datum
text_left(PG_FUNCTION_ARGS)5350 text_left(PG_FUNCTION_ARGS)
5351 {
5352 	int			n = PG_GETARG_INT32(1);
5353 
5354 	if (n < 0)
5355 	{
5356 		text	   *str = PG_GETARG_TEXT_PP(0);
5357 		const char *p = VARDATA_ANY(str);
5358 		int			len = VARSIZE_ANY_EXHDR(str);
5359 		int			rlen;
5360 
5361 		n = pg_mbstrlen_with_len(p, len) + n;
5362 		rlen = pg_mbcharcliplen(p, len, n);
5363 		PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
5364 	}
5365 	else
5366 		PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
5367 }
5368 
5369 /*
5370  * Return last n characters in the string. When n is negative,
5371  * return all but first |n| characters.
5372  */
5373 Datum
text_right(PG_FUNCTION_ARGS)5374 text_right(PG_FUNCTION_ARGS)
5375 {
5376 	text	   *str = PG_GETARG_TEXT_PP(0);
5377 	const char *p = VARDATA_ANY(str);
5378 	int			len = VARSIZE_ANY_EXHDR(str);
5379 	int			n = PG_GETARG_INT32(1);
5380 	int			off;
5381 
5382 	if (n < 0)
5383 		n = -n;
5384 	else
5385 		n = pg_mbstrlen_with_len(p, len) - n;
5386 	off = pg_mbcharcliplen(p, len, n);
5387 
5388 	PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5389 }
5390 
5391 /*
5392  * Return reversed string
5393  */
5394 Datum
text_reverse(PG_FUNCTION_ARGS)5395 text_reverse(PG_FUNCTION_ARGS)
5396 {
5397 	text	   *str = PG_GETARG_TEXT_PP(0);
5398 	const char *p = VARDATA_ANY(str);
5399 	int			len = VARSIZE_ANY_EXHDR(str);
5400 	const char *endp = p + len;
5401 	text	   *result;
5402 	char	   *dst;
5403 
5404 	result = palloc(len + VARHDRSZ);
5405 	dst = (char *) VARDATA(result) + len;
5406 	SET_VARSIZE(result, len + VARHDRSZ);
5407 
5408 	if (pg_database_encoding_max_length() > 1)
5409 	{
5410 		/* multibyte version */
5411 		while (p < endp)
5412 		{
5413 			int			sz;
5414 
5415 			sz = pg_mblen(p);
5416 			dst -= sz;
5417 			memcpy(dst, p, sz);
5418 			p += sz;
5419 		}
5420 	}
5421 	else
5422 	{
5423 		/* single byte version */
5424 		while (p < endp)
5425 			*(--dst) = *p++;
5426 	}
5427 
5428 	PG_RETURN_TEXT_P(result);
5429 }
5430 
5431 
5432 /*
5433  * Support macros for text_format()
5434  */
5435 #define TEXT_FORMAT_FLAG_MINUS	0x0001	/* is minus flag present? */
5436 
5437 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5438 	do { \
5439 		if (++(ptr) >= (end_ptr)) \
5440 			ereport(ERROR, \
5441 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5442 					 errmsg("unterminated format() type specifier"), \
5443 					 errhint("For a single \"%%\" use \"%%%%\"."))); \
5444 	} while (0)
5445 
5446 /*
5447  * Returns a formatted string
5448  */
5449 Datum
text_format(PG_FUNCTION_ARGS)5450 text_format(PG_FUNCTION_ARGS)
5451 {
5452 	text	   *fmt;
5453 	StringInfoData str;
5454 	const char *cp;
5455 	const char *start_ptr;
5456 	const char *end_ptr;
5457 	text	   *result;
5458 	int			arg;
5459 	bool		funcvariadic;
5460 	int			nargs;
5461 	Datum	   *elements = NULL;
5462 	bool	   *nulls = NULL;
5463 	Oid			element_type = InvalidOid;
5464 	Oid			prev_type = InvalidOid;
5465 	Oid			prev_width_type = InvalidOid;
5466 	FmgrInfo	typoutputfinfo;
5467 	FmgrInfo	typoutputinfo_width;
5468 
5469 	/* When format string is null, immediately return null */
5470 	if (PG_ARGISNULL(0))
5471 		PG_RETURN_NULL();
5472 
5473 	/* If argument is marked VARIADIC, expand array into elements */
5474 	if (get_fn_expr_variadic(fcinfo->flinfo))
5475 	{
5476 		ArrayType  *arr;
5477 		int16		elmlen;
5478 		bool		elmbyval;
5479 		char		elmalign;
5480 		int			nitems;
5481 
5482 		/* Should have just the one argument */
5483 		Assert(PG_NARGS() == 2);
5484 
5485 		/* If argument is NULL, we treat it as zero-length array */
5486 		if (PG_ARGISNULL(1))
5487 			nitems = 0;
5488 		else
5489 		{
5490 			/*
5491 			 * Non-null argument had better be an array.  We assume that any
5492 			 * call context that could let get_fn_expr_variadic return true
5493 			 * will have checked that a VARIADIC-labeled parameter actually is
5494 			 * an array.  So it should be okay to just Assert that it's an
5495 			 * array rather than doing a full-fledged error check.
5496 			 */
5497 			Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5498 
5499 			/* OK, safe to fetch the array value */
5500 			arr = PG_GETARG_ARRAYTYPE_P(1);
5501 
5502 			/* Get info about array element type */
5503 			element_type = ARR_ELEMTYPE(arr);
5504 			get_typlenbyvalalign(element_type,
5505 								 &elmlen, &elmbyval, &elmalign);
5506 
5507 			/* Extract all array elements */
5508 			deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5509 							  &elements, &nulls, &nitems);
5510 		}
5511 
5512 		nargs = nitems + 1;
5513 		funcvariadic = true;
5514 	}
5515 	else
5516 	{
5517 		/* Non-variadic case, we'll process the arguments individually */
5518 		nargs = PG_NARGS();
5519 		funcvariadic = false;
5520 	}
5521 
5522 	/* Setup for main loop. */
5523 	fmt = PG_GETARG_TEXT_PP(0);
5524 	start_ptr = VARDATA_ANY(fmt);
5525 	end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5526 	initStringInfo(&str);
5527 	arg = 1;					/* next argument position to print */
5528 
5529 	/* Scan format string, looking for conversion specifiers. */
5530 	for (cp = start_ptr; cp < end_ptr; cp++)
5531 	{
5532 		int			argpos;
5533 		int			widthpos;
5534 		int			flags;
5535 		int			width;
5536 		Datum		value;
5537 		bool		isNull;
5538 		Oid			typid;
5539 
5540 		/*
5541 		 * If it's not the start of a conversion specifier, just copy it to
5542 		 * the output buffer.
5543 		 */
5544 		if (*cp != '%')
5545 		{
5546 			appendStringInfoCharMacro(&str, *cp);
5547 			continue;
5548 		}
5549 
5550 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5551 
5552 		/* Easy case: %% outputs a single % */
5553 		if (*cp == '%')
5554 		{
5555 			appendStringInfoCharMacro(&str, *cp);
5556 			continue;
5557 		}
5558 
5559 		/* Parse the optional portions of the format specifier */
5560 		cp = text_format_parse_format(cp, end_ptr,
5561 									  &argpos, &widthpos,
5562 									  &flags, &width);
5563 
5564 		/*
5565 		 * Next we should see the main conversion specifier.  Whether or not
5566 		 * an argument position was present, it's known that at least one
5567 		 * character remains in the string at this point.  Experience suggests
5568 		 * that it's worth checking that that character is one of the expected
5569 		 * ones before we try to fetch arguments, so as to produce the least
5570 		 * confusing response to a mis-formatted specifier.
5571 		 */
5572 		if (strchr("sIL", *cp) == NULL)
5573 			ereport(ERROR,
5574 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5575 					 errmsg("unrecognized format() type specifier \"%c\"",
5576 							*cp),
5577 					 errhint("For a single \"%%\" use \"%%%%\".")));
5578 
5579 		/* If indirect width was specified, get its value */
5580 		if (widthpos >= 0)
5581 		{
5582 			/* Collect the specified or next argument position */
5583 			if (widthpos > 0)
5584 				arg = widthpos;
5585 			if (arg >= nargs)
5586 				ereport(ERROR,
5587 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5588 						 errmsg("too few arguments for format()")));
5589 
5590 			/* Get the value and type of the selected argument */
5591 			if (!funcvariadic)
5592 			{
5593 				value = PG_GETARG_DATUM(arg);
5594 				isNull = PG_ARGISNULL(arg);
5595 				typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5596 			}
5597 			else
5598 			{
5599 				value = elements[arg - 1];
5600 				isNull = nulls[arg - 1];
5601 				typid = element_type;
5602 			}
5603 			if (!OidIsValid(typid))
5604 				elog(ERROR, "could not determine data type of format() input");
5605 
5606 			arg++;
5607 
5608 			/* We can treat NULL width the same as zero */
5609 			if (isNull)
5610 				width = 0;
5611 			else if (typid == INT4OID)
5612 				width = DatumGetInt32(value);
5613 			else if (typid == INT2OID)
5614 				width = DatumGetInt16(value);
5615 			else
5616 			{
5617 				/* For less-usual datatypes, convert to text then to int */
5618 				char	   *str;
5619 
5620 				if (typid != prev_width_type)
5621 				{
5622 					Oid			typoutputfunc;
5623 					bool		typIsVarlena;
5624 
5625 					getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5626 					fmgr_info(typoutputfunc, &typoutputinfo_width);
5627 					prev_width_type = typid;
5628 				}
5629 
5630 				str = OutputFunctionCall(&typoutputinfo_width, value);
5631 
5632 				/* pg_strtoint32 will complain about bad data or overflow */
5633 				width = pg_strtoint32(str);
5634 
5635 				pfree(str);
5636 			}
5637 		}
5638 
5639 		/* Collect the specified or next argument position */
5640 		if (argpos > 0)
5641 			arg = argpos;
5642 		if (arg >= nargs)
5643 			ereport(ERROR,
5644 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5645 					 errmsg("too few arguments for format()")));
5646 
5647 		/* Get the value and type of the selected argument */
5648 		if (!funcvariadic)
5649 		{
5650 			value = PG_GETARG_DATUM(arg);
5651 			isNull = PG_ARGISNULL(arg);
5652 			typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5653 		}
5654 		else
5655 		{
5656 			value = elements[arg - 1];
5657 			isNull = nulls[arg - 1];
5658 			typid = element_type;
5659 		}
5660 		if (!OidIsValid(typid))
5661 			elog(ERROR, "could not determine data type of format() input");
5662 
5663 		arg++;
5664 
5665 		/*
5666 		 * Get the appropriate typOutput function, reusing previous one if
5667 		 * same type as previous argument.  That's particularly useful in the
5668 		 * variadic-array case, but often saves work even for ordinary calls.
5669 		 */
5670 		if (typid != prev_type)
5671 		{
5672 			Oid			typoutputfunc;
5673 			bool		typIsVarlena;
5674 
5675 			getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5676 			fmgr_info(typoutputfunc, &typoutputfinfo);
5677 			prev_type = typid;
5678 		}
5679 
5680 		/*
5681 		 * And now we can format the value.
5682 		 */
5683 		switch (*cp)
5684 		{
5685 			case 's':
5686 			case 'I':
5687 			case 'L':
5688 				text_format_string_conversion(&str, *cp, &typoutputfinfo,
5689 											  value, isNull,
5690 											  flags, width);
5691 				break;
5692 			default:
5693 				/* should not get here, because of previous check */
5694 				ereport(ERROR,
5695 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5696 						 errmsg("unrecognized format() type specifier \"%c\"",
5697 								*cp),
5698 						 errhint("For a single \"%%\" use \"%%%%\".")));
5699 				break;
5700 		}
5701 	}
5702 
5703 	/* Don't need deconstruct_array results anymore. */
5704 	if (elements != NULL)
5705 		pfree(elements);
5706 	if (nulls != NULL)
5707 		pfree(nulls);
5708 
5709 	/* Generate results. */
5710 	result = cstring_to_text_with_len(str.data, str.len);
5711 	pfree(str.data);
5712 
5713 	PG_RETURN_TEXT_P(result);
5714 }
5715 
5716 /*
5717  * Parse contiguous digits as a decimal number.
5718  *
5719  * Returns true if some digits could be parsed.
5720  * The value is returned into *value, and *ptr is advanced to the next
5721  * character to be parsed.
5722  *
5723  * Note parsing invariant: at least one character is known available before
5724  * string end (end_ptr) at entry, and this is still true at exit.
5725  */
5726 static bool
text_format_parse_digits(const char ** ptr,const char * end_ptr,int * value)5727 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5728 {
5729 	bool		found = false;
5730 	const char *cp = *ptr;
5731 	int			val = 0;
5732 
5733 	while (*cp >= '0' && *cp <= '9')
5734 	{
5735 		int8		digit = (*cp - '0');
5736 
5737 		if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5738 			unlikely(pg_add_s32_overflow(val, digit, &val)))
5739 			ereport(ERROR,
5740 					(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5741 					 errmsg("number is out of range")));
5742 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5743 		found = true;
5744 	}
5745 
5746 	*ptr = cp;
5747 	*value = val;
5748 
5749 	return found;
5750 }
5751 
5752 /*
5753  * Parse a format specifier (generally following the SUS printf spec).
5754  *
5755  * We have already advanced over the initial '%', and we are looking for
5756  * [argpos][flags][width]type (but the type character is not consumed here).
5757  *
5758  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5759  * Output parameters:
5760  *	argpos: argument position for value to be printed.  -1 means unspecified.
5761  *	widthpos: argument position for width.  Zero means the argument position
5762  *			was unspecified (ie, take the next arg) and -1 means no width
5763  *			argument (width was omitted or specified as a constant).
5764  *	flags: bitmask of flags.
5765  *	width: directly-specified width value.  Zero means the width was omitted
5766  *			(note it's not necessary to distinguish this case from an explicit
5767  *			zero width value).
5768  *
5769  * The function result is the next character position to be parsed, ie, the
5770  * location where the type character is/should be.
5771  *
5772  * Note parsing invariant: at least one character is known available before
5773  * string end (end_ptr) at entry, and this is still true at exit.
5774  */
5775 static const char *
text_format_parse_format(const char * start_ptr,const char * end_ptr,int * argpos,int * widthpos,int * flags,int * width)5776 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5777 						 int *argpos, int *widthpos,
5778 						 int *flags, int *width)
5779 {
5780 	const char *cp = start_ptr;
5781 	int			n;
5782 
5783 	/* set defaults for output parameters */
5784 	*argpos = -1;
5785 	*widthpos = -1;
5786 	*flags = 0;
5787 	*width = 0;
5788 
5789 	/* try to identify first number */
5790 	if (text_format_parse_digits(&cp, end_ptr, &n))
5791 	{
5792 		if (*cp != '$')
5793 		{
5794 			/* Must be just a width and a type, so we're done */
5795 			*width = n;
5796 			return cp;
5797 		}
5798 		/* The number was argument position */
5799 		*argpos = n;
5800 		/* Explicit 0 for argument index is immediately refused */
5801 		if (n == 0)
5802 			ereport(ERROR,
5803 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5804 					 errmsg("format specifies argument 0, but arguments are numbered from 1")));
5805 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5806 	}
5807 
5808 	/* Handle flags (only minus is supported now) */
5809 	while (*cp == '-')
5810 	{
5811 		*flags |= TEXT_FORMAT_FLAG_MINUS;
5812 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5813 	}
5814 
5815 	if (*cp == '*')
5816 	{
5817 		/* Handle indirect width */
5818 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5819 		if (text_format_parse_digits(&cp, end_ptr, &n))
5820 		{
5821 			/* number in this position must be closed by $ */
5822 			if (*cp != '$')
5823 				ereport(ERROR,
5824 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5825 						 errmsg("width argument position must be ended by \"$\"")));
5826 			/* The number was width argument position */
5827 			*widthpos = n;
5828 			/* Explicit 0 for argument index is immediately refused */
5829 			if (n == 0)
5830 				ereport(ERROR,
5831 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5832 						 errmsg("format specifies argument 0, but arguments are numbered from 1")));
5833 			ADVANCE_PARSE_POINTER(cp, end_ptr);
5834 		}
5835 		else
5836 			*widthpos = 0;		/* width's argument position is unspecified */
5837 	}
5838 	else
5839 	{
5840 		/* Check for direct width specification */
5841 		if (text_format_parse_digits(&cp, end_ptr, &n))
5842 			*width = n;
5843 	}
5844 
5845 	/* cp should now be pointing at type character */
5846 	return cp;
5847 }
5848 
5849 /*
5850  * Format a %s, %I, or %L conversion
5851  */
5852 static void
text_format_string_conversion(StringInfo buf,char conversion,FmgrInfo * typOutputInfo,Datum value,bool isNull,int flags,int width)5853 text_format_string_conversion(StringInfo buf, char conversion,
5854 							  FmgrInfo *typOutputInfo,
5855 							  Datum value, bool isNull,
5856 							  int flags, int width)
5857 {
5858 	char	   *str;
5859 
5860 	/* Handle NULL arguments before trying to stringify the value. */
5861 	if (isNull)
5862 	{
5863 		if (conversion == 's')
5864 			text_format_append_string(buf, "", flags, width);
5865 		else if (conversion == 'L')
5866 			text_format_append_string(buf, "NULL", flags, width);
5867 		else if (conversion == 'I')
5868 			ereport(ERROR,
5869 					(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
5870 					 errmsg("null values cannot be formatted as an SQL identifier")));
5871 		return;
5872 	}
5873 
5874 	/* Stringify. */
5875 	str = OutputFunctionCall(typOutputInfo, value);
5876 
5877 	/* Escape. */
5878 	if (conversion == 'I')
5879 	{
5880 		/* quote_identifier may or may not allocate a new string. */
5881 		text_format_append_string(buf, quote_identifier(str), flags, width);
5882 	}
5883 	else if (conversion == 'L')
5884 	{
5885 		char	   *qstr = quote_literal_cstr(str);
5886 
5887 		text_format_append_string(buf, qstr, flags, width);
5888 		/* quote_literal_cstr() always allocates a new string */
5889 		pfree(qstr);
5890 	}
5891 	else
5892 		text_format_append_string(buf, str, flags, width);
5893 
5894 	/* Cleanup. */
5895 	pfree(str);
5896 }
5897 
5898 /*
5899  * Append str to buf, padding as directed by flags/width
5900  */
5901 static void
text_format_append_string(StringInfo buf,const char * str,int flags,int width)5902 text_format_append_string(StringInfo buf, const char *str,
5903 						  int flags, int width)
5904 {
5905 	bool		align_to_left = false;
5906 	int			len;
5907 
5908 	/* fast path for typical easy case */
5909 	if (width == 0)
5910 	{
5911 		appendStringInfoString(buf, str);
5912 		return;
5913 	}
5914 
5915 	if (width < 0)
5916 	{
5917 		/* Negative width: implicit '-' flag, then take absolute value */
5918 		align_to_left = true;
5919 		/* -INT_MIN is undefined */
5920 		if (width <= INT_MIN)
5921 			ereport(ERROR,
5922 					(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5923 					 errmsg("number is out of range")));
5924 		width = -width;
5925 	}
5926 	else if (flags & TEXT_FORMAT_FLAG_MINUS)
5927 		align_to_left = true;
5928 
5929 	len = pg_mbstrlen(str);
5930 	if (align_to_left)
5931 	{
5932 		/* left justify */
5933 		appendStringInfoString(buf, str);
5934 		if (len < width)
5935 			appendStringInfoSpaces(buf, width - len);
5936 	}
5937 	else
5938 	{
5939 		/* right justify */
5940 		if (len < width)
5941 			appendStringInfoSpaces(buf, width - len);
5942 		appendStringInfoString(buf, str);
5943 	}
5944 }
5945 
5946 /*
5947  * text_format_nv - nonvariadic wrapper for text_format function.
5948  *
5949  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
5950  * which checks that all built-in functions that share the implementing C
5951  * function take the same number of arguments.
5952  */
5953 Datum
text_format_nv(PG_FUNCTION_ARGS)5954 text_format_nv(PG_FUNCTION_ARGS)
5955 {
5956 	return text_format(fcinfo);
5957 }
5958 
5959 /*
5960  * Helper function for Levenshtein distance functions. Faster than memcmp(),
5961  * for this use case.
5962  */
5963 static inline bool
rest_of_char_same(const char * s1,const char * s2,int len)5964 rest_of_char_same(const char *s1, const char *s2, int len)
5965 {
5966 	while (len > 0)
5967 	{
5968 		len--;
5969 		if (s1[len] != s2[len])
5970 			return false;
5971 	}
5972 	return true;
5973 }
5974 
5975 /* Expand each Levenshtein distance variant */
5976 #include "levenshtein.c"
5977 #define LEVENSHTEIN_LESS_EQUAL
5978 #include "levenshtein.c"
5979