1 /*-------------------------------------------------------------------------
2  *
3  * varlena.c
4  *	  Functions for the variable-length built-in types.
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/utils/adt/varlena.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <limits.h>
19 
20 #include "access/detoast.h"
21 #include "catalog/pg_collation.h"
22 #include "catalog/pg_type.h"
23 #include "common/hashfn.h"
24 #include "common/int.h"
25 #include "common/unicode_norm.h"
26 #include "lib/hyperloglog.h"
27 #include "libpq/pqformat.h"
28 #include "miscadmin.h"
29 #include "parser/scansup.h"
30 #include "port/pg_bswap.h"
31 #include "regex/regex.h"
32 #include "utils/builtins.h"
33 #include "utils/bytea.h"
34 #include "utils/lsyscache.h"
35 #include "utils/memutils.h"
36 #include "utils/pg_locale.h"
37 #include "utils/sortsupport.h"
38 #include "utils/varlena.h"
39 
40 
41 /* GUC variable */
42 int			bytea_output = BYTEA_OUTPUT_HEX;
43 
44 typedef struct varlena unknown;
45 typedef struct varlena VarString;
46 
47 /*
48  * State for text_position_* functions.
49  */
50 typedef struct
51 {
52 	bool		is_multibyte;	/* T if multibyte encoding */
53 	bool		is_multibyte_char_in_char;
54 
55 	char	   *str1;			/* haystack string */
56 	char	   *str2;			/* needle string */
57 	int			len1;			/* string lengths in bytes */
58 	int			len2;
59 
60 	/* Skip table for Boyer-Moore-Horspool search algorithm: */
61 	int			skiptablemask;	/* mask for ANDing with skiptable subscripts */
62 	int			skiptable[256]; /* skip distance for given mismatched char */
63 
64 	char	   *last_match;		/* pointer to last match in 'str1' */
65 
66 	/*
67 	 * Sometimes we need to convert the byte position of a match to a
68 	 * character position.  These store the last position that was converted,
69 	 * so that on the next call, we can continue from that point, rather than
70 	 * count characters from the very beginning.
71 	 */
72 	char	   *refpoint;		/* pointer within original haystack string */
73 	int			refpos;			/* 0-based character offset of the same point */
74 } TextPositionState;
75 
76 typedef struct
77 {
78 	char	   *buf1;			/* 1st string, or abbreviation original string
79 								 * buf */
80 	char	   *buf2;			/* 2nd string, or abbreviation strxfrm() buf */
81 	int			buflen1;
82 	int			buflen2;
83 	int			last_len1;		/* Length of last buf1 string/strxfrm() input */
84 	int			last_len2;		/* Length of last buf2 string/strxfrm() blob */
85 	int			last_returned;	/* Last comparison result (cache) */
86 	bool		cache_blob;		/* Does buf2 contain strxfrm() blob, etc? */
87 	bool		collate_c;
88 	Oid			typid;			/* Actual datatype (text/bpchar/bytea/name) */
89 	hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
90 	hyperLogLogState full_card; /* Full key cardinality state */
91 	double		prop_card;		/* Required cardinality proportion */
92 	pg_locale_t locale;
93 } VarStringSortSupport;
94 
95 /*
96  * This should be large enough that most strings will fit, but small enough
97  * that we feel comfortable putting it on the stack
98  */
99 #define TEXTBUFLEN		1024
100 
101 #define DatumGetUnknownP(X)			((unknown *) PG_DETOAST_DATUM(X))
102 #define DatumGetUnknownPCopy(X)		((unknown *) PG_DETOAST_DATUM_COPY(X))
103 #define PG_GETARG_UNKNOWN_P(n)		DatumGetUnknownP(PG_GETARG_DATUM(n))
104 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
105 #define PG_RETURN_UNKNOWN_P(x)		PG_RETURN_POINTER(x)
106 
107 #define DatumGetVarStringP(X)		((VarString *) PG_DETOAST_DATUM(X))
108 #define DatumGetVarStringPP(X)		((VarString *) PG_DETOAST_DATUM_PACKED(X))
109 
110 static int	varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
111 static int	bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
112 static int	namefastcmp_c(Datum x, Datum y, SortSupport ssup);
113 static int	varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
114 static int	namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
115 static int	varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
116 static int	varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
117 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
118 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
119 static int32 text_length(Datum str);
120 static text *text_catenate(text *t1, text *t2);
121 static text *text_substring(Datum str,
122 							int32 start,
123 							int32 length,
124 							bool length_not_specified);
125 static text *text_overlay(text *t1, text *t2, int sp, int sl);
126 static int	text_position(text *t1, text *t2, Oid collid);
127 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
128 static bool text_position_next(TextPositionState *state);
129 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
130 static char *text_position_get_match_ptr(TextPositionState *state);
131 static int	text_position_get_match_pos(TextPositionState *state);
132 static void text_position_cleanup(TextPositionState *state);
133 static void check_collation_set(Oid collid);
134 static int	text_cmp(text *arg1, text *arg2, Oid collid);
135 static bytea *bytea_catenate(bytea *t1, bytea *t2);
136 static bytea *bytea_substring(Datum str,
137 							  int S,
138 							  int L,
139 							  bool length_not_specified);
140 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
141 static void appendStringInfoText(StringInfo str, const text *t);
142 static Datum text_to_array_internal(PG_FUNCTION_ARGS);
143 static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
144 									const char *fldsep, const char *null_string);
145 static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
146 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
147 									 int *value);
148 static const char *text_format_parse_format(const char *start_ptr,
149 											const char *end_ptr,
150 											int *argpos, int *widthpos,
151 											int *flags, int *width);
152 static void text_format_string_conversion(StringInfo buf, char conversion,
153 										  FmgrInfo *typOutputInfo,
154 										  Datum value, bool isNull,
155 										  int flags, int width);
156 static void text_format_append_string(StringInfo buf, const char *str,
157 									  int flags, int width);
158 
159 
160 /*****************************************************************************
161  *	 CONVERSION ROUTINES EXPORTED FOR USE BY C CODE							 *
162  *****************************************************************************/
163 
164 /*
165  * cstring_to_text
166  *
167  * Create a text value from a null-terminated C string.
168  *
169  * The new text value is freshly palloc'd with a full-size VARHDR.
170  */
171 text *
cstring_to_text(const char * s)172 cstring_to_text(const char *s)
173 {
174 	return cstring_to_text_with_len(s, strlen(s));
175 }
176 
177 /*
178  * cstring_to_text_with_len
179  *
180  * Same as cstring_to_text except the caller specifies the string length;
181  * the string need not be null_terminated.
182  */
183 text *
cstring_to_text_with_len(const char * s,int len)184 cstring_to_text_with_len(const char *s, int len)
185 {
186 	text	   *result = (text *) palloc(len + VARHDRSZ);
187 
188 	SET_VARSIZE(result, len + VARHDRSZ);
189 	memcpy(VARDATA(result), s, len);
190 
191 	return result;
192 }
193 
194 /*
195  * text_to_cstring
196  *
197  * Create a palloc'd, null-terminated C string from a text value.
198  *
199  * We support being passed a compressed or toasted text value.
200  * This is a bit bogus since such values shouldn't really be referred to as
201  * "text *", but it seems useful for robustness.  If we didn't handle that
202  * case here, we'd need another routine that did, anyway.
203  */
204 char *
text_to_cstring(const text * t)205 text_to_cstring(const text *t)
206 {
207 	/* must cast away the const, unfortunately */
208 	text	   *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
209 	int			len = VARSIZE_ANY_EXHDR(tunpacked);
210 	char	   *result;
211 
212 	result = (char *) palloc(len + 1);
213 	memcpy(result, VARDATA_ANY(tunpacked), len);
214 	result[len] = '\0';
215 
216 	if (tunpacked != t)
217 		pfree(tunpacked);
218 
219 	return result;
220 }
221 
222 /*
223  * text_to_cstring_buffer
224  *
225  * Copy a text value into a caller-supplied buffer of size dst_len.
226  *
227  * The text string is truncated if necessary to fit.  The result is
228  * guaranteed null-terminated (unless dst_len == 0).
229  *
230  * We support being passed a compressed or toasted text value.
231  * This is a bit bogus since such values shouldn't really be referred to as
232  * "text *", but it seems useful for robustness.  If we didn't handle that
233  * case here, we'd need another routine that did, anyway.
234  */
235 void
text_to_cstring_buffer(const text * src,char * dst,size_t dst_len)236 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
237 {
238 	/* must cast away the const, unfortunately */
239 	text	   *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
240 	size_t		src_len = VARSIZE_ANY_EXHDR(srcunpacked);
241 
242 	if (dst_len > 0)
243 	{
244 		dst_len--;
245 		if (dst_len >= src_len)
246 			dst_len = src_len;
247 		else					/* ensure truncation is encoding-safe */
248 			dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
249 		memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
250 		dst[dst_len] = '\0';
251 	}
252 
253 	if (srcunpacked != src)
254 		pfree(srcunpacked);
255 }
256 
257 
258 /*****************************************************************************
259  *	 USER I/O ROUTINES														 *
260  *****************************************************************************/
261 
262 
263 #define VAL(CH)			((CH) - '0')
264 #define DIG(VAL)		((VAL) + '0')
265 
266 /*
267  *		byteain			- converts from printable representation of byte array
268  *
269  *		Non-printable characters must be passed as '\nnn' (octal) and are
270  *		converted to internal form.  '\' must be passed as '\\'.
271  *		ereport(ERROR, ...) if bad form.
272  *
273  *		BUGS:
274  *				The input is scanned twice.
275  *				The error checking of input is minimal.
276  */
277 Datum
byteain(PG_FUNCTION_ARGS)278 byteain(PG_FUNCTION_ARGS)
279 {
280 	char	   *inputText = PG_GETARG_CSTRING(0);
281 	char	   *tp;
282 	char	   *rp;
283 	int			bc;
284 	bytea	   *result;
285 
286 	/* Recognize hex input */
287 	if (inputText[0] == '\\' && inputText[1] == 'x')
288 	{
289 		size_t		len = strlen(inputText);
290 
291 		bc = (len - 2) / 2 + VARHDRSZ;	/* maximum possible length */
292 		result = palloc(bc);
293 		bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
294 		SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
295 
296 		PG_RETURN_BYTEA_P(result);
297 	}
298 
299 	/* Else, it's the traditional escaped style */
300 	for (bc = 0, tp = inputText; *tp != '\0'; bc++)
301 	{
302 		if (tp[0] != '\\')
303 			tp++;
304 		else if ((tp[0] == '\\') &&
305 				 (tp[1] >= '0' && tp[1] <= '3') &&
306 				 (tp[2] >= '0' && tp[2] <= '7') &&
307 				 (tp[3] >= '0' && tp[3] <= '7'))
308 			tp += 4;
309 		else if ((tp[0] == '\\') &&
310 				 (tp[1] == '\\'))
311 			tp += 2;
312 		else
313 		{
314 			/*
315 			 * one backslash, not followed by another or ### valid octal
316 			 */
317 			ereport(ERROR,
318 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
319 					 errmsg("invalid input syntax for type %s", "bytea")));
320 		}
321 	}
322 
323 	bc += VARHDRSZ;
324 
325 	result = (bytea *) palloc(bc);
326 	SET_VARSIZE(result, bc);
327 
328 	tp = inputText;
329 	rp = VARDATA(result);
330 	while (*tp != '\0')
331 	{
332 		if (tp[0] != '\\')
333 			*rp++ = *tp++;
334 		else if ((tp[0] == '\\') &&
335 				 (tp[1] >= '0' && tp[1] <= '3') &&
336 				 (tp[2] >= '0' && tp[2] <= '7') &&
337 				 (tp[3] >= '0' && tp[3] <= '7'))
338 		{
339 			bc = VAL(tp[1]);
340 			bc <<= 3;
341 			bc += VAL(tp[2]);
342 			bc <<= 3;
343 			*rp++ = bc + VAL(tp[3]);
344 
345 			tp += 4;
346 		}
347 		else if ((tp[0] == '\\') &&
348 				 (tp[1] == '\\'))
349 		{
350 			*rp++ = '\\';
351 			tp += 2;
352 		}
353 		else
354 		{
355 			/*
356 			 * We should never get here. The first pass should not allow it.
357 			 */
358 			ereport(ERROR,
359 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
360 					 errmsg("invalid input syntax for type %s", "bytea")));
361 		}
362 	}
363 
364 	PG_RETURN_BYTEA_P(result);
365 }
366 
367 /*
368  *		byteaout		- converts to printable representation of byte array
369  *
370  *		In the traditional escaped format, non-printable characters are
371  *		printed as '\nnn' (octal) and '\' as '\\'.
372  */
373 Datum
byteaout(PG_FUNCTION_ARGS)374 byteaout(PG_FUNCTION_ARGS)
375 {
376 	bytea	   *vlena = PG_GETARG_BYTEA_PP(0);
377 	char	   *result;
378 	char	   *rp;
379 
380 	if (bytea_output == BYTEA_OUTPUT_HEX)
381 	{
382 		/* Print hex format */
383 		rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
384 		*rp++ = '\\';
385 		*rp++ = 'x';
386 		rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
387 	}
388 	else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
389 	{
390 		/* Print traditional escaped format */
391 		char	   *vp;
392 		uint64		len;
393 		int			i;
394 
395 		len = 1;				/* empty string has 1 char */
396 		vp = VARDATA_ANY(vlena);
397 		for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
398 		{
399 			if (*vp == '\\')
400 				len += 2;
401 			else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
402 				len += 4;
403 			else
404 				len++;
405 		}
406 
407 		/*
408 		 * In principle len can't overflow uint32 if the input fit in 1GB, but
409 		 * for safety let's check rather than relying on palloc's internal
410 		 * check.
411 		 */
412 		if (len > MaxAllocSize)
413 			ereport(ERROR,
414 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
415 					 errmsg_internal("result of bytea output conversion is too large")));
416 		rp = result = (char *) palloc(len);
417 
418 		vp = VARDATA_ANY(vlena);
419 		for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
420 		{
421 			if (*vp == '\\')
422 			{
423 				*rp++ = '\\';
424 				*rp++ = '\\';
425 			}
426 			else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
427 			{
428 				int			val;	/* holds unprintable chars */
429 
430 				val = *vp;
431 				rp[0] = '\\';
432 				rp[3] = DIG(val & 07);
433 				val >>= 3;
434 				rp[2] = DIG(val & 07);
435 				val >>= 3;
436 				rp[1] = DIG(val & 03);
437 				rp += 4;
438 			}
439 			else
440 				*rp++ = *vp;
441 		}
442 	}
443 	else
444 	{
445 		elog(ERROR, "unrecognized bytea_output setting: %d",
446 			 bytea_output);
447 		rp = result = NULL;		/* keep compiler quiet */
448 	}
449 	*rp = '\0';
450 	PG_RETURN_CSTRING(result);
451 }
452 
453 /*
454  *		bytearecv			- converts external binary format to bytea
455  */
456 Datum
bytearecv(PG_FUNCTION_ARGS)457 bytearecv(PG_FUNCTION_ARGS)
458 {
459 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
460 	bytea	   *result;
461 	int			nbytes;
462 
463 	nbytes = buf->len - buf->cursor;
464 	result = (bytea *) palloc(nbytes + VARHDRSZ);
465 	SET_VARSIZE(result, nbytes + VARHDRSZ);
466 	pq_copymsgbytes(buf, VARDATA(result), nbytes);
467 	PG_RETURN_BYTEA_P(result);
468 }
469 
470 /*
471  *		byteasend			- converts bytea to binary format
472  *
473  * This is a special case: just copy the input...
474  */
475 Datum
byteasend(PG_FUNCTION_ARGS)476 byteasend(PG_FUNCTION_ARGS)
477 {
478 	bytea	   *vlena = PG_GETARG_BYTEA_P_COPY(0);
479 
480 	PG_RETURN_BYTEA_P(vlena);
481 }
482 
483 Datum
bytea_string_agg_transfn(PG_FUNCTION_ARGS)484 bytea_string_agg_transfn(PG_FUNCTION_ARGS)
485 {
486 	StringInfo	state;
487 
488 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
489 
490 	/* Append the value unless null. */
491 	if (!PG_ARGISNULL(1))
492 	{
493 		bytea	   *value = PG_GETARG_BYTEA_PP(1);
494 
495 		/* On the first time through, we ignore the delimiter. */
496 		if (state == NULL)
497 			state = makeStringAggState(fcinfo);
498 		else if (!PG_ARGISNULL(2))
499 		{
500 			bytea	   *delim = PG_GETARG_BYTEA_PP(2);
501 
502 			appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
503 		}
504 
505 		appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
506 	}
507 
508 	/*
509 	 * The transition type for string_agg() is declared to be "internal",
510 	 * which is a pass-by-value type the same size as a pointer.
511 	 */
512 	PG_RETURN_POINTER(state);
513 }
514 
515 Datum
bytea_string_agg_finalfn(PG_FUNCTION_ARGS)516 bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
517 {
518 	StringInfo	state;
519 
520 	/* cannot be called directly because of internal-type argument */
521 	Assert(AggCheckCallContext(fcinfo, NULL));
522 
523 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
524 
525 	if (state != NULL)
526 	{
527 		bytea	   *result;
528 
529 		result = (bytea *) palloc(state->len + VARHDRSZ);
530 		SET_VARSIZE(result, state->len + VARHDRSZ);
531 		memcpy(VARDATA(result), state->data, state->len);
532 		PG_RETURN_BYTEA_P(result);
533 	}
534 	else
535 		PG_RETURN_NULL();
536 }
537 
538 /*
539  *		textin			- converts "..." to internal representation
540  */
541 Datum
textin(PG_FUNCTION_ARGS)542 textin(PG_FUNCTION_ARGS)
543 {
544 	char	   *inputText = PG_GETARG_CSTRING(0);
545 
546 	PG_RETURN_TEXT_P(cstring_to_text(inputText));
547 }
548 
549 /*
550  *		textout			- converts internal representation to "..."
551  */
552 Datum
textout(PG_FUNCTION_ARGS)553 textout(PG_FUNCTION_ARGS)
554 {
555 	Datum		txt = PG_GETARG_DATUM(0);
556 
557 	PG_RETURN_CSTRING(TextDatumGetCString(txt));
558 }
559 
560 /*
561  *		textrecv			- converts external binary format to text
562  */
563 Datum
textrecv(PG_FUNCTION_ARGS)564 textrecv(PG_FUNCTION_ARGS)
565 {
566 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
567 	text	   *result;
568 	char	   *str;
569 	int			nbytes;
570 
571 	str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
572 
573 	result = cstring_to_text_with_len(str, nbytes);
574 	pfree(str);
575 	PG_RETURN_TEXT_P(result);
576 }
577 
578 /*
579  *		textsend			- converts text to binary format
580  */
581 Datum
textsend(PG_FUNCTION_ARGS)582 textsend(PG_FUNCTION_ARGS)
583 {
584 	text	   *t = PG_GETARG_TEXT_PP(0);
585 	StringInfoData buf;
586 
587 	pq_begintypsend(&buf);
588 	pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
589 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
590 }
591 
592 
593 /*
594  *		unknownin			- converts "..." to internal representation
595  */
596 Datum
unknownin(PG_FUNCTION_ARGS)597 unknownin(PG_FUNCTION_ARGS)
598 {
599 	char	   *str = PG_GETARG_CSTRING(0);
600 
601 	/* representation is same as cstring */
602 	PG_RETURN_CSTRING(pstrdup(str));
603 }
604 
605 /*
606  *		unknownout			- converts internal representation to "..."
607  */
608 Datum
unknownout(PG_FUNCTION_ARGS)609 unknownout(PG_FUNCTION_ARGS)
610 {
611 	/* representation is same as cstring */
612 	char	   *str = PG_GETARG_CSTRING(0);
613 
614 	PG_RETURN_CSTRING(pstrdup(str));
615 }
616 
617 /*
618  *		unknownrecv			- converts external binary format to unknown
619  */
620 Datum
unknownrecv(PG_FUNCTION_ARGS)621 unknownrecv(PG_FUNCTION_ARGS)
622 {
623 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
624 	char	   *str;
625 	int			nbytes;
626 
627 	str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
628 	/* representation is same as cstring */
629 	PG_RETURN_CSTRING(str);
630 }
631 
632 /*
633  *		unknownsend			- converts unknown to binary format
634  */
635 Datum
unknownsend(PG_FUNCTION_ARGS)636 unknownsend(PG_FUNCTION_ARGS)
637 {
638 	/* representation is same as cstring */
639 	char	   *str = PG_GETARG_CSTRING(0);
640 	StringInfoData buf;
641 
642 	pq_begintypsend(&buf);
643 	pq_sendtext(&buf, str, strlen(str));
644 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
645 }
646 
647 
648 /* ========== PUBLIC ROUTINES ========== */
649 
650 /*
651  * textlen -
652  *	  returns the logical length of a text*
653  *	   (which is less than the VARSIZE of the text*)
654  */
655 Datum
textlen(PG_FUNCTION_ARGS)656 textlen(PG_FUNCTION_ARGS)
657 {
658 	Datum		str = PG_GETARG_DATUM(0);
659 
660 	/* try to avoid decompressing argument */
661 	PG_RETURN_INT32(text_length(str));
662 }
663 
664 /*
665  * text_length -
666  *	Does the real work for textlen()
667  *
668  *	This is broken out so it can be called directly by other string processing
669  *	functions.  Note that the argument is passed as a Datum, to indicate that
670  *	it may still be in compressed form.  We can avoid decompressing it at all
671  *	in some cases.
672  */
673 static int32
text_length(Datum str)674 text_length(Datum str)
675 {
676 	/* fastpath when max encoding length is one */
677 	if (pg_database_encoding_max_length() == 1)
678 		PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
679 	else
680 	{
681 		text	   *t = DatumGetTextPP(str);
682 
683 		PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
684 											 VARSIZE_ANY_EXHDR(t)));
685 	}
686 }
687 
688 /*
689  * textoctetlen -
690  *	  returns the physical length of a text*
691  *	   (which is less than the VARSIZE of the text*)
692  */
693 Datum
textoctetlen(PG_FUNCTION_ARGS)694 textoctetlen(PG_FUNCTION_ARGS)
695 {
696 	Datum		str = PG_GETARG_DATUM(0);
697 
698 	/* We need not detoast the input at all */
699 	PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
700 }
701 
702 /*
703  * textcat -
704  *	  takes two text* and returns a text* that is the concatenation of
705  *	  the two.
706  *
707  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
708  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
709  * Allocate space for output in all cases.
710  * XXX - thomas 1997-07-10
711  */
712 Datum
textcat(PG_FUNCTION_ARGS)713 textcat(PG_FUNCTION_ARGS)
714 {
715 	text	   *t1 = PG_GETARG_TEXT_PP(0);
716 	text	   *t2 = PG_GETARG_TEXT_PP(1);
717 
718 	PG_RETURN_TEXT_P(text_catenate(t1, t2));
719 }
720 
721 /*
722  * text_catenate
723  *	Guts of textcat(), broken out so it can be used by other functions
724  *
725  * Arguments can be in short-header form, but not compressed or out-of-line
726  */
727 static text *
text_catenate(text * t1,text * t2)728 text_catenate(text *t1, text *t2)
729 {
730 	text	   *result;
731 	int			len1,
732 				len2,
733 				len;
734 	char	   *ptr;
735 
736 	len1 = VARSIZE_ANY_EXHDR(t1);
737 	len2 = VARSIZE_ANY_EXHDR(t2);
738 
739 	/* paranoia ... probably should throw error instead? */
740 	if (len1 < 0)
741 		len1 = 0;
742 	if (len2 < 0)
743 		len2 = 0;
744 
745 	len = len1 + len2 + VARHDRSZ;
746 	result = (text *) palloc(len);
747 
748 	/* Set size of result string... */
749 	SET_VARSIZE(result, len);
750 
751 	/* Fill data field of result string... */
752 	ptr = VARDATA(result);
753 	if (len1 > 0)
754 		memcpy(ptr, VARDATA_ANY(t1), len1);
755 	if (len2 > 0)
756 		memcpy(ptr + len1, VARDATA_ANY(t2), len2);
757 
758 	return result;
759 }
760 
761 /*
762  * charlen_to_bytelen()
763  *	Compute the number of bytes occupied by n characters starting at *p
764  *
765  * It is caller's responsibility that there actually are n characters;
766  * the string need not be null-terminated.
767  */
768 static int
charlen_to_bytelen(const char * p,int n)769 charlen_to_bytelen(const char *p, int n)
770 {
771 	if (pg_database_encoding_max_length() == 1)
772 	{
773 		/* Optimization for single-byte encodings */
774 		return n;
775 	}
776 	else
777 	{
778 		const char *s;
779 
780 		for (s = p; n > 0; n--)
781 			s += pg_mblen(s);
782 
783 		return s - p;
784 	}
785 }
786 
787 /*
788  * text_substr()
789  * Return a substring starting at the specified position.
790  * - thomas 1997-12-31
791  *
792  * Input:
793  *	- string
794  *	- starting position (is one-based)
795  *	- string length
796  *
797  * If the starting position is zero or less, then return from the start of the string
798  *	adjusting the length to be consistent with the "negative start" per SQL.
799  * If the length is less than zero, return the remaining string.
800  *
801  * Added multibyte support.
802  * - Tatsuo Ishii 1998-4-21
803  * Changed behavior if starting position is less than one to conform to SQL behavior.
804  * Formerly returned the entire string; now returns a portion.
805  * - Thomas Lockhart 1998-12-10
806  * Now uses faster TOAST-slicing interface
807  * - John Gray 2002-02-22
808  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
809  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
810  * error; if E < 1, return '', not entire string). Fixed MB related bug when
811  * S > LC and < LC + 4 sometimes garbage characters are returned.
812  * - Joe Conway 2002-08-10
813  */
814 Datum
text_substr(PG_FUNCTION_ARGS)815 text_substr(PG_FUNCTION_ARGS)
816 {
817 	PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
818 									PG_GETARG_INT32(1),
819 									PG_GETARG_INT32(2),
820 									false));
821 }
822 
823 /*
824  * text_substr_no_len -
825  *	  Wrapper to avoid opr_sanity failure due to
826  *	  one function accepting a different number of args.
827  */
828 Datum
text_substr_no_len(PG_FUNCTION_ARGS)829 text_substr_no_len(PG_FUNCTION_ARGS)
830 {
831 	PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
832 									PG_GETARG_INT32(1),
833 									-1, true));
834 }
835 
836 /*
837  * text_substring -
838  *	Does the real work for text_substr() and text_substr_no_len()
839  *
840  *	This is broken out so it can be called directly by other string processing
841  *	functions.  Note that the argument is passed as a Datum, to indicate that
842  *	it may still be in compressed/toasted form.  We can avoid detoasting all
843  *	of it in some cases.
844  *
845  *	The result is always a freshly palloc'd datum.
846  */
847 static text *
text_substring(Datum str,int32 start,int32 length,bool length_not_specified)848 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
849 {
850 	int32		eml = pg_database_encoding_max_length();
851 	int32		S = start;		/* start position */
852 	int32		S1;				/* adjusted start position */
853 	int32		L1;				/* adjusted substring length */
854 	int32		E;				/* end position */
855 
856 	/*
857 	 * SQL99 says S can be zero or negative, but we still must fetch from the
858 	 * start of the string.
859 	 */
860 	S1 = Max(S, 1);
861 
862 	/* life is easy if the encoding max length is 1 */
863 	if (eml == 1)
864 	{
865 		if (length_not_specified)	/* special case - get length to end of
866 									 * string */
867 			L1 = -1;
868 		else if (length < 0)
869 		{
870 			/* SQL99 says to throw an error for E < S, i.e., negative length */
871 			ereport(ERROR,
872 					(errcode(ERRCODE_SUBSTRING_ERROR),
873 					 errmsg("negative substring length not allowed")));
874 			L1 = -1;			/* silence stupider compilers */
875 		}
876 		else if (pg_add_s32_overflow(S, length, &E))
877 		{
878 			/*
879 			 * L could be large enough for S + L to overflow, in which case
880 			 * the substring must run to end of string.
881 			 */
882 			L1 = -1;
883 		}
884 		else
885 		{
886 			/*
887 			 * A zero or negative value for the end position can happen if the
888 			 * start was negative or one. SQL99 says to return a zero-length
889 			 * string.
890 			 */
891 			if (E < 1)
892 				return cstring_to_text("");
893 
894 			L1 = E - S1;
895 		}
896 
897 		/*
898 		 * If the start position is past the end of the string, SQL99 says to
899 		 * return a zero-length string -- DatumGetTextPSlice() will do that
900 		 * for us.  We need only convert S1 to zero-based starting position.
901 		 */
902 		return DatumGetTextPSlice(str, S1 - 1, L1);
903 	}
904 	else if (eml > 1)
905 	{
906 		/*
907 		 * When encoding max length is > 1, we can't get LC without
908 		 * detoasting, so we'll grab a conservatively large slice now and go
909 		 * back later to do the right thing
910 		 */
911 		int32		slice_start;
912 		int32		slice_size;
913 		int32		slice_strlen;
914 		text	   *slice;
915 		int32		E1;
916 		int32		i;
917 		char	   *p;
918 		char	   *s;
919 		text	   *ret;
920 
921 		/*
922 		 * We need to start at position zero because there is no way to know
923 		 * in advance which byte offset corresponds to the supplied start
924 		 * position.
925 		 */
926 		slice_start = 0;
927 
928 		if (length_not_specified)	/* special case - get length to end of
929 									 * string */
930 			slice_size = L1 = -1;
931 		else if (length < 0)
932 		{
933 			/* SQL99 says to throw an error for E < S, i.e., negative length */
934 			ereport(ERROR,
935 					(errcode(ERRCODE_SUBSTRING_ERROR),
936 					 errmsg("negative substring length not allowed")));
937 			slice_size = L1 = -1;	/* silence stupider compilers */
938 		}
939 		else if (pg_add_s32_overflow(S, length, &E))
940 		{
941 			/*
942 			 * L could be large enough for S + L to overflow, in which case
943 			 * the substring must run to end of string.
944 			 */
945 			slice_size = L1 = -1;
946 		}
947 		else
948 		{
949 			/*
950 			 * A zero or negative value for the end position can happen if the
951 			 * start was negative or one. SQL99 says to return a zero-length
952 			 * string.
953 			 */
954 			if (E < 1)
955 				return cstring_to_text("");
956 
957 			/*
958 			 * if E is past the end of the string, the tuple toaster will
959 			 * truncate the length for us
960 			 */
961 			L1 = E - S1;
962 
963 			/*
964 			 * Total slice size in bytes can't be any longer than the start
965 			 * position plus substring length times the encoding max length.
966 			 * If that overflows, we can just use -1.
967 			 */
968 			if (pg_mul_s32_overflow(E, eml, &slice_size))
969 				slice_size = -1;
970 		}
971 
972 		/*
973 		 * If we're working with an untoasted source, no need to do an extra
974 		 * copying step.
975 		 */
976 		if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
977 			VARATT_IS_EXTERNAL(DatumGetPointer(str)))
978 			slice = DatumGetTextPSlice(str, slice_start, slice_size);
979 		else
980 			slice = (text *) DatumGetPointer(str);
981 
982 		/* see if we got back an empty string */
983 		if (VARSIZE_ANY_EXHDR(slice) == 0)
984 		{
985 			if (slice != (text *) DatumGetPointer(str))
986 				pfree(slice);
987 			return cstring_to_text("");
988 		}
989 
990 		/* Now we can get the actual length of the slice in MB characters */
991 		slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
992 											VARSIZE_ANY_EXHDR(slice));
993 
994 		/*
995 		 * Check that the start position wasn't > slice_strlen. If so, SQL99
996 		 * says to return a zero-length string.
997 		 */
998 		if (S1 > slice_strlen)
999 		{
1000 			if (slice != (text *) DatumGetPointer(str))
1001 				pfree(slice);
1002 			return cstring_to_text("");
1003 		}
1004 
1005 		/*
1006 		 * Adjust L1 and E1 now that we know the slice string length. Again
1007 		 * remember that S1 is one based, and slice_start is zero based.
1008 		 */
1009 		if (L1 > -1)
1010 			E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1011 		else
1012 			E1 = slice_start + 1 + slice_strlen;
1013 
1014 		/*
1015 		 * Find the start position in the slice; remember S1 is not zero based
1016 		 */
1017 		p = VARDATA_ANY(slice);
1018 		for (i = 0; i < S1 - 1; i++)
1019 			p += pg_mblen(p);
1020 
1021 		/* hang onto a pointer to our start position */
1022 		s = p;
1023 
1024 		/*
1025 		 * Count the actual bytes used by the substring of the requested
1026 		 * length.
1027 		 */
1028 		for (i = S1; i < E1; i++)
1029 			p += pg_mblen(p);
1030 
1031 		ret = (text *) palloc(VARHDRSZ + (p - s));
1032 		SET_VARSIZE(ret, VARHDRSZ + (p - s));
1033 		memcpy(VARDATA(ret), s, (p - s));
1034 
1035 		if (slice != (text *) DatumGetPointer(str))
1036 			pfree(slice);
1037 
1038 		return ret;
1039 	}
1040 	else
1041 		elog(ERROR, "invalid backend encoding: encoding max length < 1");
1042 
1043 	/* not reached: suppress compiler warning */
1044 	return NULL;
1045 }
1046 
1047 /*
1048  * textoverlay
1049  *	Replace specified substring of first string with second
1050  *
1051  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1052  * This code is a direct implementation of what the standard says.
1053  */
1054 Datum
textoverlay(PG_FUNCTION_ARGS)1055 textoverlay(PG_FUNCTION_ARGS)
1056 {
1057 	text	   *t1 = PG_GETARG_TEXT_PP(0);
1058 	text	   *t2 = PG_GETARG_TEXT_PP(1);
1059 	int			sp = PG_GETARG_INT32(2);	/* substring start position */
1060 	int			sl = PG_GETARG_INT32(3);	/* substring length */
1061 
1062 	PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1063 }
1064 
1065 Datum
textoverlay_no_len(PG_FUNCTION_ARGS)1066 textoverlay_no_len(PG_FUNCTION_ARGS)
1067 {
1068 	text	   *t1 = PG_GETARG_TEXT_PP(0);
1069 	text	   *t2 = PG_GETARG_TEXT_PP(1);
1070 	int			sp = PG_GETARG_INT32(2);	/* substring start position */
1071 	int			sl;
1072 
1073 	sl = text_length(PointerGetDatum(t2));	/* defaults to length(t2) */
1074 	PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1075 }
1076 
1077 static text *
text_overlay(text * t1,text * t2,int sp,int sl)1078 text_overlay(text *t1, text *t2, int sp, int sl)
1079 {
1080 	text	   *result;
1081 	text	   *s1;
1082 	text	   *s2;
1083 	int			sp_pl_sl;
1084 
1085 	/*
1086 	 * Check for possible integer-overflow cases.  For negative sp, throw a
1087 	 * "substring length" error because that's what should be expected
1088 	 * according to the spec's definition of OVERLAY().
1089 	 */
1090 	if (sp <= 0)
1091 		ereport(ERROR,
1092 				(errcode(ERRCODE_SUBSTRING_ERROR),
1093 				 errmsg("negative substring length not allowed")));
1094 	if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1095 		ereport(ERROR,
1096 				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1097 				 errmsg("integer out of range")));
1098 
1099 	s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1100 	s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1101 	result = text_catenate(s1, t2);
1102 	result = text_catenate(result, s2);
1103 
1104 	return result;
1105 }
1106 
1107 /*
1108  * textpos -
1109  *	  Return the position of the specified substring.
1110  *	  Implements the SQL POSITION() function.
1111  *	  Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1112  * - thomas 1997-07-27
1113  */
1114 Datum
textpos(PG_FUNCTION_ARGS)1115 textpos(PG_FUNCTION_ARGS)
1116 {
1117 	text	   *str = PG_GETARG_TEXT_PP(0);
1118 	text	   *search_str = PG_GETARG_TEXT_PP(1);
1119 
1120 	PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1121 }
1122 
1123 /*
1124  * text_position -
1125  *	Does the real work for textpos()
1126  *
1127  * Inputs:
1128  *		t1 - string to be searched
1129  *		t2 - pattern to match within t1
1130  * Result:
1131  *		Character index of the first matched char, starting from 1,
1132  *		or 0 if no match.
1133  *
1134  *	This is broken out so it can be called directly by other string processing
1135  *	functions.
1136  */
1137 static int
text_position(text * t1,text * t2,Oid collid)1138 text_position(text *t1, text *t2, Oid collid)
1139 {
1140 	TextPositionState state;
1141 	int			result;
1142 
1143 	/* Empty needle always matches at position 1 */
1144 	if (VARSIZE_ANY_EXHDR(t2) < 1)
1145 		return 1;
1146 
1147 	/* Otherwise, can't match if haystack is shorter than needle */
1148 	if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1149 		return 0;
1150 
1151 	text_position_setup(t1, t2, collid, &state);
1152 	if (!text_position_next(&state))
1153 		result = 0;
1154 	else
1155 		result = text_position_get_match_pos(&state);
1156 	text_position_cleanup(&state);
1157 	return result;
1158 }
1159 
1160 
1161 /*
1162  * text_position_setup, text_position_next, text_position_cleanup -
1163  *	Component steps of text_position()
1164  *
1165  * These are broken out so that a string can be efficiently searched for
1166  * multiple occurrences of the same pattern.  text_position_next may be
1167  * called multiple times, and it advances to the next match on each call.
1168  * text_position_get_match_ptr() and text_position_get_match_pos() return
1169  * a pointer or 1-based character position of the last match, respectively.
1170  *
1171  * The "state" variable is normally just a local variable in the caller.
1172  *
1173  * NOTE: text_position_next skips over the matched portion.  For example,
1174  * searching for "xx" in "xxx" returns only one match, not two.
1175  */
1176 
1177 static void
text_position_setup(text * t1,text * t2,Oid collid,TextPositionState * state)1178 text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1179 {
1180 	int			len1 = VARSIZE_ANY_EXHDR(t1);
1181 	int			len2 = VARSIZE_ANY_EXHDR(t2);
1182 	pg_locale_t mylocale = 0;
1183 
1184 	check_collation_set(collid);
1185 
1186 	if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1187 		mylocale = pg_newlocale_from_collation(collid);
1188 
1189 	if (mylocale && !mylocale->deterministic)
1190 		ereport(ERROR,
1191 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1192 				 errmsg("nondeterministic collations are not supported for substring searches")));
1193 
1194 	Assert(len1 > 0);
1195 	Assert(len2 > 0);
1196 
1197 	/*
1198 	 * Even with a multi-byte encoding, we perform the search using the raw
1199 	 * byte sequence, ignoring multibyte issues.  For UTF-8, that works fine,
1200 	 * because in UTF-8 the byte sequence of one character cannot contain
1201 	 * another character.  For other multi-byte encodings, we do the search
1202 	 * initially as a simple byte search, ignoring multibyte issues, but
1203 	 * verify afterwards that the match we found is at a character boundary,
1204 	 * and continue the search if it was a false match.
1205 	 */
1206 	if (pg_database_encoding_max_length() == 1)
1207 	{
1208 		state->is_multibyte = false;
1209 		state->is_multibyte_char_in_char = false;
1210 	}
1211 	else if (GetDatabaseEncoding() == PG_UTF8)
1212 	{
1213 		state->is_multibyte = true;
1214 		state->is_multibyte_char_in_char = false;
1215 	}
1216 	else
1217 	{
1218 		state->is_multibyte = true;
1219 		state->is_multibyte_char_in_char = true;
1220 	}
1221 
1222 	state->str1 = VARDATA_ANY(t1);
1223 	state->str2 = VARDATA_ANY(t2);
1224 	state->len1 = len1;
1225 	state->len2 = len2;
1226 	state->last_match = NULL;
1227 	state->refpoint = state->str1;
1228 	state->refpos = 0;
1229 
1230 	/*
1231 	 * Prepare the skip table for Boyer-Moore-Horspool searching.  In these
1232 	 * notes we use the terminology that the "haystack" is the string to be
1233 	 * searched (t1) and the "needle" is the pattern being sought (t2).
1234 	 *
1235 	 * If the needle is empty or bigger than the haystack then there is no
1236 	 * point in wasting cycles initializing the table.  We also choose not to
1237 	 * use B-M-H for needles of length 1, since the skip table can't possibly
1238 	 * save anything in that case.
1239 	 */
1240 	if (len1 >= len2 && len2 > 1)
1241 	{
1242 		int			searchlength = len1 - len2;
1243 		int			skiptablemask;
1244 		int			last;
1245 		int			i;
1246 		const char *str2 = state->str2;
1247 
1248 		/*
1249 		 * First we must determine how much of the skip table to use.  The
1250 		 * declaration of TextPositionState allows up to 256 elements, but for
1251 		 * short search problems we don't really want to have to initialize so
1252 		 * many elements --- it would take too long in comparison to the
1253 		 * actual search time.  So we choose a useful skip table size based on
1254 		 * the haystack length minus the needle length.  The closer the needle
1255 		 * length is to the haystack length the less useful skipping becomes.
1256 		 *
1257 		 * Note: since we use bit-masking to select table elements, the skip
1258 		 * table size MUST be a power of 2, and so the mask must be 2^N-1.
1259 		 */
1260 		if (searchlength < 16)
1261 			skiptablemask = 3;
1262 		else if (searchlength < 64)
1263 			skiptablemask = 7;
1264 		else if (searchlength < 128)
1265 			skiptablemask = 15;
1266 		else if (searchlength < 512)
1267 			skiptablemask = 31;
1268 		else if (searchlength < 2048)
1269 			skiptablemask = 63;
1270 		else if (searchlength < 4096)
1271 			skiptablemask = 127;
1272 		else
1273 			skiptablemask = 255;
1274 		state->skiptablemask = skiptablemask;
1275 
1276 		/*
1277 		 * Initialize the skip table.  We set all elements to the needle
1278 		 * length, since this is the correct skip distance for any character
1279 		 * not found in the needle.
1280 		 */
1281 		for (i = 0; i <= skiptablemask; i++)
1282 			state->skiptable[i] = len2;
1283 
1284 		/*
1285 		 * Now examine the needle.  For each character except the last one,
1286 		 * set the corresponding table element to the appropriate skip
1287 		 * distance.  Note that when two characters share the same skip table
1288 		 * entry, the one later in the needle must determine the skip
1289 		 * distance.
1290 		 */
1291 		last = len2 - 1;
1292 
1293 		for (i = 0; i < last; i++)
1294 			state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1295 	}
1296 }
1297 
1298 /*
1299  * Advance to the next match, starting from the end of the previous match
1300  * (or the beginning of the string, on first call).  Returns true if a match
1301  * is found.
1302  *
1303  * Note that this refuses to match an empty-string needle.  Most callers
1304  * will have handled that case specially and we'll never see it here.
1305  */
1306 static bool
text_position_next(TextPositionState * state)1307 text_position_next(TextPositionState *state)
1308 {
1309 	int			needle_len = state->len2;
1310 	char	   *start_ptr;
1311 	char	   *matchptr;
1312 
1313 	if (needle_len <= 0)
1314 		return false;			/* result for empty pattern */
1315 
1316 	/* Start from the point right after the previous match. */
1317 	if (state->last_match)
1318 		start_ptr = state->last_match + needle_len;
1319 	else
1320 		start_ptr = state->str1;
1321 
1322 retry:
1323 	matchptr = text_position_next_internal(start_ptr, state);
1324 
1325 	if (!matchptr)
1326 		return false;
1327 
1328 	/*
1329 	 * Found a match for the byte sequence.  If this is a multibyte encoding,
1330 	 * where one character's byte sequence can appear inside a longer
1331 	 * multi-byte character, we need to verify that the match was at a
1332 	 * character boundary, not in the middle of a multi-byte character.
1333 	 */
1334 	if (state->is_multibyte_char_in_char)
1335 	{
1336 		/* Walk one character at a time, until we reach the match. */
1337 
1338 		/* the search should never move backwards. */
1339 		Assert(state->refpoint <= matchptr);
1340 
1341 		while (state->refpoint < matchptr)
1342 		{
1343 			/* step to next character. */
1344 			state->refpoint += pg_mblen(state->refpoint);
1345 			state->refpos++;
1346 
1347 			/*
1348 			 * If we stepped over the match's start position, then it was a
1349 			 * false positive, where the byte sequence appeared in the middle
1350 			 * of a multi-byte character.  Skip it, and continue the search at
1351 			 * the next character boundary.
1352 			 */
1353 			if (state->refpoint > matchptr)
1354 			{
1355 				start_ptr = state->refpoint;
1356 				goto retry;
1357 			}
1358 		}
1359 	}
1360 
1361 	state->last_match = matchptr;
1362 	return true;
1363 }
1364 
1365 /*
1366  * Subroutine of text_position_next().  This searches for the raw byte
1367  * sequence, ignoring any multi-byte encoding issues.  Returns the first
1368  * match starting at 'start_ptr', or NULL if no match is found.
1369  */
1370 static char *
text_position_next_internal(char * start_ptr,TextPositionState * state)1371 text_position_next_internal(char *start_ptr, TextPositionState *state)
1372 {
1373 	int			haystack_len = state->len1;
1374 	int			needle_len = state->len2;
1375 	int			skiptablemask = state->skiptablemask;
1376 	const char *haystack = state->str1;
1377 	const char *needle = state->str2;
1378 	const char *haystack_end = &haystack[haystack_len];
1379 	const char *hptr;
1380 
1381 	Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1382 
1383 	if (needle_len == 1)
1384 	{
1385 		/* No point in using B-M-H for a one-character needle */
1386 		char		nchar = *needle;
1387 
1388 		hptr = start_ptr;
1389 		while (hptr < haystack_end)
1390 		{
1391 			if (*hptr == nchar)
1392 				return (char *) hptr;
1393 			hptr++;
1394 		}
1395 	}
1396 	else
1397 	{
1398 		const char *needle_last = &needle[needle_len - 1];
1399 
1400 		/* Start at startpos plus the length of the needle */
1401 		hptr = start_ptr + needle_len - 1;
1402 		while (hptr < haystack_end)
1403 		{
1404 			/* Match the needle scanning *backward* */
1405 			const char *nptr;
1406 			const char *p;
1407 
1408 			nptr = needle_last;
1409 			p = hptr;
1410 			while (*nptr == *p)
1411 			{
1412 				/* Matched it all?	If so, return 1-based position */
1413 				if (nptr == needle)
1414 					return (char *) p;
1415 				nptr--, p--;
1416 			}
1417 
1418 			/*
1419 			 * No match, so use the haystack char at hptr to decide how far to
1420 			 * advance.  If the needle had any occurrence of that character
1421 			 * (or more precisely, one sharing the same skiptable entry)
1422 			 * before its last character, then we advance far enough to align
1423 			 * the last such needle character with that haystack position.
1424 			 * Otherwise we can advance by the whole needle length.
1425 			 */
1426 			hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1427 		}
1428 	}
1429 
1430 	return 0;					/* not found */
1431 }
1432 
1433 /*
1434  * Return a pointer to the current match.
1435  *
1436  * The returned pointer points into correct position in the original
1437  * the haystack string.
1438  */
1439 static char *
text_position_get_match_ptr(TextPositionState * state)1440 text_position_get_match_ptr(TextPositionState *state)
1441 {
1442 	return state->last_match;
1443 }
1444 
1445 /*
1446  * Return the offset of the current match.
1447  *
1448  * The offset is in characters, 1-based.
1449  */
1450 static int
text_position_get_match_pos(TextPositionState * state)1451 text_position_get_match_pos(TextPositionState *state)
1452 {
1453 	if (!state->is_multibyte)
1454 		return state->last_match - state->str1 + 1;
1455 	else
1456 	{
1457 		/* Convert the byte position to char position. */
1458 		while (state->refpoint < state->last_match)
1459 		{
1460 			state->refpoint += pg_mblen(state->refpoint);
1461 			state->refpos++;
1462 		}
1463 		Assert(state->refpoint == state->last_match);
1464 		return state->refpos + 1;
1465 	}
1466 }
1467 
1468 static void
text_position_cleanup(TextPositionState * state)1469 text_position_cleanup(TextPositionState *state)
1470 {
1471 	/* no cleanup needed */
1472 }
1473 
1474 static void
check_collation_set(Oid collid)1475 check_collation_set(Oid collid)
1476 {
1477 	if (!OidIsValid(collid))
1478 	{
1479 		/*
1480 		 * This typically means that the parser could not resolve a conflict
1481 		 * of implicit collations, so report it that way.
1482 		 */
1483 		ereport(ERROR,
1484 				(errcode(ERRCODE_INDETERMINATE_COLLATION),
1485 				 errmsg("could not determine which collation to use for string comparison"),
1486 				 errhint("Use the COLLATE clause to set the collation explicitly.")));
1487 	}
1488 }
1489 
1490 /* varstr_cmp()
1491  * Comparison function for text strings with given lengths.
1492  * Includes locale support, but must copy strings to temporary memory
1493  *	to allow null-termination for inputs to strcoll().
1494  * Returns an integer less than, equal to, or greater than zero, indicating
1495  * whether arg1 is less than, equal to, or greater than arg2.
1496  *
1497  * Note: many functions that depend on this are marked leakproof; therefore,
1498  * avoid reporting the actual contents of the input when throwing errors.
1499  * All errors herein should be things that can't happen except on corrupt
1500  * data, anyway; otherwise we will have trouble with indexing strings that
1501  * would cause them.
1502  */
1503 int
varstr_cmp(const char * arg1,int len1,const char * arg2,int len2,Oid collid)1504 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1505 {
1506 	int			result;
1507 
1508 	check_collation_set(collid);
1509 
1510 	/*
1511 	 * Unfortunately, there is no strncoll(), so in the non-C locale case we
1512 	 * have to do some memory copying.  This turns out to be significantly
1513 	 * slower, so we optimize the case where LC_COLLATE is C.  We also try to
1514 	 * optimize relatively-short strings by avoiding palloc/pfree overhead.
1515 	 */
1516 	if (lc_collate_is_c(collid))
1517 	{
1518 		result = memcmp(arg1, arg2, Min(len1, len2));
1519 		if ((result == 0) && (len1 != len2))
1520 			result = (len1 < len2) ? -1 : 1;
1521 	}
1522 	else
1523 	{
1524 		char		a1buf[TEXTBUFLEN];
1525 		char		a2buf[TEXTBUFLEN];
1526 		char	   *a1p,
1527 				   *a2p;
1528 		pg_locale_t mylocale = 0;
1529 
1530 		if (collid != DEFAULT_COLLATION_OID)
1531 			mylocale = pg_newlocale_from_collation(collid);
1532 
1533 		/*
1534 		 * memcmp() can't tell us which of two unequal strings sorts first,
1535 		 * but it's a cheap way to tell if they're equal.  Testing shows that
1536 		 * memcmp() followed by strcoll() is only trivially slower than
1537 		 * strcoll() by itself, so we don't lose much if this doesn't work out
1538 		 * very often, and if it does - for example, because there are many
1539 		 * equal strings in the input - then we win big by avoiding expensive
1540 		 * collation-aware comparisons.
1541 		 */
1542 		if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1543 			return 0;
1544 
1545 #ifdef WIN32
1546 		/* Win32 does not have UTF-8, so we need to map to UTF-16 */
1547 		if (GetDatabaseEncoding() == PG_UTF8
1548 			&& (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1549 		{
1550 			int			a1len;
1551 			int			a2len;
1552 			int			r;
1553 
1554 			if (len1 >= TEXTBUFLEN / 2)
1555 			{
1556 				a1len = len1 * 2 + 2;
1557 				a1p = palloc(a1len);
1558 			}
1559 			else
1560 			{
1561 				a1len = TEXTBUFLEN;
1562 				a1p = a1buf;
1563 			}
1564 			if (len2 >= TEXTBUFLEN / 2)
1565 			{
1566 				a2len = len2 * 2 + 2;
1567 				a2p = palloc(a2len);
1568 			}
1569 			else
1570 			{
1571 				a2len = TEXTBUFLEN;
1572 				a2p = a2buf;
1573 			}
1574 
1575 			/* stupid Microsloth API does not work for zero-length input */
1576 			if (len1 == 0)
1577 				r = 0;
1578 			else
1579 			{
1580 				r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1581 										(LPWSTR) a1p, a1len / 2);
1582 				if (!r)
1583 					ereport(ERROR,
1584 							(errmsg("could not convert string to UTF-16: error code %lu",
1585 									GetLastError())));
1586 			}
1587 			((LPWSTR) a1p)[r] = 0;
1588 
1589 			if (len2 == 0)
1590 				r = 0;
1591 			else
1592 			{
1593 				r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1594 										(LPWSTR) a2p, a2len / 2);
1595 				if (!r)
1596 					ereport(ERROR,
1597 							(errmsg("could not convert string to UTF-16: error code %lu",
1598 									GetLastError())));
1599 			}
1600 			((LPWSTR) a2p)[r] = 0;
1601 
1602 			errno = 0;
1603 #ifdef HAVE_LOCALE_T
1604 			if (mylocale)
1605 				result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1606 			else
1607 #endif
1608 				result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1609 			if (result == 2147483647)	/* _NLSCMPERROR; missing from mingw
1610 										 * headers */
1611 				ereport(ERROR,
1612 						(errmsg("could not compare Unicode strings: %m")));
1613 
1614 			/* Break tie if necessary. */
1615 			if (result == 0 &&
1616 				(!mylocale || mylocale->deterministic))
1617 			{
1618 				result = memcmp(arg1, arg2, Min(len1, len2));
1619 				if ((result == 0) && (len1 != len2))
1620 					result = (len1 < len2) ? -1 : 1;
1621 			}
1622 
1623 			if (a1p != a1buf)
1624 				pfree(a1p);
1625 			if (a2p != a2buf)
1626 				pfree(a2p);
1627 
1628 			return result;
1629 		}
1630 #endif							/* WIN32 */
1631 
1632 		if (len1 >= TEXTBUFLEN)
1633 			a1p = (char *) palloc(len1 + 1);
1634 		else
1635 			a1p = a1buf;
1636 		if (len2 >= TEXTBUFLEN)
1637 			a2p = (char *) palloc(len2 + 1);
1638 		else
1639 			a2p = a2buf;
1640 
1641 		memcpy(a1p, arg1, len1);
1642 		a1p[len1] = '\0';
1643 		memcpy(a2p, arg2, len2);
1644 		a2p[len2] = '\0';
1645 
1646 		if (mylocale)
1647 		{
1648 			if (mylocale->provider == COLLPROVIDER_ICU)
1649 			{
1650 #ifdef USE_ICU
1651 #ifdef HAVE_UCOL_STRCOLLUTF8
1652 				if (GetDatabaseEncoding() == PG_UTF8)
1653 				{
1654 					UErrorCode	status;
1655 
1656 					status = U_ZERO_ERROR;
1657 					result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1658 											  arg1, len1,
1659 											  arg2, len2,
1660 											  &status);
1661 					if (U_FAILURE(status))
1662 						ereport(ERROR,
1663 								(errmsg("collation failed: %s", u_errorName(status))));
1664 				}
1665 				else
1666 #endif
1667 				{
1668 					int32_t		ulen1,
1669 								ulen2;
1670 					UChar	   *uchar1,
1671 							   *uchar2;
1672 
1673 					ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1674 					ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1675 
1676 					result = ucol_strcoll(mylocale->info.icu.ucol,
1677 										  uchar1, ulen1,
1678 										  uchar2, ulen2);
1679 
1680 					pfree(uchar1);
1681 					pfree(uchar2);
1682 				}
1683 #else							/* not USE_ICU */
1684 				/* shouldn't happen */
1685 				elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1686 #endif							/* not USE_ICU */
1687 			}
1688 			else
1689 			{
1690 #ifdef HAVE_LOCALE_T
1691 				result = strcoll_l(a1p, a2p, mylocale->info.lt);
1692 #else
1693 				/* shouldn't happen */
1694 				elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1695 #endif
1696 			}
1697 		}
1698 		else
1699 			result = strcoll(a1p, a2p);
1700 
1701 		/* Break tie if necessary. */
1702 		if (result == 0 &&
1703 			(!mylocale || mylocale->deterministic))
1704 			result = strcmp(a1p, a2p);
1705 
1706 		if (a1p != a1buf)
1707 			pfree(a1p);
1708 		if (a2p != a2buf)
1709 			pfree(a2p);
1710 	}
1711 
1712 	return result;
1713 }
1714 
1715 /* text_cmp()
1716  * Internal comparison function for text strings.
1717  * Returns -1, 0 or 1
1718  */
1719 static int
text_cmp(text * arg1,text * arg2,Oid collid)1720 text_cmp(text *arg1, text *arg2, Oid collid)
1721 {
1722 	char	   *a1p,
1723 			   *a2p;
1724 	int			len1,
1725 				len2;
1726 
1727 	a1p = VARDATA_ANY(arg1);
1728 	a2p = VARDATA_ANY(arg2);
1729 
1730 	len1 = VARSIZE_ANY_EXHDR(arg1);
1731 	len2 = VARSIZE_ANY_EXHDR(arg2);
1732 
1733 	return varstr_cmp(a1p, len1, a2p, len2, collid);
1734 }
1735 
1736 /*
1737  * Comparison functions for text strings.
1738  *
1739  * Note: btree indexes need these routines not to leak memory; therefore,
1740  * be careful to free working copies of toasted datums.  Most places don't
1741  * need to be so careful.
1742  */
1743 
1744 Datum
texteq(PG_FUNCTION_ARGS)1745 texteq(PG_FUNCTION_ARGS)
1746 {
1747 	Oid			collid = PG_GET_COLLATION();
1748 	bool		result;
1749 
1750 	check_collation_set(collid);
1751 
1752 	if (lc_collate_is_c(collid) ||
1753 		collid == DEFAULT_COLLATION_OID ||
1754 		pg_newlocale_from_collation(collid)->deterministic)
1755 	{
1756 		Datum		arg1 = PG_GETARG_DATUM(0);
1757 		Datum		arg2 = PG_GETARG_DATUM(1);
1758 		Size		len1,
1759 					len2;
1760 
1761 		/*
1762 		 * Since we only care about equality or not-equality, we can avoid all
1763 		 * the expense of strcoll() here, and just do bitwise comparison.  In
1764 		 * fact, we don't even have to do a bitwise comparison if we can show
1765 		 * the lengths of the strings are unequal; which might save us from
1766 		 * having to detoast one or both values.
1767 		 */
1768 		len1 = toast_raw_datum_size(arg1);
1769 		len2 = toast_raw_datum_size(arg2);
1770 		if (len1 != len2)
1771 			result = false;
1772 		else
1773 		{
1774 			text	   *targ1 = DatumGetTextPP(arg1);
1775 			text	   *targ2 = DatumGetTextPP(arg2);
1776 
1777 			result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1778 							 len1 - VARHDRSZ) == 0);
1779 
1780 			PG_FREE_IF_COPY(targ1, 0);
1781 			PG_FREE_IF_COPY(targ2, 1);
1782 		}
1783 	}
1784 	else
1785 	{
1786 		text	   *arg1 = PG_GETARG_TEXT_PP(0);
1787 		text	   *arg2 = PG_GETARG_TEXT_PP(1);
1788 
1789 		result = (text_cmp(arg1, arg2, collid) == 0);
1790 
1791 		PG_FREE_IF_COPY(arg1, 0);
1792 		PG_FREE_IF_COPY(arg2, 1);
1793 	}
1794 
1795 	PG_RETURN_BOOL(result);
1796 }
1797 
1798 Datum
textne(PG_FUNCTION_ARGS)1799 textne(PG_FUNCTION_ARGS)
1800 {
1801 	Oid			collid = PG_GET_COLLATION();
1802 	bool		result;
1803 
1804 	check_collation_set(collid);
1805 
1806 	if (lc_collate_is_c(collid) ||
1807 		collid == DEFAULT_COLLATION_OID ||
1808 		pg_newlocale_from_collation(collid)->deterministic)
1809 	{
1810 		Datum		arg1 = PG_GETARG_DATUM(0);
1811 		Datum		arg2 = PG_GETARG_DATUM(1);
1812 		Size		len1,
1813 					len2;
1814 
1815 		/* See comment in texteq() */
1816 		len1 = toast_raw_datum_size(arg1);
1817 		len2 = toast_raw_datum_size(arg2);
1818 		if (len1 != len2)
1819 			result = true;
1820 		else
1821 		{
1822 			text	   *targ1 = DatumGetTextPP(arg1);
1823 			text	   *targ2 = DatumGetTextPP(arg2);
1824 
1825 			result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1826 							 len1 - VARHDRSZ) != 0);
1827 
1828 			PG_FREE_IF_COPY(targ1, 0);
1829 			PG_FREE_IF_COPY(targ2, 1);
1830 		}
1831 	}
1832 	else
1833 	{
1834 		text	   *arg1 = PG_GETARG_TEXT_PP(0);
1835 		text	   *arg2 = PG_GETARG_TEXT_PP(1);
1836 
1837 		result = (text_cmp(arg1, arg2, collid) != 0);
1838 
1839 		PG_FREE_IF_COPY(arg1, 0);
1840 		PG_FREE_IF_COPY(arg2, 1);
1841 	}
1842 
1843 	PG_RETURN_BOOL(result);
1844 }
1845 
1846 Datum
text_lt(PG_FUNCTION_ARGS)1847 text_lt(PG_FUNCTION_ARGS)
1848 {
1849 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1850 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1851 	bool		result;
1852 
1853 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1854 
1855 	PG_FREE_IF_COPY(arg1, 0);
1856 	PG_FREE_IF_COPY(arg2, 1);
1857 
1858 	PG_RETURN_BOOL(result);
1859 }
1860 
1861 Datum
text_le(PG_FUNCTION_ARGS)1862 text_le(PG_FUNCTION_ARGS)
1863 {
1864 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1865 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1866 	bool		result;
1867 
1868 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1869 
1870 	PG_FREE_IF_COPY(arg1, 0);
1871 	PG_FREE_IF_COPY(arg2, 1);
1872 
1873 	PG_RETURN_BOOL(result);
1874 }
1875 
1876 Datum
text_gt(PG_FUNCTION_ARGS)1877 text_gt(PG_FUNCTION_ARGS)
1878 {
1879 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1880 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1881 	bool		result;
1882 
1883 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1884 
1885 	PG_FREE_IF_COPY(arg1, 0);
1886 	PG_FREE_IF_COPY(arg2, 1);
1887 
1888 	PG_RETURN_BOOL(result);
1889 }
1890 
1891 Datum
text_ge(PG_FUNCTION_ARGS)1892 text_ge(PG_FUNCTION_ARGS)
1893 {
1894 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1895 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1896 	bool		result;
1897 
1898 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1899 
1900 	PG_FREE_IF_COPY(arg1, 0);
1901 	PG_FREE_IF_COPY(arg2, 1);
1902 
1903 	PG_RETURN_BOOL(result);
1904 }
1905 
1906 Datum
text_starts_with(PG_FUNCTION_ARGS)1907 text_starts_with(PG_FUNCTION_ARGS)
1908 {
1909 	Datum		arg1 = PG_GETARG_DATUM(0);
1910 	Datum		arg2 = PG_GETARG_DATUM(1);
1911 	Oid			collid = PG_GET_COLLATION();
1912 	pg_locale_t mylocale = 0;
1913 	bool		result;
1914 	Size		len1,
1915 				len2;
1916 
1917 	check_collation_set(collid);
1918 
1919 	if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
1920 		mylocale = pg_newlocale_from_collation(collid);
1921 
1922 	if (mylocale && !mylocale->deterministic)
1923 		ereport(ERROR,
1924 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1925 				 errmsg("nondeterministic collations are not supported for substring searches")));
1926 
1927 	len1 = toast_raw_datum_size(arg1);
1928 	len2 = toast_raw_datum_size(arg2);
1929 	if (len2 > len1)
1930 		result = false;
1931 	else
1932 	{
1933 		text	   *targ1 = text_substring(arg1, 1, len2, false);
1934 		text	   *targ2 = DatumGetTextPP(arg2);
1935 
1936 		result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1937 						 VARSIZE_ANY_EXHDR(targ2)) == 0);
1938 
1939 		PG_FREE_IF_COPY(targ1, 0);
1940 		PG_FREE_IF_COPY(targ2, 1);
1941 	}
1942 
1943 	PG_RETURN_BOOL(result);
1944 }
1945 
1946 Datum
bttextcmp(PG_FUNCTION_ARGS)1947 bttextcmp(PG_FUNCTION_ARGS)
1948 {
1949 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1950 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1951 	int32		result;
1952 
1953 	result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1954 
1955 	PG_FREE_IF_COPY(arg1, 0);
1956 	PG_FREE_IF_COPY(arg2, 1);
1957 
1958 	PG_RETURN_INT32(result);
1959 }
1960 
1961 Datum
bttextsortsupport(PG_FUNCTION_ARGS)1962 bttextsortsupport(PG_FUNCTION_ARGS)
1963 {
1964 	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1965 	Oid			collid = ssup->ssup_collation;
1966 	MemoryContext oldcontext;
1967 
1968 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1969 
1970 	/* Use generic string SortSupport */
1971 	varstr_sortsupport(ssup, TEXTOID, collid);
1972 
1973 	MemoryContextSwitchTo(oldcontext);
1974 
1975 	PG_RETURN_VOID();
1976 }
1977 
1978 /*
1979  * Generic sortsupport interface for character type's operator classes.
1980  * Includes locale support, and support for BpChar semantics (i.e. removing
1981  * trailing spaces before comparison).
1982  *
1983  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1984  * same representation.  Callers that always use the C collation (e.g.
1985  * non-collatable type callers like bytea) may have NUL bytes in their strings;
1986  * this will not work with any other collation, though.
1987  */
1988 void
varstr_sortsupport(SortSupport ssup,Oid typid,Oid collid)1989 varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
1990 {
1991 	bool		abbreviate = ssup->abbreviate;
1992 	bool		collate_c = false;
1993 	VarStringSortSupport *sss;
1994 	pg_locale_t locale = 0;
1995 
1996 	check_collation_set(collid);
1997 
1998 	/*
1999 	 * If possible, set ssup->comparator to a function which can be used to
2000 	 * directly compare two datums.  If we can do this, we'll avoid the
2001 	 * overhead of a trip through the fmgr layer for every comparison, which
2002 	 * can be substantial.
2003 	 *
2004 	 * Most typically, we'll set the comparator to varlenafastcmp_locale,
2005 	 * which uses strcoll() to perform comparisons.  We use that for the
2006 	 * BpChar case too, but type NAME uses namefastcmp_locale. However, if
2007 	 * LC_COLLATE = C, we can make things quite a bit faster with
2008 	 * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
2009 	 * memcmp() rather than strcoll().
2010 	 */
2011 	if (lc_collate_is_c(collid))
2012 	{
2013 		if (typid == BPCHAROID)
2014 			ssup->comparator = bpcharfastcmp_c;
2015 		else if (typid == NAMEOID)
2016 		{
2017 			ssup->comparator = namefastcmp_c;
2018 			/* Not supporting abbreviation with type NAME, for now */
2019 			abbreviate = false;
2020 		}
2021 		else
2022 			ssup->comparator = varstrfastcmp_c;
2023 
2024 		collate_c = true;
2025 	}
2026 	else
2027 	{
2028 		/*
2029 		 * We need a collation-sensitive comparison.  To make things faster,
2030 		 * we'll figure out the collation based on the locale id and cache the
2031 		 * result.
2032 		 */
2033 		if (collid != DEFAULT_COLLATION_OID)
2034 			locale = pg_newlocale_from_collation(collid);
2035 
2036 		/*
2037 		 * There is a further exception on Windows.  When the database
2038 		 * encoding is UTF-8 and we are not using the C collation, complex
2039 		 * hacks are required.  We don't currently have a comparator that
2040 		 * handles that case, so we fall back on the slow method of having the
2041 		 * sort code invoke bttextcmp() (in the case of text) via the fmgr
2042 		 * trampoline.  ICU locales work just the same on Windows, however.
2043 		 */
2044 #ifdef WIN32
2045 		if (GetDatabaseEncoding() == PG_UTF8 &&
2046 			!(locale && locale->provider == COLLPROVIDER_ICU))
2047 			return;
2048 #endif
2049 
2050 		/*
2051 		 * We use varlenafastcmp_locale except for type NAME.
2052 		 */
2053 		if (typid == NAMEOID)
2054 		{
2055 			ssup->comparator = namefastcmp_locale;
2056 			/* Not supporting abbreviation with type NAME, for now */
2057 			abbreviate = false;
2058 		}
2059 		else
2060 			ssup->comparator = varlenafastcmp_locale;
2061 	}
2062 
2063 	/*
2064 	 * Unfortunately, it seems that abbreviation for non-C collations is
2065 	 * broken on many common platforms; testing of multiple versions of glibc
2066 	 * reveals that, for many locales, strcoll() and strxfrm() do not return
2067 	 * consistent results, which is fatal to this optimization.  While no
2068 	 * other libc other than Cygwin has so far been shown to have a problem,
2069 	 * we take the conservative course of action for right now and disable
2070 	 * this categorically.  (Users who are certain this isn't a problem on
2071 	 * their system can define TRUST_STRXFRM.)
2072 	 *
2073 	 * Even apart from the risk of broken locales, it's possible that there
2074 	 * are platforms where the use of abbreviated keys should be disabled at
2075 	 * compile time.  Having only 4 byte datums could make worst-case
2076 	 * performance drastically more likely, for example.  Moreover, macOS's
2077 	 * strxfrm() implementation is known to not effectively concentrate a
2078 	 * significant amount of entropy from the original string in earlier
2079 	 * transformed blobs.  It's possible that other supported platforms are
2080 	 * similarly encumbered.  So, if we ever get past disabling this
2081 	 * categorically, we may still want or need to disable it for particular
2082 	 * platforms.
2083 	 */
2084 #ifndef TRUST_STRXFRM
2085 	if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
2086 		abbreviate = false;
2087 #endif
2088 
2089 	/*
2090 	 * If we're using abbreviated keys, or if we're using a locale-aware
2091 	 * comparison, we need to initialize a VarStringSortSupport object. Both
2092 	 * cases will make use of the temporary buffers we initialize here for
2093 	 * scratch space (and to detect requirement for BpChar semantics from
2094 	 * caller), and the abbreviation case requires additional state.
2095 	 */
2096 	if (abbreviate || !collate_c)
2097 	{
2098 		sss = palloc(sizeof(VarStringSortSupport));
2099 		sss->buf1 = palloc(TEXTBUFLEN);
2100 		sss->buflen1 = TEXTBUFLEN;
2101 		sss->buf2 = palloc(TEXTBUFLEN);
2102 		sss->buflen2 = TEXTBUFLEN;
2103 		/* Start with invalid values */
2104 		sss->last_len1 = -1;
2105 		sss->last_len2 = -1;
2106 		/* Initialize */
2107 		sss->last_returned = 0;
2108 		sss->locale = locale;
2109 
2110 		/*
2111 		 * To avoid somehow confusing a strxfrm() blob and an original string,
2112 		 * constantly keep track of the variety of data that buf1 and buf2
2113 		 * currently contain.
2114 		 *
2115 		 * Comparisons may be interleaved with conversion calls.  Frequently,
2116 		 * conversions and comparisons are batched into two distinct phases,
2117 		 * but the correctness of caching cannot hinge upon this.  For
2118 		 * comparison caching, buffer state is only trusted if cache_blob is
2119 		 * found set to false, whereas strxfrm() caching only trusts the state
2120 		 * when cache_blob is found set to true.
2121 		 *
2122 		 * Arbitrarily initialize cache_blob to true.
2123 		 */
2124 		sss->cache_blob = true;
2125 		sss->collate_c = collate_c;
2126 		sss->typid = typid;
2127 		ssup->ssup_extra = sss;
2128 
2129 		/*
2130 		 * If possible, plan to use the abbreviated keys optimization.  The
2131 		 * core code may switch back to authoritative comparator should
2132 		 * abbreviation be aborted.
2133 		 */
2134 		if (abbreviate)
2135 		{
2136 			sss->prop_card = 0.20;
2137 			initHyperLogLog(&sss->abbr_card, 10);
2138 			initHyperLogLog(&sss->full_card, 10);
2139 			ssup->abbrev_full_comparator = ssup->comparator;
2140 			ssup->comparator = varstrcmp_abbrev;
2141 			ssup->abbrev_converter = varstr_abbrev_convert;
2142 			ssup->abbrev_abort = varstr_abbrev_abort;
2143 		}
2144 	}
2145 }
2146 
2147 /*
2148  * sortsupport comparison func (for C locale case)
2149  */
2150 static int
varstrfastcmp_c(Datum x,Datum y,SortSupport ssup)2151 varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
2152 {
2153 	VarString  *arg1 = DatumGetVarStringPP(x);
2154 	VarString  *arg2 = DatumGetVarStringPP(y);
2155 	char	   *a1p,
2156 			   *a2p;
2157 	int			len1,
2158 				len2,
2159 				result;
2160 
2161 	a1p = VARDATA_ANY(arg1);
2162 	a2p = VARDATA_ANY(arg2);
2163 
2164 	len1 = VARSIZE_ANY_EXHDR(arg1);
2165 	len2 = VARSIZE_ANY_EXHDR(arg2);
2166 
2167 	result = memcmp(a1p, a2p, Min(len1, len2));
2168 	if ((result == 0) && (len1 != len2))
2169 		result = (len1 < len2) ? -1 : 1;
2170 
2171 	/* We can't afford to leak memory here. */
2172 	if (PointerGetDatum(arg1) != x)
2173 		pfree(arg1);
2174 	if (PointerGetDatum(arg2) != y)
2175 		pfree(arg2);
2176 
2177 	return result;
2178 }
2179 
2180 /*
2181  * sortsupport comparison func (for BpChar C locale case)
2182  *
2183  * BpChar outsources its sortsupport to this module.  Specialization for the
2184  * varstr_sortsupport BpChar case, modeled on
2185  * internal_bpchar_pattern_compare().
2186  */
2187 static int
bpcharfastcmp_c(Datum x,Datum y,SortSupport ssup)2188 bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2189 {
2190 	BpChar	   *arg1 = DatumGetBpCharPP(x);
2191 	BpChar	   *arg2 = DatumGetBpCharPP(y);
2192 	char	   *a1p,
2193 			   *a2p;
2194 	int			len1,
2195 				len2,
2196 				result;
2197 
2198 	a1p = VARDATA_ANY(arg1);
2199 	a2p = VARDATA_ANY(arg2);
2200 
2201 	len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2202 	len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2203 
2204 	result = memcmp(a1p, a2p, Min(len1, len2));
2205 	if ((result == 0) && (len1 != len2))
2206 		result = (len1 < len2) ? -1 : 1;
2207 
2208 	/* We can't afford to leak memory here. */
2209 	if (PointerGetDatum(arg1) != x)
2210 		pfree(arg1);
2211 	if (PointerGetDatum(arg2) != y)
2212 		pfree(arg2);
2213 
2214 	return result;
2215 }
2216 
2217 /*
2218  * sortsupport comparison func (for NAME C locale case)
2219  */
2220 static int
namefastcmp_c(Datum x,Datum y,SortSupport ssup)2221 namefastcmp_c(Datum x, Datum y, SortSupport ssup)
2222 {
2223 	Name		arg1 = DatumGetName(x);
2224 	Name		arg2 = DatumGetName(y);
2225 
2226 	return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2227 }
2228 
2229 /*
2230  * sortsupport comparison func (for locale case with all varlena types)
2231  */
2232 static int
varlenafastcmp_locale(Datum x,Datum y,SortSupport ssup)2233 varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
2234 {
2235 	VarString  *arg1 = DatumGetVarStringPP(x);
2236 	VarString  *arg2 = DatumGetVarStringPP(y);
2237 	char	   *a1p,
2238 			   *a2p;
2239 	int			len1,
2240 				len2,
2241 				result;
2242 
2243 	a1p = VARDATA_ANY(arg1);
2244 	a2p = VARDATA_ANY(arg2);
2245 
2246 	len1 = VARSIZE_ANY_EXHDR(arg1);
2247 	len2 = VARSIZE_ANY_EXHDR(arg2);
2248 
2249 	result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2250 
2251 	/* We can't afford to leak memory here. */
2252 	if (PointerGetDatum(arg1) != x)
2253 		pfree(arg1);
2254 	if (PointerGetDatum(arg2) != y)
2255 		pfree(arg2);
2256 
2257 	return result;
2258 }
2259 
2260 /*
2261  * sortsupport comparison func (for locale case with NAME type)
2262  */
2263 static int
namefastcmp_locale(Datum x,Datum y,SortSupport ssup)2264 namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
2265 {
2266 	Name		arg1 = DatumGetName(x);
2267 	Name		arg2 = DatumGetName(y);
2268 
2269 	return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2270 								NameStr(*arg2), strlen(NameStr(*arg2)),
2271 								ssup);
2272 }
2273 
2274 /*
2275  * sortsupport comparison func for locale cases
2276  */
2277 static int
varstrfastcmp_locale(char * a1p,int len1,char * a2p,int len2,SortSupport ssup)2278 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2279 {
2280 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2281 	int			result;
2282 	bool		arg1_match;
2283 
2284 	/* Fast pre-check for equality, as discussed in varstr_cmp() */
2285 	if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2286 	{
2287 		/*
2288 		 * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2289 		 * last_len2.  Existing contents of buffers might still be used by
2290 		 * next call.
2291 		 *
2292 		 * It's fine to allow the comparison of BpChar padding bytes here,
2293 		 * even though that implies that the memcmp() will usually be
2294 		 * performed for BpChar callers (though multibyte characters could
2295 		 * still prevent that from occurring).  The memcmp() is still very
2296 		 * cheap, and BpChar's funny semantics have us remove trailing spaces
2297 		 * (not limited to padding), so we need make no distinction between
2298 		 * padding space characters and "real" space characters.
2299 		 */
2300 		return 0;
2301 	}
2302 
2303 	if (sss->typid == BPCHAROID)
2304 	{
2305 		/* Get true number of bytes, ignoring trailing spaces */
2306 		len1 = bpchartruelen(a1p, len1);
2307 		len2 = bpchartruelen(a2p, len2);
2308 	}
2309 
2310 	if (len1 >= sss->buflen1)
2311 	{
2312 		pfree(sss->buf1);
2313 		sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2314 		sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2315 	}
2316 	if (len2 >= sss->buflen2)
2317 	{
2318 		pfree(sss->buf2);
2319 		sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2320 		sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2321 	}
2322 
2323 	/*
2324 	 * We're likely to be asked to compare the same strings repeatedly, and
2325 	 * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2326 	 * comparisons, even though in general there is no reason to think that
2327 	 * that will work out (every string datum may be unique).  Caching does
2328 	 * not slow things down measurably when it doesn't work out, and can speed
2329 	 * things up by rather a lot when it does.  In part, this is because the
2330 	 * memcmp() compares data from cachelines that are needed in L1 cache even
2331 	 * when the last comparison's result cannot be reused.
2332 	 */
2333 	arg1_match = true;
2334 	if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2335 	{
2336 		arg1_match = false;
2337 		memcpy(sss->buf1, a1p, len1);
2338 		sss->buf1[len1] = '\0';
2339 		sss->last_len1 = len1;
2340 	}
2341 
2342 	/*
2343 	 * If we're comparing the same two strings as last time, we can return the
2344 	 * same answer without calling strcoll() again.  This is more likely than
2345 	 * it seems (at least with moderate to low cardinality sets), because
2346 	 * quicksort compares the same pivot against many values.
2347 	 */
2348 	if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2349 	{
2350 		memcpy(sss->buf2, a2p, len2);
2351 		sss->buf2[len2] = '\0';
2352 		sss->last_len2 = len2;
2353 	}
2354 	else if (arg1_match && !sss->cache_blob)
2355 	{
2356 		/* Use result cached following last actual strcoll() call */
2357 		return sss->last_returned;
2358 	}
2359 
2360 	if (sss->locale)
2361 	{
2362 		if (sss->locale->provider == COLLPROVIDER_ICU)
2363 		{
2364 #ifdef USE_ICU
2365 #ifdef HAVE_UCOL_STRCOLLUTF8
2366 			if (GetDatabaseEncoding() == PG_UTF8)
2367 			{
2368 				UErrorCode	status;
2369 
2370 				status = U_ZERO_ERROR;
2371 				result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2372 										  a1p, len1,
2373 										  a2p, len2,
2374 										  &status);
2375 				if (U_FAILURE(status))
2376 					ereport(ERROR,
2377 							(errmsg("collation failed: %s", u_errorName(status))));
2378 			}
2379 			else
2380 #endif
2381 			{
2382 				int32_t		ulen1,
2383 							ulen2;
2384 				UChar	   *uchar1,
2385 						   *uchar2;
2386 
2387 				ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2388 				ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2389 
2390 				result = ucol_strcoll(sss->locale->info.icu.ucol,
2391 									  uchar1, ulen1,
2392 									  uchar2, ulen2);
2393 
2394 				pfree(uchar1);
2395 				pfree(uchar2);
2396 			}
2397 #else							/* not USE_ICU */
2398 			/* shouldn't happen */
2399 			elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2400 #endif							/* not USE_ICU */
2401 		}
2402 		else
2403 		{
2404 #ifdef HAVE_LOCALE_T
2405 			result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2406 #else
2407 			/* shouldn't happen */
2408 			elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2409 #endif
2410 		}
2411 	}
2412 	else
2413 		result = strcoll(sss->buf1, sss->buf2);
2414 
2415 	/* Break tie if necessary. */
2416 	if (result == 0 &&
2417 		(!sss->locale || sss->locale->deterministic))
2418 		result = strcmp(sss->buf1, sss->buf2);
2419 
2420 	/* Cache result, perhaps saving an expensive strcoll() call next time */
2421 	sss->cache_blob = false;
2422 	sss->last_returned = result;
2423 	return result;
2424 }
2425 
2426 /*
2427  * Abbreviated key comparison func
2428  */
2429 static int
varstrcmp_abbrev(Datum x,Datum y,SortSupport ssup)2430 varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup)
2431 {
2432 	/*
2433 	 * When 0 is returned, the core system will call varstrfastcmp_c()
2434 	 * (bpcharfastcmp_c() in BpChar case) or varlenafastcmp_locale().  Even a
2435 	 * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2436 	 * authoritatively, for the same reason that there is a strcoll()
2437 	 * tie-breaker call to strcmp() in varstr_cmp().
2438 	 */
2439 	if (x > y)
2440 		return 1;
2441 	else if (x == y)
2442 		return 0;
2443 	else
2444 		return -1;
2445 }
2446 
2447 /*
2448  * Conversion routine for sortsupport.  Converts original to abbreviated key
2449  * representation.  Our encoding strategy is simple -- pack the first 8 bytes
2450  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2451  * stored in reverse order), and treat it as an unsigned integer.  When the "C"
2452  * locale is used, or in case of bytea, just memcpy() from original instead.
2453  */
2454 static Datum
varstr_abbrev_convert(Datum original,SortSupport ssup)2455 varstr_abbrev_convert(Datum original, SortSupport ssup)
2456 {
2457 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2458 	VarString  *authoritative = DatumGetVarStringPP(original);
2459 	char	   *authoritative_data = VARDATA_ANY(authoritative);
2460 
2461 	/* working state */
2462 	Datum		res;
2463 	char	   *pres;
2464 	int			len;
2465 	uint32		hash;
2466 
2467 	pres = (char *) &res;
2468 	/* memset(), so any non-overwritten bytes are NUL */
2469 	memset(pres, 0, sizeof(Datum));
2470 	len = VARSIZE_ANY_EXHDR(authoritative);
2471 
2472 	/* Get number of bytes, ignoring trailing spaces */
2473 	if (sss->typid == BPCHAROID)
2474 		len = bpchartruelen(authoritative_data, len);
2475 
2476 	/*
2477 	 * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2478 	 * abbreviate keys.  The full comparator for the C locale is always
2479 	 * memcmp().  It would be incorrect to allow bytea callers (callers that
2480 	 * always force the C collation -- bytea isn't a collatable type, but this
2481 	 * approach is convenient) to use strxfrm().  This is because bytea
2482 	 * strings may contain NUL bytes.  Besides, this should be faster, too.
2483 	 *
2484 	 * More generally, it's okay that bytea callers can have NUL bytes in
2485 	 * strings because varstrcmp_abbrev() need not make a distinction between
2486 	 * terminating NUL bytes, and NUL bytes representing actual NULs in the
2487 	 * authoritative representation.  Hopefully a comparison at or past one
2488 	 * abbreviated key's terminating NUL byte will resolve the comparison
2489 	 * without consulting the authoritative representation; specifically, some
2490 	 * later non-NUL byte in the longer string can resolve the comparison
2491 	 * against a subsequent terminating NUL in the shorter string.  There will
2492 	 * usually be what is effectively a "length-wise" resolution there and
2493 	 * then.
2494 	 *
2495 	 * If that doesn't work out -- if all bytes in the longer string
2496 	 * positioned at or past the offset of the smaller string's (first)
2497 	 * terminating NUL are actually representative of NUL bytes in the
2498 	 * authoritative binary string (perhaps with some *terminating* NUL bytes
2499 	 * towards the end of the longer string iff it happens to still be small)
2500 	 * -- then an authoritative tie-breaker will happen, and do the right
2501 	 * thing: explicitly consider string length.
2502 	 */
2503 	if (sss->collate_c)
2504 		memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2505 	else
2506 	{
2507 		Size		bsize;
2508 #ifdef USE_ICU
2509 		int32_t		ulen = -1;
2510 		UChar	   *uchar = NULL;
2511 #endif
2512 
2513 		/*
2514 		 * We're not using the C collation, so fall back on strxfrm or ICU
2515 		 * analogs.
2516 		 */
2517 
2518 		/* By convention, we use buffer 1 to store and NUL-terminate */
2519 		if (len >= sss->buflen1)
2520 		{
2521 			pfree(sss->buf1);
2522 			sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2523 			sss->buf1 = palloc(sss->buflen1);
2524 		}
2525 
2526 		/* Might be able to reuse strxfrm() blob from last call */
2527 		if (sss->last_len1 == len && sss->cache_blob &&
2528 			memcmp(sss->buf1, authoritative_data, len) == 0)
2529 		{
2530 			memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2531 			/* No change affecting cardinality, so no hashing required */
2532 			goto done;
2533 		}
2534 
2535 		memcpy(sss->buf1, authoritative_data, len);
2536 
2537 		/*
2538 		 * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2539 		 * necessary for ICU, but doesn't hurt.
2540 		 */
2541 		sss->buf1[len] = '\0';
2542 		sss->last_len1 = len;
2543 
2544 #ifdef USE_ICU
2545 		/* When using ICU and not UTF8, convert string to UChar. */
2546 		if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2547 			GetDatabaseEncoding() != PG_UTF8)
2548 			ulen = icu_to_uchar(&uchar, sss->buf1, len);
2549 #endif
2550 
2551 		/*
2552 		 * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2553 		 * and try again.  Both of these functions have the result buffer
2554 		 * content undefined if the result did not fit, so we need to retry
2555 		 * until everything fits, even though we only need the first few bytes
2556 		 * in the end.  When using ucol_nextSortKeyPart(), however, we only
2557 		 * ask for as many bytes as we actually need.
2558 		 */
2559 		for (;;)
2560 		{
2561 #ifdef USE_ICU
2562 			if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2563 			{
2564 				/*
2565 				 * When using UTF8, use the iteration interface so we only
2566 				 * need to produce as many bytes as we actually need.
2567 				 */
2568 				if (GetDatabaseEncoding() == PG_UTF8)
2569 				{
2570 					UCharIterator iter;
2571 					uint32_t	state[2];
2572 					UErrorCode	status;
2573 
2574 					uiter_setUTF8(&iter, sss->buf1, len);
2575 					state[0] = state[1] = 0;	/* won't need that again */
2576 					status = U_ZERO_ERROR;
2577 					bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2578 												 &iter,
2579 												 state,
2580 												 (uint8_t *) sss->buf2,
2581 												 Min(sizeof(Datum), sss->buflen2),
2582 												 &status);
2583 					if (U_FAILURE(status))
2584 						ereport(ERROR,
2585 								(errmsg("sort key generation failed: %s",
2586 										u_errorName(status))));
2587 				}
2588 				else
2589 					bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2590 											uchar, ulen,
2591 											(uint8_t *) sss->buf2, sss->buflen2);
2592 			}
2593 			else
2594 #endif
2595 #ifdef HAVE_LOCALE_T
2596 			if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2597 				bsize = strxfrm_l(sss->buf2, sss->buf1,
2598 								  sss->buflen2, sss->locale->info.lt);
2599 			else
2600 #endif
2601 				bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2602 
2603 			sss->last_len2 = bsize;
2604 			if (bsize < sss->buflen2)
2605 				break;
2606 
2607 			/*
2608 			 * Grow buffer and retry.
2609 			 */
2610 			pfree(sss->buf2);
2611 			sss->buflen2 = Max(bsize + 1,
2612 							   Min(sss->buflen2 * 2, MaxAllocSize));
2613 			sss->buf2 = palloc(sss->buflen2);
2614 		}
2615 
2616 		/*
2617 		 * Every Datum byte is always compared.  This is safe because the
2618 		 * strxfrm() blob is itself NUL terminated, leaving no danger of
2619 		 * misinterpreting any NUL bytes not intended to be interpreted as
2620 		 * logically representing termination.
2621 		 *
2622 		 * (Actually, even if there were NUL bytes in the blob it would be
2623 		 * okay.  See remarks on bytea case above.)
2624 		 */
2625 		memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2626 
2627 #ifdef USE_ICU
2628 		if (uchar)
2629 			pfree(uchar);
2630 #endif
2631 	}
2632 
2633 	/*
2634 	 * Maintain approximate cardinality of both abbreviated keys and original,
2635 	 * authoritative keys using HyperLogLog.  Used as cheap insurance against
2636 	 * the worst case, where we do many string transformations for no saving
2637 	 * in full strcoll()-based comparisons.  These statistics are used by
2638 	 * varstr_abbrev_abort().
2639 	 *
2640 	 * First, Hash key proper, or a significant fraction of it.  Mix in length
2641 	 * in order to compensate for cases where differences are past
2642 	 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2643 	 */
2644 	hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2645 								   Min(len, PG_CACHE_LINE_SIZE)));
2646 
2647 	if (len > PG_CACHE_LINE_SIZE)
2648 		hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2649 
2650 	addHyperLogLog(&sss->full_card, hash);
2651 
2652 	/* Hash abbreviated key */
2653 #if SIZEOF_DATUM == 8
2654 	{
2655 		uint32		lohalf,
2656 					hihalf;
2657 
2658 		lohalf = (uint32) res;
2659 		hihalf = (uint32) (res >> 32);
2660 		hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2661 	}
2662 #else							/* SIZEOF_DATUM != 8 */
2663 	hash = DatumGetUInt32(hash_uint32((uint32) res));
2664 #endif
2665 
2666 	addHyperLogLog(&sss->abbr_card, hash);
2667 
2668 	/* Cache result, perhaps saving an expensive strxfrm() call next time */
2669 	sss->cache_blob = true;
2670 done:
2671 
2672 	/*
2673 	 * Byteswap on little-endian machines.
2674 	 *
2675 	 * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2676 	 * comparator) works correctly on all platforms.  If we didn't do this,
2677 	 * the comparator would have to call memcmp() with a pair of pointers to
2678 	 * the first byte of each abbreviated key, which is slower.
2679 	 */
2680 	res = DatumBigEndianToNative(res);
2681 
2682 	/* Don't leak memory here */
2683 	if (PointerGetDatum(authoritative) != original)
2684 		pfree(authoritative);
2685 
2686 	return res;
2687 }
2688 
2689 /*
2690  * Callback for estimating effectiveness of abbreviated key optimization, using
2691  * heuristic rules.  Returns value indicating if the abbreviation optimization
2692  * should be aborted, based on its projected effectiveness.
2693  */
2694 static bool
varstr_abbrev_abort(int memtupcount,SortSupport ssup)2695 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2696 {
2697 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2698 	double		abbrev_distinct,
2699 				key_distinct;
2700 
2701 	Assert(ssup->abbreviate);
2702 
2703 	/* Have a little patience */
2704 	if (memtupcount < 100)
2705 		return false;
2706 
2707 	abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2708 	key_distinct = estimateHyperLogLog(&sss->full_card);
2709 
2710 	/*
2711 	 * Clamp cardinality estimates to at least one distinct value.  While
2712 	 * NULLs are generally disregarded, if only NULL values were seen so far,
2713 	 * that might misrepresent costs if we failed to clamp.
2714 	 */
2715 	if (abbrev_distinct <= 1.0)
2716 		abbrev_distinct = 1.0;
2717 
2718 	if (key_distinct <= 1.0)
2719 		key_distinct = 1.0;
2720 
2721 	/*
2722 	 * In the worst case all abbreviated keys are identical, while at the same
2723 	 * time there are differences within full key strings not captured in
2724 	 * abbreviations.
2725 	 */
2726 #ifdef TRACE_SORT
2727 	if (trace_sort)
2728 	{
2729 		double		norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2730 
2731 		elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2732 			 "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2733 			 memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2734 			 sss->prop_card);
2735 	}
2736 #endif
2737 
2738 	/*
2739 	 * If the number of distinct abbreviated keys approximately matches the
2740 	 * number of distinct authoritative original keys, that's reason enough to
2741 	 * proceed.  We can win even with a very low cardinality set if most
2742 	 * tie-breakers only memcmp().  This is by far the most important
2743 	 * consideration.
2744 	 *
2745 	 * While comparisons that are resolved at the abbreviated key level are
2746 	 * considerably cheaper than tie-breakers resolved with memcmp(), both of
2747 	 * those two outcomes are so much cheaper than a full strcoll() once
2748 	 * sorting is underway that it doesn't seem worth it to weigh abbreviated
2749 	 * cardinality against the overall size of the set in order to more
2750 	 * accurately model costs.  Assume that an abbreviated comparison, and an
2751 	 * abbreviated comparison with a cheap memcmp()-based authoritative
2752 	 * resolution are equivalent.
2753 	 */
2754 	if (abbrev_distinct > key_distinct * sss->prop_card)
2755 	{
2756 		/*
2757 		 * When we have exceeded 10,000 tuples, decay required cardinality
2758 		 * aggressively for next call.
2759 		 *
2760 		 * This is useful because the number of comparisons required on
2761 		 * average increases at a linearithmic rate, and at roughly 10,000
2762 		 * tuples that factor will start to dominate over the linear costs of
2763 		 * string transformation (this is a conservative estimate).  The decay
2764 		 * rate is chosen to be a little less aggressive than halving -- which
2765 		 * (since we're called at points at which memtupcount has doubled)
2766 		 * would never see the cost model actually abort past the first call
2767 		 * following a decay.  This decay rate is mostly a precaution against
2768 		 * a sudden, violent swing in how well abbreviated cardinality tracks
2769 		 * full key cardinality.  The decay also serves to prevent a marginal
2770 		 * case from being aborted too late, when too much has already been
2771 		 * invested in string transformation.
2772 		 *
2773 		 * It's possible for sets of several million distinct strings with
2774 		 * mere tens of thousands of distinct abbreviated keys to still
2775 		 * benefit very significantly.  This will generally occur provided
2776 		 * each abbreviated key is a proxy for a roughly uniform number of the
2777 		 * set's full keys. If it isn't so, we hope to catch that early and
2778 		 * abort.  If it isn't caught early, by the time the problem is
2779 		 * apparent it's probably not worth aborting.
2780 		 */
2781 		if (memtupcount > 10000)
2782 			sss->prop_card *= 0.65;
2783 
2784 		return false;
2785 	}
2786 
2787 	/*
2788 	 * Abort abbreviation strategy.
2789 	 *
2790 	 * The worst case, where all abbreviated keys are identical while all
2791 	 * original strings differ will typically only see a regression of about
2792 	 * 10% in execution time for small to medium sized lists of strings.
2793 	 * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2794 	 * often expect very large improvements, particularly with sets of strings
2795 	 * of moderately high to high abbreviated cardinality.  There is little to
2796 	 * lose but much to gain, which our strategy reflects.
2797 	 */
2798 #ifdef TRACE_SORT
2799 	if (trace_sort)
2800 		elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2801 			 "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2802 			 memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2803 #endif
2804 
2805 	return true;
2806 }
2807 
2808 /*
2809  * Generic equalimage support function for character type's operator classes.
2810  * Disables the use of deduplication with nondeterministic collations.
2811  */
2812 Datum
btvarstrequalimage(PG_FUNCTION_ARGS)2813 btvarstrequalimage(PG_FUNCTION_ARGS)
2814 {
2815 	/* Oid		opcintype = PG_GETARG_OID(0); */
2816 	Oid			collid = PG_GET_COLLATION();
2817 
2818 	check_collation_set(collid);
2819 
2820 	if (lc_collate_is_c(collid) ||
2821 		collid == DEFAULT_COLLATION_OID ||
2822 		get_collation_isdeterministic(collid))
2823 		PG_RETURN_BOOL(true);
2824 	else
2825 		PG_RETURN_BOOL(false);
2826 }
2827 
2828 Datum
text_larger(PG_FUNCTION_ARGS)2829 text_larger(PG_FUNCTION_ARGS)
2830 {
2831 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2832 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2833 	text	   *result;
2834 
2835 	result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2836 
2837 	PG_RETURN_TEXT_P(result);
2838 }
2839 
2840 Datum
text_smaller(PG_FUNCTION_ARGS)2841 text_smaller(PG_FUNCTION_ARGS)
2842 {
2843 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2844 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2845 	text	   *result;
2846 
2847 	result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2848 
2849 	PG_RETURN_TEXT_P(result);
2850 }
2851 
2852 
2853 /*
2854  * Cross-type comparison functions for types text and name.
2855  */
2856 
2857 Datum
nameeqtext(PG_FUNCTION_ARGS)2858 nameeqtext(PG_FUNCTION_ARGS)
2859 {
2860 	Name		arg1 = PG_GETARG_NAME(0);
2861 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2862 	size_t		len1 = strlen(NameStr(*arg1));
2863 	size_t		len2 = VARSIZE_ANY_EXHDR(arg2);
2864 	Oid			collid = PG_GET_COLLATION();
2865 	bool		result;
2866 
2867 	check_collation_set(collid);
2868 
2869 	if (collid == C_COLLATION_OID)
2870 		result = (len1 == len2 &&
2871 				  memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2872 	else
2873 		result = (varstr_cmp(NameStr(*arg1), len1,
2874 							 VARDATA_ANY(arg2), len2,
2875 							 collid) == 0);
2876 
2877 	PG_FREE_IF_COPY(arg2, 1);
2878 
2879 	PG_RETURN_BOOL(result);
2880 }
2881 
2882 Datum
texteqname(PG_FUNCTION_ARGS)2883 texteqname(PG_FUNCTION_ARGS)
2884 {
2885 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2886 	Name		arg2 = PG_GETARG_NAME(1);
2887 	size_t		len1 = VARSIZE_ANY_EXHDR(arg1);
2888 	size_t		len2 = strlen(NameStr(*arg2));
2889 	Oid			collid = PG_GET_COLLATION();
2890 	bool		result;
2891 
2892 	check_collation_set(collid);
2893 
2894 	if (collid == C_COLLATION_OID)
2895 		result = (len1 == len2 &&
2896 				  memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2897 	else
2898 		result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2899 							 NameStr(*arg2), len2,
2900 							 collid) == 0);
2901 
2902 	PG_FREE_IF_COPY(arg1, 0);
2903 
2904 	PG_RETURN_BOOL(result);
2905 }
2906 
2907 Datum
namenetext(PG_FUNCTION_ARGS)2908 namenetext(PG_FUNCTION_ARGS)
2909 {
2910 	Name		arg1 = PG_GETARG_NAME(0);
2911 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2912 	size_t		len1 = strlen(NameStr(*arg1));
2913 	size_t		len2 = VARSIZE_ANY_EXHDR(arg2);
2914 	Oid			collid = PG_GET_COLLATION();
2915 	bool		result;
2916 
2917 	check_collation_set(collid);
2918 
2919 	if (collid == C_COLLATION_OID)
2920 		result = !(len1 == len2 &&
2921 				   memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2922 	else
2923 		result = !(varstr_cmp(NameStr(*arg1), len1,
2924 							  VARDATA_ANY(arg2), len2,
2925 							  collid) == 0);
2926 
2927 	PG_FREE_IF_COPY(arg2, 1);
2928 
2929 	PG_RETURN_BOOL(result);
2930 }
2931 
2932 Datum
textnename(PG_FUNCTION_ARGS)2933 textnename(PG_FUNCTION_ARGS)
2934 {
2935 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2936 	Name		arg2 = PG_GETARG_NAME(1);
2937 	size_t		len1 = VARSIZE_ANY_EXHDR(arg1);
2938 	size_t		len2 = strlen(NameStr(*arg2));
2939 	Oid			collid = PG_GET_COLLATION();
2940 	bool		result;
2941 
2942 	check_collation_set(collid);
2943 
2944 	if (collid == C_COLLATION_OID)
2945 		result = !(len1 == len2 &&
2946 				   memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2947 	else
2948 		result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2949 							  NameStr(*arg2), len2,
2950 							  collid) == 0);
2951 
2952 	PG_FREE_IF_COPY(arg1, 0);
2953 
2954 	PG_RETURN_BOOL(result);
2955 }
2956 
2957 Datum
btnametextcmp(PG_FUNCTION_ARGS)2958 btnametextcmp(PG_FUNCTION_ARGS)
2959 {
2960 	Name		arg1 = PG_GETARG_NAME(0);
2961 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2962 	int32		result;
2963 
2964 	result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2965 						VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2966 						PG_GET_COLLATION());
2967 
2968 	PG_FREE_IF_COPY(arg2, 1);
2969 
2970 	PG_RETURN_INT32(result);
2971 }
2972 
2973 Datum
bttextnamecmp(PG_FUNCTION_ARGS)2974 bttextnamecmp(PG_FUNCTION_ARGS)
2975 {
2976 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2977 	Name		arg2 = PG_GETARG_NAME(1);
2978 	int32		result;
2979 
2980 	result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2981 						NameStr(*arg2), strlen(NameStr(*arg2)),
2982 						PG_GET_COLLATION());
2983 
2984 	PG_FREE_IF_COPY(arg1, 0);
2985 
2986 	PG_RETURN_INT32(result);
2987 }
2988 
2989 #define CmpCall(cmpfunc) \
2990 	DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2991 										  PG_GET_COLLATION(), \
2992 										  PG_GETARG_DATUM(0), \
2993 										  PG_GETARG_DATUM(1)))
2994 
2995 Datum
namelttext(PG_FUNCTION_ARGS)2996 namelttext(PG_FUNCTION_ARGS)
2997 {
2998 	PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
2999 }
3000 
3001 Datum
nameletext(PG_FUNCTION_ARGS)3002 nameletext(PG_FUNCTION_ARGS)
3003 {
3004 	PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
3005 }
3006 
3007 Datum
namegttext(PG_FUNCTION_ARGS)3008 namegttext(PG_FUNCTION_ARGS)
3009 {
3010 	PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
3011 }
3012 
3013 Datum
namegetext(PG_FUNCTION_ARGS)3014 namegetext(PG_FUNCTION_ARGS)
3015 {
3016 	PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
3017 }
3018 
3019 Datum
textltname(PG_FUNCTION_ARGS)3020 textltname(PG_FUNCTION_ARGS)
3021 {
3022 	PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
3023 }
3024 
3025 Datum
textlename(PG_FUNCTION_ARGS)3026 textlename(PG_FUNCTION_ARGS)
3027 {
3028 	PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
3029 }
3030 
3031 Datum
textgtname(PG_FUNCTION_ARGS)3032 textgtname(PG_FUNCTION_ARGS)
3033 {
3034 	PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
3035 }
3036 
3037 Datum
textgename(PG_FUNCTION_ARGS)3038 textgename(PG_FUNCTION_ARGS)
3039 {
3040 	PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
3041 }
3042 
3043 #undef CmpCall
3044 
3045 
3046 /*
3047  * The following operators support character-by-character comparison
3048  * of text datums, to allow building indexes suitable for LIKE clauses.
3049  * Note that the regular texteq/textne comparison operators, and regular
3050  * support functions 1 and 2 with "C" collation are assumed to be
3051  * compatible with these!
3052  */
3053 
3054 static int
internal_text_pattern_compare(text * arg1,text * arg2)3055 internal_text_pattern_compare(text *arg1, text *arg2)
3056 {
3057 	int			result;
3058 	int			len1,
3059 				len2;
3060 
3061 	len1 = VARSIZE_ANY_EXHDR(arg1);
3062 	len2 = VARSIZE_ANY_EXHDR(arg2);
3063 
3064 	result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3065 	if (result != 0)
3066 		return result;
3067 	else if (len1 < len2)
3068 		return -1;
3069 	else if (len1 > len2)
3070 		return 1;
3071 	else
3072 		return 0;
3073 }
3074 
3075 
3076 Datum
text_pattern_lt(PG_FUNCTION_ARGS)3077 text_pattern_lt(PG_FUNCTION_ARGS)
3078 {
3079 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
3080 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
3081 	int			result;
3082 
3083 	result = internal_text_pattern_compare(arg1, arg2);
3084 
3085 	PG_FREE_IF_COPY(arg1, 0);
3086 	PG_FREE_IF_COPY(arg2, 1);
3087 
3088 	PG_RETURN_BOOL(result < 0);
3089 }
3090 
3091 
3092 Datum
text_pattern_le(PG_FUNCTION_ARGS)3093 text_pattern_le(PG_FUNCTION_ARGS)
3094 {
3095 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
3096 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
3097 	int			result;
3098 
3099 	result = internal_text_pattern_compare(arg1, arg2);
3100 
3101 	PG_FREE_IF_COPY(arg1, 0);
3102 	PG_FREE_IF_COPY(arg2, 1);
3103 
3104 	PG_RETURN_BOOL(result <= 0);
3105 }
3106 
3107 
3108 Datum
text_pattern_ge(PG_FUNCTION_ARGS)3109 text_pattern_ge(PG_FUNCTION_ARGS)
3110 {
3111 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
3112 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
3113 	int			result;
3114 
3115 	result = internal_text_pattern_compare(arg1, arg2);
3116 
3117 	PG_FREE_IF_COPY(arg1, 0);
3118 	PG_FREE_IF_COPY(arg2, 1);
3119 
3120 	PG_RETURN_BOOL(result >= 0);
3121 }
3122 
3123 
3124 Datum
text_pattern_gt(PG_FUNCTION_ARGS)3125 text_pattern_gt(PG_FUNCTION_ARGS)
3126 {
3127 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
3128 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
3129 	int			result;
3130 
3131 	result = internal_text_pattern_compare(arg1, arg2);
3132 
3133 	PG_FREE_IF_COPY(arg1, 0);
3134 	PG_FREE_IF_COPY(arg2, 1);
3135 
3136 	PG_RETURN_BOOL(result > 0);
3137 }
3138 
3139 
3140 Datum
bttext_pattern_cmp(PG_FUNCTION_ARGS)3141 bttext_pattern_cmp(PG_FUNCTION_ARGS)
3142 {
3143 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
3144 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
3145 	int			result;
3146 
3147 	result = internal_text_pattern_compare(arg1, arg2);
3148 
3149 	PG_FREE_IF_COPY(arg1, 0);
3150 	PG_FREE_IF_COPY(arg2, 1);
3151 
3152 	PG_RETURN_INT32(result);
3153 }
3154 
3155 
3156 Datum
bttext_pattern_sortsupport(PG_FUNCTION_ARGS)3157 bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
3158 {
3159 	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3160 	MemoryContext oldcontext;
3161 
3162 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3163 
3164 	/* Use generic string SortSupport, forcing "C" collation */
3165 	varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
3166 
3167 	MemoryContextSwitchTo(oldcontext);
3168 
3169 	PG_RETURN_VOID();
3170 }
3171 
3172 
3173 /*-------------------------------------------------------------
3174  * byteaoctetlen
3175  *
3176  * get the number of bytes contained in an instance of type 'bytea'
3177  *-------------------------------------------------------------
3178  */
3179 Datum
byteaoctetlen(PG_FUNCTION_ARGS)3180 byteaoctetlen(PG_FUNCTION_ARGS)
3181 {
3182 	Datum		str = PG_GETARG_DATUM(0);
3183 
3184 	/* We need not detoast the input at all */
3185 	PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
3186 }
3187 
3188 /*
3189  * byteacat -
3190  *	  takes two bytea* and returns a bytea* that is the concatenation of
3191  *	  the two.
3192  *
3193  * Cloned from textcat and modified as required.
3194  */
3195 Datum
byteacat(PG_FUNCTION_ARGS)3196 byteacat(PG_FUNCTION_ARGS)
3197 {
3198 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
3199 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
3200 
3201 	PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
3202 }
3203 
3204 /*
3205  * bytea_catenate
3206  *	Guts of byteacat(), broken out so it can be used by other functions
3207  *
3208  * Arguments can be in short-header form, but not compressed or out-of-line
3209  */
3210 static bytea *
bytea_catenate(bytea * t1,bytea * t2)3211 bytea_catenate(bytea *t1, bytea *t2)
3212 {
3213 	bytea	   *result;
3214 	int			len1,
3215 				len2,
3216 				len;
3217 	char	   *ptr;
3218 
3219 	len1 = VARSIZE_ANY_EXHDR(t1);
3220 	len2 = VARSIZE_ANY_EXHDR(t2);
3221 
3222 	/* paranoia ... probably should throw error instead? */
3223 	if (len1 < 0)
3224 		len1 = 0;
3225 	if (len2 < 0)
3226 		len2 = 0;
3227 
3228 	len = len1 + len2 + VARHDRSZ;
3229 	result = (bytea *) palloc(len);
3230 
3231 	/* Set size of result string... */
3232 	SET_VARSIZE(result, len);
3233 
3234 	/* Fill data field of result string... */
3235 	ptr = VARDATA(result);
3236 	if (len1 > 0)
3237 		memcpy(ptr, VARDATA_ANY(t1), len1);
3238 	if (len2 > 0)
3239 		memcpy(ptr + len1, VARDATA_ANY(t2), len2);
3240 
3241 	return result;
3242 }
3243 
3244 #define PG_STR_GET_BYTEA(str_) \
3245 	DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
3246 
3247 /*
3248  * bytea_substr()
3249  * Return a substring starting at the specified position.
3250  * Cloned from text_substr and modified as required.
3251  *
3252  * Input:
3253  *	- string
3254  *	- starting position (is one-based)
3255  *	- string length (optional)
3256  *
3257  * If the starting position is zero or less, then return from the start of the string
3258  * adjusting the length to be consistent with the "negative start" per SQL.
3259  * If the length is less than zero, an ERROR is thrown. If no third argument
3260  * (length) is provided, the length to the end of the string is assumed.
3261  */
3262 Datum
bytea_substr(PG_FUNCTION_ARGS)3263 bytea_substr(PG_FUNCTION_ARGS)
3264 {
3265 	PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3266 									  PG_GETARG_INT32(1),
3267 									  PG_GETARG_INT32(2),
3268 									  false));
3269 }
3270 
3271 /*
3272  * bytea_substr_no_len -
3273  *	  Wrapper to avoid opr_sanity failure due to
3274  *	  one function accepting a different number of args.
3275  */
3276 Datum
bytea_substr_no_len(PG_FUNCTION_ARGS)3277 bytea_substr_no_len(PG_FUNCTION_ARGS)
3278 {
3279 	PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3280 									  PG_GETARG_INT32(1),
3281 									  -1,
3282 									  true));
3283 }
3284 
3285 static bytea *
bytea_substring(Datum str,int S,int L,bool length_not_specified)3286 bytea_substring(Datum str,
3287 				int S,
3288 				int L,
3289 				bool length_not_specified)
3290 {
3291 	int32		S1;				/* adjusted start position */
3292 	int32		L1;				/* adjusted substring length */
3293 	int32		E;				/* end position */
3294 
3295 	/*
3296 	 * The logic here should generally match text_substring().
3297 	 */
3298 	S1 = Max(S, 1);
3299 
3300 	if (length_not_specified)
3301 	{
3302 		/*
3303 		 * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3304 		 * end of the string if we pass it a negative value for length.
3305 		 */
3306 		L1 = -1;
3307 	}
3308 	else if (L < 0)
3309 	{
3310 		/* SQL99 says to throw an error for E < S, i.e., negative length */
3311 		ereport(ERROR,
3312 				(errcode(ERRCODE_SUBSTRING_ERROR),
3313 				 errmsg("negative substring length not allowed")));
3314 		L1 = -1;				/* silence stupider compilers */
3315 	}
3316 	else if (pg_add_s32_overflow(S, L, &E))
3317 	{
3318 		/*
3319 		 * L could be large enough for S + L to overflow, in which case the
3320 		 * substring must run to end of string.
3321 		 */
3322 		L1 = -1;
3323 	}
3324 	else
3325 	{
3326 		/*
3327 		 * A zero or negative value for the end position can happen if the
3328 		 * start was negative or one. SQL99 says to return a zero-length
3329 		 * string.
3330 		 */
3331 		if (E < 1)
3332 			return PG_STR_GET_BYTEA("");
3333 
3334 		L1 = E - S1;
3335 	}
3336 
3337 	/*
3338 	 * If the start position is past the end of the string, SQL99 says to
3339 	 * return a zero-length string -- DatumGetByteaPSlice() will do that for
3340 	 * us.  We need only convert S1 to zero-based starting position.
3341 	 */
3342 	return DatumGetByteaPSlice(str, S1 - 1, L1);
3343 }
3344 
3345 /*
3346  * byteaoverlay
3347  *	Replace specified substring of first string with second
3348  *
3349  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3350  * This code is a direct implementation of what the standard says.
3351  */
3352 Datum
byteaoverlay(PG_FUNCTION_ARGS)3353 byteaoverlay(PG_FUNCTION_ARGS)
3354 {
3355 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
3356 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
3357 	int			sp = PG_GETARG_INT32(2);	/* substring start position */
3358 	int			sl = PG_GETARG_INT32(3);	/* substring length */
3359 
3360 	PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3361 }
3362 
3363 Datum
byteaoverlay_no_len(PG_FUNCTION_ARGS)3364 byteaoverlay_no_len(PG_FUNCTION_ARGS)
3365 {
3366 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
3367 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
3368 	int			sp = PG_GETARG_INT32(2);	/* substring start position */
3369 	int			sl;
3370 
3371 	sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3372 	PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3373 }
3374 
3375 static bytea *
bytea_overlay(bytea * t1,bytea * t2,int sp,int sl)3376 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3377 {
3378 	bytea	   *result;
3379 	bytea	   *s1;
3380 	bytea	   *s2;
3381 	int			sp_pl_sl;
3382 
3383 	/*
3384 	 * Check for possible integer-overflow cases.  For negative sp, throw a
3385 	 * "substring length" error because that's what should be expected
3386 	 * according to the spec's definition of OVERLAY().
3387 	 */
3388 	if (sp <= 0)
3389 		ereport(ERROR,
3390 				(errcode(ERRCODE_SUBSTRING_ERROR),
3391 				 errmsg("negative substring length not allowed")));
3392 	if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3393 		ereport(ERROR,
3394 				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3395 				 errmsg("integer out of range")));
3396 
3397 	s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3398 	s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3399 	result = bytea_catenate(s1, t2);
3400 	result = bytea_catenate(result, s2);
3401 
3402 	return result;
3403 }
3404 
3405 /*
3406  * byteapos -
3407  *	  Return the position of the specified substring.
3408  *	  Implements the SQL POSITION() function.
3409  * Cloned from textpos and modified as required.
3410  */
3411 Datum
byteapos(PG_FUNCTION_ARGS)3412 byteapos(PG_FUNCTION_ARGS)
3413 {
3414 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
3415 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
3416 	int			pos;
3417 	int			px,
3418 				p;
3419 	int			len1,
3420 				len2;
3421 	char	   *p1,
3422 			   *p2;
3423 
3424 	len1 = VARSIZE_ANY_EXHDR(t1);
3425 	len2 = VARSIZE_ANY_EXHDR(t2);
3426 
3427 	if (len2 <= 0)
3428 		PG_RETURN_INT32(1);		/* result for empty pattern */
3429 
3430 	p1 = VARDATA_ANY(t1);
3431 	p2 = VARDATA_ANY(t2);
3432 
3433 	pos = 0;
3434 	px = (len1 - len2);
3435 	for (p = 0; p <= px; p++)
3436 	{
3437 		if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3438 		{
3439 			pos = p + 1;
3440 			break;
3441 		};
3442 		p1++;
3443 	};
3444 
3445 	PG_RETURN_INT32(pos);
3446 }
3447 
3448 /*-------------------------------------------------------------
3449  * byteaGetByte
3450  *
3451  * this routine treats "bytea" as an array of bytes.
3452  * It returns the Nth byte (a number between 0 and 255).
3453  *-------------------------------------------------------------
3454  */
3455 Datum
byteaGetByte(PG_FUNCTION_ARGS)3456 byteaGetByte(PG_FUNCTION_ARGS)
3457 {
3458 	bytea	   *v = PG_GETARG_BYTEA_PP(0);
3459 	int32		n = PG_GETARG_INT32(1);
3460 	int			len;
3461 	int			byte;
3462 
3463 	len = VARSIZE_ANY_EXHDR(v);
3464 
3465 	if (n < 0 || n >= len)
3466 		ereport(ERROR,
3467 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3468 				 errmsg("index %d out of valid range, 0..%d",
3469 						n, len - 1)));
3470 
3471 	byte = ((unsigned char *) VARDATA_ANY(v))[n];
3472 
3473 	PG_RETURN_INT32(byte);
3474 }
3475 
3476 /*-------------------------------------------------------------
3477  * byteaGetBit
3478  *
3479  * This routine treats a "bytea" type like an array of bits.
3480  * It returns the value of the Nth bit (0 or 1).
3481  *
3482  *-------------------------------------------------------------
3483  */
3484 Datum
byteaGetBit(PG_FUNCTION_ARGS)3485 byteaGetBit(PG_FUNCTION_ARGS)
3486 {
3487 	bytea	   *v = PG_GETARG_BYTEA_PP(0);
3488 	int64		n = PG_GETARG_INT64(1);
3489 	int			byteNo,
3490 				bitNo;
3491 	int			len;
3492 	int			byte;
3493 
3494 	len = VARSIZE_ANY_EXHDR(v);
3495 
3496 	if (n < 0 || n >= (int64) len * 8)
3497 		ereport(ERROR,
3498 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3499 				 errmsg("index %lld out of valid range, 0..%lld",
3500 						(long long) n, (long long) len * 8 - 1)));
3501 
3502 	/* n/8 is now known < len, so safe to cast to int */
3503 	byteNo = (int) (n / 8);
3504 	bitNo = (int) (n % 8);
3505 
3506 	byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3507 
3508 	if (byte & (1 << bitNo))
3509 		PG_RETURN_INT32(1);
3510 	else
3511 		PG_RETURN_INT32(0);
3512 }
3513 
3514 /*-------------------------------------------------------------
3515  * byteaSetByte
3516  *
3517  * Given an instance of type 'bytea' creates a new one with
3518  * the Nth byte set to the given value.
3519  *
3520  *-------------------------------------------------------------
3521  */
3522 Datum
byteaSetByte(PG_FUNCTION_ARGS)3523 byteaSetByte(PG_FUNCTION_ARGS)
3524 {
3525 	bytea	   *res = PG_GETARG_BYTEA_P_COPY(0);
3526 	int32		n = PG_GETARG_INT32(1);
3527 	int32		newByte = PG_GETARG_INT32(2);
3528 	int			len;
3529 
3530 	len = VARSIZE(res) - VARHDRSZ;
3531 
3532 	if (n < 0 || n >= len)
3533 		ereport(ERROR,
3534 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3535 				 errmsg("index %d out of valid range, 0..%d",
3536 						n, len - 1)));
3537 
3538 	/*
3539 	 * Now set the byte.
3540 	 */
3541 	((unsigned char *) VARDATA(res))[n] = newByte;
3542 
3543 	PG_RETURN_BYTEA_P(res);
3544 }
3545 
3546 /*-------------------------------------------------------------
3547  * byteaSetBit
3548  *
3549  * Given an instance of type 'bytea' creates a new one with
3550  * the Nth bit set to the given value.
3551  *
3552  *-------------------------------------------------------------
3553  */
3554 Datum
byteaSetBit(PG_FUNCTION_ARGS)3555 byteaSetBit(PG_FUNCTION_ARGS)
3556 {
3557 	bytea	   *res = PG_GETARG_BYTEA_P_COPY(0);
3558 	int64		n = PG_GETARG_INT64(1);
3559 	int32		newBit = PG_GETARG_INT32(2);
3560 	int			len;
3561 	int			oldByte,
3562 				newByte;
3563 	int			byteNo,
3564 				bitNo;
3565 
3566 	len = VARSIZE(res) - VARHDRSZ;
3567 
3568 	if (n < 0 || n >= (int64) len * 8)
3569 		ereport(ERROR,
3570 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3571 				 errmsg("index %lld out of valid range, 0..%lld",
3572 						(long long) n, (long long) len * 8 - 1)));
3573 
3574 	/* n/8 is now known < len, so safe to cast to int */
3575 	byteNo = (int) (n / 8);
3576 	bitNo = (int) (n % 8);
3577 
3578 	/*
3579 	 * sanity check!
3580 	 */
3581 	if (newBit != 0 && newBit != 1)
3582 		ereport(ERROR,
3583 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3584 				 errmsg("new bit must be 0 or 1")));
3585 
3586 	/*
3587 	 * Update the byte.
3588 	 */
3589 	oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3590 
3591 	if (newBit == 0)
3592 		newByte = oldByte & (~(1 << bitNo));
3593 	else
3594 		newByte = oldByte | (1 << bitNo);
3595 
3596 	((unsigned char *) VARDATA(res))[byteNo] = newByte;
3597 
3598 	PG_RETURN_BYTEA_P(res);
3599 }
3600 
3601 
3602 /* text_name()
3603  * Converts a text type to a Name type.
3604  */
3605 Datum
text_name(PG_FUNCTION_ARGS)3606 text_name(PG_FUNCTION_ARGS)
3607 {
3608 	text	   *s = PG_GETARG_TEXT_PP(0);
3609 	Name		result;
3610 	int			len;
3611 
3612 	len = VARSIZE_ANY_EXHDR(s);
3613 
3614 	/* Truncate oversize input */
3615 	if (len >= NAMEDATALEN)
3616 		len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3617 
3618 	/* We use palloc0 here to ensure result is zero-padded */
3619 	result = (Name) palloc0(NAMEDATALEN);
3620 	memcpy(NameStr(*result), VARDATA_ANY(s), len);
3621 
3622 	PG_RETURN_NAME(result);
3623 }
3624 
3625 /* name_text()
3626  * Converts a Name type to a text type.
3627  */
3628 Datum
name_text(PG_FUNCTION_ARGS)3629 name_text(PG_FUNCTION_ARGS)
3630 {
3631 	Name		s = PG_GETARG_NAME(0);
3632 
3633 	PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3634 }
3635 
3636 
3637 /*
3638  * textToQualifiedNameList - convert a text object to list of names
3639  *
3640  * This implements the input parsing needed by nextval() and other
3641  * functions that take a text parameter representing a qualified name.
3642  * We split the name at dots, downcase if not double-quoted, and
3643  * truncate names if they're too long.
3644  */
3645 List *
textToQualifiedNameList(text * textval)3646 textToQualifiedNameList(text *textval)
3647 {
3648 	char	   *rawname;
3649 	List	   *result = NIL;
3650 	List	   *namelist;
3651 	ListCell   *l;
3652 
3653 	/* Convert to C string (handles possible detoasting). */
3654 	/* Note we rely on being able to modify rawname below. */
3655 	rawname = text_to_cstring(textval);
3656 
3657 	if (!SplitIdentifierString(rawname, '.', &namelist))
3658 		ereport(ERROR,
3659 				(errcode(ERRCODE_INVALID_NAME),
3660 				 errmsg("invalid name syntax")));
3661 
3662 	if (namelist == NIL)
3663 		ereport(ERROR,
3664 				(errcode(ERRCODE_INVALID_NAME),
3665 				 errmsg("invalid name syntax")));
3666 
3667 	foreach(l, namelist)
3668 	{
3669 		char	   *curname = (char *) lfirst(l);
3670 
3671 		result = lappend(result, makeString(pstrdup(curname)));
3672 	}
3673 
3674 	pfree(rawname);
3675 	list_free(namelist);
3676 
3677 	return result;
3678 }
3679 
3680 /*
3681  * SplitIdentifierString --- parse a string containing identifiers
3682  *
3683  * This is the guts of textToQualifiedNameList, and is exported for use in
3684  * other situations such as parsing GUC variables.  In the GUC case, it's
3685  * important to avoid memory leaks, so the API is designed to minimize the
3686  * amount of stuff that needs to be allocated and freed.
3687  *
3688  * Inputs:
3689  *	rawstring: the input string; must be overwritable!	On return, it's
3690  *			   been modified to contain the separated identifiers.
3691  *	separator: the separator punctuation expected between identifiers
3692  *			   (typically '.' or ',').  Whitespace may also appear around
3693  *			   identifiers.
3694  * Outputs:
3695  *	namelist: filled with a palloc'd list of pointers to identifiers within
3696  *			  rawstring.  Caller should list_free() this even on error return.
3697  *
3698  * Returns true if okay, false if there is a syntax error in the string.
3699  *
3700  * Note that an empty string is considered okay here, though not in
3701  * textToQualifiedNameList.
3702  */
3703 bool
SplitIdentifierString(char * rawstring,char separator,List ** namelist)3704 SplitIdentifierString(char *rawstring, char separator,
3705 					  List **namelist)
3706 {
3707 	char	   *nextp = rawstring;
3708 	bool		done = false;
3709 
3710 	*namelist = NIL;
3711 
3712 	while (scanner_isspace(*nextp))
3713 		nextp++;				/* skip leading whitespace */
3714 
3715 	if (*nextp == '\0')
3716 		return true;			/* allow empty string */
3717 
3718 	/* At the top of the loop, we are at start of a new identifier. */
3719 	do
3720 	{
3721 		char	   *curname;
3722 		char	   *endp;
3723 
3724 		if (*nextp == '"')
3725 		{
3726 			/* Quoted name --- collapse quote-quote pairs, no downcasing */
3727 			curname = nextp + 1;
3728 			for (;;)
3729 			{
3730 				endp = strchr(nextp + 1, '"');
3731 				if (endp == NULL)
3732 					return false;	/* mismatched quotes */
3733 				if (endp[1] != '"')
3734 					break;		/* found end of quoted name */
3735 				/* Collapse adjacent quotes into one quote, and look again */
3736 				memmove(endp, endp + 1, strlen(endp));
3737 				nextp = endp;
3738 			}
3739 			/* endp now points at the terminating quote */
3740 			nextp = endp + 1;
3741 		}
3742 		else
3743 		{
3744 			/* Unquoted name --- extends to separator or whitespace */
3745 			char	   *downname;
3746 			int			len;
3747 
3748 			curname = nextp;
3749 			while (*nextp && *nextp != separator &&
3750 				   !scanner_isspace(*nextp))
3751 				nextp++;
3752 			endp = nextp;
3753 			if (curname == nextp)
3754 				return false;	/* empty unquoted name not allowed */
3755 
3756 			/*
3757 			 * Downcase the identifier, using same code as main lexer does.
3758 			 *
3759 			 * XXX because we want to overwrite the input in-place, we cannot
3760 			 * support a downcasing transformation that increases the string
3761 			 * length.  This is not a problem given the current implementation
3762 			 * of downcase_truncate_identifier, but we'll probably have to do
3763 			 * something about this someday.
3764 			 */
3765 			len = endp - curname;
3766 			downname = downcase_truncate_identifier(curname, len, false);
3767 			Assert(strlen(downname) <= len);
3768 			strncpy(curname, downname, len);	/* strncpy is required here */
3769 			pfree(downname);
3770 		}
3771 
3772 		while (scanner_isspace(*nextp))
3773 			nextp++;			/* skip trailing whitespace */
3774 
3775 		if (*nextp == separator)
3776 		{
3777 			nextp++;
3778 			while (scanner_isspace(*nextp))
3779 				nextp++;		/* skip leading whitespace for next */
3780 			/* we expect another name, so done remains false */
3781 		}
3782 		else if (*nextp == '\0')
3783 			done = true;
3784 		else
3785 			return false;		/* invalid syntax */
3786 
3787 		/* Now safe to overwrite separator with a null */
3788 		*endp = '\0';
3789 
3790 		/* Truncate name if it's overlength */
3791 		truncate_identifier(curname, strlen(curname), false);
3792 
3793 		/*
3794 		 * Finished isolating current name --- add it to list
3795 		 */
3796 		*namelist = lappend(*namelist, curname);
3797 
3798 		/* Loop back if we didn't reach end of string */
3799 	} while (!done);
3800 
3801 	return true;
3802 }
3803 
3804 
3805 /*
3806  * SplitDirectoriesString --- parse a string containing file/directory names
3807  *
3808  * This works fine on file names too; the function name is historical.
3809  *
3810  * This is similar to SplitIdentifierString, except that the parsing
3811  * rules are meant to handle pathnames instead of identifiers: there is
3812  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3813  * and we apply canonicalize_path() to each extracted string.  Because of the
3814  * last, the returned strings are separately palloc'd rather than being
3815  * pointers into rawstring --- but we still scribble on rawstring.
3816  *
3817  * Inputs:
3818  *	rawstring: the input string; must be modifiable!
3819  *	separator: the separator punctuation expected between directories
3820  *			   (typically ',' or ';').  Whitespace may also appear around
3821  *			   directories.
3822  * Outputs:
3823  *	namelist: filled with a palloc'd list of directory names.
3824  *			  Caller should list_free_deep() this even on error return.
3825  *
3826  * Returns true if okay, false if there is a syntax error in the string.
3827  *
3828  * Note that an empty string is considered okay here.
3829  */
3830 bool
SplitDirectoriesString(char * rawstring,char separator,List ** namelist)3831 SplitDirectoriesString(char *rawstring, char separator,
3832 					   List **namelist)
3833 {
3834 	char	   *nextp = rawstring;
3835 	bool		done = false;
3836 
3837 	*namelist = NIL;
3838 
3839 	while (scanner_isspace(*nextp))
3840 		nextp++;				/* skip leading whitespace */
3841 
3842 	if (*nextp == '\0')
3843 		return true;			/* allow empty string */
3844 
3845 	/* At the top of the loop, we are at start of a new directory. */
3846 	do
3847 	{
3848 		char	   *curname;
3849 		char	   *endp;
3850 
3851 		if (*nextp == '"')
3852 		{
3853 			/* Quoted name --- collapse quote-quote pairs */
3854 			curname = nextp + 1;
3855 			for (;;)
3856 			{
3857 				endp = strchr(nextp + 1, '"');
3858 				if (endp == NULL)
3859 					return false;	/* mismatched quotes */
3860 				if (endp[1] != '"')
3861 					break;		/* found end of quoted name */
3862 				/* Collapse adjacent quotes into one quote, and look again */
3863 				memmove(endp, endp + 1, strlen(endp));
3864 				nextp = endp;
3865 			}
3866 			/* endp now points at the terminating quote */
3867 			nextp = endp + 1;
3868 		}
3869 		else
3870 		{
3871 			/* Unquoted name --- extends to separator or end of string */
3872 			curname = endp = nextp;
3873 			while (*nextp && *nextp != separator)
3874 			{
3875 				/* trailing whitespace should not be included in name */
3876 				if (!scanner_isspace(*nextp))
3877 					endp = nextp + 1;
3878 				nextp++;
3879 			}
3880 			if (curname == endp)
3881 				return false;	/* empty unquoted name not allowed */
3882 		}
3883 
3884 		while (scanner_isspace(*nextp))
3885 			nextp++;			/* skip trailing whitespace */
3886 
3887 		if (*nextp == separator)
3888 		{
3889 			nextp++;
3890 			while (scanner_isspace(*nextp))
3891 				nextp++;		/* skip leading whitespace for next */
3892 			/* we expect another name, so done remains false */
3893 		}
3894 		else if (*nextp == '\0')
3895 			done = true;
3896 		else
3897 			return false;		/* invalid syntax */
3898 
3899 		/* Now safe to overwrite separator with a null */
3900 		*endp = '\0';
3901 
3902 		/* Truncate path if it's overlength */
3903 		if (strlen(curname) >= MAXPGPATH)
3904 			curname[MAXPGPATH - 1] = '\0';
3905 
3906 		/*
3907 		 * Finished isolating current name --- add it to list
3908 		 */
3909 		curname = pstrdup(curname);
3910 		canonicalize_path(curname);
3911 		*namelist = lappend(*namelist, curname);
3912 
3913 		/* Loop back if we didn't reach end of string */
3914 	} while (!done);
3915 
3916 	return true;
3917 }
3918 
3919 
3920 /*
3921  * SplitGUCList --- parse a string containing identifiers or file names
3922  *
3923  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3924  * presuming whether the elements will be taken as identifiers or file names.
3925  * We assume the input has already been through flatten_set_variable_args(),
3926  * so that we need never downcase (if appropriate, that was done already).
3927  * Nor do we ever truncate, since we don't know the correct max length.
3928  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3929  * because any embedded whitespace should have led to double-quoting).
3930  * Otherwise the API is identical to SplitIdentifierString.
3931  *
3932  * XXX it's annoying to have so many copies of this string-splitting logic.
3933  * However, it's not clear that having one function with a bunch of option
3934  * flags would be much better.
3935  *
3936  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3937  * Be sure to update that if you have to change this.
3938  *
3939  * Inputs:
3940  *	rawstring: the input string; must be overwritable!	On return, it's
3941  *			   been modified to contain the separated identifiers.
3942  *	separator: the separator punctuation expected between identifiers
3943  *			   (typically '.' or ',').  Whitespace may also appear around
3944  *			   identifiers.
3945  * Outputs:
3946  *	namelist: filled with a palloc'd list of pointers to identifiers within
3947  *			  rawstring.  Caller should list_free() this even on error return.
3948  *
3949  * Returns true if okay, false if there is a syntax error in the string.
3950  */
3951 bool
SplitGUCList(char * rawstring,char separator,List ** namelist)3952 SplitGUCList(char *rawstring, char separator,
3953 			 List **namelist)
3954 {
3955 	char	   *nextp = rawstring;
3956 	bool		done = false;
3957 
3958 	*namelist = NIL;
3959 
3960 	while (scanner_isspace(*nextp))
3961 		nextp++;				/* skip leading whitespace */
3962 
3963 	if (*nextp == '\0')
3964 		return true;			/* allow empty string */
3965 
3966 	/* At the top of the loop, we are at start of a new identifier. */
3967 	do
3968 	{
3969 		char	   *curname;
3970 		char	   *endp;
3971 
3972 		if (*nextp == '"')
3973 		{
3974 			/* Quoted name --- collapse quote-quote pairs */
3975 			curname = nextp + 1;
3976 			for (;;)
3977 			{
3978 				endp = strchr(nextp + 1, '"');
3979 				if (endp == NULL)
3980 					return false;	/* mismatched quotes */
3981 				if (endp[1] != '"')
3982 					break;		/* found end of quoted name */
3983 				/* Collapse adjacent quotes into one quote, and look again */
3984 				memmove(endp, endp + 1, strlen(endp));
3985 				nextp = endp;
3986 			}
3987 			/* endp now points at the terminating quote */
3988 			nextp = endp + 1;
3989 		}
3990 		else
3991 		{
3992 			/* Unquoted name --- extends to separator or whitespace */
3993 			curname = nextp;
3994 			while (*nextp && *nextp != separator &&
3995 				   !scanner_isspace(*nextp))
3996 				nextp++;
3997 			endp = nextp;
3998 			if (curname == nextp)
3999 				return false;	/* empty unquoted name not allowed */
4000 		}
4001 
4002 		while (scanner_isspace(*nextp))
4003 			nextp++;			/* skip trailing whitespace */
4004 
4005 		if (*nextp == separator)
4006 		{
4007 			nextp++;
4008 			while (scanner_isspace(*nextp))
4009 				nextp++;		/* skip leading whitespace for next */
4010 			/* we expect another name, so done remains false */
4011 		}
4012 		else if (*nextp == '\0')
4013 			done = true;
4014 		else
4015 			return false;		/* invalid syntax */
4016 
4017 		/* Now safe to overwrite separator with a null */
4018 		*endp = '\0';
4019 
4020 		/*
4021 		 * Finished isolating current name --- add it to list
4022 		 */
4023 		*namelist = lappend(*namelist, curname);
4024 
4025 		/* Loop back if we didn't reach end of string */
4026 	} while (!done);
4027 
4028 	return true;
4029 }
4030 
4031 
4032 /*****************************************************************************
4033  *	Comparison Functions used for bytea
4034  *
4035  * Note: btree indexes need these routines not to leak memory; therefore,
4036  * be careful to free working copies of toasted datums.  Most places don't
4037  * need to be so careful.
4038  *****************************************************************************/
4039 
4040 Datum
byteaeq(PG_FUNCTION_ARGS)4041 byteaeq(PG_FUNCTION_ARGS)
4042 {
4043 	Datum		arg1 = PG_GETARG_DATUM(0);
4044 	Datum		arg2 = PG_GETARG_DATUM(1);
4045 	bool		result;
4046 	Size		len1,
4047 				len2;
4048 
4049 	/*
4050 	 * We can use a fast path for unequal lengths, which might save us from
4051 	 * having to detoast one or both values.
4052 	 */
4053 	len1 = toast_raw_datum_size(arg1);
4054 	len2 = toast_raw_datum_size(arg2);
4055 	if (len1 != len2)
4056 		result = false;
4057 	else
4058 	{
4059 		bytea	   *barg1 = DatumGetByteaPP(arg1);
4060 		bytea	   *barg2 = DatumGetByteaPP(arg2);
4061 
4062 		result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4063 						 len1 - VARHDRSZ) == 0);
4064 
4065 		PG_FREE_IF_COPY(barg1, 0);
4066 		PG_FREE_IF_COPY(barg2, 1);
4067 	}
4068 
4069 	PG_RETURN_BOOL(result);
4070 }
4071 
4072 Datum
byteane(PG_FUNCTION_ARGS)4073 byteane(PG_FUNCTION_ARGS)
4074 {
4075 	Datum		arg1 = PG_GETARG_DATUM(0);
4076 	Datum		arg2 = PG_GETARG_DATUM(1);
4077 	bool		result;
4078 	Size		len1,
4079 				len2;
4080 
4081 	/*
4082 	 * We can use a fast path for unequal lengths, which might save us from
4083 	 * having to detoast one or both values.
4084 	 */
4085 	len1 = toast_raw_datum_size(arg1);
4086 	len2 = toast_raw_datum_size(arg2);
4087 	if (len1 != len2)
4088 		result = true;
4089 	else
4090 	{
4091 		bytea	   *barg1 = DatumGetByteaPP(arg1);
4092 		bytea	   *barg2 = DatumGetByteaPP(arg2);
4093 
4094 		result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
4095 						 len1 - VARHDRSZ) != 0);
4096 
4097 		PG_FREE_IF_COPY(barg1, 0);
4098 		PG_FREE_IF_COPY(barg2, 1);
4099 	}
4100 
4101 	PG_RETURN_BOOL(result);
4102 }
4103 
4104 Datum
bytealt(PG_FUNCTION_ARGS)4105 bytealt(PG_FUNCTION_ARGS)
4106 {
4107 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
4108 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
4109 	int			len1,
4110 				len2;
4111 	int			cmp;
4112 
4113 	len1 = VARSIZE_ANY_EXHDR(arg1);
4114 	len2 = VARSIZE_ANY_EXHDR(arg2);
4115 
4116 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4117 
4118 	PG_FREE_IF_COPY(arg1, 0);
4119 	PG_FREE_IF_COPY(arg2, 1);
4120 
4121 	PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
4122 }
4123 
4124 Datum
byteale(PG_FUNCTION_ARGS)4125 byteale(PG_FUNCTION_ARGS)
4126 {
4127 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
4128 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
4129 	int			len1,
4130 				len2;
4131 	int			cmp;
4132 
4133 	len1 = VARSIZE_ANY_EXHDR(arg1);
4134 	len2 = VARSIZE_ANY_EXHDR(arg2);
4135 
4136 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4137 
4138 	PG_FREE_IF_COPY(arg1, 0);
4139 	PG_FREE_IF_COPY(arg2, 1);
4140 
4141 	PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
4142 }
4143 
4144 Datum
byteagt(PG_FUNCTION_ARGS)4145 byteagt(PG_FUNCTION_ARGS)
4146 {
4147 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
4148 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
4149 	int			len1,
4150 				len2;
4151 	int			cmp;
4152 
4153 	len1 = VARSIZE_ANY_EXHDR(arg1);
4154 	len2 = VARSIZE_ANY_EXHDR(arg2);
4155 
4156 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4157 
4158 	PG_FREE_IF_COPY(arg1, 0);
4159 	PG_FREE_IF_COPY(arg2, 1);
4160 
4161 	PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
4162 }
4163 
4164 Datum
byteage(PG_FUNCTION_ARGS)4165 byteage(PG_FUNCTION_ARGS)
4166 {
4167 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
4168 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
4169 	int			len1,
4170 				len2;
4171 	int			cmp;
4172 
4173 	len1 = VARSIZE_ANY_EXHDR(arg1);
4174 	len2 = VARSIZE_ANY_EXHDR(arg2);
4175 
4176 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4177 
4178 	PG_FREE_IF_COPY(arg1, 0);
4179 	PG_FREE_IF_COPY(arg2, 1);
4180 
4181 	PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
4182 }
4183 
4184 Datum
byteacmp(PG_FUNCTION_ARGS)4185 byteacmp(PG_FUNCTION_ARGS)
4186 {
4187 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
4188 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
4189 	int			len1,
4190 				len2;
4191 	int			cmp;
4192 
4193 	len1 = VARSIZE_ANY_EXHDR(arg1);
4194 	len2 = VARSIZE_ANY_EXHDR(arg2);
4195 
4196 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
4197 	if ((cmp == 0) && (len1 != len2))
4198 		cmp = (len1 < len2) ? -1 : 1;
4199 
4200 	PG_FREE_IF_COPY(arg1, 0);
4201 	PG_FREE_IF_COPY(arg2, 1);
4202 
4203 	PG_RETURN_INT32(cmp);
4204 }
4205 
4206 Datum
bytea_sortsupport(PG_FUNCTION_ARGS)4207 bytea_sortsupport(PG_FUNCTION_ARGS)
4208 {
4209 	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
4210 	MemoryContext oldcontext;
4211 
4212 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
4213 
4214 	/* Use generic string SortSupport, forcing "C" collation */
4215 	varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
4216 
4217 	MemoryContextSwitchTo(oldcontext);
4218 
4219 	PG_RETURN_VOID();
4220 }
4221 
4222 /*
4223  * appendStringInfoText
4224  *
4225  * Append a text to str.
4226  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
4227  */
4228 static void
appendStringInfoText(StringInfo str,const text * t)4229 appendStringInfoText(StringInfo str, const text *t)
4230 {
4231 	appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
4232 }
4233 
4234 /*
4235  * replace_text
4236  * replace all occurrences of 'old_sub_str' in 'orig_str'
4237  * with 'new_sub_str' to form 'new_str'
4238  *
4239  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4240  * otherwise returns 'new_str'
4241  */
4242 Datum
replace_text(PG_FUNCTION_ARGS)4243 replace_text(PG_FUNCTION_ARGS)
4244 {
4245 	text	   *src_text = PG_GETARG_TEXT_PP(0);
4246 	text	   *from_sub_text = PG_GETARG_TEXT_PP(1);
4247 	text	   *to_sub_text = PG_GETARG_TEXT_PP(2);
4248 	int			src_text_len;
4249 	int			from_sub_text_len;
4250 	TextPositionState state;
4251 	text	   *ret_text;
4252 	int			chunk_len;
4253 	char	   *curr_ptr;
4254 	char	   *start_ptr;
4255 	StringInfoData str;
4256 	bool		found;
4257 
4258 	src_text_len = VARSIZE_ANY_EXHDR(src_text);
4259 	from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4260 
4261 	/* Return unmodified source string if empty source or pattern */
4262 	if (src_text_len < 1 || from_sub_text_len < 1)
4263 	{
4264 		PG_RETURN_TEXT_P(src_text);
4265 	}
4266 
4267 	text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4268 
4269 	found = text_position_next(&state);
4270 
4271 	/* When the from_sub_text is not found, there is nothing to do. */
4272 	if (!found)
4273 	{
4274 		text_position_cleanup(&state);
4275 		PG_RETURN_TEXT_P(src_text);
4276 	}
4277 	curr_ptr = text_position_get_match_ptr(&state);
4278 	start_ptr = VARDATA_ANY(src_text);
4279 
4280 	initStringInfo(&str);
4281 
4282 	do
4283 	{
4284 		CHECK_FOR_INTERRUPTS();
4285 
4286 		/* copy the data skipped over by last text_position_next() */
4287 		chunk_len = curr_ptr - start_ptr;
4288 		appendBinaryStringInfo(&str, start_ptr, chunk_len);
4289 
4290 		appendStringInfoText(&str, to_sub_text);
4291 
4292 		start_ptr = curr_ptr + from_sub_text_len;
4293 
4294 		found = text_position_next(&state);
4295 		if (found)
4296 			curr_ptr = text_position_get_match_ptr(&state);
4297 	}
4298 	while (found);
4299 
4300 	/* copy trailing data */
4301 	chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4302 	appendBinaryStringInfo(&str, start_ptr, chunk_len);
4303 
4304 	text_position_cleanup(&state);
4305 
4306 	ret_text = cstring_to_text_with_len(str.data, str.len);
4307 	pfree(str.data);
4308 
4309 	PG_RETURN_TEXT_P(ret_text);
4310 }
4311 
4312 /*
4313  * check_replace_text_has_escape_char
4314  *
4315  * check whether replace_text contains escape char.
4316  */
4317 static bool
check_replace_text_has_escape_char(const text * replace_text)4318 check_replace_text_has_escape_char(const text *replace_text)
4319 {
4320 	const char *p = VARDATA_ANY(replace_text);
4321 	const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4322 
4323 	if (pg_database_encoding_max_length() == 1)
4324 	{
4325 		for (; p < p_end; p++)
4326 		{
4327 			if (*p == '\\')
4328 				return true;
4329 		}
4330 	}
4331 	else
4332 	{
4333 		for (; p < p_end; p += pg_mblen(p))
4334 		{
4335 			if (*p == '\\')
4336 				return true;
4337 		}
4338 	}
4339 
4340 	return false;
4341 }
4342 
4343 /*
4344  * appendStringInfoRegexpSubstr
4345  *
4346  * Append replace_text to str, substituting regexp back references for
4347  * \n escapes.  start_ptr is the start of the match in the source string,
4348  * at logical character position data_pos.
4349  */
4350 static void
appendStringInfoRegexpSubstr(StringInfo str,text * replace_text,regmatch_t * pmatch,char * start_ptr,int data_pos)4351 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
4352 							 regmatch_t *pmatch,
4353 							 char *start_ptr, int data_pos)
4354 {
4355 	const char *p = VARDATA_ANY(replace_text);
4356 	const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4357 	int			eml = pg_database_encoding_max_length();
4358 
4359 	for (;;)
4360 	{
4361 		const char *chunk_start = p;
4362 		int			so;
4363 		int			eo;
4364 
4365 		/* Find next escape char. */
4366 		if (eml == 1)
4367 		{
4368 			for (; p < p_end && *p != '\\'; p++)
4369 				 /* nothing */ ;
4370 		}
4371 		else
4372 		{
4373 			for (; p < p_end && *p != '\\'; p += pg_mblen(p))
4374 				 /* nothing */ ;
4375 		}
4376 
4377 		/* Copy the text we just scanned over, if any. */
4378 		if (p > chunk_start)
4379 			appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4380 
4381 		/* Done if at end of string, else advance over escape char. */
4382 		if (p >= p_end)
4383 			break;
4384 		p++;
4385 
4386 		if (p >= p_end)
4387 		{
4388 			/* Escape at very end of input.  Treat same as unexpected char */
4389 			appendStringInfoChar(str, '\\');
4390 			break;
4391 		}
4392 
4393 		if (*p >= '1' && *p <= '9')
4394 		{
4395 			/* Use the back reference of regexp. */
4396 			int			idx = *p - '0';
4397 
4398 			so = pmatch[idx].rm_so;
4399 			eo = pmatch[idx].rm_eo;
4400 			p++;
4401 		}
4402 		else if (*p == '&')
4403 		{
4404 			/* Use the entire matched string. */
4405 			so = pmatch[0].rm_so;
4406 			eo = pmatch[0].rm_eo;
4407 			p++;
4408 		}
4409 		else if (*p == '\\')
4410 		{
4411 			/* \\ means transfer one \ to output. */
4412 			appendStringInfoChar(str, '\\');
4413 			p++;
4414 			continue;
4415 		}
4416 		else
4417 		{
4418 			/*
4419 			 * If escape char is not followed by any expected char, just treat
4420 			 * it as ordinary data to copy.  (XXX would it be better to throw
4421 			 * an error?)
4422 			 */
4423 			appendStringInfoChar(str, '\\');
4424 			continue;
4425 		}
4426 
4427 		if (so != -1 && eo != -1)
4428 		{
4429 			/*
4430 			 * Copy the text that is back reference of regexp.  Note so and eo
4431 			 * are counted in characters not bytes.
4432 			 */
4433 			char	   *chunk_start;
4434 			int			chunk_len;
4435 
4436 			Assert(so >= data_pos);
4437 			chunk_start = start_ptr;
4438 			chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4439 			chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4440 			appendBinaryStringInfo(str, chunk_start, chunk_len);
4441 		}
4442 	}
4443 }
4444 
4445 #define REGEXP_REPLACE_BACKREF_CNT		10
4446 
4447 /*
4448  * replace_text_regexp
4449  *
4450  * replace text that matches to regexp in src_text to replace_text.
4451  *
4452  * Note: to avoid having to include regex.h in builtins.h, we declare
4453  * the regexp argument as void *, but really it's regex_t *.
4454  */
4455 text *
replace_text_regexp(text * src_text,void * regexp,text * replace_text,bool glob)4456 replace_text_regexp(text *src_text, void *regexp,
4457 					text *replace_text, bool glob)
4458 {
4459 	text	   *ret_text;
4460 	regex_t    *re = (regex_t *) regexp;
4461 	int			src_text_len = VARSIZE_ANY_EXHDR(src_text);
4462 	StringInfoData buf;
4463 	regmatch_t	pmatch[REGEXP_REPLACE_BACKREF_CNT];
4464 	pg_wchar   *data;
4465 	size_t		data_len;
4466 	int			search_start;
4467 	int			data_pos;
4468 	char	   *start_ptr;
4469 	bool		have_escape;
4470 
4471 	initStringInfo(&buf);
4472 
4473 	/* Convert data string to wide characters. */
4474 	data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4475 	data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4476 
4477 	/* Check whether replace_text has escape char. */
4478 	have_escape = check_replace_text_has_escape_char(replace_text);
4479 
4480 	/* start_ptr points to the data_pos'th character of src_text */
4481 	start_ptr = (char *) VARDATA_ANY(src_text);
4482 	data_pos = 0;
4483 
4484 	search_start = 0;
4485 	while (search_start <= data_len)
4486 	{
4487 		int			regexec_result;
4488 
4489 		CHECK_FOR_INTERRUPTS();
4490 
4491 		regexec_result = pg_regexec(re,
4492 									data,
4493 									data_len,
4494 									search_start,
4495 									NULL,	/* no details */
4496 									REGEXP_REPLACE_BACKREF_CNT,
4497 									pmatch,
4498 									0);
4499 
4500 		if (regexec_result == REG_NOMATCH)
4501 			break;
4502 
4503 		if (regexec_result != REG_OKAY)
4504 		{
4505 			char		errMsg[100];
4506 
4507 			CHECK_FOR_INTERRUPTS();
4508 			pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4509 			ereport(ERROR,
4510 					(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4511 					 errmsg("regular expression failed: %s", errMsg)));
4512 		}
4513 
4514 		/*
4515 		 * Copy the text to the left of the match position.  Note we are given
4516 		 * character not byte indexes.
4517 		 */
4518 		if (pmatch[0].rm_so - data_pos > 0)
4519 		{
4520 			int			chunk_len;
4521 
4522 			chunk_len = charlen_to_bytelen(start_ptr,
4523 										   pmatch[0].rm_so - data_pos);
4524 			appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4525 
4526 			/*
4527 			 * Advance start_ptr over that text, to avoid multiple rescans of
4528 			 * it if the replace_text contains multiple back-references.
4529 			 */
4530 			start_ptr += chunk_len;
4531 			data_pos = pmatch[0].rm_so;
4532 		}
4533 
4534 		/*
4535 		 * Copy the replace_text. Process back references when the
4536 		 * replace_text has escape characters.
4537 		 */
4538 		if (have_escape)
4539 			appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4540 										 start_ptr, data_pos);
4541 		else
4542 			appendStringInfoText(&buf, replace_text);
4543 
4544 		/* Advance start_ptr and data_pos over the matched text. */
4545 		start_ptr += charlen_to_bytelen(start_ptr,
4546 										pmatch[0].rm_eo - data_pos);
4547 		data_pos = pmatch[0].rm_eo;
4548 
4549 		/*
4550 		 * When global option is off, replace the first instance only.
4551 		 */
4552 		if (!glob)
4553 			break;
4554 
4555 		/*
4556 		 * Advance search position.  Normally we start the next search at the
4557 		 * end of the previous match; but if the match was of zero length, we
4558 		 * have to advance by one character, or we'd just find the same match
4559 		 * again.
4560 		 */
4561 		search_start = data_pos;
4562 		if (pmatch[0].rm_so == pmatch[0].rm_eo)
4563 			search_start++;
4564 	}
4565 
4566 	/*
4567 	 * Copy the text to the right of the last match.
4568 	 */
4569 	if (data_pos < data_len)
4570 	{
4571 		int			chunk_len;
4572 
4573 		chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4574 		appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4575 	}
4576 
4577 	ret_text = cstring_to_text_with_len(buf.data, buf.len);
4578 	pfree(buf.data);
4579 	pfree(data);
4580 
4581 	return ret_text;
4582 }
4583 
4584 /*
4585  * split_text
4586  * parse input string
4587  * return ord item (1 based)
4588  * based on provided field separator
4589  */
4590 Datum
split_text(PG_FUNCTION_ARGS)4591 split_text(PG_FUNCTION_ARGS)
4592 {
4593 	text	   *inputstring = PG_GETARG_TEXT_PP(0);
4594 	text	   *fldsep = PG_GETARG_TEXT_PP(1);
4595 	int			fldnum = PG_GETARG_INT32(2);
4596 	int			inputstring_len;
4597 	int			fldsep_len;
4598 	TextPositionState state;
4599 	char	   *start_ptr;
4600 	char	   *end_ptr;
4601 	text	   *result_text;
4602 	bool		found;
4603 
4604 	/* field number is 1 based */
4605 	if (fldnum < 1)
4606 		ereport(ERROR,
4607 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4608 				 errmsg("field position must be greater than zero")));
4609 
4610 	inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4611 	fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4612 
4613 	/* return empty string for empty input string */
4614 	if (inputstring_len < 1)
4615 		PG_RETURN_TEXT_P(cstring_to_text(""));
4616 
4617 	/* empty field separator */
4618 	if (fldsep_len < 1)
4619 	{
4620 		text_position_cleanup(&state);
4621 		/* if first field, return input string, else empty string */
4622 		if (fldnum == 1)
4623 			PG_RETURN_TEXT_P(inputstring);
4624 		else
4625 			PG_RETURN_TEXT_P(cstring_to_text(""));
4626 	}
4627 
4628 	text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4629 
4630 	/* identify bounds of first field */
4631 	start_ptr = VARDATA_ANY(inputstring);
4632 	found = text_position_next(&state);
4633 
4634 	/* special case if fldsep not found at all */
4635 	if (!found)
4636 	{
4637 		text_position_cleanup(&state);
4638 		/* if field 1 requested, return input string, else empty string */
4639 		if (fldnum == 1)
4640 			PG_RETURN_TEXT_P(inputstring);
4641 		else
4642 			PG_RETURN_TEXT_P(cstring_to_text(""));
4643 	}
4644 	end_ptr = text_position_get_match_ptr(&state);
4645 
4646 	while (found && --fldnum > 0)
4647 	{
4648 		/* identify bounds of next field */
4649 		start_ptr = end_ptr + fldsep_len;
4650 		found = text_position_next(&state);
4651 		if (found)
4652 			end_ptr = text_position_get_match_ptr(&state);
4653 	}
4654 
4655 	text_position_cleanup(&state);
4656 
4657 	if (fldnum > 0)
4658 	{
4659 		/* N'th field separator not found */
4660 		/* if last field requested, return it, else empty string */
4661 		if (fldnum == 1)
4662 		{
4663 			int			last_len = start_ptr - VARDATA_ANY(inputstring);
4664 
4665 			result_text = cstring_to_text_with_len(start_ptr,
4666 												   inputstring_len - last_len);
4667 		}
4668 		else
4669 			result_text = cstring_to_text("");
4670 	}
4671 	else
4672 	{
4673 		/* non-last field requested */
4674 		result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4675 	}
4676 
4677 	PG_RETURN_TEXT_P(result_text);
4678 }
4679 
4680 /*
4681  * Convenience function to return true when two text params are equal.
4682  */
4683 static bool
text_isequal(text * txt1,text * txt2,Oid collid)4684 text_isequal(text *txt1, text *txt2, Oid collid)
4685 {
4686 	return DatumGetBool(DirectFunctionCall2Coll(texteq,
4687 												collid,
4688 												PointerGetDatum(txt1),
4689 												PointerGetDatum(txt2)));
4690 }
4691 
4692 /*
4693  * text_to_array
4694  * parse input string and return text array of elements,
4695  * based on provided field separator
4696  */
4697 Datum
text_to_array(PG_FUNCTION_ARGS)4698 text_to_array(PG_FUNCTION_ARGS)
4699 {
4700 	return text_to_array_internal(fcinfo);
4701 }
4702 
4703 /*
4704  * text_to_array_null
4705  * parse input string and return text array of elements,
4706  * based on provided field separator and null string
4707  *
4708  * This is a separate entry point only to prevent the regression tests from
4709  * complaining about different argument sets for the same internal function.
4710  */
4711 Datum
text_to_array_null(PG_FUNCTION_ARGS)4712 text_to_array_null(PG_FUNCTION_ARGS)
4713 {
4714 	return text_to_array_internal(fcinfo);
4715 }
4716 
4717 /*
4718  * common code for text_to_array and text_to_array_null functions
4719  *
4720  * These are not strict so we have to test for null inputs explicitly.
4721  */
4722 static Datum
text_to_array_internal(PG_FUNCTION_ARGS)4723 text_to_array_internal(PG_FUNCTION_ARGS)
4724 {
4725 	text	   *inputstring;
4726 	text	   *fldsep;
4727 	text	   *null_string;
4728 	int			inputstring_len;
4729 	int			fldsep_len;
4730 	char	   *start_ptr;
4731 	text	   *result_text;
4732 	bool		is_null;
4733 	ArrayBuildState *astate = NULL;
4734 
4735 	/* when input string is NULL, then result is NULL too */
4736 	if (PG_ARGISNULL(0))
4737 		PG_RETURN_NULL();
4738 
4739 	inputstring = PG_GETARG_TEXT_PP(0);
4740 
4741 	/* fldsep can be NULL */
4742 	if (!PG_ARGISNULL(1))
4743 		fldsep = PG_GETARG_TEXT_PP(1);
4744 	else
4745 		fldsep = NULL;
4746 
4747 	/* null_string can be NULL or omitted */
4748 	if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4749 		null_string = PG_GETARG_TEXT_PP(2);
4750 	else
4751 		null_string = NULL;
4752 
4753 	if (fldsep != NULL)
4754 	{
4755 		/*
4756 		 * Normal case with non-null fldsep.  Use the text_position machinery
4757 		 * to search for occurrences of fldsep.
4758 		 */
4759 		TextPositionState state;
4760 
4761 		inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4762 		fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4763 
4764 		/* return empty array for empty input string */
4765 		if (inputstring_len < 1)
4766 			PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4767 
4768 		/*
4769 		 * empty field separator: return the input string as a one-element
4770 		 * array
4771 		 */
4772 		if (fldsep_len < 1)
4773 		{
4774 			Datum		elems[1];
4775 			bool		nulls[1];
4776 			int			dims[1];
4777 			int			lbs[1];
4778 
4779 			/* single element can be a NULL too */
4780 			is_null = null_string ? text_isequal(inputstring, null_string, PG_GET_COLLATION()) : false;
4781 
4782 			elems[0] = PointerGetDatum(inputstring);
4783 			nulls[0] = is_null;
4784 			dims[0] = 1;
4785 			lbs[0] = 1;
4786 			/* XXX: this hardcodes assumptions about the text type */
4787 			PG_RETURN_ARRAYTYPE_P(construct_md_array(elems, nulls,
4788 													 1, dims, lbs,
4789 													 TEXTOID, -1, false, TYPALIGN_INT));
4790 		}
4791 
4792 		text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4793 
4794 		start_ptr = VARDATA_ANY(inputstring);
4795 
4796 		for (;;)
4797 		{
4798 			bool		found;
4799 			char	   *end_ptr;
4800 			int			chunk_len;
4801 
4802 			CHECK_FOR_INTERRUPTS();
4803 
4804 			found = text_position_next(&state);
4805 			if (!found)
4806 			{
4807 				/* fetch last field */
4808 				chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4809 				end_ptr = NULL; /* not used, but some compilers complain */
4810 			}
4811 			else
4812 			{
4813 				/* fetch non-last field */
4814 				end_ptr = text_position_get_match_ptr(&state);
4815 				chunk_len = end_ptr - start_ptr;
4816 			}
4817 
4818 			/* must build a temp text datum to pass to accumArrayResult */
4819 			result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4820 			is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false;
4821 
4822 			/* stash away this field */
4823 			astate = accumArrayResult(astate,
4824 									  PointerGetDatum(result_text),
4825 									  is_null,
4826 									  TEXTOID,
4827 									  CurrentMemoryContext);
4828 
4829 			pfree(result_text);
4830 
4831 			if (!found)
4832 				break;
4833 
4834 			start_ptr = end_ptr + fldsep_len;
4835 		}
4836 
4837 		text_position_cleanup(&state);
4838 	}
4839 	else
4840 	{
4841 		/*
4842 		 * When fldsep is NULL, each character in the inputstring becomes an
4843 		 * element in the result array.  The separator is effectively the
4844 		 * space between characters.
4845 		 */
4846 		inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4847 
4848 		/* return empty array for empty input string */
4849 		if (inputstring_len < 1)
4850 			PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4851 
4852 		start_ptr = VARDATA_ANY(inputstring);
4853 
4854 		while (inputstring_len > 0)
4855 		{
4856 			int			chunk_len = pg_mblen(start_ptr);
4857 
4858 			CHECK_FOR_INTERRUPTS();
4859 
4860 			/* must build a temp text datum to pass to accumArrayResult */
4861 			result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4862 			is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false;
4863 
4864 			/* stash away this field */
4865 			astate = accumArrayResult(astate,
4866 									  PointerGetDatum(result_text),
4867 									  is_null,
4868 									  TEXTOID,
4869 									  CurrentMemoryContext);
4870 
4871 			pfree(result_text);
4872 
4873 			start_ptr += chunk_len;
4874 			inputstring_len -= chunk_len;
4875 		}
4876 	}
4877 
4878 	PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
4879 										  CurrentMemoryContext));
4880 }
4881 
4882 /*
4883  * array_to_text
4884  * concatenate Cstring representation of input array elements
4885  * using provided field separator
4886  */
4887 Datum
array_to_text(PG_FUNCTION_ARGS)4888 array_to_text(PG_FUNCTION_ARGS)
4889 {
4890 	ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
4891 	char	   *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4892 
4893 	PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4894 }
4895 
4896 /*
4897  * array_to_text_null
4898  * concatenate Cstring representation of input array elements
4899  * using provided field separator and null string
4900  *
4901  * This version is not strict so we have to test for null inputs explicitly.
4902  */
4903 Datum
array_to_text_null(PG_FUNCTION_ARGS)4904 array_to_text_null(PG_FUNCTION_ARGS)
4905 {
4906 	ArrayType  *v;
4907 	char	   *fldsep;
4908 	char	   *null_string;
4909 
4910 	/* returns NULL when first or second parameter is NULL */
4911 	if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4912 		PG_RETURN_NULL();
4913 
4914 	v = PG_GETARG_ARRAYTYPE_P(0);
4915 	fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4916 
4917 	/* NULL null string is passed through as a null pointer */
4918 	if (!PG_ARGISNULL(2))
4919 		null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4920 	else
4921 		null_string = NULL;
4922 
4923 	PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4924 }
4925 
4926 /*
4927  * common code for array_to_text and array_to_text_null functions
4928  */
4929 static text *
array_to_text_internal(FunctionCallInfo fcinfo,ArrayType * v,const char * fldsep,const char * null_string)4930 array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
4931 					   const char *fldsep, const char *null_string)
4932 {
4933 	text	   *result;
4934 	int			nitems,
4935 			   *dims,
4936 				ndims;
4937 	Oid			element_type;
4938 	int			typlen;
4939 	bool		typbyval;
4940 	char		typalign;
4941 	StringInfoData buf;
4942 	bool		printed = false;
4943 	char	   *p;
4944 	bits8	   *bitmap;
4945 	int			bitmask;
4946 	int			i;
4947 	ArrayMetaState *my_extra;
4948 
4949 	ndims = ARR_NDIM(v);
4950 	dims = ARR_DIMS(v);
4951 	nitems = ArrayGetNItems(ndims, dims);
4952 
4953 	/* if there are no elements, return an empty string */
4954 	if (nitems == 0)
4955 		return cstring_to_text_with_len("", 0);
4956 
4957 	element_type = ARR_ELEMTYPE(v);
4958 	initStringInfo(&buf);
4959 
4960 	/*
4961 	 * We arrange to look up info about element type, including its output
4962 	 * conversion proc, only once per series of calls, assuming the element
4963 	 * type doesn't change underneath us.
4964 	 */
4965 	my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4966 	if (my_extra == NULL)
4967 	{
4968 		fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4969 													  sizeof(ArrayMetaState));
4970 		my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4971 		my_extra->element_type = ~element_type;
4972 	}
4973 
4974 	if (my_extra->element_type != element_type)
4975 	{
4976 		/*
4977 		 * Get info about element type, including its output conversion proc
4978 		 */
4979 		get_type_io_data(element_type, IOFunc_output,
4980 						 &my_extra->typlen, &my_extra->typbyval,
4981 						 &my_extra->typalign, &my_extra->typdelim,
4982 						 &my_extra->typioparam, &my_extra->typiofunc);
4983 		fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4984 					  fcinfo->flinfo->fn_mcxt);
4985 		my_extra->element_type = element_type;
4986 	}
4987 	typlen = my_extra->typlen;
4988 	typbyval = my_extra->typbyval;
4989 	typalign = my_extra->typalign;
4990 
4991 	p = ARR_DATA_PTR(v);
4992 	bitmap = ARR_NULLBITMAP(v);
4993 	bitmask = 1;
4994 
4995 	for (i = 0; i < nitems; i++)
4996 	{
4997 		Datum		itemvalue;
4998 		char	   *value;
4999 
5000 		/* Get source element, checking for NULL */
5001 		if (bitmap && (*bitmap & bitmask) == 0)
5002 		{
5003 			/* if null_string is NULL, we just ignore null elements */
5004 			if (null_string != NULL)
5005 			{
5006 				if (printed)
5007 					appendStringInfo(&buf, "%s%s", fldsep, null_string);
5008 				else
5009 					appendStringInfoString(&buf, null_string);
5010 				printed = true;
5011 			}
5012 		}
5013 		else
5014 		{
5015 			itemvalue = fetch_att(p, typbyval, typlen);
5016 
5017 			value = OutputFunctionCall(&my_extra->proc, itemvalue);
5018 
5019 			if (printed)
5020 				appendStringInfo(&buf, "%s%s", fldsep, value);
5021 			else
5022 				appendStringInfoString(&buf, value);
5023 			printed = true;
5024 
5025 			p = att_addlength_pointer(p, typlen, p);
5026 			p = (char *) att_align_nominal(p, typalign);
5027 		}
5028 
5029 		/* advance bitmap pointer if any */
5030 		if (bitmap)
5031 		{
5032 			bitmask <<= 1;
5033 			if (bitmask == 0x100)
5034 			{
5035 				bitmap++;
5036 				bitmask = 1;
5037 			}
5038 		}
5039 	}
5040 
5041 	result = cstring_to_text_with_len(buf.data, buf.len);
5042 	pfree(buf.data);
5043 
5044 	return result;
5045 }
5046 
5047 #define HEXBASE 16
5048 /*
5049  * Convert an int32 to a string containing a base 16 (hex) representation of
5050  * the number.
5051  */
5052 Datum
to_hex32(PG_FUNCTION_ARGS)5053 to_hex32(PG_FUNCTION_ARGS)
5054 {
5055 	uint32		value = (uint32) PG_GETARG_INT32(0);
5056 	char	   *ptr;
5057 	const char *digits = "0123456789abcdef";
5058 	char		buf[32];		/* bigger than needed, but reasonable */
5059 
5060 	ptr = buf + sizeof(buf) - 1;
5061 	*ptr = '\0';
5062 
5063 	do
5064 	{
5065 		*--ptr = digits[value % HEXBASE];
5066 		value /= HEXBASE;
5067 	} while (ptr > buf && value);
5068 
5069 	PG_RETURN_TEXT_P(cstring_to_text(ptr));
5070 }
5071 
5072 /*
5073  * Convert an int64 to a string containing a base 16 (hex) representation of
5074  * the number.
5075  */
5076 Datum
to_hex64(PG_FUNCTION_ARGS)5077 to_hex64(PG_FUNCTION_ARGS)
5078 {
5079 	uint64		value = (uint64) PG_GETARG_INT64(0);
5080 	char	   *ptr;
5081 	const char *digits = "0123456789abcdef";
5082 	char		buf[32];		/* bigger than needed, but reasonable */
5083 
5084 	ptr = buf + sizeof(buf) - 1;
5085 	*ptr = '\0';
5086 
5087 	do
5088 	{
5089 		*--ptr = digits[value % HEXBASE];
5090 		value /= HEXBASE;
5091 	} while (ptr > buf && value);
5092 
5093 	PG_RETURN_TEXT_P(cstring_to_text(ptr));
5094 }
5095 
5096 /*
5097  * Return the size of a datum, possibly compressed
5098  *
5099  * Works on any data type
5100  */
5101 Datum
pg_column_size(PG_FUNCTION_ARGS)5102 pg_column_size(PG_FUNCTION_ARGS)
5103 {
5104 	Datum		value = PG_GETARG_DATUM(0);
5105 	int32		result;
5106 	int			typlen;
5107 
5108 	/* On first call, get the input type's typlen, and save at *fn_extra */
5109 	if (fcinfo->flinfo->fn_extra == NULL)
5110 	{
5111 		/* Lookup the datatype of the supplied argument */
5112 		Oid			argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5113 
5114 		typlen = get_typlen(argtypeid);
5115 		if (typlen == 0)		/* should not happen */
5116 			elog(ERROR, "cache lookup failed for type %u", argtypeid);
5117 
5118 		fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5119 													  sizeof(int));
5120 		*((int *) fcinfo->flinfo->fn_extra) = typlen;
5121 	}
5122 	else
5123 		typlen = *((int *) fcinfo->flinfo->fn_extra);
5124 
5125 	if (typlen == -1)
5126 	{
5127 		/* varlena type, possibly toasted */
5128 		result = toast_datum_size(value);
5129 	}
5130 	else if (typlen == -2)
5131 	{
5132 		/* cstring */
5133 		result = strlen(DatumGetCString(value)) + 1;
5134 	}
5135 	else
5136 	{
5137 		/* ordinary fixed-width type */
5138 		result = typlen;
5139 	}
5140 
5141 	PG_RETURN_INT32(result);
5142 }
5143 
5144 /*
5145  * string_agg - Concatenates values and returns string.
5146  *
5147  * Syntax: string_agg(value text, delimiter text) RETURNS text
5148  *
5149  * Note: Any NULL values are ignored. The first-call delimiter isn't
5150  * actually used at all, and on subsequent calls the delimiter precedes
5151  * the associated value.
5152  */
5153 
5154 /* subroutine to initialize state */
5155 static StringInfo
makeStringAggState(FunctionCallInfo fcinfo)5156 makeStringAggState(FunctionCallInfo fcinfo)
5157 {
5158 	StringInfo	state;
5159 	MemoryContext aggcontext;
5160 	MemoryContext oldcontext;
5161 
5162 	if (!AggCheckCallContext(fcinfo, &aggcontext))
5163 	{
5164 		/* cannot be called directly because of internal-type argument */
5165 		elog(ERROR, "string_agg_transfn called in non-aggregate context");
5166 	}
5167 
5168 	/*
5169 	 * Create state in aggregate context.  It'll stay there across subsequent
5170 	 * calls.
5171 	 */
5172 	oldcontext = MemoryContextSwitchTo(aggcontext);
5173 	state = makeStringInfo();
5174 	MemoryContextSwitchTo(oldcontext);
5175 
5176 	return state;
5177 }
5178 
5179 Datum
string_agg_transfn(PG_FUNCTION_ARGS)5180 string_agg_transfn(PG_FUNCTION_ARGS)
5181 {
5182 	StringInfo	state;
5183 
5184 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5185 
5186 	/* Append the value unless null. */
5187 	if (!PG_ARGISNULL(1))
5188 	{
5189 		/* On the first time through, we ignore the delimiter. */
5190 		if (state == NULL)
5191 			state = makeStringAggState(fcinfo);
5192 		else if (!PG_ARGISNULL(2))
5193 			appendStringInfoText(state, PG_GETARG_TEXT_PP(2));	/* delimiter */
5194 
5195 		appendStringInfoText(state, PG_GETARG_TEXT_PP(1));	/* value */
5196 	}
5197 
5198 	/*
5199 	 * The transition type for string_agg() is declared to be "internal",
5200 	 * which is a pass-by-value type the same size as a pointer.
5201 	 */
5202 	PG_RETURN_POINTER(state);
5203 }
5204 
5205 Datum
string_agg_finalfn(PG_FUNCTION_ARGS)5206 string_agg_finalfn(PG_FUNCTION_ARGS)
5207 {
5208 	StringInfo	state;
5209 
5210 	/* cannot be called directly because of internal-type argument */
5211 	Assert(AggCheckCallContext(fcinfo, NULL));
5212 
5213 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5214 
5215 	if (state != NULL)
5216 		PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
5217 	else
5218 		PG_RETURN_NULL();
5219 }
5220 
5221 /*
5222  * Prepare cache with fmgr info for the output functions of the datatypes of
5223  * the arguments of a concat-like function, beginning with argument "argidx".
5224  * (Arguments before that will have corresponding slots in the resulting
5225  * FmgrInfo array, but we don't fill those slots.)
5226  */
5227 static FmgrInfo *
build_concat_foutcache(FunctionCallInfo fcinfo,int argidx)5228 build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
5229 {
5230 	FmgrInfo   *foutcache;
5231 	int			i;
5232 
5233 	/* We keep the info in fn_mcxt so it survives across calls */
5234 	foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5235 												PG_NARGS() * sizeof(FmgrInfo));
5236 
5237 	for (i = argidx; i < PG_NARGS(); i++)
5238 	{
5239 		Oid			valtype;
5240 		Oid			typOutput;
5241 		bool		typIsVarlena;
5242 
5243 		valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5244 		if (!OidIsValid(valtype))
5245 			elog(ERROR, "could not determine data type of concat() input");
5246 
5247 		getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5248 		fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5249 	}
5250 
5251 	fcinfo->flinfo->fn_extra = foutcache;
5252 
5253 	return foutcache;
5254 }
5255 
5256 /*
5257  * Implementation of both concat() and concat_ws().
5258  *
5259  * sepstr is the separator string to place between values.
5260  * argidx identifies the first argument to concatenate (counting from zero);
5261  * note that this must be constant across any one series of calls.
5262  *
5263  * Returns NULL if result should be NULL, else text value.
5264  */
5265 static text *
concat_internal(const char * sepstr,int argidx,FunctionCallInfo fcinfo)5266 concat_internal(const char *sepstr, int argidx,
5267 				FunctionCallInfo fcinfo)
5268 {
5269 	text	   *result;
5270 	StringInfoData str;
5271 	FmgrInfo   *foutcache;
5272 	bool		first_arg = true;
5273 	int			i;
5274 
5275 	/*
5276 	 * concat(VARIADIC some-array) is essentially equivalent to
5277 	 * array_to_text(), ie concat the array elements with the given separator.
5278 	 * So we just pass the case off to that code.
5279 	 */
5280 	if (get_fn_expr_variadic(fcinfo->flinfo))
5281 	{
5282 		ArrayType  *arr;
5283 
5284 		/* Should have just the one argument */
5285 		Assert(argidx == PG_NARGS() - 1);
5286 
5287 		/* concat(VARIADIC NULL) is defined as NULL */
5288 		if (PG_ARGISNULL(argidx))
5289 			return NULL;
5290 
5291 		/*
5292 		 * Non-null argument had better be an array.  We assume that any call
5293 		 * context that could let get_fn_expr_variadic return true will have
5294 		 * checked that a VARIADIC-labeled parameter actually is an array.  So
5295 		 * it should be okay to just Assert that it's an array rather than
5296 		 * doing a full-fledged error check.
5297 		 */
5298 		Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
5299 
5300 		/* OK, safe to fetch the array value */
5301 		arr = PG_GETARG_ARRAYTYPE_P(argidx);
5302 
5303 		/*
5304 		 * And serialize the array.  We tell array_to_text to ignore null
5305 		 * elements, which matches the behavior of the loop below.
5306 		 */
5307 		return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5308 	}
5309 
5310 	/* Normal case without explicit VARIADIC marker */
5311 	initStringInfo(&str);
5312 
5313 	/* Get output function info, building it if first time through */
5314 	foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5315 	if (foutcache == NULL)
5316 		foutcache = build_concat_foutcache(fcinfo, argidx);
5317 
5318 	for (i = argidx; i < PG_NARGS(); i++)
5319 	{
5320 		if (!PG_ARGISNULL(i))
5321 		{
5322 			Datum		value = PG_GETARG_DATUM(i);
5323 
5324 			/* add separator if appropriate */
5325 			if (first_arg)
5326 				first_arg = false;
5327 			else
5328 				appendStringInfoString(&str, sepstr);
5329 
5330 			/* call the appropriate type output function, append the result */
5331 			appendStringInfoString(&str,
5332 								   OutputFunctionCall(&foutcache[i], value));
5333 		}
5334 	}
5335 
5336 	result = cstring_to_text_with_len(str.data, str.len);
5337 	pfree(str.data);
5338 
5339 	return result;
5340 }
5341 
5342 /*
5343  * Concatenate all arguments. NULL arguments are ignored.
5344  */
5345 Datum
text_concat(PG_FUNCTION_ARGS)5346 text_concat(PG_FUNCTION_ARGS)
5347 {
5348 	text	   *result;
5349 
5350 	result = concat_internal("", 0, fcinfo);
5351 	if (result == NULL)
5352 		PG_RETURN_NULL();
5353 	PG_RETURN_TEXT_P(result);
5354 }
5355 
5356 /*
5357  * Concatenate all but first argument value with separators. The first
5358  * parameter is used as the separator. NULL arguments are ignored.
5359  */
5360 Datum
text_concat_ws(PG_FUNCTION_ARGS)5361 text_concat_ws(PG_FUNCTION_ARGS)
5362 {
5363 	char	   *sep;
5364 	text	   *result;
5365 
5366 	/* return NULL when separator is NULL */
5367 	if (PG_ARGISNULL(0))
5368 		PG_RETURN_NULL();
5369 	sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
5370 
5371 	result = concat_internal(sep, 1, fcinfo);
5372 	if (result == NULL)
5373 		PG_RETURN_NULL();
5374 	PG_RETURN_TEXT_P(result);
5375 }
5376 
5377 /*
5378  * Return first n characters in the string. When n is negative,
5379  * return all but last |n| characters.
5380  */
5381 Datum
text_left(PG_FUNCTION_ARGS)5382 text_left(PG_FUNCTION_ARGS)
5383 {
5384 	int			n = PG_GETARG_INT32(1);
5385 
5386 	if (n < 0)
5387 	{
5388 		text	   *str = PG_GETARG_TEXT_PP(0);
5389 		const char *p = VARDATA_ANY(str);
5390 		int			len = VARSIZE_ANY_EXHDR(str);
5391 		int			rlen;
5392 
5393 		n = pg_mbstrlen_with_len(p, len) + n;
5394 		rlen = pg_mbcharcliplen(p, len, n);
5395 		PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
5396 	}
5397 	else
5398 		PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
5399 }
5400 
5401 /*
5402  * Return last n characters in the string. When n is negative,
5403  * return all but first |n| characters.
5404  */
5405 Datum
text_right(PG_FUNCTION_ARGS)5406 text_right(PG_FUNCTION_ARGS)
5407 {
5408 	text	   *str = PG_GETARG_TEXT_PP(0);
5409 	const char *p = VARDATA_ANY(str);
5410 	int			len = VARSIZE_ANY_EXHDR(str);
5411 	int			n = PG_GETARG_INT32(1);
5412 	int			off;
5413 
5414 	if (n < 0)
5415 		n = -n;
5416 	else
5417 		n = pg_mbstrlen_with_len(p, len) - n;
5418 	off = pg_mbcharcliplen(p, len, n);
5419 
5420 	PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5421 }
5422 
5423 /*
5424  * Return reversed string
5425  */
5426 Datum
text_reverse(PG_FUNCTION_ARGS)5427 text_reverse(PG_FUNCTION_ARGS)
5428 {
5429 	text	   *str = PG_GETARG_TEXT_PP(0);
5430 	const char *p = VARDATA_ANY(str);
5431 	int			len = VARSIZE_ANY_EXHDR(str);
5432 	const char *endp = p + len;
5433 	text	   *result;
5434 	char	   *dst;
5435 
5436 	result = palloc(len + VARHDRSZ);
5437 	dst = (char *) VARDATA(result) + len;
5438 	SET_VARSIZE(result, len + VARHDRSZ);
5439 
5440 	if (pg_database_encoding_max_length() > 1)
5441 	{
5442 		/* multibyte version */
5443 		while (p < endp)
5444 		{
5445 			int			sz;
5446 
5447 			sz = pg_mblen(p);
5448 			dst -= sz;
5449 			memcpy(dst, p, sz);
5450 			p += sz;
5451 		}
5452 	}
5453 	else
5454 	{
5455 		/* single byte version */
5456 		while (p < endp)
5457 			*(--dst) = *p++;
5458 	}
5459 
5460 	PG_RETURN_TEXT_P(result);
5461 }
5462 
5463 
5464 /*
5465  * Support macros for text_format()
5466  */
5467 #define TEXT_FORMAT_FLAG_MINUS	0x0001	/* is minus flag present? */
5468 
5469 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5470 	do { \
5471 		if (++(ptr) >= (end_ptr)) \
5472 			ereport(ERROR, \
5473 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5474 					 errmsg("unterminated format() type specifier"), \
5475 					 errhint("For a single \"%%\" use \"%%%%\"."))); \
5476 	} while (0)
5477 
5478 /*
5479  * Returns a formatted string
5480  */
5481 Datum
text_format(PG_FUNCTION_ARGS)5482 text_format(PG_FUNCTION_ARGS)
5483 {
5484 	text	   *fmt;
5485 	StringInfoData str;
5486 	const char *cp;
5487 	const char *start_ptr;
5488 	const char *end_ptr;
5489 	text	   *result;
5490 	int			arg;
5491 	bool		funcvariadic;
5492 	int			nargs;
5493 	Datum	   *elements = NULL;
5494 	bool	   *nulls = NULL;
5495 	Oid			element_type = InvalidOid;
5496 	Oid			prev_type = InvalidOid;
5497 	Oid			prev_width_type = InvalidOid;
5498 	FmgrInfo	typoutputfinfo;
5499 	FmgrInfo	typoutputinfo_width;
5500 
5501 	/* When format string is null, immediately return null */
5502 	if (PG_ARGISNULL(0))
5503 		PG_RETURN_NULL();
5504 
5505 	/* If argument is marked VARIADIC, expand array into elements */
5506 	if (get_fn_expr_variadic(fcinfo->flinfo))
5507 	{
5508 		ArrayType  *arr;
5509 		int16		elmlen;
5510 		bool		elmbyval;
5511 		char		elmalign;
5512 		int			nitems;
5513 
5514 		/* Should have just the one argument */
5515 		Assert(PG_NARGS() == 2);
5516 
5517 		/* If argument is NULL, we treat it as zero-length array */
5518 		if (PG_ARGISNULL(1))
5519 			nitems = 0;
5520 		else
5521 		{
5522 			/*
5523 			 * Non-null argument had better be an array.  We assume that any
5524 			 * call context that could let get_fn_expr_variadic return true
5525 			 * will have checked that a VARIADIC-labeled parameter actually is
5526 			 * an array.  So it should be okay to just Assert that it's an
5527 			 * array rather than doing a full-fledged error check.
5528 			 */
5529 			Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5530 
5531 			/* OK, safe to fetch the array value */
5532 			arr = PG_GETARG_ARRAYTYPE_P(1);
5533 
5534 			/* Get info about array element type */
5535 			element_type = ARR_ELEMTYPE(arr);
5536 			get_typlenbyvalalign(element_type,
5537 								 &elmlen, &elmbyval, &elmalign);
5538 
5539 			/* Extract all array elements */
5540 			deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5541 							  &elements, &nulls, &nitems);
5542 		}
5543 
5544 		nargs = nitems + 1;
5545 		funcvariadic = true;
5546 	}
5547 	else
5548 	{
5549 		/* Non-variadic case, we'll process the arguments individually */
5550 		nargs = PG_NARGS();
5551 		funcvariadic = false;
5552 	}
5553 
5554 	/* Setup for main loop. */
5555 	fmt = PG_GETARG_TEXT_PP(0);
5556 	start_ptr = VARDATA_ANY(fmt);
5557 	end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5558 	initStringInfo(&str);
5559 	arg = 1;					/* next argument position to print */
5560 
5561 	/* Scan format string, looking for conversion specifiers. */
5562 	for (cp = start_ptr; cp < end_ptr; cp++)
5563 	{
5564 		int			argpos;
5565 		int			widthpos;
5566 		int			flags;
5567 		int			width;
5568 		Datum		value;
5569 		bool		isNull;
5570 		Oid			typid;
5571 
5572 		/*
5573 		 * If it's not the start of a conversion specifier, just copy it to
5574 		 * the output buffer.
5575 		 */
5576 		if (*cp != '%')
5577 		{
5578 			appendStringInfoCharMacro(&str, *cp);
5579 			continue;
5580 		}
5581 
5582 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5583 
5584 		/* Easy case: %% outputs a single % */
5585 		if (*cp == '%')
5586 		{
5587 			appendStringInfoCharMacro(&str, *cp);
5588 			continue;
5589 		}
5590 
5591 		/* Parse the optional portions of the format specifier */
5592 		cp = text_format_parse_format(cp, end_ptr,
5593 									  &argpos, &widthpos,
5594 									  &flags, &width);
5595 
5596 		/*
5597 		 * Next we should see the main conversion specifier.  Whether or not
5598 		 * an argument position was present, it's known that at least one
5599 		 * character remains in the string at this point.  Experience suggests
5600 		 * that it's worth checking that that character is one of the expected
5601 		 * ones before we try to fetch arguments, so as to produce the least
5602 		 * confusing response to a mis-formatted specifier.
5603 		 */
5604 		if (strchr("sIL", *cp) == NULL)
5605 			ereport(ERROR,
5606 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5607 					 errmsg("unrecognized format() type specifier \"%c\"",
5608 							*cp),
5609 					 errhint("For a single \"%%\" use \"%%%%\".")));
5610 
5611 		/* If indirect width was specified, get its value */
5612 		if (widthpos >= 0)
5613 		{
5614 			/* Collect the specified or next argument position */
5615 			if (widthpos > 0)
5616 				arg = widthpos;
5617 			if (arg >= nargs)
5618 				ereport(ERROR,
5619 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5620 						 errmsg("too few arguments for format()")));
5621 
5622 			/* Get the value and type of the selected argument */
5623 			if (!funcvariadic)
5624 			{
5625 				value = PG_GETARG_DATUM(arg);
5626 				isNull = PG_ARGISNULL(arg);
5627 				typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5628 			}
5629 			else
5630 			{
5631 				value = elements[arg - 1];
5632 				isNull = nulls[arg - 1];
5633 				typid = element_type;
5634 			}
5635 			if (!OidIsValid(typid))
5636 				elog(ERROR, "could not determine data type of format() input");
5637 
5638 			arg++;
5639 
5640 			/* We can treat NULL width the same as zero */
5641 			if (isNull)
5642 				width = 0;
5643 			else if (typid == INT4OID)
5644 				width = DatumGetInt32(value);
5645 			else if (typid == INT2OID)
5646 				width = DatumGetInt16(value);
5647 			else
5648 			{
5649 				/* For less-usual datatypes, convert to text then to int */
5650 				char	   *str;
5651 
5652 				if (typid != prev_width_type)
5653 				{
5654 					Oid			typoutputfunc;
5655 					bool		typIsVarlena;
5656 
5657 					getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5658 					fmgr_info(typoutputfunc, &typoutputinfo_width);
5659 					prev_width_type = typid;
5660 				}
5661 
5662 				str = OutputFunctionCall(&typoutputinfo_width, value);
5663 
5664 				/* pg_strtoint32 will complain about bad data or overflow */
5665 				width = pg_strtoint32(str);
5666 
5667 				pfree(str);
5668 			}
5669 		}
5670 
5671 		/* Collect the specified or next argument position */
5672 		if (argpos > 0)
5673 			arg = argpos;
5674 		if (arg >= nargs)
5675 			ereport(ERROR,
5676 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5677 					 errmsg("too few arguments for format()")));
5678 
5679 		/* Get the value and type of the selected argument */
5680 		if (!funcvariadic)
5681 		{
5682 			value = PG_GETARG_DATUM(arg);
5683 			isNull = PG_ARGISNULL(arg);
5684 			typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5685 		}
5686 		else
5687 		{
5688 			value = elements[arg - 1];
5689 			isNull = nulls[arg - 1];
5690 			typid = element_type;
5691 		}
5692 		if (!OidIsValid(typid))
5693 			elog(ERROR, "could not determine data type of format() input");
5694 
5695 		arg++;
5696 
5697 		/*
5698 		 * Get the appropriate typOutput function, reusing previous one if
5699 		 * same type as previous argument.  That's particularly useful in the
5700 		 * variadic-array case, but often saves work even for ordinary calls.
5701 		 */
5702 		if (typid != prev_type)
5703 		{
5704 			Oid			typoutputfunc;
5705 			bool		typIsVarlena;
5706 
5707 			getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5708 			fmgr_info(typoutputfunc, &typoutputfinfo);
5709 			prev_type = typid;
5710 		}
5711 
5712 		/*
5713 		 * And now we can format the value.
5714 		 */
5715 		switch (*cp)
5716 		{
5717 			case 's':
5718 			case 'I':
5719 			case 'L':
5720 				text_format_string_conversion(&str, *cp, &typoutputfinfo,
5721 											  value, isNull,
5722 											  flags, width);
5723 				break;
5724 			default:
5725 				/* should not get here, because of previous check */
5726 				ereport(ERROR,
5727 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5728 						 errmsg("unrecognized format() type specifier \"%c\"",
5729 								*cp),
5730 						 errhint("For a single \"%%\" use \"%%%%\".")));
5731 				break;
5732 		}
5733 	}
5734 
5735 	/* Don't need deconstruct_array results anymore. */
5736 	if (elements != NULL)
5737 		pfree(elements);
5738 	if (nulls != NULL)
5739 		pfree(nulls);
5740 
5741 	/* Generate results. */
5742 	result = cstring_to_text_with_len(str.data, str.len);
5743 	pfree(str.data);
5744 
5745 	PG_RETURN_TEXT_P(result);
5746 }
5747 
5748 /*
5749  * Parse contiguous digits as a decimal number.
5750  *
5751  * Returns true if some digits could be parsed.
5752  * The value is returned into *value, and *ptr is advanced to the next
5753  * character to be parsed.
5754  *
5755  * Note parsing invariant: at least one character is known available before
5756  * string end (end_ptr) at entry, and this is still true at exit.
5757  */
5758 static bool
text_format_parse_digits(const char ** ptr,const char * end_ptr,int * value)5759 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5760 {
5761 	bool		found = false;
5762 	const char *cp = *ptr;
5763 	int			val = 0;
5764 
5765 	while (*cp >= '0' && *cp <= '9')
5766 	{
5767 		int8		digit = (*cp - '0');
5768 
5769 		if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5770 			unlikely(pg_add_s32_overflow(val, digit, &val)))
5771 			ereport(ERROR,
5772 					(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5773 					 errmsg("number is out of range")));
5774 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5775 		found = true;
5776 	}
5777 
5778 	*ptr = cp;
5779 	*value = val;
5780 
5781 	return found;
5782 }
5783 
5784 /*
5785  * Parse a format specifier (generally following the SUS printf spec).
5786  *
5787  * We have already advanced over the initial '%', and we are looking for
5788  * [argpos][flags][width]type (but the type character is not consumed here).
5789  *
5790  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5791  * Output parameters:
5792  *	argpos: argument position for value to be printed.  -1 means unspecified.
5793  *	widthpos: argument position for width.  Zero means the argument position
5794  *			was unspecified (ie, take the next arg) and -1 means no width
5795  *			argument (width was omitted or specified as a constant).
5796  *	flags: bitmask of flags.
5797  *	width: directly-specified width value.  Zero means the width was omitted
5798  *			(note it's not necessary to distinguish this case from an explicit
5799  *			zero width value).
5800  *
5801  * The function result is the next character position to be parsed, ie, the
5802  * location where the type character is/should be.
5803  *
5804  * Note parsing invariant: at least one character is known available before
5805  * string end (end_ptr) at entry, and this is still true at exit.
5806  */
5807 static const char *
text_format_parse_format(const char * start_ptr,const char * end_ptr,int * argpos,int * widthpos,int * flags,int * width)5808 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5809 						 int *argpos, int *widthpos,
5810 						 int *flags, int *width)
5811 {
5812 	const char *cp = start_ptr;
5813 	int			n;
5814 
5815 	/* set defaults for output parameters */
5816 	*argpos = -1;
5817 	*widthpos = -1;
5818 	*flags = 0;
5819 	*width = 0;
5820 
5821 	/* try to identify first number */
5822 	if (text_format_parse_digits(&cp, end_ptr, &n))
5823 	{
5824 		if (*cp != '$')
5825 		{
5826 			/* Must be just a width and a type, so we're done */
5827 			*width = n;
5828 			return cp;
5829 		}
5830 		/* The number was argument position */
5831 		*argpos = n;
5832 		/* Explicit 0 for argument index is immediately refused */
5833 		if (n == 0)
5834 			ereport(ERROR,
5835 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5836 					 errmsg("format specifies argument 0, but arguments are numbered from 1")));
5837 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5838 	}
5839 
5840 	/* Handle flags (only minus is supported now) */
5841 	while (*cp == '-')
5842 	{
5843 		*flags |= TEXT_FORMAT_FLAG_MINUS;
5844 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5845 	}
5846 
5847 	if (*cp == '*')
5848 	{
5849 		/* Handle indirect width */
5850 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5851 		if (text_format_parse_digits(&cp, end_ptr, &n))
5852 		{
5853 			/* number in this position must be closed by $ */
5854 			if (*cp != '$')
5855 				ereport(ERROR,
5856 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5857 						 errmsg("width argument position must be ended by \"$\"")));
5858 			/* The number was width argument position */
5859 			*widthpos = n;
5860 			/* Explicit 0 for argument index is immediately refused */
5861 			if (n == 0)
5862 				ereport(ERROR,
5863 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5864 						 errmsg("format specifies argument 0, but arguments are numbered from 1")));
5865 			ADVANCE_PARSE_POINTER(cp, end_ptr);
5866 		}
5867 		else
5868 			*widthpos = 0;		/* width's argument position is unspecified */
5869 	}
5870 	else
5871 	{
5872 		/* Check for direct width specification */
5873 		if (text_format_parse_digits(&cp, end_ptr, &n))
5874 			*width = n;
5875 	}
5876 
5877 	/* cp should now be pointing at type character */
5878 	return cp;
5879 }
5880 
5881 /*
5882  * Format a %s, %I, or %L conversion
5883  */
5884 static void
text_format_string_conversion(StringInfo buf,char conversion,FmgrInfo * typOutputInfo,Datum value,bool isNull,int flags,int width)5885 text_format_string_conversion(StringInfo buf, char conversion,
5886 							  FmgrInfo *typOutputInfo,
5887 							  Datum value, bool isNull,
5888 							  int flags, int width)
5889 {
5890 	char	   *str;
5891 
5892 	/* Handle NULL arguments before trying to stringify the value. */
5893 	if (isNull)
5894 	{
5895 		if (conversion == 's')
5896 			text_format_append_string(buf, "", flags, width);
5897 		else if (conversion == 'L')
5898 			text_format_append_string(buf, "NULL", flags, width);
5899 		else if (conversion == 'I')
5900 			ereport(ERROR,
5901 					(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
5902 					 errmsg("null values cannot be formatted as an SQL identifier")));
5903 		return;
5904 	}
5905 
5906 	/* Stringify. */
5907 	str = OutputFunctionCall(typOutputInfo, value);
5908 
5909 	/* Escape. */
5910 	if (conversion == 'I')
5911 	{
5912 		/* quote_identifier may or may not allocate a new string. */
5913 		text_format_append_string(buf, quote_identifier(str), flags, width);
5914 	}
5915 	else if (conversion == 'L')
5916 	{
5917 		char	   *qstr = quote_literal_cstr(str);
5918 
5919 		text_format_append_string(buf, qstr, flags, width);
5920 		/* quote_literal_cstr() always allocates a new string */
5921 		pfree(qstr);
5922 	}
5923 	else
5924 		text_format_append_string(buf, str, flags, width);
5925 
5926 	/* Cleanup. */
5927 	pfree(str);
5928 }
5929 
5930 /*
5931  * Append str to buf, padding as directed by flags/width
5932  */
5933 static void
text_format_append_string(StringInfo buf,const char * str,int flags,int width)5934 text_format_append_string(StringInfo buf, const char *str,
5935 						  int flags, int width)
5936 {
5937 	bool		align_to_left = false;
5938 	int			len;
5939 
5940 	/* fast path for typical easy case */
5941 	if (width == 0)
5942 	{
5943 		appendStringInfoString(buf, str);
5944 		return;
5945 	}
5946 
5947 	if (width < 0)
5948 	{
5949 		/* Negative width: implicit '-' flag, then take absolute value */
5950 		align_to_left = true;
5951 		/* -INT_MIN is undefined */
5952 		if (width <= INT_MIN)
5953 			ereport(ERROR,
5954 					(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5955 					 errmsg("number is out of range")));
5956 		width = -width;
5957 	}
5958 	else if (flags & TEXT_FORMAT_FLAG_MINUS)
5959 		align_to_left = true;
5960 
5961 	len = pg_mbstrlen(str);
5962 	if (align_to_left)
5963 	{
5964 		/* left justify */
5965 		appendStringInfoString(buf, str);
5966 		if (len < width)
5967 			appendStringInfoSpaces(buf, width - len);
5968 	}
5969 	else
5970 	{
5971 		/* right justify */
5972 		if (len < width)
5973 			appendStringInfoSpaces(buf, width - len);
5974 		appendStringInfoString(buf, str);
5975 	}
5976 }
5977 
5978 /*
5979  * text_format_nv - nonvariadic wrapper for text_format function.
5980  *
5981  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
5982  * which checks that all built-in functions that share the implementing C
5983  * function take the same number of arguments.
5984  */
5985 Datum
text_format_nv(PG_FUNCTION_ARGS)5986 text_format_nv(PG_FUNCTION_ARGS)
5987 {
5988 	return text_format(fcinfo);
5989 }
5990 
5991 /*
5992  * Helper function for Levenshtein distance functions. Faster than memcmp(),
5993  * for this use case.
5994  */
5995 static inline bool
rest_of_char_same(const char * s1,const char * s2,int len)5996 rest_of_char_same(const char *s1, const char *s2, int len)
5997 {
5998 	while (len > 0)
5999 	{
6000 		len--;
6001 		if (s1[len] != s2[len])
6002 			return false;
6003 	}
6004 	return true;
6005 }
6006 
6007 /* Expand each Levenshtein distance variant */
6008 #include "levenshtein.c"
6009 #define LEVENSHTEIN_LESS_EQUAL
6010 #include "levenshtein.c"
6011 
6012 
6013 /*
6014  * Unicode support
6015  */
6016 
6017 static UnicodeNormalizationForm
unicode_norm_form_from_string(const char * formstr)6018 unicode_norm_form_from_string(const char *formstr)
6019 {
6020 	UnicodeNormalizationForm form = -1;
6021 
6022 	/*
6023 	 * Might as well check this while we're here.
6024 	 */
6025 	if (GetDatabaseEncoding() != PG_UTF8)
6026 		ereport(ERROR,
6027 				(errcode(ERRCODE_SYNTAX_ERROR),
6028 				 errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6029 
6030 	if (pg_strcasecmp(formstr, "NFC") == 0)
6031 		form = UNICODE_NFC;
6032 	else if (pg_strcasecmp(formstr, "NFD") == 0)
6033 		form = UNICODE_NFD;
6034 	else if (pg_strcasecmp(formstr, "NFKC") == 0)
6035 		form = UNICODE_NFKC;
6036 	else if (pg_strcasecmp(formstr, "NFKD") == 0)
6037 		form = UNICODE_NFKD;
6038 	else
6039 		ereport(ERROR,
6040 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6041 				 errmsg("invalid normalization form: %s", formstr)));
6042 
6043 	return form;
6044 }
6045 
6046 Datum
unicode_normalize_func(PG_FUNCTION_ARGS)6047 unicode_normalize_func(PG_FUNCTION_ARGS)
6048 {
6049 	text	   *input = PG_GETARG_TEXT_PP(0);
6050 	char	   *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6051 	UnicodeNormalizationForm form;
6052 	int			size;
6053 	pg_wchar   *input_chars;
6054 	pg_wchar   *output_chars;
6055 	unsigned char *p;
6056 	text	   *result;
6057 	int			i;
6058 
6059 	form = unicode_norm_form_from_string(formstr);
6060 
6061 	/* convert to pg_wchar */
6062 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6063 	input_chars = palloc((size + 1) * sizeof(pg_wchar));
6064 	p = (unsigned char *) VARDATA_ANY(input);
6065 	for (i = 0; i < size; i++)
6066 	{
6067 		input_chars[i] = utf8_to_unicode(p);
6068 		p += pg_utf_mblen(p);
6069 	}
6070 	input_chars[i] = (pg_wchar) '\0';
6071 	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6072 
6073 	/* action */
6074 	output_chars = unicode_normalize(form, input_chars);
6075 
6076 	/* convert back to UTF-8 string */
6077 	size = 0;
6078 	for (pg_wchar *wp = output_chars; *wp; wp++)
6079 	{
6080 		unsigned char buf[4];
6081 
6082 		unicode_to_utf8(*wp, buf);
6083 		size += pg_utf_mblen(buf);
6084 	}
6085 
6086 	result = palloc(size + VARHDRSZ);
6087 	SET_VARSIZE(result, size + VARHDRSZ);
6088 
6089 	p = (unsigned char *) VARDATA_ANY(result);
6090 	for (pg_wchar *wp = output_chars; *wp; wp++)
6091 	{
6092 		unicode_to_utf8(*wp, p);
6093 		p += pg_utf_mblen(p);
6094 	}
6095 	Assert((char *) p == (char *) result + size + VARHDRSZ);
6096 
6097 	PG_RETURN_TEXT_P(result);
6098 }
6099 
6100 /*
6101  * Check whether the string is in the specified Unicode normalization form.
6102  *
6103  * This is done by convering the string to the specified normal form and then
6104  * comparing that to the original string.  To speed that up, we also apply the
6105  * "quick check" algorithm specified in UAX #15, which can give a yes or no
6106  * answer for many strings by just scanning the string once.
6107  *
6108  * This function should generally be optimized for the case where the string
6109  * is in fact normalized.  In that case, we'll end up looking at the entire
6110  * string, so it's probably not worth doing any incremental conversion etc.
6111  */
6112 Datum
unicode_is_normalized(PG_FUNCTION_ARGS)6113 unicode_is_normalized(PG_FUNCTION_ARGS)
6114 {
6115 	text	   *input = PG_GETARG_TEXT_PP(0);
6116 	char	   *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6117 	UnicodeNormalizationForm form;
6118 	int			size;
6119 	pg_wchar   *input_chars;
6120 	pg_wchar   *output_chars;
6121 	unsigned char *p;
6122 	int			i;
6123 	UnicodeNormalizationQC quickcheck;
6124 	int			output_size;
6125 	bool		result;
6126 
6127 	form = unicode_norm_form_from_string(formstr);
6128 
6129 	/* convert to pg_wchar */
6130 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6131 	input_chars = palloc((size + 1) * sizeof(pg_wchar));
6132 	p = (unsigned char *) VARDATA_ANY(input);
6133 	for (i = 0; i < size; i++)
6134 	{
6135 		input_chars[i] = utf8_to_unicode(p);
6136 		p += pg_utf_mblen(p);
6137 	}
6138 	input_chars[i] = (pg_wchar) '\0';
6139 	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6140 
6141 	/* quick check (see UAX #15) */
6142 	quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
6143 	if (quickcheck == UNICODE_NORM_QC_YES)
6144 		PG_RETURN_BOOL(true);
6145 	else if (quickcheck == UNICODE_NORM_QC_NO)
6146 		PG_RETURN_BOOL(false);
6147 
6148 	/* normalize and compare with original */
6149 	output_chars = unicode_normalize(form, input_chars);
6150 
6151 	output_size = 0;
6152 	for (pg_wchar *wp = output_chars; *wp; wp++)
6153 		output_size++;
6154 
6155 	result = (size == output_size) &&
6156 		(memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
6157 
6158 	PG_RETURN_BOOL(result);
6159 }
6160