1 /*-------------------------------------------------------------------------
2  *
3  * varlena.c
4  *	  Functions for the variable-length built-in types.
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/utils/adt/varlena.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <ctype.h>
18 #include <limits.h>
19 
20 #include "access/hash.h"
21 #include "access/tuptoaster.h"
22 #include "catalog/pg_collation.h"
23 #include "catalog/pg_type.h"
24 #include "common/md5.h"
25 #include "lib/hyperloglog.h"
26 #include "libpq/pqformat.h"
27 #include "miscadmin.h"
28 #include "parser/scansup.h"
29 #include "port/pg_bswap.h"
30 #include "regex/regex.h"
31 #include "utils/builtins.h"
32 #include "utils/bytea.h"
33 #include "utils/lsyscache.h"
34 #include "utils/memutils.h"
35 #include "utils/pg_locale.h"
36 #include "utils/sortsupport.h"
37 #include "utils/varlena.h"
38 
39 
40 /* GUC variable */
41 int			bytea_output = BYTEA_OUTPUT_HEX;
42 
43 typedef struct varlena unknown;
44 typedef struct varlena VarString;
45 
46 typedef struct
47 {
48 	bool		use_wchar;		/* T if multibyte encoding */
49 	char	   *str1;			/* use these if not use_wchar */
50 	char	   *str2;			/* note: these point to original texts */
51 	pg_wchar   *wstr1;			/* use these if use_wchar */
52 	pg_wchar   *wstr2;			/* note: these are palloc'd */
53 	int			len1;			/* string lengths in logical characters */
54 	int			len2;
55 	/* Skip table for Boyer-Moore-Horspool search algorithm: */
56 	int			skiptablemask;	/* mask for ANDing with skiptable subscripts */
57 	int			skiptable[256]; /* skip distance for given mismatched char */
58 } TextPositionState;
59 
60 typedef struct
61 {
62 	char	   *buf1;			/* 1st string, or abbreviation original string
63 								 * buf */
64 	char	   *buf2;			/* 2nd string, or abbreviation strxfrm() buf */
65 	int			buflen1;
66 	int			buflen2;
67 	int			last_len1;		/* Length of last buf1 string/strxfrm() input */
68 	int			last_len2;		/* Length of last buf2 string/strxfrm() blob */
69 	int			last_returned;	/* Last comparison result (cache) */
70 	bool		cache_blob;		/* Does buf2 contain strxfrm() blob, etc? */
71 	bool		collate_c;
72 	bool		bpchar;			/* Sorting bpchar, not varchar/text/bytea? */
73 	hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
74 	hyperLogLogState full_card; /* Full key cardinality state */
75 	double		prop_card;		/* Required cardinality proportion */
76 	pg_locale_t locale;
77 } VarStringSortSupport;
78 
79 /*
80  * This should be large enough that most strings will fit, but small enough
81  * that we feel comfortable putting it on the stack
82  */
83 #define TEXTBUFLEN		1024
84 
85 #define DatumGetUnknownP(X)			((unknown *) PG_DETOAST_DATUM(X))
86 #define DatumGetUnknownPCopy(X)		((unknown *) PG_DETOAST_DATUM_COPY(X))
87 #define PG_GETARG_UNKNOWN_P(n)		DatumGetUnknownP(PG_GETARG_DATUM(n))
88 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
89 #define PG_RETURN_UNKNOWN_P(x)		PG_RETURN_POINTER(x)
90 
91 #define DatumGetVarStringP(X)		((VarString *) PG_DETOAST_DATUM(X))
92 #define DatumGetVarStringPP(X)		((VarString *) PG_DETOAST_DATUM_PACKED(X))
93 
94 static int	varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
95 static int	bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
96 static int	varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup);
97 static int	varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
98 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
99 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
100 static int32 text_length(Datum str);
101 static text *text_catenate(text *t1, text *t2);
102 static text *text_substring(Datum str,
103 			   int32 start,
104 			   int32 length,
105 			   bool length_not_specified);
106 static text *text_overlay(text *t1, text *t2, int sp, int sl);
107 static int	text_position(text *t1, text *t2);
108 static void text_position_setup(text *t1, text *t2, TextPositionState *state);
109 static int	text_position_next(int start_pos, TextPositionState *state);
110 static void text_position_cleanup(TextPositionState *state);
111 static int	text_cmp(text *arg1, text *arg2, Oid collid);
112 static bytea *bytea_catenate(bytea *t1, bytea *t2);
113 static bytea *bytea_substring(Datum str,
114 				int S,
115 				int L,
116 				bool length_not_specified);
117 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
118 static void appendStringInfoText(StringInfo str, const text *t);
119 static Datum text_to_array_internal(PG_FUNCTION_ARGS);
120 static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
121 					   const char *fldsep, const char *null_string);
122 static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
123 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
124 						 int *value);
125 static const char *text_format_parse_format(const char *start_ptr,
126 						 const char *end_ptr,
127 						 int *argpos, int *widthpos,
128 						 int *flags, int *width);
129 static void text_format_string_conversion(StringInfo buf, char conversion,
130 							  FmgrInfo *typOutputInfo,
131 							  Datum value, bool isNull,
132 							  int flags, int width);
133 static void text_format_append_string(StringInfo buf, const char *str,
134 						  int flags, int width);
135 
136 
137 /*****************************************************************************
138  *	 CONVERSION ROUTINES EXPORTED FOR USE BY C CODE							 *
139  *****************************************************************************/
140 
141 /*
142  * cstring_to_text
143  *
144  * Create a text value from a null-terminated C string.
145  *
146  * The new text value is freshly palloc'd with a full-size VARHDR.
147  */
148 text *
cstring_to_text(const char * s)149 cstring_to_text(const char *s)
150 {
151 	return cstring_to_text_with_len(s, strlen(s));
152 }
153 
154 /*
155  * cstring_to_text_with_len
156  *
157  * Same as cstring_to_text except the caller specifies the string length;
158  * the string need not be null_terminated.
159  */
160 text *
cstring_to_text_with_len(const char * s,int len)161 cstring_to_text_with_len(const char *s, int len)
162 {
163 	text	   *result = (text *) palloc(len + VARHDRSZ);
164 
165 	SET_VARSIZE(result, len + VARHDRSZ);
166 	memcpy(VARDATA(result), s, len);
167 
168 	return result;
169 }
170 
171 /*
172  * text_to_cstring
173  *
174  * Create a palloc'd, null-terminated C string from a text value.
175  *
176  * We support being passed a compressed or toasted text value.
177  * This is a bit bogus since such values shouldn't really be referred to as
178  * "text *", but it seems useful for robustness.  If we didn't handle that
179  * case here, we'd need another routine that did, anyway.
180  */
181 char *
text_to_cstring(const text * t)182 text_to_cstring(const text *t)
183 {
184 	/* must cast away the const, unfortunately */
185 	text	   *tunpacked = pg_detoast_datum_packed((struct varlena *) t);
186 	int			len = VARSIZE_ANY_EXHDR(tunpacked);
187 	char	   *result;
188 
189 	result = (char *) palloc(len + 1);
190 	memcpy(result, VARDATA_ANY(tunpacked), len);
191 	result[len] = '\0';
192 
193 	if (tunpacked != t)
194 		pfree(tunpacked);
195 
196 	return result;
197 }
198 
199 /*
200  * text_to_cstring_buffer
201  *
202  * Copy a text value into a caller-supplied buffer of size dst_len.
203  *
204  * The text string is truncated if necessary to fit.  The result is
205  * guaranteed null-terminated (unless dst_len == 0).
206  *
207  * We support being passed a compressed or toasted text value.
208  * This is a bit bogus since such values shouldn't really be referred to as
209  * "text *", but it seems useful for robustness.  If we didn't handle that
210  * case here, we'd need another routine that did, anyway.
211  */
212 void
text_to_cstring_buffer(const text * src,char * dst,size_t dst_len)213 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
214 {
215 	/* must cast away the const, unfortunately */
216 	text	   *srcunpacked = pg_detoast_datum_packed((struct varlena *) src);
217 	size_t		src_len = VARSIZE_ANY_EXHDR(srcunpacked);
218 
219 	if (dst_len > 0)
220 	{
221 		dst_len--;
222 		if (dst_len >= src_len)
223 			dst_len = src_len;
224 		else					/* ensure truncation is encoding-safe */
225 			dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
226 		memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
227 		dst[dst_len] = '\0';
228 	}
229 
230 	if (srcunpacked != src)
231 		pfree(srcunpacked);
232 }
233 
234 
235 /*****************************************************************************
236  *	 USER I/O ROUTINES														 *
237  *****************************************************************************/
238 
239 
240 #define VAL(CH)			((CH) - '0')
241 #define DIG(VAL)		((VAL) + '0')
242 
243 /*
244  *		byteain			- converts from printable representation of byte array
245  *
246  *		Non-printable characters must be passed as '\nnn' (octal) and are
247  *		converted to internal form.  '\' must be passed as '\\'.
248  *		ereport(ERROR, ...) if bad form.
249  *
250  *		BUGS:
251  *				The input is scanned twice.
252  *				The error checking of input is minimal.
253  */
254 Datum
byteain(PG_FUNCTION_ARGS)255 byteain(PG_FUNCTION_ARGS)
256 {
257 	char	   *inputText = PG_GETARG_CSTRING(0);
258 	char	   *tp;
259 	char	   *rp;
260 	int			bc;
261 	bytea	   *result;
262 
263 	/* Recognize hex input */
264 	if (inputText[0] == '\\' && inputText[1] == 'x')
265 	{
266 		size_t		len = strlen(inputText);
267 
268 		bc = (len - 2) / 2 + VARHDRSZ;	/* maximum possible length */
269 		result = palloc(bc);
270 		bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
271 		SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
272 
273 		PG_RETURN_BYTEA_P(result);
274 	}
275 
276 	/* Else, it's the traditional escaped style */
277 	for (bc = 0, tp = inputText; *tp != '\0'; bc++)
278 	{
279 		if (tp[0] != '\\')
280 			tp++;
281 		else if ((tp[0] == '\\') &&
282 				 (tp[1] >= '0' && tp[1] <= '3') &&
283 				 (tp[2] >= '0' && tp[2] <= '7') &&
284 				 (tp[3] >= '0' && tp[3] <= '7'))
285 			tp += 4;
286 		else if ((tp[0] == '\\') &&
287 				 (tp[1] == '\\'))
288 			tp += 2;
289 		else
290 		{
291 			/*
292 			 * one backslash, not followed by another or ### valid octal
293 			 */
294 			ereport(ERROR,
295 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
296 					 errmsg("invalid input syntax for type %s", "bytea")));
297 		}
298 	}
299 
300 	bc += VARHDRSZ;
301 
302 	result = (bytea *) palloc(bc);
303 	SET_VARSIZE(result, bc);
304 
305 	tp = inputText;
306 	rp = VARDATA(result);
307 	while (*tp != '\0')
308 	{
309 		if (tp[0] != '\\')
310 			*rp++ = *tp++;
311 		else if ((tp[0] == '\\') &&
312 				 (tp[1] >= '0' && tp[1] <= '3') &&
313 				 (tp[2] >= '0' && tp[2] <= '7') &&
314 				 (tp[3] >= '0' && tp[3] <= '7'))
315 		{
316 			bc = VAL(tp[1]);
317 			bc <<= 3;
318 			bc += VAL(tp[2]);
319 			bc <<= 3;
320 			*rp++ = bc + VAL(tp[3]);
321 
322 			tp += 4;
323 		}
324 		else if ((tp[0] == '\\') &&
325 				 (tp[1] == '\\'))
326 		{
327 			*rp++ = '\\';
328 			tp += 2;
329 		}
330 		else
331 		{
332 			/*
333 			 * We should never get here. The first pass should not allow it.
334 			 */
335 			ereport(ERROR,
336 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
337 					 errmsg("invalid input syntax for type %s", "bytea")));
338 		}
339 	}
340 
341 	PG_RETURN_BYTEA_P(result);
342 }
343 
344 /*
345  *		byteaout		- converts to printable representation of byte array
346  *
347  *		In the traditional escaped format, non-printable characters are
348  *		printed as '\nnn' (octal) and '\' as '\\'.
349  */
350 Datum
byteaout(PG_FUNCTION_ARGS)351 byteaout(PG_FUNCTION_ARGS)
352 {
353 	bytea	   *vlena = PG_GETARG_BYTEA_PP(0);
354 	char	   *result;
355 	char	   *rp;
356 
357 	if (bytea_output == BYTEA_OUTPUT_HEX)
358 	{
359 		/* Print hex format */
360 		rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
361 		*rp++ = '\\';
362 		*rp++ = 'x';
363 		rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
364 	}
365 	else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
366 	{
367 		/* Print traditional escaped format */
368 		char	   *vp;
369 		int			len;
370 		int			i;
371 
372 		len = 1;				/* empty string has 1 char */
373 		vp = VARDATA_ANY(vlena);
374 		for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
375 		{
376 			if (*vp == '\\')
377 				len += 2;
378 			else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
379 				len += 4;
380 			else
381 				len++;
382 		}
383 		rp = result = (char *) palloc(len);
384 		vp = VARDATA_ANY(vlena);
385 		for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
386 		{
387 			if (*vp == '\\')
388 			{
389 				*rp++ = '\\';
390 				*rp++ = '\\';
391 			}
392 			else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
393 			{
394 				int			val;	/* holds unprintable chars */
395 
396 				val = *vp;
397 				rp[0] = '\\';
398 				rp[3] = DIG(val & 07);
399 				val >>= 3;
400 				rp[2] = DIG(val & 07);
401 				val >>= 3;
402 				rp[1] = DIG(val & 03);
403 				rp += 4;
404 			}
405 			else
406 				*rp++ = *vp;
407 		}
408 	}
409 	else
410 	{
411 		elog(ERROR, "unrecognized bytea_output setting: %d",
412 			 bytea_output);
413 		rp = result = NULL;		/* keep compiler quiet */
414 	}
415 	*rp = '\0';
416 	PG_RETURN_CSTRING(result);
417 }
418 
419 /*
420  *		bytearecv			- converts external binary format to bytea
421  */
422 Datum
bytearecv(PG_FUNCTION_ARGS)423 bytearecv(PG_FUNCTION_ARGS)
424 {
425 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
426 	bytea	   *result;
427 	int			nbytes;
428 
429 	nbytes = buf->len - buf->cursor;
430 	result = (bytea *) palloc(nbytes + VARHDRSZ);
431 	SET_VARSIZE(result, nbytes + VARHDRSZ);
432 	pq_copymsgbytes(buf, VARDATA(result), nbytes);
433 	PG_RETURN_BYTEA_P(result);
434 }
435 
436 /*
437  *		byteasend			- converts bytea to binary format
438  *
439  * This is a special case: just copy the input...
440  */
441 Datum
byteasend(PG_FUNCTION_ARGS)442 byteasend(PG_FUNCTION_ARGS)
443 {
444 	bytea	   *vlena = PG_GETARG_BYTEA_P_COPY(0);
445 
446 	PG_RETURN_BYTEA_P(vlena);
447 }
448 
449 Datum
bytea_string_agg_transfn(PG_FUNCTION_ARGS)450 bytea_string_agg_transfn(PG_FUNCTION_ARGS)
451 {
452 	StringInfo	state;
453 
454 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
455 
456 	/* Append the value unless null. */
457 	if (!PG_ARGISNULL(1))
458 	{
459 		bytea	   *value = PG_GETARG_BYTEA_PP(1);
460 
461 		/* On the first time through, we ignore the delimiter. */
462 		if (state == NULL)
463 			state = makeStringAggState(fcinfo);
464 		else if (!PG_ARGISNULL(2))
465 		{
466 			bytea	   *delim = PG_GETARG_BYTEA_PP(2);
467 
468 			appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
469 		}
470 
471 		appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
472 	}
473 
474 	/*
475 	 * The transition type for string_agg() is declared to be "internal",
476 	 * which is a pass-by-value type the same size as a pointer.
477 	 */
478 	PG_RETURN_POINTER(state);
479 }
480 
481 Datum
bytea_string_agg_finalfn(PG_FUNCTION_ARGS)482 bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
483 {
484 	StringInfo	state;
485 
486 	/* cannot be called directly because of internal-type argument */
487 	Assert(AggCheckCallContext(fcinfo, NULL));
488 
489 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
490 
491 	if (state != NULL)
492 	{
493 		bytea	   *result;
494 
495 		result = (bytea *) palloc(state->len + VARHDRSZ);
496 		SET_VARSIZE(result, state->len + VARHDRSZ);
497 		memcpy(VARDATA(result), state->data, state->len);
498 		PG_RETURN_BYTEA_P(result);
499 	}
500 	else
501 		PG_RETURN_NULL();
502 }
503 
504 /*
505  *		textin			- converts "..." to internal representation
506  */
507 Datum
textin(PG_FUNCTION_ARGS)508 textin(PG_FUNCTION_ARGS)
509 {
510 	char	   *inputText = PG_GETARG_CSTRING(0);
511 
512 	PG_RETURN_TEXT_P(cstring_to_text(inputText));
513 }
514 
515 /*
516  *		textout			- converts internal representation to "..."
517  */
518 Datum
textout(PG_FUNCTION_ARGS)519 textout(PG_FUNCTION_ARGS)
520 {
521 	Datum		txt = PG_GETARG_DATUM(0);
522 
523 	PG_RETURN_CSTRING(TextDatumGetCString(txt));
524 }
525 
526 /*
527  *		textrecv			- converts external binary format to text
528  */
529 Datum
textrecv(PG_FUNCTION_ARGS)530 textrecv(PG_FUNCTION_ARGS)
531 {
532 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
533 	text	   *result;
534 	char	   *str;
535 	int			nbytes;
536 
537 	str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
538 
539 	result = cstring_to_text_with_len(str, nbytes);
540 	pfree(str);
541 	PG_RETURN_TEXT_P(result);
542 }
543 
544 /*
545  *		textsend			- converts text to binary format
546  */
547 Datum
textsend(PG_FUNCTION_ARGS)548 textsend(PG_FUNCTION_ARGS)
549 {
550 	text	   *t = PG_GETARG_TEXT_PP(0);
551 	StringInfoData buf;
552 
553 	pq_begintypsend(&buf);
554 	pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
555 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
556 }
557 
558 
559 /*
560  *		unknownin			- converts "..." to internal representation
561  */
562 Datum
unknownin(PG_FUNCTION_ARGS)563 unknownin(PG_FUNCTION_ARGS)
564 {
565 	char	   *str = PG_GETARG_CSTRING(0);
566 
567 	/* representation is same as cstring */
568 	PG_RETURN_CSTRING(pstrdup(str));
569 }
570 
571 /*
572  *		unknownout			- converts internal representation to "..."
573  */
574 Datum
unknownout(PG_FUNCTION_ARGS)575 unknownout(PG_FUNCTION_ARGS)
576 {
577 	/* representation is same as cstring */
578 	char	   *str = PG_GETARG_CSTRING(0);
579 
580 	PG_RETURN_CSTRING(pstrdup(str));
581 }
582 
583 /*
584  *		unknownrecv			- converts external binary format to unknown
585  */
586 Datum
unknownrecv(PG_FUNCTION_ARGS)587 unknownrecv(PG_FUNCTION_ARGS)
588 {
589 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
590 	char	   *str;
591 	int			nbytes;
592 
593 	str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
594 	/* representation is same as cstring */
595 	PG_RETURN_CSTRING(str);
596 }
597 
598 /*
599  *		unknownsend			- converts unknown to binary format
600  */
601 Datum
unknownsend(PG_FUNCTION_ARGS)602 unknownsend(PG_FUNCTION_ARGS)
603 {
604 	/* representation is same as cstring */
605 	char	   *str = PG_GETARG_CSTRING(0);
606 	StringInfoData buf;
607 
608 	pq_begintypsend(&buf);
609 	pq_sendtext(&buf, str, strlen(str));
610 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
611 }
612 
613 
614 /* ========== PUBLIC ROUTINES ========== */
615 
616 /*
617  * textlen -
618  *	  returns the logical length of a text*
619  *	   (which is less than the VARSIZE of the text*)
620  */
621 Datum
textlen(PG_FUNCTION_ARGS)622 textlen(PG_FUNCTION_ARGS)
623 {
624 	Datum		str = PG_GETARG_DATUM(0);
625 
626 	/* try to avoid decompressing argument */
627 	PG_RETURN_INT32(text_length(str));
628 }
629 
630 /*
631  * text_length -
632  *	Does the real work for textlen()
633  *
634  *	This is broken out so it can be called directly by other string processing
635  *	functions.  Note that the argument is passed as a Datum, to indicate that
636  *	it may still be in compressed form.  We can avoid decompressing it at all
637  *	in some cases.
638  */
639 static int32
text_length(Datum str)640 text_length(Datum str)
641 {
642 	/* fastpath when max encoding length is one */
643 	if (pg_database_encoding_max_length() == 1)
644 		PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
645 	else
646 	{
647 		text	   *t = DatumGetTextPP(str);
648 
649 		PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
650 											 VARSIZE_ANY_EXHDR(t)));
651 	}
652 }
653 
654 /*
655  * textoctetlen -
656  *	  returns the physical length of a text*
657  *	   (which is less than the VARSIZE of the text*)
658  */
659 Datum
textoctetlen(PG_FUNCTION_ARGS)660 textoctetlen(PG_FUNCTION_ARGS)
661 {
662 	Datum		str = PG_GETARG_DATUM(0);
663 
664 	/* We need not detoast the input at all */
665 	PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
666 }
667 
668 /*
669  * textcat -
670  *	  takes two text* and returns a text* that is the concatenation of
671  *	  the two.
672  *
673  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
674  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
675  * Allocate space for output in all cases.
676  * XXX - thomas 1997-07-10
677  */
678 Datum
textcat(PG_FUNCTION_ARGS)679 textcat(PG_FUNCTION_ARGS)
680 {
681 	text	   *t1 = PG_GETARG_TEXT_PP(0);
682 	text	   *t2 = PG_GETARG_TEXT_PP(1);
683 
684 	PG_RETURN_TEXT_P(text_catenate(t1, t2));
685 }
686 
687 /*
688  * text_catenate
689  *	Guts of textcat(), broken out so it can be used by other functions
690  *
691  * Arguments can be in short-header form, but not compressed or out-of-line
692  */
693 static text *
text_catenate(text * t1,text * t2)694 text_catenate(text *t1, text *t2)
695 {
696 	text	   *result;
697 	int			len1,
698 				len2,
699 				len;
700 	char	   *ptr;
701 
702 	len1 = VARSIZE_ANY_EXHDR(t1);
703 	len2 = VARSIZE_ANY_EXHDR(t2);
704 
705 	/* paranoia ... probably should throw error instead? */
706 	if (len1 < 0)
707 		len1 = 0;
708 	if (len2 < 0)
709 		len2 = 0;
710 
711 	len = len1 + len2 + VARHDRSZ;
712 	result = (text *) palloc(len);
713 
714 	/* Set size of result string... */
715 	SET_VARSIZE(result, len);
716 
717 	/* Fill data field of result string... */
718 	ptr = VARDATA(result);
719 	if (len1 > 0)
720 		memcpy(ptr, VARDATA_ANY(t1), len1);
721 	if (len2 > 0)
722 		memcpy(ptr + len1, VARDATA_ANY(t2), len2);
723 
724 	return result;
725 }
726 
727 /*
728  * charlen_to_bytelen()
729  *	Compute the number of bytes occupied by n characters starting at *p
730  *
731  * It is caller's responsibility that there actually are n characters;
732  * the string need not be null-terminated.
733  */
734 static int
charlen_to_bytelen(const char * p,int n)735 charlen_to_bytelen(const char *p, int n)
736 {
737 	if (pg_database_encoding_max_length() == 1)
738 	{
739 		/* Optimization for single-byte encodings */
740 		return n;
741 	}
742 	else
743 	{
744 		const char *s;
745 
746 		for (s = p; n > 0; n--)
747 			s += pg_mblen(s);
748 
749 		return s - p;
750 	}
751 }
752 
753 /*
754  * text_substr()
755  * Return a substring starting at the specified position.
756  * - thomas 1997-12-31
757  *
758  * Input:
759  *	- string
760  *	- starting position (is one-based)
761  *	- string length
762  *
763  * If the starting position is zero or less, then return from the start of the string
764  *	adjusting the length to be consistent with the "negative start" per SQL.
765  * If the length is less than zero, return the remaining string.
766  *
767  * Added multibyte support.
768  * - Tatsuo Ishii 1998-4-21
769  * Changed behavior if starting position is less than one to conform to SQL behavior.
770  * Formerly returned the entire string; now returns a portion.
771  * - Thomas Lockhart 1998-12-10
772  * Now uses faster TOAST-slicing interface
773  * - John Gray 2002-02-22
774  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
775  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
776  * error; if E < 1, return '', not entire string). Fixed MB related bug when
777  * S > LC and < LC + 4 sometimes garbage characters are returned.
778  * - Joe Conway 2002-08-10
779  */
780 Datum
text_substr(PG_FUNCTION_ARGS)781 text_substr(PG_FUNCTION_ARGS)
782 {
783 	PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
784 									PG_GETARG_INT32(1),
785 									PG_GETARG_INT32(2),
786 									false));
787 }
788 
789 /*
790  * text_substr_no_len -
791  *	  Wrapper to avoid opr_sanity failure due to
792  *	  one function accepting a different number of args.
793  */
794 Datum
text_substr_no_len(PG_FUNCTION_ARGS)795 text_substr_no_len(PG_FUNCTION_ARGS)
796 {
797 	PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
798 									PG_GETARG_INT32(1),
799 									-1, true));
800 }
801 
802 /*
803  * text_substring -
804  *	Does the real work for text_substr() and text_substr_no_len()
805  *
806  *	This is broken out so it can be called directly by other string processing
807  *	functions.  Note that the argument is passed as a Datum, to indicate that
808  *	it may still be in compressed/toasted form.  We can avoid detoasting all
809  *	of it in some cases.
810  *
811  *	The result is always a freshly palloc'd datum.
812  */
813 static text *
text_substring(Datum str,int32 start,int32 length,bool length_not_specified)814 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
815 {
816 	int32		eml = pg_database_encoding_max_length();
817 	int32		S = start;		/* start position */
818 	int32		S1;				/* adjusted start position */
819 	int32		L1;				/* adjusted substring length */
820 
821 	/* life is easy if the encoding max length is 1 */
822 	if (eml == 1)
823 	{
824 		S1 = Max(S, 1);
825 
826 		if (length_not_specified)	/* special case - get length to end of
827 									 * string */
828 			L1 = -1;
829 		else
830 		{
831 			/* end position */
832 			int			E = S + length;
833 
834 			/*
835 			 * A negative value for L is the only way for the end position to
836 			 * be before the start. SQL99 says to throw an error.
837 			 */
838 			if (E < S)
839 				ereport(ERROR,
840 						(errcode(ERRCODE_SUBSTRING_ERROR),
841 						 errmsg("negative substring length not allowed")));
842 
843 			/*
844 			 * A zero or negative value for the end position can happen if the
845 			 * start was negative or one. SQL99 says to return a zero-length
846 			 * string.
847 			 */
848 			if (E < 1)
849 				return cstring_to_text("");
850 
851 			L1 = E - S1;
852 		}
853 
854 		/*
855 		 * If the start position is past the end of the string, SQL99 says to
856 		 * return a zero-length string -- PG_GETARG_TEXT_P_SLICE() will do
857 		 * that for us. Convert to zero-based starting position
858 		 */
859 		return DatumGetTextPSlice(str, S1 - 1, L1);
860 	}
861 	else if (eml > 1)
862 	{
863 		/*
864 		 * When encoding max length is > 1, we can't get LC without
865 		 * detoasting, so we'll grab a conservatively large slice now and go
866 		 * back later to do the right thing
867 		 */
868 		int32		slice_start;
869 		int32		slice_size;
870 		int32		slice_strlen;
871 		text	   *slice;
872 		int32		E1;
873 		int32		i;
874 		char	   *p;
875 		char	   *s;
876 		text	   *ret;
877 
878 		/*
879 		 * if S is past the end of the string, the tuple toaster will return a
880 		 * zero-length string to us
881 		 */
882 		S1 = Max(S, 1);
883 
884 		/*
885 		 * We need to start at position zero because there is no way to know
886 		 * in advance which byte offset corresponds to the supplied start
887 		 * position.
888 		 */
889 		slice_start = 0;
890 
891 		if (length_not_specified)	/* special case - get length to end of
892 									 * string */
893 			slice_size = L1 = -1;
894 		else
895 		{
896 			int			E = S + length;
897 
898 			/*
899 			 * A negative value for L is the only way for the end position to
900 			 * be before the start. SQL99 says to throw an error.
901 			 */
902 			if (E < S)
903 				ereport(ERROR,
904 						(errcode(ERRCODE_SUBSTRING_ERROR),
905 						 errmsg("negative substring length not allowed")));
906 
907 			/*
908 			 * A zero or negative value for the end position can happen if the
909 			 * start was negative or one. SQL99 says to return a zero-length
910 			 * string.
911 			 */
912 			if (E < 1)
913 				return cstring_to_text("");
914 
915 			/*
916 			 * if E is past the end of the string, the tuple toaster will
917 			 * truncate the length for us
918 			 */
919 			L1 = E - S1;
920 
921 			/*
922 			 * Total slice size in bytes can't be any longer than the start
923 			 * position plus substring length times the encoding max length.
924 			 */
925 			slice_size = (S1 + L1) * eml;
926 		}
927 
928 		/*
929 		 * If we're working with an untoasted source, no need to do an extra
930 		 * copying step.
931 		 */
932 		if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
933 			VARATT_IS_EXTERNAL(DatumGetPointer(str)))
934 			slice = DatumGetTextPSlice(str, slice_start, slice_size);
935 		else
936 			slice = (text *) DatumGetPointer(str);
937 
938 		/* see if we got back an empty string */
939 		if (VARSIZE_ANY_EXHDR(slice) == 0)
940 		{
941 			if (slice != (text *) DatumGetPointer(str))
942 				pfree(slice);
943 			return cstring_to_text("");
944 		}
945 
946 		/* Now we can get the actual length of the slice in MB characters */
947 		slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
948 											VARSIZE_ANY_EXHDR(slice));
949 
950 		/*
951 		 * Check that the start position wasn't > slice_strlen. If so, SQL99
952 		 * says to return a zero-length string.
953 		 */
954 		if (S1 > slice_strlen)
955 		{
956 			if (slice != (text *) DatumGetPointer(str))
957 				pfree(slice);
958 			return cstring_to_text("");
959 		}
960 
961 		/*
962 		 * Adjust L1 and E1 now that we know the slice string length. Again
963 		 * remember that S1 is one based, and slice_start is zero based.
964 		 */
965 		if (L1 > -1)
966 			E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
967 		else
968 			E1 = slice_start + 1 + slice_strlen;
969 
970 		/*
971 		 * Find the start position in the slice; remember S1 is not zero based
972 		 */
973 		p = VARDATA_ANY(slice);
974 		for (i = 0; i < S1 - 1; i++)
975 			p += pg_mblen(p);
976 
977 		/* hang onto a pointer to our start position */
978 		s = p;
979 
980 		/*
981 		 * Count the actual bytes used by the substring of the requested
982 		 * length.
983 		 */
984 		for (i = S1; i < E1; i++)
985 			p += pg_mblen(p);
986 
987 		ret = (text *) palloc(VARHDRSZ + (p - s));
988 		SET_VARSIZE(ret, VARHDRSZ + (p - s));
989 		memcpy(VARDATA(ret), s, (p - s));
990 
991 		if (slice != (text *) DatumGetPointer(str))
992 			pfree(slice);
993 
994 		return ret;
995 	}
996 	else
997 		elog(ERROR, "invalid backend encoding: encoding max length < 1");
998 
999 	/* not reached: suppress compiler warning */
1000 	return NULL;
1001 }
1002 
1003 /*
1004  * textoverlay
1005  *	Replace specified substring of first string with second
1006  *
1007  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1008  * This code is a direct implementation of what the standard says.
1009  */
1010 Datum
textoverlay(PG_FUNCTION_ARGS)1011 textoverlay(PG_FUNCTION_ARGS)
1012 {
1013 	text	   *t1 = PG_GETARG_TEXT_PP(0);
1014 	text	   *t2 = PG_GETARG_TEXT_PP(1);
1015 	int			sp = PG_GETARG_INT32(2);	/* substring start position */
1016 	int			sl = PG_GETARG_INT32(3);	/* substring length */
1017 
1018 	PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1019 }
1020 
1021 Datum
textoverlay_no_len(PG_FUNCTION_ARGS)1022 textoverlay_no_len(PG_FUNCTION_ARGS)
1023 {
1024 	text	   *t1 = PG_GETARG_TEXT_PP(0);
1025 	text	   *t2 = PG_GETARG_TEXT_PP(1);
1026 	int			sp = PG_GETARG_INT32(2);	/* substring start position */
1027 	int			sl;
1028 
1029 	sl = text_length(PointerGetDatum(t2));	/* defaults to length(t2) */
1030 	PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1031 }
1032 
1033 static text *
text_overlay(text * t1,text * t2,int sp,int sl)1034 text_overlay(text *t1, text *t2, int sp, int sl)
1035 {
1036 	text	   *result;
1037 	text	   *s1;
1038 	text	   *s2;
1039 	int			sp_pl_sl;
1040 
1041 	/*
1042 	 * Check for possible integer-overflow cases.  For negative sp, throw a
1043 	 * "substring length" error because that's what should be expected
1044 	 * according to the spec's definition of OVERLAY().
1045 	 */
1046 	if (sp <= 0)
1047 		ereport(ERROR,
1048 				(errcode(ERRCODE_SUBSTRING_ERROR),
1049 				 errmsg("negative substring length not allowed")));
1050 	sp_pl_sl = sp + sl;
1051 	if (sp_pl_sl <= sl)
1052 		ereport(ERROR,
1053 				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1054 				 errmsg("integer out of range")));
1055 
1056 	s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1057 	s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1058 	result = text_catenate(s1, t2);
1059 	result = text_catenate(result, s2);
1060 
1061 	return result;
1062 }
1063 
1064 /*
1065  * textpos -
1066  *	  Return the position of the specified substring.
1067  *	  Implements the SQL POSITION() function.
1068  *	  Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1069  * - thomas 1997-07-27
1070  */
1071 Datum
textpos(PG_FUNCTION_ARGS)1072 textpos(PG_FUNCTION_ARGS)
1073 {
1074 	text	   *str = PG_GETARG_TEXT_PP(0);
1075 	text	   *search_str = PG_GETARG_TEXT_PP(1);
1076 
1077 	PG_RETURN_INT32((int32) text_position(str, search_str));
1078 }
1079 
1080 /*
1081  * text_position -
1082  *	Does the real work for textpos()
1083  *
1084  * Inputs:
1085  *		t1 - string to be searched
1086  *		t2 - pattern to match within t1
1087  * Result:
1088  *		Character index of the first matched char, starting from 1,
1089  *		or 0 if no match.
1090  *
1091  *	This is broken out so it can be called directly by other string processing
1092  *	functions.
1093  */
1094 static int
text_position(text * t1,text * t2)1095 text_position(text *t1, text *t2)
1096 {
1097 	TextPositionState state;
1098 	int			result;
1099 
1100 	text_position_setup(t1, t2, &state);
1101 	result = text_position_next(1, &state);
1102 	text_position_cleanup(&state);
1103 	return result;
1104 }
1105 
1106 
1107 /*
1108  * text_position_setup, text_position_next, text_position_cleanup -
1109  *	Component steps of text_position()
1110  *
1111  * These are broken out so that a string can be efficiently searched for
1112  * multiple occurrences of the same pattern.  text_position_next may be
1113  * called multiple times with increasing values of start_pos, which is
1114  * the 1-based character position to start the search from.  The "state"
1115  * variable is normally just a local variable in the caller.
1116  */
1117 
1118 static void
text_position_setup(text * t1,text * t2,TextPositionState * state)1119 text_position_setup(text *t1, text *t2, TextPositionState *state)
1120 {
1121 	int			len1 = VARSIZE_ANY_EXHDR(t1);
1122 	int			len2 = VARSIZE_ANY_EXHDR(t2);
1123 
1124 	if (pg_database_encoding_max_length() == 1)
1125 	{
1126 		/* simple case - single byte encoding */
1127 		state->use_wchar = false;
1128 		state->str1 = VARDATA_ANY(t1);
1129 		state->str2 = VARDATA_ANY(t2);
1130 		state->len1 = len1;
1131 		state->len2 = len2;
1132 	}
1133 	else
1134 	{
1135 		/* not as simple - multibyte encoding */
1136 		pg_wchar   *p1,
1137 				   *p2;
1138 
1139 		p1 = (pg_wchar *) palloc((len1 + 1) * sizeof(pg_wchar));
1140 		len1 = pg_mb2wchar_with_len(VARDATA_ANY(t1), p1, len1);
1141 		p2 = (pg_wchar *) palloc((len2 + 1) * sizeof(pg_wchar));
1142 		len2 = pg_mb2wchar_with_len(VARDATA_ANY(t2), p2, len2);
1143 
1144 		state->use_wchar = true;
1145 		state->wstr1 = p1;
1146 		state->wstr2 = p2;
1147 		state->len1 = len1;
1148 		state->len2 = len2;
1149 	}
1150 
1151 	/*
1152 	 * Prepare the skip table for Boyer-Moore-Horspool searching.  In these
1153 	 * notes we use the terminology that the "haystack" is the string to be
1154 	 * searched (t1) and the "needle" is the pattern being sought (t2).
1155 	 *
1156 	 * If the needle is empty or bigger than the haystack then there is no
1157 	 * point in wasting cycles initializing the table.  We also choose not to
1158 	 * use B-M-H for needles of length 1, since the skip table can't possibly
1159 	 * save anything in that case.
1160 	 */
1161 	if (len1 >= len2 && len2 > 1)
1162 	{
1163 		int			searchlength = len1 - len2;
1164 		int			skiptablemask;
1165 		int			last;
1166 		int			i;
1167 
1168 		/*
1169 		 * First we must determine how much of the skip table to use.  The
1170 		 * declaration of TextPositionState allows up to 256 elements, but for
1171 		 * short search problems we don't really want to have to initialize so
1172 		 * many elements --- it would take too long in comparison to the
1173 		 * actual search time.  So we choose a useful skip table size based on
1174 		 * the haystack length minus the needle length.  The closer the needle
1175 		 * length is to the haystack length the less useful skipping becomes.
1176 		 *
1177 		 * Note: since we use bit-masking to select table elements, the skip
1178 		 * table size MUST be a power of 2, and so the mask must be 2^N-1.
1179 		 */
1180 		if (searchlength < 16)
1181 			skiptablemask = 3;
1182 		else if (searchlength < 64)
1183 			skiptablemask = 7;
1184 		else if (searchlength < 128)
1185 			skiptablemask = 15;
1186 		else if (searchlength < 512)
1187 			skiptablemask = 31;
1188 		else if (searchlength < 2048)
1189 			skiptablemask = 63;
1190 		else if (searchlength < 4096)
1191 			skiptablemask = 127;
1192 		else
1193 			skiptablemask = 255;
1194 		state->skiptablemask = skiptablemask;
1195 
1196 		/*
1197 		 * Initialize the skip table.  We set all elements to the needle
1198 		 * length, since this is the correct skip distance for any character
1199 		 * not found in the needle.
1200 		 */
1201 		for (i = 0; i <= skiptablemask; i++)
1202 			state->skiptable[i] = len2;
1203 
1204 		/*
1205 		 * Now examine the needle.  For each character except the last one,
1206 		 * set the corresponding table element to the appropriate skip
1207 		 * distance.  Note that when two characters share the same skip table
1208 		 * entry, the one later in the needle must determine the skip
1209 		 * distance.
1210 		 */
1211 		last = len2 - 1;
1212 
1213 		if (!state->use_wchar)
1214 		{
1215 			const char *str2 = state->str2;
1216 
1217 			for (i = 0; i < last; i++)
1218 				state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1219 		}
1220 		else
1221 		{
1222 			const pg_wchar *wstr2 = state->wstr2;
1223 
1224 			for (i = 0; i < last; i++)
1225 				state->skiptable[wstr2[i] & skiptablemask] = last - i;
1226 		}
1227 	}
1228 }
1229 
1230 static int
text_position_next(int start_pos,TextPositionState * state)1231 text_position_next(int start_pos, TextPositionState *state)
1232 {
1233 	int			haystack_len = state->len1;
1234 	int			needle_len = state->len2;
1235 	int			skiptablemask = state->skiptablemask;
1236 
1237 	Assert(start_pos > 0);		/* else caller error */
1238 
1239 	if (needle_len <= 0)
1240 		return start_pos;		/* result for empty pattern */
1241 
1242 	start_pos--;				/* adjust for zero based arrays */
1243 
1244 	/* Done if the needle can't possibly fit */
1245 	if (haystack_len < start_pos + needle_len)
1246 		return 0;
1247 
1248 	if (!state->use_wchar)
1249 	{
1250 		/* simple case - single byte encoding */
1251 		const char *haystack = state->str1;
1252 		const char *needle = state->str2;
1253 		const char *haystack_end = &haystack[haystack_len];
1254 		const char *hptr;
1255 
1256 		if (needle_len == 1)
1257 		{
1258 			/* No point in using B-M-H for a one-character needle */
1259 			char		nchar = *needle;
1260 
1261 			hptr = &haystack[start_pos];
1262 			while (hptr < haystack_end)
1263 			{
1264 				if (*hptr == nchar)
1265 					return hptr - haystack + 1;
1266 				hptr++;
1267 			}
1268 		}
1269 		else
1270 		{
1271 			const char *needle_last = &needle[needle_len - 1];
1272 
1273 			/* Start at startpos plus the length of the needle */
1274 			hptr = &haystack[start_pos + needle_len - 1];
1275 			while (hptr < haystack_end)
1276 			{
1277 				/* Match the needle scanning *backward* */
1278 				const char *nptr;
1279 				const char *p;
1280 
1281 				nptr = needle_last;
1282 				p = hptr;
1283 				while (*nptr == *p)
1284 				{
1285 					/* Matched it all?	If so, return 1-based position */
1286 					if (nptr == needle)
1287 						return p - haystack + 1;
1288 					nptr--, p--;
1289 				}
1290 
1291 				/*
1292 				 * No match, so use the haystack char at hptr to decide how
1293 				 * far to advance.  If the needle had any occurrence of that
1294 				 * character (or more precisely, one sharing the same
1295 				 * skiptable entry) before its last character, then we advance
1296 				 * far enough to align the last such needle character with
1297 				 * that haystack position.  Otherwise we can advance by the
1298 				 * whole needle length.
1299 				 */
1300 				hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1301 			}
1302 		}
1303 	}
1304 	else
1305 	{
1306 		/* The multibyte char version. This works exactly the same way. */
1307 		const pg_wchar *haystack = state->wstr1;
1308 		const pg_wchar *needle = state->wstr2;
1309 		const pg_wchar *haystack_end = &haystack[haystack_len];
1310 		const pg_wchar *hptr;
1311 
1312 		if (needle_len == 1)
1313 		{
1314 			/* No point in using B-M-H for a one-character needle */
1315 			pg_wchar	nchar = *needle;
1316 
1317 			hptr = &haystack[start_pos];
1318 			while (hptr < haystack_end)
1319 			{
1320 				if (*hptr == nchar)
1321 					return hptr - haystack + 1;
1322 				hptr++;
1323 			}
1324 		}
1325 		else
1326 		{
1327 			const pg_wchar *needle_last = &needle[needle_len - 1];
1328 
1329 			/* Start at startpos plus the length of the needle */
1330 			hptr = &haystack[start_pos + needle_len - 1];
1331 			while (hptr < haystack_end)
1332 			{
1333 				/* Match the needle scanning *backward* */
1334 				const pg_wchar *nptr;
1335 				const pg_wchar *p;
1336 
1337 				nptr = needle_last;
1338 				p = hptr;
1339 				while (*nptr == *p)
1340 				{
1341 					/* Matched it all?	If so, return 1-based position */
1342 					if (nptr == needle)
1343 						return p - haystack + 1;
1344 					nptr--, p--;
1345 				}
1346 
1347 				/*
1348 				 * No match, so use the haystack char at hptr to decide how
1349 				 * far to advance.  If the needle had any occurrence of that
1350 				 * character (or more precisely, one sharing the same
1351 				 * skiptable entry) before its last character, then we advance
1352 				 * far enough to align the last such needle character with
1353 				 * that haystack position.  Otherwise we can advance by the
1354 				 * whole needle length.
1355 				 */
1356 				hptr += state->skiptable[*hptr & skiptablemask];
1357 			}
1358 		}
1359 	}
1360 
1361 	return 0;					/* not found */
1362 }
1363 
1364 static void
text_position_cleanup(TextPositionState * state)1365 text_position_cleanup(TextPositionState *state)
1366 {
1367 	if (state->use_wchar)
1368 	{
1369 		pfree(state->wstr1);
1370 		pfree(state->wstr2);
1371 	}
1372 }
1373 
1374 /* varstr_cmp()
1375  * Comparison function for text strings with given lengths.
1376  * Includes locale support, but must copy strings to temporary memory
1377  *	to allow null-termination for inputs to strcoll().
1378  * Returns an integer less than, equal to, or greater than zero, indicating
1379  * whether arg1 is less than, equal to, or greater than arg2.
1380  */
1381 int
varstr_cmp(char * arg1,int len1,char * arg2,int len2,Oid collid)1382 varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid)
1383 {
1384 	int			result;
1385 
1386 	/*
1387 	 * Unfortunately, there is no strncoll(), so in the non-C locale case we
1388 	 * have to do some memory copying.  This turns out to be significantly
1389 	 * slower, so we optimize the case where LC_COLLATE is C.  We also try to
1390 	 * optimize relatively-short strings by avoiding palloc/pfree overhead.
1391 	 */
1392 	if (lc_collate_is_c(collid))
1393 	{
1394 		result = memcmp(arg1, arg2, Min(len1, len2));
1395 		if ((result == 0) && (len1 != len2))
1396 			result = (len1 < len2) ? -1 : 1;
1397 	}
1398 	else
1399 	{
1400 		char		a1buf[TEXTBUFLEN];
1401 		char		a2buf[TEXTBUFLEN];
1402 		char	   *a1p,
1403 				   *a2p;
1404 		pg_locale_t mylocale = 0;
1405 
1406 		if (collid != DEFAULT_COLLATION_OID)
1407 		{
1408 			if (!OidIsValid(collid))
1409 			{
1410 				/*
1411 				 * This typically means that the parser could not resolve a
1412 				 * conflict of implicit collations, so report it that way.
1413 				 */
1414 				ereport(ERROR,
1415 						(errcode(ERRCODE_INDETERMINATE_COLLATION),
1416 						 errmsg("could not determine which collation to use for string comparison"),
1417 						 errhint("Use the COLLATE clause to set the collation explicitly.")));
1418 			}
1419 			mylocale = pg_newlocale_from_collation(collid);
1420 		}
1421 
1422 		/*
1423 		 * memcmp() can't tell us which of two unequal strings sorts first,
1424 		 * but it's a cheap way to tell if they're equal.  Testing shows that
1425 		 * memcmp() followed by strcoll() is only trivially slower than
1426 		 * strcoll() by itself, so we don't lose much if this doesn't work out
1427 		 * very often, and if it does - for example, because there are many
1428 		 * equal strings in the input - then we win big by avoiding expensive
1429 		 * collation-aware comparisons.
1430 		 */
1431 		if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1432 			return 0;
1433 
1434 #ifdef WIN32
1435 		/* Win32 does not have UTF-8, so we need to map to UTF-16 */
1436 		if (GetDatabaseEncoding() == PG_UTF8
1437 			&& (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1438 		{
1439 			int			a1len;
1440 			int			a2len;
1441 			int			r;
1442 
1443 			if (len1 >= TEXTBUFLEN / 2)
1444 			{
1445 				a1len = len1 * 2 + 2;
1446 				a1p = palloc(a1len);
1447 			}
1448 			else
1449 			{
1450 				a1len = TEXTBUFLEN;
1451 				a1p = a1buf;
1452 			}
1453 			if (len2 >= TEXTBUFLEN / 2)
1454 			{
1455 				a2len = len2 * 2 + 2;
1456 				a2p = palloc(a2len);
1457 			}
1458 			else
1459 			{
1460 				a2len = TEXTBUFLEN;
1461 				a2p = a2buf;
1462 			}
1463 
1464 			/* stupid Microsloth API does not work for zero-length input */
1465 			if (len1 == 0)
1466 				r = 0;
1467 			else
1468 			{
1469 				r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1470 										(LPWSTR) a1p, a1len / 2);
1471 				if (!r)
1472 					ereport(ERROR,
1473 							(errmsg("could not convert string to UTF-16: error code %lu",
1474 									GetLastError())));
1475 			}
1476 			((LPWSTR) a1p)[r] = 0;
1477 
1478 			if (len2 == 0)
1479 				r = 0;
1480 			else
1481 			{
1482 				r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1483 										(LPWSTR) a2p, a2len / 2);
1484 				if (!r)
1485 					ereport(ERROR,
1486 							(errmsg("could not convert string to UTF-16: error code %lu",
1487 									GetLastError())));
1488 			}
1489 			((LPWSTR) a2p)[r] = 0;
1490 
1491 			errno = 0;
1492 #ifdef HAVE_LOCALE_T
1493 			if (mylocale)
1494 				result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1495 			else
1496 #endif
1497 				result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1498 			if (result == 2147483647)	/* _NLSCMPERROR; missing from mingw
1499 										 * headers */
1500 				ereport(ERROR,
1501 						(errmsg("could not compare Unicode strings: %m")));
1502 
1503 			/*
1504 			 * In some locales wcscoll() can claim that nonidentical strings
1505 			 * are equal.  Believing that would be bad news for a number of
1506 			 * reasons, so we follow Perl's lead and sort "equal" strings
1507 			 * according to strcmp (on the UTF-8 representation).
1508 			 */
1509 			if (result == 0)
1510 			{
1511 				result = memcmp(arg1, arg2, Min(len1, len2));
1512 				if ((result == 0) && (len1 != len2))
1513 					result = (len1 < len2) ? -1 : 1;
1514 			}
1515 
1516 			if (a1p != a1buf)
1517 				pfree(a1p);
1518 			if (a2p != a2buf)
1519 				pfree(a2p);
1520 
1521 			return result;
1522 		}
1523 #endif							/* WIN32 */
1524 
1525 		if (len1 >= TEXTBUFLEN)
1526 			a1p = (char *) palloc(len1 + 1);
1527 		else
1528 			a1p = a1buf;
1529 		if (len2 >= TEXTBUFLEN)
1530 			a2p = (char *) palloc(len2 + 1);
1531 		else
1532 			a2p = a2buf;
1533 
1534 		memcpy(a1p, arg1, len1);
1535 		a1p[len1] = '\0';
1536 		memcpy(a2p, arg2, len2);
1537 		a2p[len2] = '\0';
1538 
1539 		if (mylocale)
1540 		{
1541 			if (mylocale->provider == COLLPROVIDER_ICU)
1542 			{
1543 #ifdef USE_ICU
1544 #ifdef HAVE_UCOL_STRCOLLUTF8
1545 				if (GetDatabaseEncoding() == PG_UTF8)
1546 				{
1547 					UErrorCode	status;
1548 
1549 					status = U_ZERO_ERROR;
1550 					result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1551 											  arg1, len1,
1552 											  arg2, len2,
1553 											  &status);
1554 					if (U_FAILURE(status))
1555 						ereport(ERROR,
1556 								(errmsg("collation failed: %s", u_errorName(status))));
1557 				}
1558 				else
1559 #endif
1560 				{
1561 					int32_t		ulen1,
1562 								ulen2;
1563 					UChar	   *uchar1,
1564 							   *uchar2;
1565 
1566 					ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1567 					ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1568 
1569 					result = ucol_strcoll(mylocale->info.icu.ucol,
1570 										  uchar1, ulen1,
1571 										  uchar2, ulen2);
1572 
1573 					pfree(uchar1);
1574 					pfree(uchar2);
1575 				}
1576 #else							/* not USE_ICU */
1577 				/* shouldn't happen */
1578 				elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1579 #endif							/* not USE_ICU */
1580 			}
1581 			else
1582 			{
1583 #ifdef HAVE_LOCALE_T
1584 				result = strcoll_l(a1p, a2p, mylocale->info.lt);
1585 #else
1586 				/* shouldn't happen */
1587 				elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1588 #endif
1589 			}
1590 		}
1591 		else
1592 			result = strcoll(a1p, a2p);
1593 
1594 		/*
1595 		 * In some locales strcoll() can claim that nonidentical strings are
1596 		 * equal.  Believing that would be bad news for a number of reasons,
1597 		 * so we follow Perl's lead and sort "equal" strings according to
1598 		 * strcmp().
1599 		 */
1600 		if (result == 0)
1601 			result = strcmp(a1p, a2p);
1602 
1603 		if (a1p != a1buf)
1604 			pfree(a1p);
1605 		if (a2p != a2buf)
1606 			pfree(a2p);
1607 	}
1608 
1609 	return result;
1610 }
1611 
1612 /* text_cmp()
1613  * Internal comparison function for text strings.
1614  * Returns -1, 0 or 1
1615  */
1616 static int
text_cmp(text * arg1,text * arg2,Oid collid)1617 text_cmp(text *arg1, text *arg2, Oid collid)
1618 {
1619 	char	   *a1p,
1620 			   *a2p;
1621 	int			len1,
1622 				len2;
1623 
1624 	a1p = VARDATA_ANY(arg1);
1625 	a2p = VARDATA_ANY(arg2);
1626 
1627 	len1 = VARSIZE_ANY_EXHDR(arg1);
1628 	len2 = VARSIZE_ANY_EXHDR(arg2);
1629 
1630 	return varstr_cmp(a1p, len1, a2p, len2, collid);
1631 }
1632 
1633 /*
1634  * Comparison functions for text strings.
1635  *
1636  * Note: btree indexes need these routines not to leak memory; therefore,
1637  * be careful to free working copies of toasted datums.  Most places don't
1638  * need to be so careful.
1639  */
1640 
1641 Datum
texteq(PG_FUNCTION_ARGS)1642 texteq(PG_FUNCTION_ARGS)
1643 {
1644 	Datum		arg1 = PG_GETARG_DATUM(0);
1645 	Datum		arg2 = PG_GETARG_DATUM(1);
1646 	bool		result;
1647 	Size		len1,
1648 				len2;
1649 
1650 	/*
1651 	 * Since we only care about equality or not-equality, we can avoid all the
1652 	 * expense of strcoll() here, and just do bitwise comparison.  In fact, we
1653 	 * don't even have to do a bitwise comparison if we can show the lengths
1654 	 * of the strings are unequal; which might save us from having to detoast
1655 	 * one or both values.
1656 	 */
1657 	len1 = toast_raw_datum_size(arg1);
1658 	len2 = toast_raw_datum_size(arg2);
1659 	if (len1 != len2)
1660 		result = false;
1661 	else
1662 	{
1663 		text	   *targ1 = DatumGetTextPP(arg1);
1664 		text	   *targ2 = DatumGetTextPP(arg2);
1665 
1666 		result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1667 						 len1 - VARHDRSZ) == 0);
1668 
1669 		PG_FREE_IF_COPY(targ1, 0);
1670 		PG_FREE_IF_COPY(targ2, 1);
1671 	}
1672 
1673 	PG_RETURN_BOOL(result);
1674 }
1675 
1676 Datum
textne(PG_FUNCTION_ARGS)1677 textne(PG_FUNCTION_ARGS)
1678 {
1679 	Datum		arg1 = PG_GETARG_DATUM(0);
1680 	Datum		arg2 = PG_GETARG_DATUM(1);
1681 	bool		result;
1682 	Size		len1,
1683 				len2;
1684 
1685 	/* See comment in texteq() */
1686 	len1 = toast_raw_datum_size(arg1);
1687 	len2 = toast_raw_datum_size(arg2);
1688 	if (len1 != len2)
1689 		result = true;
1690 	else
1691 	{
1692 		text	   *targ1 = DatumGetTextPP(arg1);
1693 		text	   *targ2 = DatumGetTextPP(arg2);
1694 
1695 		result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1696 						 len1 - VARHDRSZ) != 0);
1697 
1698 		PG_FREE_IF_COPY(targ1, 0);
1699 		PG_FREE_IF_COPY(targ2, 1);
1700 	}
1701 
1702 	PG_RETURN_BOOL(result);
1703 }
1704 
1705 Datum
text_lt(PG_FUNCTION_ARGS)1706 text_lt(PG_FUNCTION_ARGS)
1707 {
1708 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1709 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1710 	bool		result;
1711 
1712 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1713 
1714 	PG_FREE_IF_COPY(arg1, 0);
1715 	PG_FREE_IF_COPY(arg2, 1);
1716 
1717 	PG_RETURN_BOOL(result);
1718 }
1719 
1720 Datum
text_le(PG_FUNCTION_ARGS)1721 text_le(PG_FUNCTION_ARGS)
1722 {
1723 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1724 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1725 	bool		result;
1726 
1727 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1728 
1729 	PG_FREE_IF_COPY(arg1, 0);
1730 	PG_FREE_IF_COPY(arg2, 1);
1731 
1732 	PG_RETURN_BOOL(result);
1733 }
1734 
1735 Datum
text_gt(PG_FUNCTION_ARGS)1736 text_gt(PG_FUNCTION_ARGS)
1737 {
1738 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1739 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1740 	bool		result;
1741 
1742 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1743 
1744 	PG_FREE_IF_COPY(arg1, 0);
1745 	PG_FREE_IF_COPY(arg2, 1);
1746 
1747 	PG_RETURN_BOOL(result);
1748 }
1749 
1750 Datum
text_ge(PG_FUNCTION_ARGS)1751 text_ge(PG_FUNCTION_ARGS)
1752 {
1753 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1754 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1755 	bool		result;
1756 
1757 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1758 
1759 	PG_FREE_IF_COPY(arg1, 0);
1760 	PG_FREE_IF_COPY(arg2, 1);
1761 
1762 	PG_RETURN_BOOL(result);
1763 }
1764 
1765 Datum
bttextcmp(PG_FUNCTION_ARGS)1766 bttextcmp(PG_FUNCTION_ARGS)
1767 {
1768 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1769 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1770 	int32		result;
1771 
1772 	result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1773 
1774 	PG_FREE_IF_COPY(arg1, 0);
1775 	PG_FREE_IF_COPY(arg2, 1);
1776 
1777 	PG_RETURN_INT32(result);
1778 }
1779 
1780 Datum
bttextsortsupport(PG_FUNCTION_ARGS)1781 bttextsortsupport(PG_FUNCTION_ARGS)
1782 {
1783 	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1784 	Oid			collid = ssup->ssup_collation;
1785 	MemoryContext oldcontext;
1786 
1787 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1788 
1789 	/* Use generic string SortSupport */
1790 	varstr_sortsupport(ssup, collid, false);
1791 
1792 	MemoryContextSwitchTo(oldcontext);
1793 
1794 	PG_RETURN_VOID();
1795 }
1796 
1797 /*
1798  * Generic sortsupport interface for character type's operator classes.
1799  * Includes locale support, and support for BpChar semantics (i.e. removing
1800  * trailing spaces before comparison).
1801  *
1802  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1803  * same representation.  Callers that always use the C collation (e.g.
1804  * non-collatable type callers like bytea) may have NUL bytes in their strings;
1805  * this will not work with any other collation, though.
1806  */
1807 void
varstr_sortsupport(SortSupport ssup,Oid collid,bool bpchar)1808 varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar)
1809 {
1810 	bool		abbreviate = ssup->abbreviate;
1811 	bool		collate_c = false;
1812 	VarStringSortSupport *sss;
1813 	pg_locale_t locale = 0;
1814 
1815 	/*
1816 	 * If possible, set ssup->comparator to a function which can be used to
1817 	 * directly compare two datums.  If we can do this, we'll avoid the
1818 	 * overhead of a trip through the fmgr layer for every comparison, which
1819 	 * can be substantial.
1820 	 *
1821 	 * Most typically, we'll set the comparator to varstrfastcmp_locale, which
1822 	 * uses strcoll() to perform comparisons and knows about the special
1823 	 * requirements of BpChar callers.  However, if LC_COLLATE = C, we can
1824 	 * make things quite a bit faster with varstrfastcmp_c or bpcharfastcmp_c,
1825 	 * both of which use memcmp() rather than strcoll().
1826 	 */
1827 	if (lc_collate_is_c(collid))
1828 	{
1829 		if (!bpchar)
1830 			ssup->comparator = varstrfastcmp_c;
1831 		else
1832 			ssup->comparator = bpcharfastcmp_c;
1833 
1834 		collate_c = true;
1835 	}
1836 	else
1837 	{
1838 		/*
1839 		 * We need a collation-sensitive comparison.  To make things faster,
1840 		 * we'll figure out the collation based on the locale id and cache the
1841 		 * result.
1842 		 */
1843 		if (collid != DEFAULT_COLLATION_OID)
1844 		{
1845 			if (!OidIsValid(collid))
1846 			{
1847 				/*
1848 				 * This typically means that the parser could not resolve a
1849 				 * conflict of implicit collations, so report it that way.
1850 				 */
1851 				ereport(ERROR,
1852 						(errcode(ERRCODE_INDETERMINATE_COLLATION),
1853 						 errmsg("could not determine which collation to use for string comparison"),
1854 						 errhint("Use the COLLATE clause to set the collation explicitly.")));
1855 			}
1856 			locale = pg_newlocale_from_collation(collid);
1857 		}
1858 
1859 		/*
1860 		 * There is a further exception on Windows.  When the database
1861 		 * encoding is UTF-8 and we are not using the C collation, complex
1862 		 * hacks are required.  We don't currently have a comparator that
1863 		 * handles that case, so we fall back on the slow method of having the
1864 		 * sort code invoke bttextcmp() (in the case of text) via the fmgr
1865 		 * trampoline.  ICU locales work just the same on Windows, however.
1866 		 */
1867 #ifdef WIN32
1868 		if (GetDatabaseEncoding() == PG_UTF8 &&
1869 			!(locale && locale->provider == COLLPROVIDER_ICU))
1870 			return;
1871 #endif
1872 
1873 		ssup->comparator = varstrfastcmp_locale;
1874 	}
1875 
1876 	/*
1877 	 * Unfortunately, it seems that abbreviation for non-C collations is
1878 	 * broken on many common platforms; testing of multiple versions of glibc
1879 	 * reveals that, for many locales, strcoll() and strxfrm() do not return
1880 	 * consistent results, which is fatal to this optimization.  While no
1881 	 * other libc other than Cygwin has so far been shown to have a problem,
1882 	 * we take the conservative course of action for right now and disable
1883 	 * this categorically.  (Users who are certain this isn't a problem on
1884 	 * their system can define TRUST_STRXFRM.)
1885 	 *
1886 	 * Even apart from the risk of broken locales, it's possible that there
1887 	 * are platforms where the use of abbreviated keys should be disabled at
1888 	 * compile time.  Having only 4 byte datums could make worst-case
1889 	 * performance drastically more likely, for example.  Moreover, macOS's
1890 	 * strxfrm() implementation is known to not effectively concentrate a
1891 	 * significant amount of entropy from the original string in earlier
1892 	 * transformed blobs.  It's possible that other supported platforms are
1893 	 * similarly encumbered.  So, if we ever get past disabling this
1894 	 * categorically, we may still want or need to disable it for particular
1895 	 * platforms.
1896 	 */
1897 #ifndef TRUST_STRXFRM
1898 	if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
1899 		abbreviate = false;
1900 #endif
1901 
1902 	/*
1903 	 * If we're using abbreviated keys, or if we're using a locale-aware
1904 	 * comparison, we need to initialize a StringSortSupport object.  Both
1905 	 * cases will make use of the temporary buffers we initialize here for
1906 	 * scratch space (and to detect requirement for BpChar semantics from
1907 	 * caller), and the abbreviation case requires additional state.
1908 	 */
1909 	if (abbreviate || !collate_c)
1910 	{
1911 		sss = palloc(sizeof(VarStringSortSupport));
1912 		sss->buf1 = palloc(TEXTBUFLEN);
1913 		sss->buflen1 = TEXTBUFLEN;
1914 		sss->buf2 = palloc(TEXTBUFLEN);
1915 		sss->buflen2 = TEXTBUFLEN;
1916 		/* Start with invalid values */
1917 		sss->last_len1 = -1;
1918 		sss->last_len2 = -1;
1919 		/* Initialize */
1920 		sss->last_returned = 0;
1921 		sss->locale = locale;
1922 
1923 		/*
1924 		 * To avoid somehow confusing a strxfrm() blob and an original string,
1925 		 * constantly keep track of the variety of data that buf1 and buf2
1926 		 * currently contain.
1927 		 *
1928 		 * Comparisons may be interleaved with conversion calls.  Frequently,
1929 		 * conversions and comparisons are batched into two distinct phases,
1930 		 * but the correctness of caching cannot hinge upon this.  For
1931 		 * comparison caching, buffer state is only trusted if cache_blob is
1932 		 * found set to false, whereas strxfrm() caching only trusts the state
1933 		 * when cache_blob is found set to true.
1934 		 *
1935 		 * Arbitrarily initialize cache_blob to true.
1936 		 */
1937 		sss->cache_blob = true;
1938 		sss->collate_c = collate_c;
1939 		sss->bpchar = bpchar;
1940 		ssup->ssup_extra = sss;
1941 
1942 		/*
1943 		 * If possible, plan to use the abbreviated keys optimization.  The
1944 		 * core code may switch back to authoritative comparator should
1945 		 * abbreviation be aborted.
1946 		 */
1947 		if (abbreviate)
1948 		{
1949 			sss->prop_card = 0.20;
1950 			initHyperLogLog(&sss->abbr_card, 10);
1951 			initHyperLogLog(&sss->full_card, 10);
1952 			ssup->abbrev_full_comparator = ssup->comparator;
1953 			ssup->comparator = varstrcmp_abbrev;
1954 			ssup->abbrev_converter = varstr_abbrev_convert;
1955 			ssup->abbrev_abort = varstr_abbrev_abort;
1956 		}
1957 	}
1958 }
1959 
1960 /*
1961  * sortsupport comparison func (for C locale case)
1962  */
1963 static int
varstrfastcmp_c(Datum x,Datum y,SortSupport ssup)1964 varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
1965 {
1966 	VarString  *arg1 = DatumGetVarStringPP(x);
1967 	VarString  *arg2 = DatumGetVarStringPP(y);
1968 	char	   *a1p,
1969 			   *a2p;
1970 	int			len1,
1971 				len2,
1972 				result;
1973 
1974 	a1p = VARDATA_ANY(arg1);
1975 	a2p = VARDATA_ANY(arg2);
1976 
1977 	len1 = VARSIZE_ANY_EXHDR(arg1);
1978 	len2 = VARSIZE_ANY_EXHDR(arg2);
1979 
1980 	result = memcmp(a1p, a2p, Min(len1, len2));
1981 	if ((result == 0) && (len1 != len2))
1982 		result = (len1 < len2) ? -1 : 1;
1983 
1984 	/* We can't afford to leak memory here. */
1985 	if (PointerGetDatum(arg1) != x)
1986 		pfree(arg1);
1987 	if (PointerGetDatum(arg2) != y)
1988 		pfree(arg2);
1989 
1990 	return result;
1991 }
1992 
1993 /*
1994  * sortsupport comparison func (for BpChar C locale case)
1995  *
1996  * BpChar outsources its sortsupport to this module.  Specialization for the
1997  * varstr_sortsupport BpChar case, modeled on
1998  * internal_bpchar_pattern_compare().
1999  */
2000 static int
bpcharfastcmp_c(Datum x,Datum y,SortSupport ssup)2001 bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2002 {
2003 	BpChar	   *arg1 = DatumGetBpCharPP(x);
2004 	BpChar	   *arg2 = DatumGetBpCharPP(y);
2005 	char	   *a1p,
2006 			   *a2p;
2007 	int			len1,
2008 				len2,
2009 				result;
2010 
2011 	a1p = VARDATA_ANY(arg1);
2012 	a2p = VARDATA_ANY(arg2);
2013 
2014 	len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2015 	len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2016 
2017 	result = memcmp(a1p, a2p, Min(len1, len2));
2018 	if ((result == 0) && (len1 != len2))
2019 		result = (len1 < len2) ? -1 : 1;
2020 
2021 	/* We can't afford to leak memory here. */
2022 	if (PointerGetDatum(arg1) != x)
2023 		pfree(arg1);
2024 	if (PointerGetDatum(arg2) != y)
2025 		pfree(arg2);
2026 
2027 	return result;
2028 }
2029 
2030 /*
2031  * sortsupport comparison func (for locale case)
2032  */
2033 static int
varstrfastcmp_locale(Datum x,Datum y,SortSupport ssup)2034 varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup)
2035 {
2036 	VarString  *arg1 = DatumGetVarStringPP(x);
2037 	VarString  *arg2 = DatumGetVarStringPP(y);
2038 	bool		arg1_match;
2039 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2040 
2041 	/* working state */
2042 	char	   *a1p,
2043 			   *a2p;
2044 	int			len1,
2045 				len2,
2046 				result;
2047 
2048 	a1p = VARDATA_ANY(arg1);
2049 	a2p = VARDATA_ANY(arg2);
2050 
2051 	len1 = VARSIZE_ANY_EXHDR(arg1);
2052 	len2 = VARSIZE_ANY_EXHDR(arg2);
2053 
2054 	/* Fast pre-check for equality, as discussed in varstr_cmp() */
2055 	if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2056 	{
2057 		/*
2058 		 * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2059 		 * last_len2.  Existing contents of buffers might still be used by
2060 		 * next call.
2061 		 *
2062 		 * It's fine to allow the comparison of BpChar padding bytes here,
2063 		 * even though that implies that the memcmp() will usually be
2064 		 * performed for BpChar callers (though multibyte characters could
2065 		 * still prevent that from occurring).  The memcmp() is still very
2066 		 * cheap, and BpChar's funny semantics have us remove trailing spaces
2067 		 * (not limited to padding), so we need make no distinction between
2068 		 * padding space characters and "real" space characters.
2069 		 */
2070 		result = 0;
2071 		goto done;
2072 	}
2073 
2074 	if (sss->bpchar)
2075 	{
2076 		/* Get true number of bytes, ignoring trailing spaces */
2077 		len1 = bpchartruelen(a1p, len1);
2078 		len2 = bpchartruelen(a2p, len2);
2079 	}
2080 
2081 	if (len1 >= sss->buflen1)
2082 	{
2083 		pfree(sss->buf1);
2084 		sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2085 		sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2086 	}
2087 	if (len2 >= sss->buflen2)
2088 	{
2089 		pfree(sss->buf2);
2090 		sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2091 		sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2092 	}
2093 
2094 	/*
2095 	 * We're likely to be asked to compare the same strings repeatedly, and
2096 	 * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2097 	 * comparisons, even though in general there is no reason to think that
2098 	 * that will work out (every string datum may be unique).  Caching does
2099 	 * not slow things down measurably when it doesn't work out, and can speed
2100 	 * things up by rather a lot when it does.  In part, this is because the
2101 	 * memcmp() compares data from cachelines that are needed in L1 cache even
2102 	 * when the last comparison's result cannot be reused.
2103 	 */
2104 	arg1_match = true;
2105 	if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2106 	{
2107 		arg1_match = false;
2108 		memcpy(sss->buf1, a1p, len1);
2109 		sss->buf1[len1] = '\0';
2110 		sss->last_len1 = len1;
2111 	}
2112 
2113 	/*
2114 	 * If we're comparing the same two strings as last time, we can return the
2115 	 * same answer without calling strcoll() again.  This is more likely than
2116 	 * it seems (at least with moderate to low cardinality sets), because
2117 	 * quicksort compares the same pivot against many values.
2118 	 */
2119 	if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2120 	{
2121 		memcpy(sss->buf2, a2p, len2);
2122 		sss->buf2[len2] = '\0';
2123 		sss->last_len2 = len2;
2124 	}
2125 	else if (arg1_match && !sss->cache_blob)
2126 	{
2127 		/* Use result cached following last actual strcoll() call */
2128 		result = sss->last_returned;
2129 		goto done;
2130 	}
2131 
2132 	if (sss->locale)
2133 	{
2134 		if (sss->locale->provider == COLLPROVIDER_ICU)
2135 		{
2136 #ifdef USE_ICU
2137 #ifdef HAVE_UCOL_STRCOLLUTF8
2138 			if (GetDatabaseEncoding() == PG_UTF8)
2139 			{
2140 				UErrorCode	status;
2141 
2142 				status = U_ZERO_ERROR;
2143 				result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2144 										  a1p, len1,
2145 										  a2p, len2,
2146 										  &status);
2147 				if (U_FAILURE(status))
2148 					ereport(ERROR,
2149 							(errmsg("collation failed: %s", u_errorName(status))));
2150 			}
2151 			else
2152 #endif
2153 			{
2154 				int32_t		ulen1,
2155 							ulen2;
2156 				UChar	   *uchar1,
2157 						   *uchar2;
2158 
2159 				ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2160 				ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2161 
2162 				result = ucol_strcoll(sss->locale->info.icu.ucol,
2163 									  uchar1, ulen1,
2164 									  uchar2, ulen2);
2165 
2166 				pfree(uchar1);
2167 				pfree(uchar2);
2168 			}
2169 #else							/* not USE_ICU */
2170 			/* shouldn't happen */
2171 			elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2172 #endif							/* not USE_ICU */
2173 		}
2174 		else
2175 		{
2176 #ifdef HAVE_LOCALE_T
2177 			result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2178 #else
2179 			/* shouldn't happen */
2180 			elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2181 #endif
2182 		}
2183 	}
2184 	else
2185 		result = strcoll(sss->buf1, sss->buf2);
2186 
2187 	/*
2188 	 * In some locales strcoll() can claim that nonidentical strings are
2189 	 * equal. Believing that would be bad news for a number of reasons, so we
2190 	 * follow Perl's lead and sort "equal" strings according to strcmp().
2191 	 */
2192 	if (result == 0)
2193 		result = strcmp(sss->buf1, sss->buf2);
2194 
2195 	/* Cache result, perhaps saving an expensive strcoll() call next time */
2196 	sss->cache_blob = false;
2197 	sss->last_returned = result;
2198 done:
2199 	/* We can't afford to leak memory here. */
2200 	if (PointerGetDatum(arg1) != x)
2201 		pfree(arg1);
2202 	if (PointerGetDatum(arg2) != y)
2203 		pfree(arg2);
2204 
2205 	return result;
2206 }
2207 
2208 /*
2209  * Abbreviated key comparison func
2210  */
2211 static int
varstrcmp_abbrev(Datum x,Datum y,SortSupport ssup)2212 varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup)
2213 {
2214 	/*
2215 	 * When 0 is returned, the core system will call varstrfastcmp_c()
2216 	 * (bpcharfastcmp_c() in BpChar case) or varstrfastcmp_locale().  Even a
2217 	 * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2218 	 * authoritatively, for the same reason that there is a strcoll()
2219 	 * tie-breaker call to strcmp() in varstr_cmp().
2220 	 */
2221 	if (x > y)
2222 		return 1;
2223 	else if (x == y)
2224 		return 0;
2225 	else
2226 		return -1;
2227 }
2228 
2229 /*
2230  * Conversion routine for sortsupport.  Converts original to abbreviated key
2231  * representation.  Our encoding strategy is simple -- pack the first 8 bytes
2232  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2233  * stored in reverse order), and treat it as an unsigned integer.  When the "C"
2234  * locale is used, or in case of bytea, just memcpy() from original instead.
2235  */
2236 static Datum
varstr_abbrev_convert(Datum original,SortSupport ssup)2237 varstr_abbrev_convert(Datum original, SortSupport ssup)
2238 {
2239 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2240 	VarString  *authoritative = DatumGetVarStringPP(original);
2241 	char	   *authoritative_data = VARDATA_ANY(authoritative);
2242 
2243 	/* working state */
2244 	Datum		res;
2245 	char	   *pres;
2246 	int			len;
2247 	uint32		hash;
2248 
2249 	pres = (char *) &res;
2250 	/* memset(), so any non-overwritten bytes are NUL */
2251 	memset(pres, 0, sizeof(Datum));
2252 	len = VARSIZE_ANY_EXHDR(authoritative);
2253 
2254 	/* Get number of bytes, ignoring trailing spaces */
2255 	if (sss->bpchar)
2256 		len = bpchartruelen(authoritative_data, len);
2257 
2258 	/*
2259 	 * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2260 	 * abbreviate keys.  The full comparator for the C locale is always
2261 	 * memcmp().  It would be incorrect to allow bytea callers (callers that
2262 	 * always force the C collation -- bytea isn't a collatable type, but this
2263 	 * approach is convenient) to use strxfrm().  This is because bytea
2264 	 * strings may contain NUL bytes.  Besides, this should be faster, too.
2265 	 *
2266 	 * More generally, it's okay that bytea callers can have NUL bytes in
2267 	 * strings because varstrcmp_abbrev() need not make a distinction between
2268 	 * terminating NUL bytes, and NUL bytes representing actual NULs in the
2269 	 * authoritative representation.  Hopefully a comparison at or past one
2270 	 * abbreviated key's terminating NUL byte will resolve the comparison
2271 	 * without consulting the authoritative representation; specifically, some
2272 	 * later non-NUL byte in the longer string can resolve the comparison
2273 	 * against a subsequent terminating NUL in the shorter string.  There will
2274 	 * usually be what is effectively a "length-wise" resolution there and
2275 	 * then.
2276 	 *
2277 	 * If that doesn't work out -- if all bytes in the longer string
2278 	 * positioned at or past the offset of the smaller string's (first)
2279 	 * terminating NUL are actually representative of NUL bytes in the
2280 	 * authoritative binary string (perhaps with some *terminating* NUL bytes
2281 	 * towards the end of the longer string iff it happens to still be small)
2282 	 * -- then an authoritative tie-breaker will happen, and do the right
2283 	 * thing: explicitly consider string length.
2284 	 */
2285 	if (sss->collate_c)
2286 		memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2287 	else
2288 	{
2289 		Size		bsize;
2290 #ifdef USE_ICU
2291 		int32_t		ulen = -1;
2292 		UChar	   *uchar = NULL;
2293 #endif
2294 
2295 		/*
2296 		 * We're not using the C collation, so fall back on strxfrm or ICU
2297 		 * analogs.
2298 		 */
2299 
2300 		/* By convention, we use buffer 1 to store and NUL-terminate */
2301 		if (len >= sss->buflen1)
2302 		{
2303 			pfree(sss->buf1);
2304 			sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2305 			sss->buf1 = palloc(sss->buflen1);
2306 		}
2307 
2308 		/* Might be able to reuse strxfrm() blob from last call */
2309 		if (sss->last_len1 == len && sss->cache_blob &&
2310 			memcmp(sss->buf1, authoritative_data, len) == 0)
2311 		{
2312 			memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2313 			/* No change affecting cardinality, so no hashing required */
2314 			goto done;
2315 		}
2316 
2317 		memcpy(sss->buf1, authoritative_data, len);
2318 
2319 		/*
2320 		 * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2321 		 * necessary for ICU, but doesn't hurt.
2322 		 */
2323 		sss->buf1[len] = '\0';
2324 		sss->last_len1 = len;
2325 
2326 #ifdef USE_ICU
2327 		/* When using ICU and not UTF8, convert string to UChar. */
2328 		if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2329 			GetDatabaseEncoding() != PG_UTF8)
2330 			ulen = icu_to_uchar(&uchar, sss->buf1, len);
2331 #endif
2332 
2333 		/*
2334 		 * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2335 		 * and try again.  Both of these functions have the result buffer
2336 		 * content undefined if the result did not fit, so we need to retry
2337 		 * until everything fits, even though we only need the first few bytes
2338 		 * in the end.  When using ucol_nextSortKeyPart(), however, we only
2339 		 * ask for as many bytes as we actually need.
2340 		 */
2341 		for (;;)
2342 		{
2343 #ifdef USE_ICU
2344 			if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2345 			{
2346 				/*
2347 				 * When using UTF8, use the iteration interface so we only
2348 				 * need to produce as many bytes as we actually need.
2349 				 */
2350 				if (GetDatabaseEncoding() == PG_UTF8)
2351 				{
2352 					UCharIterator iter;
2353 					uint32_t	state[2];
2354 					UErrorCode	status;
2355 
2356 					uiter_setUTF8(&iter, sss->buf1, len);
2357 					state[0] = state[1] = 0;	/* won't need that again */
2358 					status = U_ZERO_ERROR;
2359 					bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2360 												 &iter,
2361 												 state,
2362 												 (uint8_t *) sss->buf2,
2363 												 Min(sizeof(Datum), sss->buflen2),
2364 												 &status);
2365 					if (U_FAILURE(status))
2366 						ereport(ERROR,
2367 								(errmsg("sort key generation failed: %s",
2368 										u_errorName(status))));
2369 				}
2370 				else
2371 					bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2372 											uchar, ulen,
2373 											(uint8_t *) sss->buf2, sss->buflen2);
2374 			}
2375 			else
2376 #endif
2377 #ifdef HAVE_LOCALE_T
2378 			if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2379 				bsize = strxfrm_l(sss->buf2, sss->buf1,
2380 								  sss->buflen2, sss->locale->info.lt);
2381 			else
2382 #endif
2383 				bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2384 
2385 			sss->last_len2 = bsize;
2386 			if (bsize < sss->buflen2)
2387 				break;
2388 
2389 			/*
2390 			 * Grow buffer and retry.
2391 			 */
2392 			pfree(sss->buf2);
2393 			sss->buflen2 = Max(bsize + 1,
2394 							   Min(sss->buflen2 * 2, MaxAllocSize));
2395 			sss->buf2 = palloc(sss->buflen2);
2396 		}
2397 
2398 		/*
2399 		 * Every Datum byte is always compared.  This is safe because the
2400 		 * strxfrm() blob is itself NUL terminated, leaving no danger of
2401 		 * misinterpreting any NUL bytes not intended to be interpreted as
2402 		 * logically representing termination.
2403 		 *
2404 		 * (Actually, even if there were NUL bytes in the blob it would be
2405 		 * okay.  See remarks on bytea case above.)
2406 		 */
2407 		memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2408 
2409 #ifdef USE_ICU
2410 		if (uchar)
2411 			pfree(uchar);
2412 #endif
2413 	}
2414 
2415 	/*
2416 	 * Maintain approximate cardinality of both abbreviated keys and original,
2417 	 * authoritative keys using HyperLogLog.  Used as cheap insurance against
2418 	 * the worst case, where we do many string transformations for no saving
2419 	 * in full strcoll()-based comparisons.  These statistics are used by
2420 	 * varstr_abbrev_abort().
2421 	 *
2422 	 * First, Hash key proper, or a significant fraction of it.  Mix in length
2423 	 * in order to compensate for cases where differences are past
2424 	 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2425 	 */
2426 	hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2427 								   Min(len, PG_CACHE_LINE_SIZE)));
2428 
2429 	if (len > PG_CACHE_LINE_SIZE)
2430 		hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2431 
2432 	addHyperLogLog(&sss->full_card, hash);
2433 
2434 	/* Hash abbreviated key */
2435 #if SIZEOF_DATUM == 8
2436 	{
2437 		uint32		lohalf,
2438 					hihalf;
2439 
2440 		lohalf = (uint32) res;
2441 		hihalf = (uint32) (res >> 32);
2442 		hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2443 	}
2444 #else							/* SIZEOF_DATUM != 8 */
2445 	hash = DatumGetUInt32(hash_uint32((uint32) res));
2446 #endif
2447 
2448 	addHyperLogLog(&sss->abbr_card, hash);
2449 
2450 	/* Cache result, perhaps saving an expensive strxfrm() call next time */
2451 	sss->cache_blob = true;
2452 done:
2453 
2454 	/*
2455 	 * Byteswap on little-endian machines.
2456 	 *
2457 	 * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2458 	 * comparator) works correctly on all platforms.  If we didn't do this,
2459 	 * the comparator would have to call memcmp() with a pair of pointers to
2460 	 * the first byte of each abbreviated key, which is slower.
2461 	 */
2462 	res = DatumBigEndianToNative(res);
2463 
2464 	/* Don't leak memory here */
2465 	if (PointerGetDatum(authoritative) != original)
2466 		pfree(authoritative);
2467 
2468 	return res;
2469 }
2470 
2471 /*
2472  * Callback for estimating effectiveness of abbreviated key optimization, using
2473  * heuristic rules.  Returns value indicating if the abbreviation optimization
2474  * should be aborted, based on its projected effectiveness.
2475  */
2476 static bool
varstr_abbrev_abort(int memtupcount,SortSupport ssup)2477 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2478 {
2479 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2480 	double		abbrev_distinct,
2481 				key_distinct;
2482 
2483 	Assert(ssup->abbreviate);
2484 
2485 	/* Have a little patience */
2486 	if (memtupcount < 100)
2487 		return false;
2488 
2489 	abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2490 	key_distinct = estimateHyperLogLog(&sss->full_card);
2491 
2492 	/*
2493 	 * Clamp cardinality estimates to at least one distinct value.  While
2494 	 * NULLs are generally disregarded, if only NULL values were seen so far,
2495 	 * that might misrepresent costs if we failed to clamp.
2496 	 */
2497 	if (abbrev_distinct <= 1.0)
2498 		abbrev_distinct = 1.0;
2499 
2500 	if (key_distinct <= 1.0)
2501 		key_distinct = 1.0;
2502 
2503 	/*
2504 	 * In the worst case all abbreviated keys are identical, while at the same
2505 	 * time there are differences within full key strings not captured in
2506 	 * abbreviations.
2507 	 */
2508 #ifdef TRACE_SORT
2509 	if (trace_sort)
2510 	{
2511 		double		norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2512 
2513 		elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2514 			 "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2515 			 memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2516 			 sss->prop_card);
2517 	}
2518 #endif
2519 
2520 	/*
2521 	 * If the number of distinct abbreviated keys approximately matches the
2522 	 * number of distinct authoritative original keys, that's reason enough to
2523 	 * proceed.  We can win even with a very low cardinality set if most
2524 	 * tie-breakers only memcmp().  This is by far the most important
2525 	 * consideration.
2526 	 *
2527 	 * While comparisons that are resolved at the abbreviated key level are
2528 	 * considerably cheaper than tie-breakers resolved with memcmp(), both of
2529 	 * those two outcomes are so much cheaper than a full strcoll() once
2530 	 * sorting is underway that it doesn't seem worth it to weigh abbreviated
2531 	 * cardinality against the overall size of the set in order to more
2532 	 * accurately model costs.  Assume that an abbreviated comparison, and an
2533 	 * abbreviated comparison with a cheap memcmp()-based authoritative
2534 	 * resolution are equivalent.
2535 	 */
2536 	if (abbrev_distinct > key_distinct * sss->prop_card)
2537 	{
2538 		/*
2539 		 * When we have exceeded 10,000 tuples, decay required cardinality
2540 		 * aggressively for next call.
2541 		 *
2542 		 * This is useful because the number of comparisons required on
2543 		 * average increases at a linearithmic rate, and at roughly 10,000
2544 		 * tuples that factor will start to dominate over the linear costs of
2545 		 * string transformation (this is a conservative estimate).  The decay
2546 		 * rate is chosen to be a little less aggressive than halving -- which
2547 		 * (since we're called at points at which memtupcount has doubled)
2548 		 * would never see the cost model actually abort past the first call
2549 		 * following a decay.  This decay rate is mostly a precaution against
2550 		 * a sudden, violent swing in how well abbreviated cardinality tracks
2551 		 * full key cardinality.  The decay also serves to prevent a marginal
2552 		 * case from being aborted too late, when too much has already been
2553 		 * invested in string transformation.
2554 		 *
2555 		 * It's possible for sets of several million distinct strings with
2556 		 * mere tens of thousands of distinct abbreviated keys to still
2557 		 * benefit very significantly.  This will generally occur provided
2558 		 * each abbreviated key is a proxy for a roughly uniform number of the
2559 		 * set's full keys. If it isn't so, we hope to catch that early and
2560 		 * abort.  If it isn't caught early, by the time the problem is
2561 		 * apparent it's probably not worth aborting.
2562 		 */
2563 		if (memtupcount > 10000)
2564 			sss->prop_card *= 0.65;
2565 
2566 		return false;
2567 	}
2568 
2569 	/*
2570 	 * Abort abbreviation strategy.
2571 	 *
2572 	 * The worst case, where all abbreviated keys are identical while all
2573 	 * original strings differ will typically only see a regression of about
2574 	 * 10% in execution time for small to medium sized lists of strings.
2575 	 * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2576 	 * often expect very large improvements, particularly with sets of strings
2577 	 * of moderately high to high abbreviated cardinality.  There is little to
2578 	 * lose but much to gain, which our strategy reflects.
2579 	 */
2580 #ifdef TRACE_SORT
2581 	if (trace_sort)
2582 		elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2583 			 "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2584 			 memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2585 #endif
2586 
2587 	return true;
2588 }
2589 
2590 Datum
text_larger(PG_FUNCTION_ARGS)2591 text_larger(PG_FUNCTION_ARGS)
2592 {
2593 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2594 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2595 	text	   *result;
2596 
2597 	result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2598 
2599 	PG_RETURN_TEXT_P(result);
2600 }
2601 
2602 Datum
text_smaller(PG_FUNCTION_ARGS)2603 text_smaller(PG_FUNCTION_ARGS)
2604 {
2605 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2606 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2607 	text	   *result;
2608 
2609 	result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2610 
2611 	PG_RETURN_TEXT_P(result);
2612 }
2613 
2614 
2615 /*
2616  * The following operators support character-by-character comparison
2617  * of text datums, to allow building indexes suitable for LIKE clauses.
2618  * Note that the regular texteq/textne comparison operators, and regular
2619  * support functions 1 and 2 with "C" collation are assumed to be
2620  * compatible with these!
2621  */
2622 
2623 static int
internal_text_pattern_compare(text * arg1,text * arg2)2624 internal_text_pattern_compare(text *arg1, text *arg2)
2625 {
2626 	int			result;
2627 	int			len1,
2628 				len2;
2629 
2630 	len1 = VARSIZE_ANY_EXHDR(arg1);
2631 	len2 = VARSIZE_ANY_EXHDR(arg2);
2632 
2633 	result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
2634 	if (result != 0)
2635 		return result;
2636 	else if (len1 < len2)
2637 		return -1;
2638 	else if (len1 > len2)
2639 		return 1;
2640 	else
2641 		return 0;
2642 }
2643 
2644 
2645 Datum
text_pattern_lt(PG_FUNCTION_ARGS)2646 text_pattern_lt(PG_FUNCTION_ARGS)
2647 {
2648 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2649 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2650 	int			result;
2651 
2652 	result = internal_text_pattern_compare(arg1, arg2);
2653 
2654 	PG_FREE_IF_COPY(arg1, 0);
2655 	PG_FREE_IF_COPY(arg2, 1);
2656 
2657 	PG_RETURN_BOOL(result < 0);
2658 }
2659 
2660 
2661 Datum
text_pattern_le(PG_FUNCTION_ARGS)2662 text_pattern_le(PG_FUNCTION_ARGS)
2663 {
2664 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2665 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2666 	int			result;
2667 
2668 	result = internal_text_pattern_compare(arg1, arg2);
2669 
2670 	PG_FREE_IF_COPY(arg1, 0);
2671 	PG_FREE_IF_COPY(arg2, 1);
2672 
2673 	PG_RETURN_BOOL(result <= 0);
2674 }
2675 
2676 
2677 Datum
text_pattern_ge(PG_FUNCTION_ARGS)2678 text_pattern_ge(PG_FUNCTION_ARGS)
2679 {
2680 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2681 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2682 	int			result;
2683 
2684 	result = internal_text_pattern_compare(arg1, arg2);
2685 
2686 	PG_FREE_IF_COPY(arg1, 0);
2687 	PG_FREE_IF_COPY(arg2, 1);
2688 
2689 	PG_RETURN_BOOL(result >= 0);
2690 }
2691 
2692 
2693 Datum
text_pattern_gt(PG_FUNCTION_ARGS)2694 text_pattern_gt(PG_FUNCTION_ARGS)
2695 {
2696 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2697 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2698 	int			result;
2699 
2700 	result = internal_text_pattern_compare(arg1, arg2);
2701 
2702 	PG_FREE_IF_COPY(arg1, 0);
2703 	PG_FREE_IF_COPY(arg2, 1);
2704 
2705 	PG_RETURN_BOOL(result > 0);
2706 }
2707 
2708 
2709 Datum
bttext_pattern_cmp(PG_FUNCTION_ARGS)2710 bttext_pattern_cmp(PG_FUNCTION_ARGS)
2711 {
2712 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2713 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2714 	int			result;
2715 
2716 	result = internal_text_pattern_compare(arg1, arg2);
2717 
2718 	PG_FREE_IF_COPY(arg1, 0);
2719 	PG_FREE_IF_COPY(arg2, 1);
2720 
2721 	PG_RETURN_INT32(result);
2722 }
2723 
2724 
2725 Datum
bttext_pattern_sortsupport(PG_FUNCTION_ARGS)2726 bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
2727 {
2728 	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
2729 	MemoryContext oldcontext;
2730 
2731 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2732 
2733 	/* Use generic string SortSupport, forcing "C" collation */
2734 	varstr_sortsupport(ssup, C_COLLATION_OID, false);
2735 
2736 	MemoryContextSwitchTo(oldcontext);
2737 
2738 	PG_RETURN_VOID();
2739 }
2740 
2741 
2742 /*-------------------------------------------------------------
2743  * byteaoctetlen
2744  *
2745  * get the number of bytes contained in an instance of type 'bytea'
2746  *-------------------------------------------------------------
2747  */
2748 Datum
byteaoctetlen(PG_FUNCTION_ARGS)2749 byteaoctetlen(PG_FUNCTION_ARGS)
2750 {
2751 	Datum		str = PG_GETARG_DATUM(0);
2752 
2753 	/* We need not detoast the input at all */
2754 	PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
2755 }
2756 
2757 /*
2758  * byteacat -
2759  *	  takes two bytea* and returns a bytea* that is the concatenation of
2760  *	  the two.
2761  *
2762  * Cloned from textcat and modified as required.
2763  */
2764 Datum
byteacat(PG_FUNCTION_ARGS)2765 byteacat(PG_FUNCTION_ARGS)
2766 {
2767 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
2768 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
2769 
2770 	PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
2771 }
2772 
2773 /*
2774  * bytea_catenate
2775  *	Guts of byteacat(), broken out so it can be used by other functions
2776  *
2777  * Arguments can be in short-header form, but not compressed or out-of-line
2778  */
2779 static bytea *
bytea_catenate(bytea * t1,bytea * t2)2780 bytea_catenate(bytea *t1, bytea *t2)
2781 {
2782 	bytea	   *result;
2783 	int			len1,
2784 				len2,
2785 				len;
2786 	char	   *ptr;
2787 
2788 	len1 = VARSIZE_ANY_EXHDR(t1);
2789 	len2 = VARSIZE_ANY_EXHDR(t2);
2790 
2791 	/* paranoia ... probably should throw error instead? */
2792 	if (len1 < 0)
2793 		len1 = 0;
2794 	if (len2 < 0)
2795 		len2 = 0;
2796 
2797 	len = len1 + len2 + VARHDRSZ;
2798 	result = (bytea *) palloc(len);
2799 
2800 	/* Set size of result string... */
2801 	SET_VARSIZE(result, len);
2802 
2803 	/* Fill data field of result string... */
2804 	ptr = VARDATA(result);
2805 	if (len1 > 0)
2806 		memcpy(ptr, VARDATA_ANY(t1), len1);
2807 	if (len2 > 0)
2808 		memcpy(ptr + len1, VARDATA_ANY(t2), len2);
2809 
2810 	return result;
2811 }
2812 
2813 #define PG_STR_GET_BYTEA(str_) \
2814 	DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
2815 
2816 /*
2817  * bytea_substr()
2818  * Return a substring starting at the specified position.
2819  * Cloned from text_substr and modified as required.
2820  *
2821  * Input:
2822  *	- string
2823  *	- starting position (is one-based)
2824  *	- string length (optional)
2825  *
2826  * If the starting position is zero or less, then return from the start of the string
2827  * adjusting the length to be consistent with the "negative start" per SQL.
2828  * If the length is less than zero, an ERROR is thrown. If no third argument
2829  * (length) is provided, the length to the end of the string is assumed.
2830  */
2831 Datum
bytea_substr(PG_FUNCTION_ARGS)2832 bytea_substr(PG_FUNCTION_ARGS)
2833 {
2834 	PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
2835 									  PG_GETARG_INT32(1),
2836 									  PG_GETARG_INT32(2),
2837 									  false));
2838 }
2839 
2840 /*
2841  * bytea_substr_no_len -
2842  *	  Wrapper to avoid opr_sanity failure due to
2843  *	  one function accepting a different number of args.
2844  */
2845 Datum
bytea_substr_no_len(PG_FUNCTION_ARGS)2846 bytea_substr_no_len(PG_FUNCTION_ARGS)
2847 {
2848 	PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
2849 									  PG_GETARG_INT32(1),
2850 									  -1,
2851 									  true));
2852 }
2853 
2854 static bytea *
bytea_substring(Datum str,int S,int L,bool length_not_specified)2855 bytea_substring(Datum str,
2856 				int S,
2857 				int L,
2858 				bool length_not_specified)
2859 {
2860 	int			S1;				/* adjusted start position */
2861 	int			L1;				/* adjusted substring length */
2862 
2863 	S1 = Max(S, 1);
2864 
2865 	if (length_not_specified)
2866 	{
2867 		/*
2868 		 * Not passed a length - DatumGetByteaPSlice() grabs everything to the
2869 		 * end of the string if we pass it a negative value for length.
2870 		 */
2871 		L1 = -1;
2872 	}
2873 	else
2874 	{
2875 		/* end position */
2876 		int			E = S + L;
2877 
2878 		/*
2879 		 * A negative value for L is the only way for the end position to be
2880 		 * before the start. SQL99 says to throw an error.
2881 		 */
2882 		if (E < S)
2883 			ereport(ERROR,
2884 					(errcode(ERRCODE_SUBSTRING_ERROR),
2885 					 errmsg("negative substring length not allowed")));
2886 
2887 		/*
2888 		 * A zero or negative value for the end position can happen if the
2889 		 * start was negative or one. SQL99 says to return a zero-length
2890 		 * string.
2891 		 */
2892 		if (E < 1)
2893 			return PG_STR_GET_BYTEA("");
2894 
2895 		L1 = E - S1;
2896 	}
2897 
2898 	/*
2899 	 * If the start position is past the end of the string, SQL99 says to
2900 	 * return a zero-length string -- DatumGetByteaPSlice() will do that for
2901 	 * us. Convert to zero-based starting position
2902 	 */
2903 	return DatumGetByteaPSlice(str, S1 - 1, L1);
2904 }
2905 
2906 /*
2907  * byteaoverlay
2908  *	Replace specified substring of first string with second
2909  *
2910  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
2911  * This code is a direct implementation of what the standard says.
2912  */
2913 Datum
byteaoverlay(PG_FUNCTION_ARGS)2914 byteaoverlay(PG_FUNCTION_ARGS)
2915 {
2916 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
2917 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
2918 	int			sp = PG_GETARG_INT32(2);	/* substring start position */
2919 	int			sl = PG_GETARG_INT32(3);	/* substring length */
2920 
2921 	PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
2922 }
2923 
2924 Datum
byteaoverlay_no_len(PG_FUNCTION_ARGS)2925 byteaoverlay_no_len(PG_FUNCTION_ARGS)
2926 {
2927 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
2928 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
2929 	int			sp = PG_GETARG_INT32(2);	/* substring start position */
2930 	int			sl;
2931 
2932 	sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
2933 	PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
2934 }
2935 
2936 static bytea *
bytea_overlay(bytea * t1,bytea * t2,int sp,int sl)2937 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
2938 {
2939 	bytea	   *result;
2940 	bytea	   *s1;
2941 	bytea	   *s2;
2942 	int			sp_pl_sl;
2943 
2944 	/*
2945 	 * Check for possible integer-overflow cases.  For negative sp, throw a
2946 	 * "substring length" error because that's what should be expected
2947 	 * according to the spec's definition of OVERLAY().
2948 	 */
2949 	if (sp <= 0)
2950 		ereport(ERROR,
2951 				(errcode(ERRCODE_SUBSTRING_ERROR),
2952 				 errmsg("negative substring length not allowed")));
2953 	sp_pl_sl = sp + sl;
2954 	if (sp_pl_sl <= sl)
2955 		ereport(ERROR,
2956 				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
2957 				 errmsg("integer out of range")));
2958 
2959 	s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
2960 	s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
2961 	result = bytea_catenate(s1, t2);
2962 	result = bytea_catenate(result, s2);
2963 
2964 	return result;
2965 }
2966 
2967 /*
2968  * byteapos -
2969  *	  Return the position of the specified substring.
2970  *	  Implements the SQL POSITION() function.
2971  * Cloned from textpos and modified as required.
2972  */
2973 Datum
byteapos(PG_FUNCTION_ARGS)2974 byteapos(PG_FUNCTION_ARGS)
2975 {
2976 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
2977 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
2978 	int			pos;
2979 	int			px,
2980 				p;
2981 	int			len1,
2982 				len2;
2983 	char	   *p1,
2984 			   *p2;
2985 
2986 	len1 = VARSIZE_ANY_EXHDR(t1);
2987 	len2 = VARSIZE_ANY_EXHDR(t2);
2988 
2989 	if (len2 <= 0)
2990 		PG_RETURN_INT32(1);		/* result for empty pattern */
2991 
2992 	p1 = VARDATA_ANY(t1);
2993 	p2 = VARDATA_ANY(t2);
2994 
2995 	pos = 0;
2996 	px = (len1 - len2);
2997 	for (p = 0; p <= px; p++)
2998 	{
2999 		if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3000 		{
3001 			pos = p + 1;
3002 			break;
3003 		};
3004 		p1++;
3005 	};
3006 
3007 	PG_RETURN_INT32(pos);
3008 }
3009 
3010 /*-------------------------------------------------------------
3011  * byteaGetByte
3012  *
3013  * this routine treats "bytea" as an array of bytes.
3014  * It returns the Nth byte (a number between 0 and 255).
3015  *-------------------------------------------------------------
3016  */
3017 Datum
byteaGetByte(PG_FUNCTION_ARGS)3018 byteaGetByte(PG_FUNCTION_ARGS)
3019 {
3020 	bytea	   *v = PG_GETARG_BYTEA_PP(0);
3021 	int32		n = PG_GETARG_INT32(1);
3022 	int			len;
3023 	int			byte;
3024 
3025 	len = VARSIZE_ANY_EXHDR(v);
3026 
3027 	if (n < 0 || n >= len)
3028 		ereport(ERROR,
3029 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3030 				 errmsg("index %d out of valid range, 0..%d",
3031 						n, len - 1)));
3032 
3033 	byte = ((unsigned char *) VARDATA_ANY(v))[n];
3034 
3035 	PG_RETURN_INT32(byte);
3036 }
3037 
3038 /*-------------------------------------------------------------
3039  * byteaGetBit
3040  *
3041  * This routine treats a "bytea" type like an array of bits.
3042  * It returns the value of the Nth bit (0 or 1).
3043  *
3044  *-------------------------------------------------------------
3045  */
3046 Datum
byteaGetBit(PG_FUNCTION_ARGS)3047 byteaGetBit(PG_FUNCTION_ARGS)
3048 {
3049 	bytea	   *v = PG_GETARG_BYTEA_PP(0);
3050 	int32		n = PG_GETARG_INT32(1);
3051 	int			byteNo,
3052 				bitNo;
3053 	int			len;
3054 	int			byte;
3055 
3056 	len = VARSIZE_ANY_EXHDR(v);
3057 
3058 	/* Do comparison arithmetic in int64 in case len exceeds INT_MAX/8 */
3059 	if (n < 0 || n >= (int64) len * 8)
3060 		ereport(ERROR,
3061 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3062 				 errmsg("index %d out of valid range, 0..%d",
3063 						n, (int) Min((int64) len * 8 - 1, INT_MAX))));
3064 
3065 	byteNo = n / 8;
3066 	bitNo = n % 8;
3067 
3068 	byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3069 
3070 	if (byte & (1 << bitNo))
3071 		PG_RETURN_INT32(1);
3072 	else
3073 		PG_RETURN_INT32(0);
3074 }
3075 
3076 /*-------------------------------------------------------------
3077  * byteaSetByte
3078  *
3079  * Given an instance of type 'bytea' creates a new one with
3080  * the Nth byte set to the given value.
3081  *
3082  *-------------------------------------------------------------
3083  */
3084 Datum
byteaSetByte(PG_FUNCTION_ARGS)3085 byteaSetByte(PG_FUNCTION_ARGS)
3086 {
3087 	bytea	   *res = PG_GETARG_BYTEA_P_COPY(0);
3088 	int32		n = PG_GETARG_INT32(1);
3089 	int32		newByte = PG_GETARG_INT32(2);
3090 	int			len;
3091 
3092 	len = VARSIZE(res) - VARHDRSZ;
3093 
3094 	if (n < 0 || n >= len)
3095 		ereport(ERROR,
3096 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3097 				 errmsg("index %d out of valid range, 0..%d",
3098 						n, len - 1)));
3099 
3100 	/*
3101 	 * Now set the byte.
3102 	 */
3103 	((unsigned char *) VARDATA(res))[n] = newByte;
3104 
3105 	PG_RETURN_BYTEA_P(res);
3106 }
3107 
3108 /*-------------------------------------------------------------
3109  * byteaSetBit
3110  *
3111  * Given an instance of type 'bytea' creates a new one with
3112  * the Nth bit set to the given value.
3113  *
3114  *-------------------------------------------------------------
3115  */
3116 Datum
byteaSetBit(PG_FUNCTION_ARGS)3117 byteaSetBit(PG_FUNCTION_ARGS)
3118 {
3119 	bytea	   *res = PG_GETARG_BYTEA_P_COPY(0);
3120 	int32		n = PG_GETARG_INT32(1);
3121 	int32		newBit = PG_GETARG_INT32(2);
3122 	int			len;
3123 	int			oldByte,
3124 				newByte;
3125 	int			byteNo,
3126 				bitNo;
3127 
3128 	len = VARSIZE(res) - VARHDRSZ;
3129 
3130 	/* Do comparison arithmetic in int64 in case len exceeds INT_MAX/8 */
3131 	if (n < 0 || n >= (int64) len * 8)
3132 		ereport(ERROR,
3133 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3134 				 errmsg("index %d out of valid range, 0..%d",
3135 						n, (int) Min((int64) len * 8 - 1, INT_MAX))));
3136 
3137 	byteNo = n / 8;
3138 	bitNo = n % 8;
3139 
3140 	/*
3141 	 * sanity check!
3142 	 */
3143 	if (newBit != 0 && newBit != 1)
3144 		ereport(ERROR,
3145 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3146 				 errmsg("new bit must be 0 or 1")));
3147 
3148 	/*
3149 	 * Update the byte.
3150 	 */
3151 	oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3152 
3153 	if (newBit == 0)
3154 		newByte = oldByte & (~(1 << bitNo));
3155 	else
3156 		newByte = oldByte | (1 << bitNo);
3157 
3158 	((unsigned char *) VARDATA(res))[byteNo] = newByte;
3159 
3160 	PG_RETURN_BYTEA_P(res);
3161 }
3162 
3163 
3164 /* text_name()
3165  * Converts a text type to a Name type.
3166  */
3167 Datum
text_name(PG_FUNCTION_ARGS)3168 text_name(PG_FUNCTION_ARGS)
3169 {
3170 	text	   *s = PG_GETARG_TEXT_PP(0);
3171 	Name		result;
3172 	int			len;
3173 
3174 	len = VARSIZE_ANY_EXHDR(s);
3175 
3176 	/* Truncate oversize input */
3177 	if (len >= NAMEDATALEN)
3178 		len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3179 
3180 	/* We use palloc0 here to ensure result is zero-padded */
3181 	result = (Name) palloc0(NAMEDATALEN);
3182 	memcpy(NameStr(*result), VARDATA_ANY(s), len);
3183 
3184 	PG_RETURN_NAME(result);
3185 }
3186 
3187 /* name_text()
3188  * Converts a Name type to a text type.
3189  */
3190 Datum
name_text(PG_FUNCTION_ARGS)3191 name_text(PG_FUNCTION_ARGS)
3192 {
3193 	Name		s = PG_GETARG_NAME(0);
3194 
3195 	PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3196 }
3197 
3198 
3199 /*
3200  * textToQualifiedNameList - convert a text object to list of names
3201  *
3202  * This implements the input parsing needed by nextval() and other
3203  * functions that take a text parameter representing a qualified name.
3204  * We split the name at dots, downcase if not double-quoted, and
3205  * truncate names if they're too long.
3206  */
3207 List *
textToQualifiedNameList(text * textval)3208 textToQualifiedNameList(text *textval)
3209 {
3210 	char	   *rawname;
3211 	List	   *result = NIL;
3212 	List	   *namelist;
3213 	ListCell   *l;
3214 
3215 	/* Convert to C string (handles possible detoasting). */
3216 	/* Note we rely on being able to modify rawname below. */
3217 	rawname = text_to_cstring(textval);
3218 
3219 	if (!SplitIdentifierString(rawname, '.', &namelist))
3220 		ereport(ERROR,
3221 				(errcode(ERRCODE_INVALID_NAME),
3222 				 errmsg("invalid name syntax")));
3223 
3224 	if (namelist == NIL)
3225 		ereport(ERROR,
3226 				(errcode(ERRCODE_INVALID_NAME),
3227 				 errmsg("invalid name syntax")));
3228 
3229 	foreach(l, namelist)
3230 	{
3231 		char	   *curname = (char *) lfirst(l);
3232 
3233 		result = lappend(result, makeString(pstrdup(curname)));
3234 	}
3235 
3236 	pfree(rawname);
3237 	list_free(namelist);
3238 
3239 	return result;
3240 }
3241 
3242 /*
3243  * SplitIdentifierString --- parse a string containing identifiers
3244  *
3245  * This is the guts of textToQualifiedNameList, and is exported for use in
3246  * other situations such as parsing GUC variables.  In the GUC case, it's
3247  * important to avoid memory leaks, so the API is designed to minimize the
3248  * amount of stuff that needs to be allocated and freed.
3249  *
3250  * Inputs:
3251  *	rawstring: the input string; must be overwritable!	On return, it's
3252  *			   been modified to contain the separated identifiers.
3253  *	separator: the separator punctuation expected between identifiers
3254  *			   (typically '.' or ',').  Whitespace may also appear around
3255  *			   identifiers.
3256  * Outputs:
3257  *	namelist: filled with a palloc'd list of pointers to identifiers within
3258  *			  rawstring.  Caller should list_free() this even on error return.
3259  *
3260  * Returns TRUE if okay, FALSE if there is a syntax error in the string.
3261  *
3262  * Note that an empty string is considered okay here, though not in
3263  * textToQualifiedNameList.
3264  */
3265 bool
SplitIdentifierString(char * rawstring,char separator,List ** namelist)3266 SplitIdentifierString(char *rawstring, char separator,
3267 					  List **namelist)
3268 {
3269 	char	   *nextp = rawstring;
3270 	bool		done = false;
3271 
3272 	*namelist = NIL;
3273 
3274 	while (scanner_isspace(*nextp))
3275 		nextp++;				/* skip leading whitespace */
3276 
3277 	if (*nextp == '\0')
3278 		return true;			/* allow empty string */
3279 
3280 	/* At the top of the loop, we are at start of a new identifier. */
3281 	do
3282 	{
3283 		char	   *curname;
3284 		char	   *endp;
3285 
3286 		if (*nextp == '"')
3287 		{
3288 			/* Quoted name --- collapse quote-quote pairs, no downcasing */
3289 			curname = nextp + 1;
3290 			for (;;)
3291 			{
3292 				endp = strchr(nextp + 1, '"');
3293 				if (endp == NULL)
3294 					return false;	/* mismatched quotes */
3295 				if (endp[1] != '"')
3296 					break;		/* found end of quoted name */
3297 				/* Collapse adjacent quotes into one quote, and look again */
3298 				memmove(endp, endp + 1, strlen(endp));
3299 				nextp = endp;
3300 			}
3301 			/* endp now points at the terminating quote */
3302 			nextp = endp + 1;
3303 		}
3304 		else
3305 		{
3306 			/* Unquoted name --- extends to separator or whitespace */
3307 			char	   *downname;
3308 			int			len;
3309 
3310 			curname = nextp;
3311 			while (*nextp && *nextp != separator &&
3312 				   !scanner_isspace(*nextp))
3313 				nextp++;
3314 			endp = nextp;
3315 			if (curname == nextp)
3316 				return false;	/* empty unquoted name not allowed */
3317 
3318 			/*
3319 			 * Downcase the identifier, using same code as main lexer does.
3320 			 *
3321 			 * XXX because we want to overwrite the input in-place, we cannot
3322 			 * support a downcasing transformation that increases the string
3323 			 * length.  This is not a problem given the current implementation
3324 			 * of downcase_truncate_identifier, but we'll probably have to do
3325 			 * something about this someday.
3326 			 */
3327 			len = endp - curname;
3328 			downname = downcase_truncate_identifier(curname, len, false);
3329 			Assert(strlen(downname) <= len);
3330 			strncpy(curname, downname, len);	/* strncpy is required here */
3331 			pfree(downname);
3332 		}
3333 
3334 		while (scanner_isspace(*nextp))
3335 			nextp++;			/* skip trailing whitespace */
3336 
3337 		if (*nextp == separator)
3338 		{
3339 			nextp++;
3340 			while (scanner_isspace(*nextp))
3341 				nextp++;		/* skip leading whitespace for next */
3342 			/* we expect another name, so done remains false */
3343 		}
3344 		else if (*nextp == '\0')
3345 			done = true;
3346 		else
3347 			return false;		/* invalid syntax */
3348 
3349 		/* Now safe to overwrite separator with a null */
3350 		*endp = '\0';
3351 
3352 		/* Truncate name if it's overlength */
3353 		truncate_identifier(curname, strlen(curname), false);
3354 
3355 		/*
3356 		 * Finished isolating current name --- add it to list
3357 		 */
3358 		*namelist = lappend(*namelist, curname);
3359 
3360 		/* Loop back if we didn't reach end of string */
3361 	} while (!done);
3362 
3363 	return true;
3364 }
3365 
3366 
3367 /*
3368  * SplitDirectoriesString --- parse a string containing file/directory names
3369  *
3370  * This works fine on file names too; the function name is historical.
3371  *
3372  * This is similar to SplitIdentifierString, except that the parsing
3373  * rules are meant to handle pathnames instead of identifiers: there is
3374  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3375  * and we apply canonicalize_path() to each extracted string.  Because of the
3376  * last, the returned strings are separately palloc'd rather than being
3377  * pointers into rawstring --- but we still scribble on rawstring.
3378  *
3379  * Inputs:
3380  *	rawstring: the input string; must be modifiable!
3381  *	separator: the separator punctuation expected between directories
3382  *			   (typically ',' or ';').  Whitespace may also appear around
3383  *			   directories.
3384  * Outputs:
3385  *	namelist: filled with a palloc'd list of directory names.
3386  *			  Caller should list_free_deep() this even on error return.
3387  *
3388  * Returns TRUE if okay, FALSE if there is a syntax error in the string.
3389  *
3390  * Note that an empty string is considered okay here.
3391  */
3392 bool
SplitDirectoriesString(char * rawstring,char separator,List ** namelist)3393 SplitDirectoriesString(char *rawstring, char separator,
3394 					   List **namelist)
3395 {
3396 	char	   *nextp = rawstring;
3397 	bool		done = false;
3398 
3399 	*namelist = NIL;
3400 
3401 	while (scanner_isspace(*nextp))
3402 		nextp++;				/* skip leading whitespace */
3403 
3404 	if (*nextp == '\0')
3405 		return true;			/* allow empty string */
3406 
3407 	/* At the top of the loop, we are at start of a new directory. */
3408 	do
3409 	{
3410 		char	   *curname;
3411 		char	   *endp;
3412 
3413 		if (*nextp == '"')
3414 		{
3415 			/* Quoted name --- collapse quote-quote pairs */
3416 			curname = nextp + 1;
3417 			for (;;)
3418 			{
3419 				endp = strchr(nextp + 1, '"');
3420 				if (endp == NULL)
3421 					return false;	/* mismatched quotes */
3422 				if (endp[1] != '"')
3423 					break;		/* found end of quoted name */
3424 				/* Collapse adjacent quotes into one quote, and look again */
3425 				memmove(endp, endp + 1, strlen(endp));
3426 				nextp = endp;
3427 			}
3428 			/* endp now points at the terminating quote */
3429 			nextp = endp + 1;
3430 		}
3431 		else
3432 		{
3433 			/* Unquoted name --- extends to separator or end of string */
3434 			curname = endp = nextp;
3435 			while (*nextp && *nextp != separator)
3436 			{
3437 				/* trailing whitespace should not be included in name */
3438 				if (!scanner_isspace(*nextp))
3439 					endp = nextp + 1;
3440 				nextp++;
3441 			}
3442 			if (curname == endp)
3443 				return false;	/* empty unquoted name not allowed */
3444 		}
3445 
3446 		while (scanner_isspace(*nextp))
3447 			nextp++;			/* skip trailing whitespace */
3448 
3449 		if (*nextp == separator)
3450 		{
3451 			nextp++;
3452 			while (scanner_isspace(*nextp))
3453 				nextp++;		/* skip leading whitespace for next */
3454 			/* we expect another name, so done remains false */
3455 		}
3456 		else if (*nextp == '\0')
3457 			done = true;
3458 		else
3459 			return false;		/* invalid syntax */
3460 
3461 		/* Now safe to overwrite separator with a null */
3462 		*endp = '\0';
3463 
3464 		/* Truncate path if it's overlength */
3465 		if (strlen(curname) >= MAXPGPATH)
3466 			curname[MAXPGPATH - 1] = '\0';
3467 
3468 		/*
3469 		 * Finished isolating current name --- add it to list
3470 		 */
3471 		curname = pstrdup(curname);
3472 		canonicalize_path(curname);
3473 		*namelist = lappend(*namelist, curname);
3474 
3475 		/* Loop back if we didn't reach end of string */
3476 	} while (!done);
3477 
3478 	return true;
3479 }
3480 
3481 
3482 /*
3483  * SplitGUCList --- parse a string containing identifiers or file names
3484  *
3485  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3486  * presuming whether the elements will be taken as identifiers or file names.
3487  * We assume the input has already been through flatten_set_variable_args(),
3488  * so that we need never downcase (if appropriate, that was done already).
3489  * Nor do we ever truncate, since we don't know the correct max length.
3490  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3491  * because any embedded whitespace should have led to double-quoting).
3492  * Otherwise the API is identical to SplitIdentifierString.
3493  *
3494  * XXX it's annoying to have so many copies of this string-splitting logic.
3495  * However, it's not clear that having one function with a bunch of option
3496  * flags would be much better.
3497  *
3498  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3499  * Be sure to update that if you have to change this.
3500  *
3501  * Inputs:
3502  *	rawstring: the input string; must be overwritable!	On return, it's
3503  *			   been modified to contain the separated identifiers.
3504  *	separator: the separator punctuation expected between identifiers
3505  *			   (typically '.' or ',').  Whitespace may also appear around
3506  *			   identifiers.
3507  * Outputs:
3508  *	namelist: filled with a palloc'd list of pointers to identifiers within
3509  *			  rawstring.  Caller should list_free() this even on error return.
3510  *
3511  * Returns true if okay, false if there is a syntax error in the string.
3512  */
3513 bool
SplitGUCList(char * rawstring,char separator,List ** namelist)3514 SplitGUCList(char *rawstring, char separator,
3515 			 List **namelist)
3516 {
3517 	char	   *nextp = rawstring;
3518 	bool		done = false;
3519 
3520 	*namelist = NIL;
3521 
3522 	while (scanner_isspace(*nextp))
3523 		nextp++;				/* skip leading whitespace */
3524 
3525 	if (*nextp == '\0')
3526 		return true;			/* allow empty string */
3527 
3528 	/* At the top of the loop, we are at start of a new identifier. */
3529 	do
3530 	{
3531 		char	   *curname;
3532 		char	   *endp;
3533 
3534 		if (*nextp == '"')
3535 		{
3536 			/* Quoted name --- collapse quote-quote pairs */
3537 			curname = nextp + 1;
3538 			for (;;)
3539 			{
3540 				endp = strchr(nextp + 1, '"');
3541 				if (endp == NULL)
3542 					return false;	/* mismatched quotes */
3543 				if (endp[1] != '"')
3544 					break;		/* found end of quoted name */
3545 				/* Collapse adjacent quotes into one quote, and look again */
3546 				memmove(endp, endp + 1, strlen(endp));
3547 				nextp = endp;
3548 			}
3549 			/* endp now points at the terminating quote */
3550 			nextp = endp + 1;
3551 		}
3552 		else
3553 		{
3554 			/* Unquoted name --- extends to separator or whitespace */
3555 			curname = nextp;
3556 			while (*nextp && *nextp != separator &&
3557 				   !scanner_isspace(*nextp))
3558 				nextp++;
3559 			endp = nextp;
3560 			if (curname == nextp)
3561 				return false;	/* empty unquoted name not allowed */
3562 		}
3563 
3564 		while (scanner_isspace(*nextp))
3565 			nextp++;			/* skip trailing whitespace */
3566 
3567 		if (*nextp == separator)
3568 		{
3569 			nextp++;
3570 			while (scanner_isspace(*nextp))
3571 				nextp++;		/* skip leading whitespace for next */
3572 			/* we expect another name, so done remains false */
3573 		}
3574 		else if (*nextp == '\0')
3575 			done = true;
3576 		else
3577 			return false;		/* invalid syntax */
3578 
3579 		/* Now safe to overwrite separator with a null */
3580 		*endp = '\0';
3581 
3582 		/*
3583 		 * Finished isolating current name --- add it to list
3584 		 */
3585 		*namelist = lappend(*namelist, curname);
3586 
3587 		/* Loop back if we didn't reach end of string */
3588 	} while (!done);
3589 
3590 	return true;
3591 }
3592 
3593 
3594 /*****************************************************************************
3595  *	Comparison Functions used for bytea
3596  *
3597  * Note: btree indexes need these routines not to leak memory; therefore,
3598  * be careful to free working copies of toasted datums.  Most places don't
3599  * need to be so careful.
3600  *****************************************************************************/
3601 
3602 Datum
byteaeq(PG_FUNCTION_ARGS)3603 byteaeq(PG_FUNCTION_ARGS)
3604 {
3605 	Datum		arg1 = PG_GETARG_DATUM(0);
3606 	Datum		arg2 = PG_GETARG_DATUM(1);
3607 	bool		result;
3608 	Size		len1,
3609 				len2;
3610 
3611 	/*
3612 	 * We can use a fast path for unequal lengths, which might save us from
3613 	 * having to detoast one or both values.
3614 	 */
3615 	len1 = toast_raw_datum_size(arg1);
3616 	len2 = toast_raw_datum_size(arg2);
3617 	if (len1 != len2)
3618 		result = false;
3619 	else
3620 	{
3621 		bytea	   *barg1 = DatumGetByteaPP(arg1);
3622 		bytea	   *barg2 = DatumGetByteaPP(arg2);
3623 
3624 		result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3625 						 len1 - VARHDRSZ) == 0);
3626 
3627 		PG_FREE_IF_COPY(barg1, 0);
3628 		PG_FREE_IF_COPY(barg2, 1);
3629 	}
3630 
3631 	PG_RETURN_BOOL(result);
3632 }
3633 
3634 Datum
byteane(PG_FUNCTION_ARGS)3635 byteane(PG_FUNCTION_ARGS)
3636 {
3637 	Datum		arg1 = PG_GETARG_DATUM(0);
3638 	Datum		arg2 = PG_GETARG_DATUM(1);
3639 	bool		result;
3640 	Size		len1,
3641 				len2;
3642 
3643 	/*
3644 	 * We can use a fast path for unequal lengths, which might save us from
3645 	 * having to detoast one or both values.
3646 	 */
3647 	len1 = toast_raw_datum_size(arg1);
3648 	len2 = toast_raw_datum_size(arg2);
3649 	if (len1 != len2)
3650 		result = true;
3651 	else
3652 	{
3653 		bytea	   *barg1 = DatumGetByteaPP(arg1);
3654 		bytea	   *barg2 = DatumGetByteaPP(arg2);
3655 
3656 		result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3657 						 len1 - VARHDRSZ) != 0);
3658 
3659 		PG_FREE_IF_COPY(barg1, 0);
3660 		PG_FREE_IF_COPY(barg2, 1);
3661 	}
3662 
3663 	PG_RETURN_BOOL(result);
3664 }
3665 
3666 Datum
bytealt(PG_FUNCTION_ARGS)3667 bytealt(PG_FUNCTION_ARGS)
3668 {
3669 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
3670 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
3671 	int			len1,
3672 				len2;
3673 	int			cmp;
3674 
3675 	len1 = VARSIZE_ANY_EXHDR(arg1);
3676 	len2 = VARSIZE_ANY_EXHDR(arg2);
3677 
3678 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3679 
3680 	PG_FREE_IF_COPY(arg1, 0);
3681 	PG_FREE_IF_COPY(arg2, 1);
3682 
3683 	PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
3684 }
3685 
3686 Datum
byteale(PG_FUNCTION_ARGS)3687 byteale(PG_FUNCTION_ARGS)
3688 {
3689 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
3690 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
3691 	int			len1,
3692 				len2;
3693 	int			cmp;
3694 
3695 	len1 = VARSIZE_ANY_EXHDR(arg1);
3696 	len2 = VARSIZE_ANY_EXHDR(arg2);
3697 
3698 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3699 
3700 	PG_FREE_IF_COPY(arg1, 0);
3701 	PG_FREE_IF_COPY(arg2, 1);
3702 
3703 	PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
3704 }
3705 
3706 Datum
byteagt(PG_FUNCTION_ARGS)3707 byteagt(PG_FUNCTION_ARGS)
3708 {
3709 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
3710 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
3711 	int			len1,
3712 				len2;
3713 	int			cmp;
3714 
3715 	len1 = VARSIZE_ANY_EXHDR(arg1);
3716 	len2 = VARSIZE_ANY_EXHDR(arg2);
3717 
3718 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3719 
3720 	PG_FREE_IF_COPY(arg1, 0);
3721 	PG_FREE_IF_COPY(arg2, 1);
3722 
3723 	PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
3724 }
3725 
3726 Datum
byteage(PG_FUNCTION_ARGS)3727 byteage(PG_FUNCTION_ARGS)
3728 {
3729 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
3730 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
3731 	int			len1,
3732 				len2;
3733 	int			cmp;
3734 
3735 	len1 = VARSIZE_ANY_EXHDR(arg1);
3736 	len2 = VARSIZE_ANY_EXHDR(arg2);
3737 
3738 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3739 
3740 	PG_FREE_IF_COPY(arg1, 0);
3741 	PG_FREE_IF_COPY(arg2, 1);
3742 
3743 	PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
3744 }
3745 
3746 Datum
byteacmp(PG_FUNCTION_ARGS)3747 byteacmp(PG_FUNCTION_ARGS)
3748 {
3749 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
3750 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
3751 	int			len1,
3752 				len2;
3753 	int			cmp;
3754 
3755 	len1 = VARSIZE_ANY_EXHDR(arg1);
3756 	len2 = VARSIZE_ANY_EXHDR(arg2);
3757 
3758 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3759 	if ((cmp == 0) && (len1 != len2))
3760 		cmp = (len1 < len2) ? -1 : 1;
3761 
3762 	PG_FREE_IF_COPY(arg1, 0);
3763 	PG_FREE_IF_COPY(arg2, 1);
3764 
3765 	PG_RETURN_INT32(cmp);
3766 }
3767 
3768 Datum
bytea_sortsupport(PG_FUNCTION_ARGS)3769 bytea_sortsupport(PG_FUNCTION_ARGS)
3770 {
3771 	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3772 	MemoryContext oldcontext;
3773 
3774 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3775 
3776 	/* Use generic string SortSupport, forcing "C" collation */
3777 	varstr_sortsupport(ssup, C_COLLATION_OID, false);
3778 
3779 	MemoryContextSwitchTo(oldcontext);
3780 
3781 	PG_RETURN_VOID();
3782 }
3783 
3784 /*
3785  * appendStringInfoText
3786  *
3787  * Append a text to str.
3788  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
3789  */
3790 static void
appendStringInfoText(StringInfo str,const text * t)3791 appendStringInfoText(StringInfo str, const text *t)
3792 {
3793 	appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
3794 }
3795 
3796 /*
3797  * replace_text
3798  * replace all occurrences of 'old_sub_str' in 'orig_str'
3799  * with 'new_sub_str' to form 'new_str'
3800  *
3801  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
3802  * otherwise returns 'new_str'
3803  */
3804 Datum
replace_text(PG_FUNCTION_ARGS)3805 replace_text(PG_FUNCTION_ARGS)
3806 {
3807 	text	   *src_text = PG_GETARG_TEXT_PP(0);
3808 	text	   *from_sub_text = PG_GETARG_TEXT_PP(1);
3809 	text	   *to_sub_text = PG_GETARG_TEXT_PP(2);
3810 	int			src_text_len;
3811 	int			from_sub_text_len;
3812 	TextPositionState state;
3813 	text	   *ret_text;
3814 	int			start_posn;
3815 	int			curr_posn;
3816 	int			chunk_len;
3817 	char	   *start_ptr;
3818 	StringInfoData str;
3819 
3820 	text_position_setup(src_text, from_sub_text, &state);
3821 
3822 	/*
3823 	 * Note: we check the converted string length, not the original, because
3824 	 * they could be different if the input contained invalid encoding.
3825 	 */
3826 	src_text_len = state.len1;
3827 	from_sub_text_len = state.len2;
3828 
3829 	/* Return unmodified source string if empty source or pattern */
3830 	if (src_text_len < 1 || from_sub_text_len < 1)
3831 	{
3832 		text_position_cleanup(&state);
3833 		PG_RETURN_TEXT_P(src_text);
3834 	}
3835 
3836 	start_posn = 1;
3837 	curr_posn = text_position_next(1, &state);
3838 
3839 	/* When the from_sub_text is not found, there is nothing to do. */
3840 	if (curr_posn == 0)
3841 	{
3842 		text_position_cleanup(&state);
3843 		PG_RETURN_TEXT_P(src_text);
3844 	}
3845 
3846 	/* start_ptr points to the start_posn'th character of src_text */
3847 	start_ptr = VARDATA_ANY(src_text);
3848 
3849 	initStringInfo(&str);
3850 
3851 	do
3852 	{
3853 		CHECK_FOR_INTERRUPTS();
3854 
3855 		/* copy the data skipped over by last text_position_next() */
3856 		chunk_len = charlen_to_bytelen(start_ptr, curr_posn - start_posn);
3857 		appendBinaryStringInfo(&str, start_ptr, chunk_len);
3858 
3859 		appendStringInfoText(&str, to_sub_text);
3860 
3861 		start_posn = curr_posn;
3862 		start_ptr += chunk_len;
3863 		start_posn += from_sub_text_len;
3864 		start_ptr += charlen_to_bytelen(start_ptr, from_sub_text_len);
3865 
3866 		curr_posn = text_position_next(start_posn, &state);
3867 	}
3868 	while (curr_posn > 0);
3869 
3870 	/* copy trailing data */
3871 	chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
3872 	appendBinaryStringInfo(&str, start_ptr, chunk_len);
3873 
3874 	text_position_cleanup(&state);
3875 
3876 	ret_text = cstring_to_text_with_len(str.data, str.len);
3877 	pfree(str.data);
3878 
3879 	PG_RETURN_TEXT_P(ret_text);
3880 }
3881 
3882 /*
3883  * check_replace_text_has_escape_char
3884  *
3885  * check whether replace_text contains escape char.
3886  */
3887 static bool
check_replace_text_has_escape_char(const text * replace_text)3888 check_replace_text_has_escape_char(const text *replace_text)
3889 {
3890 	const char *p = VARDATA_ANY(replace_text);
3891 	const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
3892 
3893 	if (pg_database_encoding_max_length() == 1)
3894 	{
3895 		for (; p < p_end; p++)
3896 		{
3897 			if (*p == '\\')
3898 				return true;
3899 		}
3900 	}
3901 	else
3902 	{
3903 		for (; p < p_end; p += pg_mblen(p))
3904 		{
3905 			if (*p == '\\')
3906 				return true;
3907 		}
3908 	}
3909 
3910 	return false;
3911 }
3912 
3913 /*
3914  * appendStringInfoRegexpSubstr
3915  *
3916  * Append replace_text to str, substituting regexp back references for
3917  * \n escapes.  start_ptr is the start of the match in the source string,
3918  * at logical character position data_pos.
3919  */
3920 static void
appendStringInfoRegexpSubstr(StringInfo str,text * replace_text,regmatch_t * pmatch,char * start_ptr,int data_pos)3921 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
3922 							 regmatch_t *pmatch,
3923 							 char *start_ptr, int data_pos)
3924 {
3925 	const char *p = VARDATA_ANY(replace_text);
3926 	const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
3927 	int			eml = pg_database_encoding_max_length();
3928 
3929 	for (;;)
3930 	{
3931 		const char *chunk_start = p;
3932 		int			so;
3933 		int			eo;
3934 
3935 		/* Find next escape char. */
3936 		if (eml == 1)
3937 		{
3938 			for (; p < p_end && *p != '\\'; p++)
3939 				 /* nothing */ ;
3940 		}
3941 		else
3942 		{
3943 			for (; p < p_end && *p != '\\'; p += pg_mblen(p))
3944 				 /* nothing */ ;
3945 		}
3946 
3947 		/* Copy the text we just scanned over, if any. */
3948 		if (p > chunk_start)
3949 			appendBinaryStringInfo(str, chunk_start, p - chunk_start);
3950 
3951 		/* Done if at end of string, else advance over escape char. */
3952 		if (p >= p_end)
3953 			break;
3954 		p++;
3955 
3956 		if (p >= p_end)
3957 		{
3958 			/* Escape at very end of input.  Treat same as unexpected char */
3959 			appendStringInfoChar(str, '\\');
3960 			break;
3961 		}
3962 
3963 		if (*p >= '1' && *p <= '9')
3964 		{
3965 			/* Use the back reference of regexp. */
3966 			int			idx = *p - '0';
3967 
3968 			so = pmatch[idx].rm_so;
3969 			eo = pmatch[idx].rm_eo;
3970 			p++;
3971 		}
3972 		else if (*p == '&')
3973 		{
3974 			/* Use the entire matched string. */
3975 			so = pmatch[0].rm_so;
3976 			eo = pmatch[0].rm_eo;
3977 			p++;
3978 		}
3979 		else if (*p == '\\')
3980 		{
3981 			/* \\ means transfer one \ to output. */
3982 			appendStringInfoChar(str, '\\');
3983 			p++;
3984 			continue;
3985 		}
3986 		else
3987 		{
3988 			/*
3989 			 * If escape char is not followed by any expected char, just treat
3990 			 * it as ordinary data to copy.  (XXX would it be better to throw
3991 			 * an error?)
3992 			 */
3993 			appendStringInfoChar(str, '\\');
3994 			continue;
3995 		}
3996 
3997 		if (so != -1 && eo != -1)
3998 		{
3999 			/*
4000 			 * Copy the text that is back reference of regexp.  Note so and eo
4001 			 * are counted in characters not bytes.
4002 			 */
4003 			char	   *chunk_start;
4004 			int			chunk_len;
4005 
4006 			Assert(so >= data_pos);
4007 			chunk_start = start_ptr;
4008 			chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4009 			chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4010 			appendBinaryStringInfo(str, chunk_start, chunk_len);
4011 		}
4012 	}
4013 }
4014 
4015 #define REGEXP_REPLACE_BACKREF_CNT		10
4016 
4017 /*
4018  * replace_text_regexp
4019  *
4020  * replace text that matches to regexp in src_text to replace_text.
4021  *
4022  * Note: to avoid having to include regex.h in builtins.h, we declare
4023  * the regexp argument as void *, but really it's regex_t *.
4024  */
4025 text *
replace_text_regexp(text * src_text,void * regexp,text * replace_text,bool glob)4026 replace_text_regexp(text *src_text, void *regexp,
4027 					text *replace_text, bool glob)
4028 {
4029 	text	   *ret_text;
4030 	regex_t    *re = (regex_t *) regexp;
4031 	int			src_text_len = VARSIZE_ANY_EXHDR(src_text);
4032 	StringInfoData buf;
4033 	regmatch_t	pmatch[REGEXP_REPLACE_BACKREF_CNT];
4034 	pg_wchar   *data;
4035 	size_t		data_len;
4036 	int			search_start;
4037 	int			data_pos;
4038 	char	   *start_ptr;
4039 	bool		have_escape;
4040 
4041 	initStringInfo(&buf);
4042 
4043 	/* Convert data string to wide characters. */
4044 	data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4045 	data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4046 
4047 	/* Check whether replace_text has escape char. */
4048 	have_escape = check_replace_text_has_escape_char(replace_text);
4049 
4050 	/* start_ptr points to the data_pos'th character of src_text */
4051 	start_ptr = (char *) VARDATA_ANY(src_text);
4052 	data_pos = 0;
4053 
4054 	search_start = 0;
4055 	while (search_start <= data_len)
4056 	{
4057 		int			regexec_result;
4058 
4059 		CHECK_FOR_INTERRUPTS();
4060 
4061 		regexec_result = pg_regexec(re,
4062 									data,
4063 									data_len,
4064 									search_start,
4065 									NULL,	/* no details */
4066 									REGEXP_REPLACE_BACKREF_CNT,
4067 									pmatch,
4068 									0);
4069 
4070 		if (regexec_result == REG_NOMATCH)
4071 			break;
4072 
4073 		if (regexec_result != REG_OKAY)
4074 		{
4075 			char		errMsg[100];
4076 
4077 			CHECK_FOR_INTERRUPTS();
4078 			pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4079 			ereport(ERROR,
4080 					(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4081 					 errmsg("regular expression failed: %s", errMsg)));
4082 		}
4083 
4084 		/*
4085 		 * Copy the text to the left of the match position.  Note we are given
4086 		 * character not byte indexes.
4087 		 */
4088 		if (pmatch[0].rm_so - data_pos > 0)
4089 		{
4090 			int			chunk_len;
4091 
4092 			chunk_len = charlen_to_bytelen(start_ptr,
4093 										   pmatch[0].rm_so - data_pos);
4094 			appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4095 
4096 			/*
4097 			 * Advance start_ptr over that text, to avoid multiple rescans of
4098 			 * it if the replace_text contains multiple back-references.
4099 			 */
4100 			start_ptr += chunk_len;
4101 			data_pos = pmatch[0].rm_so;
4102 		}
4103 
4104 		/*
4105 		 * Copy the replace_text. Process back references when the
4106 		 * replace_text has escape characters.
4107 		 */
4108 		if (have_escape)
4109 			appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4110 										 start_ptr, data_pos);
4111 		else
4112 			appendStringInfoText(&buf, replace_text);
4113 
4114 		/* Advance start_ptr and data_pos over the matched text. */
4115 		start_ptr += charlen_to_bytelen(start_ptr,
4116 										pmatch[0].rm_eo - data_pos);
4117 		data_pos = pmatch[0].rm_eo;
4118 
4119 		/*
4120 		 * When global option is off, replace the first instance only.
4121 		 */
4122 		if (!glob)
4123 			break;
4124 
4125 		/*
4126 		 * Advance search position.  Normally we start the next search at the
4127 		 * end of the previous match; but if the match was of zero length, we
4128 		 * have to advance by one character, or we'd just find the same match
4129 		 * again.
4130 		 */
4131 		search_start = data_pos;
4132 		if (pmatch[0].rm_so == pmatch[0].rm_eo)
4133 			search_start++;
4134 	}
4135 
4136 	/*
4137 	 * Copy the text to the right of the last match.
4138 	 */
4139 	if (data_pos < data_len)
4140 	{
4141 		int			chunk_len;
4142 
4143 		chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4144 		appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4145 	}
4146 
4147 	ret_text = cstring_to_text_with_len(buf.data, buf.len);
4148 	pfree(buf.data);
4149 	pfree(data);
4150 
4151 	return ret_text;
4152 }
4153 
4154 /*
4155  * split_text
4156  * parse input string
4157  * return ord item (1 based)
4158  * based on provided field separator
4159  */
4160 Datum
split_text(PG_FUNCTION_ARGS)4161 split_text(PG_FUNCTION_ARGS)
4162 {
4163 	text	   *inputstring = PG_GETARG_TEXT_PP(0);
4164 	text	   *fldsep = PG_GETARG_TEXT_PP(1);
4165 	int			fldnum = PG_GETARG_INT32(2);
4166 	int			inputstring_len;
4167 	int			fldsep_len;
4168 	TextPositionState state;
4169 	int			start_posn;
4170 	int			end_posn;
4171 	text	   *result_text;
4172 
4173 	/* field number is 1 based */
4174 	if (fldnum < 1)
4175 		ereport(ERROR,
4176 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4177 				 errmsg("field position must be greater than zero")));
4178 
4179 	text_position_setup(inputstring, fldsep, &state);
4180 
4181 	/*
4182 	 * Note: we check the converted string length, not the original, because
4183 	 * they could be different if the input contained invalid encoding.
4184 	 */
4185 	inputstring_len = state.len1;
4186 	fldsep_len = state.len2;
4187 
4188 	/* return empty string for empty input string */
4189 	if (inputstring_len < 1)
4190 	{
4191 		text_position_cleanup(&state);
4192 		PG_RETURN_TEXT_P(cstring_to_text(""));
4193 	}
4194 
4195 	/* empty field separator */
4196 	if (fldsep_len < 1)
4197 	{
4198 		text_position_cleanup(&state);
4199 		/* if first field, return input string, else empty string */
4200 		if (fldnum == 1)
4201 			PG_RETURN_TEXT_P(inputstring);
4202 		else
4203 			PG_RETURN_TEXT_P(cstring_to_text(""));
4204 	}
4205 
4206 	/* identify bounds of first field */
4207 	start_posn = 1;
4208 	end_posn = text_position_next(1, &state);
4209 
4210 	/* special case if fldsep not found at all */
4211 	if (end_posn == 0)
4212 	{
4213 		text_position_cleanup(&state);
4214 		/* if field 1 requested, return input string, else empty string */
4215 		if (fldnum == 1)
4216 			PG_RETURN_TEXT_P(inputstring);
4217 		else
4218 			PG_RETURN_TEXT_P(cstring_to_text(""));
4219 	}
4220 
4221 	while (end_posn > 0 && --fldnum > 0)
4222 	{
4223 		/* identify bounds of next field */
4224 		start_posn = end_posn + fldsep_len;
4225 		end_posn = text_position_next(start_posn, &state);
4226 	}
4227 
4228 	text_position_cleanup(&state);
4229 
4230 	if (fldnum > 0)
4231 	{
4232 		/* N'th field separator not found */
4233 		/* if last field requested, return it, else empty string */
4234 		if (fldnum == 1)
4235 			result_text = text_substring(PointerGetDatum(inputstring),
4236 										 start_posn,
4237 										 -1,
4238 										 true);
4239 		else
4240 			result_text = cstring_to_text("");
4241 	}
4242 	else
4243 	{
4244 		/* non-last field requested */
4245 		result_text = text_substring(PointerGetDatum(inputstring),
4246 									 start_posn,
4247 									 end_posn - start_posn,
4248 									 false);
4249 	}
4250 
4251 	PG_RETURN_TEXT_P(result_text);
4252 }
4253 
4254 /*
4255  * Convenience function to return true when two text params are equal.
4256  */
4257 static bool
text_isequal(text * txt1,text * txt2)4258 text_isequal(text *txt1, text *txt2)
4259 {
4260 	return DatumGetBool(DirectFunctionCall2(texteq,
4261 											PointerGetDatum(txt1),
4262 											PointerGetDatum(txt2)));
4263 }
4264 
4265 /*
4266  * text_to_array
4267  * parse input string and return text array of elements,
4268  * based on provided field separator
4269  */
4270 Datum
text_to_array(PG_FUNCTION_ARGS)4271 text_to_array(PG_FUNCTION_ARGS)
4272 {
4273 	return text_to_array_internal(fcinfo);
4274 }
4275 
4276 /*
4277  * text_to_array_null
4278  * parse input string and return text array of elements,
4279  * based on provided field separator and null string
4280  *
4281  * This is a separate entry point only to prevent the regression tests from
4282  * complaining about different argument sets for the same internal function.
4283  */
4284 Datum
text_to_array_null(PG_FUNCTION_ARGS)4285 text_to_array_null(PG_FUNCTION_ARGS)
4286 {
4287 	return text_to_array_internal(fcinfo);
4288 }
4289 
4290 /*
4291  * common code for text_to_array and text_to_array_null functions
4292  *
4293  * These are not strict so we have to test for null inputs explicitly.
4294  */
4295 static Datum
text_to_array_internal(PG_FUNCTION_ARGS)4296 text_to_array_internal(PG_FUNCTION_ARGS)
4297 {
4298 	text	   *inputstring;
4299 	text	   *fldsep;
4300 	text	   *null_string;
4301 	int			inputstring_len;
4302 	int			fldsep_len;
4303 	char	   *start_ptr;
4304 	text	   *result_text;
4305 	bool		is_null;
4306 	ArrayBuildState *astate = NULL;
4307 
4308 	/* when input string is NULL, then result is NULL too */
4309 	if (PG_ARGISNULL(0))
4310 		PG_RETURN_NULL();
4311 
4312 	inputstring = PG_GETARG_TEXT_PP(0);
4313 
4314 	/* fldsep can be NULL */
4315 	if (!PG_ARGISNULL(1))
4316 		fldsep = PG_GETARG_TEXT_PP(1);
4317 	else
4318 		fldsep = NULL;
4319 
4320 	/* null_string can be NULL or omitted */
4321 	if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4322 		null_string = PG_GETARG_TEXT_PP(2);
4323 	else
4324 		null_string = NULL;
4325 
4326 	if (fldsep != NULL)
4327 	{
4328 		/*
4329 		 * Normal case with non-null fldsep.  Use the text_position machinery
4330 		 * to search for occurrences of fldsep.
4331 		 */
4332 		TextPositionState state;
4333 		int			fldnum;
4334 		int			start_posn;
4335 		int			end_posn;
4336 		int			chunk_len;
4337 
4338 		text_position_setup(inputstring, fldsep, &state);
4339 
4340 		/*
4341 		 * Note: we check the converted string length, not the original,
4342 		 * because they could be different if the input contained invalid
4343 		 * encoding.
4344 		 */
4345 		inputstring_len = state.len1;
4346 		fldsep_len = state.len2;
4347 
4348 		/* return empty array for empty input string */
4349 		if (inputstring_len < 1)
4350 		{
4351 			text_position_cleanup(&state);
4352 			PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4353 		}
4354 
4355 		/*
4356 		 * empty field separator: return the input string as a one-element
4357 		 * array
4358 		 */
4359 		if (fldsep_len < 1)
4360 		{
4361 			Datum		elems[1];
4362 			bool		nulls[1];
4363 			int			dims[1];
4364 			int			lbs[1];
4365 
4366 			text_position_cleanup(&state);
4367 			/* single element can be a NULL too */
4368 			is_null = null_string ? text_isequal(inputstring, null_string) : false;
4369 
4370 			elems[0] = PointerGetDatum(inputstring);
4371 			nulls[0] = is_null;
4372 			dims[0] = 1;
4373 			lbs[0] = 1;
4374 			/* XXX: this hardcodes assumptions about the text type */
4375 			PG_RETURN_ARRAYTYPE_P(construct_md_array(elems, nulls,
4376 													 1, dims, lbs,
4377 													 TEXTOID, -1, false, 'i'));
4378 		}
4379 
4380 		start_posn = 1;
4381 		/* start_ptr points to the start_posn'th character of inputstring */
4382 		start_ptr = VARDATA_ANY(inputstring);
4383 
4384 		for (fldnum = 1;; fldnum++) /* field number is 1 based */
4385 		{
4386 			CHECK_FOR_INTERRUPTS();
4387 
4388 			end_posn = text_position_next(start_posn, &state);
4389 
4390 			if (end_posn == 0)
4391 			{
4392 				/* fetch last field */
4393 				chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4394 			}
4395 			else
4396 			{
4397 				/* fetch non-last field */
4398 				chunk_len = charlen_to_bytelen(start_ptr, end_posn - start_posn);
4399 			}
4400 
4401 			/* must build a temp text datum to pass to accumArrayResult */
4402 			result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4403 			is_null = null_string ? text_isequal(result_text, null_string) : false;
4404 
4405 			/* stash away this field */
4406 			astate = accumArrayResult(astate,
4407 									  PointerGetDatum(result_text),
4408 									  is_null,
4409 									  TEXTOID,
4410 									  CurrentMemoryContext);
4411 
4412 			pfree(result_text);
4413 
4414 			if (end_posn == 0)
4415 				break;
4416 
4417 			start_posn = end_posn;
4418 			start_ptr += chunk_len;
4419 			start_posn += fldsep_len;
4420 			start_ptr += charlen_to_bytelen(start_ptr, fldsep_len);
4421 		}
4422 
4423 		text_position_cleanup(&state);
4424 	}
4425 	else
4426 	{
4427 		/*
4428 		 * When fldsep is NULL, each character in the inputstring becomes an
4429 		 * element in the result array.  The separator is effectively the
4430 		 * space between characters.
4431 		 */
4432 		inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4433 
4434 		/* return empty array for empty input string */
4435 		if (inputstring_len < 1)
4436 			PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4437 
4438 		start_ptr = VARDATA_ANY(inputstring);
4439 
4440 		while (inputstring_len > 0)
4441 		{
4442 			int			chunk_len = pg_mblen(start_ptr);
4443 
4444 			CHECK_FOR_INTERRUPTS();
4445 
4446 			/* must build a temp text datum to pass to accumArrayResult */
4447 			result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4448 			is_null = null_string ? text_isequal(result_text, null_string) : false;
4449 
4450 			/* stash away this field */
4451 			astate = accumArrayResult(astate,
4452 									  PointerGetDatum(result_text),
4453 									  is_null,
4454 									  TEXTOID,
4455 									  CurrentMemoryContext);
4456 
4457 			pfree(result_text);
4458 
4459 			start_ptr += chunk_len;
4460 			inputstring_len -= chunk_len;
4461 		}
4462 	}
4463 
4464 	PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
4465 										  CurrentMemoryContext));
4466 }
4467 
4468 /*
4469  * array_to_text
4470  * concatenate Cstring representation of input array elements
4471  * using provided field separator
4472  */
4473 Datum
array_to_text(PG_FUNCTION_ARGS)4474 array_to_text(PG_FUNCTION_ARGS)
4475 {
4476 	ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
4477 	char	   *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4478 
4479 	PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4480 }
4481 
4482 /*
4483  * array_to_text_null
4484  * concatenate Cstring representation of input array elements
4485  * using provided field separator and null string
4486  *
4487  * This version is not strict so we have to test for null inputs explicitly.
4488  */
4489 Datum
array_to_text_null(PG_FUNCTION_ARGS)4490 array_to_text_null(PG_FUNCTION_ARGS)
4491 {
4492 	ArrayType  *v;
4493 	char	   *fldsep;
4494 	char	   *null_string;
4495 
4496 	/* returns NULL when first or second parameter is NULL */
4497 	if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4498 		PG_RETURN_NULL();
4499 
4500 	v = PG_GETARG_ARRAYTYPE_P(0);
4501 	fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4502 
4503 	/* NULL null string is passed through as a null pointer */
4504 	if (!PG_ARGISNULL(2))
4505 		null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4506 	else
4507 		null_string = NULL;
4508 
4509 	PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4510 }
4511 
4512 /*
4513  * common code for array_to_text and array_to_text_null functions
4514  */
4515 static text *
array_to_text_internal(FunctionCallInfo fcinfo,ArrayType * v,const char * fldsep,const char * null_string)4516 array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
4517 					   const char *fldsep, const char *null_string)
4518 {
4519 	text	   *result;
4520 	int			nitems,
4521 			   *dims,
4522 				ndims;
4523 	Oid			element_type;
4524 	int			typlen;
4525 	bool		typbyval;
4526 	char		typalign;
4527 	StringInfoData buf;
4528 	bool		printed = false;
4529 	char	   *p;
4530 	bits8	   *bitmap;
4531 	int			bitmask;
4532 	int			i;
4533 	ArrayMetaState *my_extra;
4534 
4535 	ndims = ARR_NDIM(v);
4536 	dims = ARR_DIMS(v);
4537 	nitems = ArrayGetNItems(ndims, dims);
4538 
4539 	/* if there are no elements, return an empty string */
4540 	if (nitems == 0)
4541 		return cstring_to_text_with_len("", 0);
4542 
4543 	element_type = ARR_ELEMTYPE(v);
4544 	initStringInfo(&buf);
4545 
4546 	/*
4547 	 * We arrange to look up info about element type, including its output
4548 	 * conversion proc, only once per series of calls, assuming the element
4549 	 * type doesn't change underneath us.
4550 	 */
4551 	my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4552 	if (my_extra == NULL)
4553 	{
4554 		fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4555 													  sizeof(ArrayMetaState));
4556 		my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4557 		my_extra->element_type = ~element_type;
4558 	}
4559 
4560 	if (my_extra->element_type != element_type)
4561 	{
4562 		/*
4563 		 * Get info about element type, including its output conversion proc
4564 		 */
4565 		get_type_io_data(element_type, IOFunc_output,
4566 						 &my_extra->typlen, &my_extra->typbyval,
4567 						 &my_extra->typalign, &my_extra->typdelim,
4568 						 &my_extra->typioparam, &my_extra->typiofunc);
4569 		fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4570 					  fcinfo->flinfo->fn_mcxt);
4571 		my_extra->element_type = element_type;
4572 	}
4573 	typlen = my_extra->typlen;
4574 	typbyval = my_extra->typbyval;
4575 	typalign = my_extra->typalign;
4576 
4577 	p = ARR_DATA_PTR(v);
4578 	bitmap = ARR_NULLBITMAP(v);
4579 	bitmask = 1;
4580 
4581 	for (i = 0; i < nitems; i++)
4582 	{
4583 		Datum		itemvalue;
4584 		char	   *value;
4585 
4586 		/* Get source element, checking for NULL */
4587 		if (bitmap && (*bitmap & bitmask) == 0)
4588 		{
4589 			/* if null_string is NULL, we just ignore null elements */
4590 			if (null_string != NULL)
4591 			{
4592 				if (printed)
4593 					appendStringInfo(&buf, "%s%s", fldsep, null_string);
4594 				else
4595 					appendStringInfoString(&buf, null_string);
4596 				printed = true;
4597 			}
4598 		}
4599 		else
4600 		{
4601 			itemvalue = fetch_att(p, typbyval, typlen);
4602 
4603 			value = OutputFunctionCall(&my_extra->proc, itemvalue);
4604 
4605 			if (printed)
4606 				appendStringInfo(&buf, "%s%s", fldsep, value);
4607 			else
4608 				appendStringInfoString(&buf, value);
4609 			printed = true;
4610 
4611 			p = att_addlength_pointer(p, typlen, p);
4612 			p = (char *) att_align_nominal(p, typalign);
4613 		}
4614 
4615 		/* advance bitmap pointer if any */
4616 		if (bitmap)
4617 		{
4618 			bitmask <<= 1;
4619 			if (bitmask == 0x100)
4620 			{
4621 				bitmap++;
4622 				bitmask = 1;
4623 			}
4624 		}
4625 	}
4626 
4627 	result = cstring_to_text_with_len(buf.data, buf.len);
4628 	pfree(buf.data);
4629 
4630 	return result;
4631 }
4632 
4633 #define HEXBASE 16
4634 /*
4635  * Convert an int32 to a string containing a base 16 (hex) representation of
4636  * the number.
4637  */
4638 Datum
to_hex32(PG_FUNCTION_ARGS)4639 to_hex32(PG_FUNCTION_ARGS)
4640 {
4641 	uint32		value = (uint32) PG_GETARG_INT32(0);
4642 	char	   *ptr;
4643 	const char *digits = "0123456789abcdef";
4644 	char		buf[32];		/* bigger than needed, but reasonable */
4645 
4646 	ptr = buf + sizeof(buf) - 1;
4647 	*ptr = '\0';
4648 
4649 	do
4650 	{
4651 		*--ptr = digits[value % HEXBASE];
4652 		value /= HEXBASE;
4653 	} while (ptr > buf && value);
4654 
4655 	PG_RETURN_TEXT_P(cstring_to_text(ptr));
4656 }
4657 
4658 /*
4659  * Convert an int64 to a string containing a base 16 (hex) representation of
4660  * the number.
4661  */
4662 Datum
to_hex64(PG_FUNCTION_ARGS)4663 to_hex64(PG_FUNCTION_ARGS)
4664 {
4665 	uint64		value = (uint64) PG_GETARG_INT64(0);
4666 	char	   *ptr;
4667 	const char *digits = "0123456789abcdef";
4668 	char		buf[32];		/* bigger than needed, but reasonable */
4669 
4670 	ptr = buf + sizeof(buf) - 1;
4671 	*ptr = '\0';
4672 
4673 	do
4674 	{
4675 		*--ptr = digits[value % HEXBASE];
4676 		value /= HEXBASE;
4677 	} while (ptr > buf && value);
4678 
4679 	PG_RETURN_TEXT_P(cstring_to_text(ptr));
4680 }
4681 
4682 /*
4683  * Create an md5 hash of a text string and return it as hex
4684  *
4685  * md5 produces a 16 byte (128 bit) hash; double it for hex
4686  */
4687 #define MD5_HASH_LEN  32
4688 
4689 Datum
md5_text(PG_FUNCTION_ARGS)4690 md5_text(PG_FUNCTION_ARGS)
4691 {
4692 	text	   *in_text = PG_GETARG_TEXT_PP(0);
4693 	size_t		len;
4694 	char		hexsum[MD5_HASH_LEN + 1];
4695 
4696 	/* Calculate the length of the buffer using varlena metadata */
4697 	len = VARSIZE_ANY_EXHDR(in_text);
4698 
4699 	/* get the hash result */
4700 	if (pg_md5_hash(VARDATA_ANY(in_text), len, hexsum) == false)
4701 		ereport(ERROR,
4702 				(errcode(ERRCODE_OUT_OF_MEMORY),
4703 				 errmsg("out of memory")));
4704 
4705 	/* convert to text and return it */
4706 	PG_RETURN_TEXT_P(cstring_to_text(hexsum));
4707 }
4708 
4709 /*
4710  * Create an md5 hash of a bytea field and return it as a hex string:
4711  * 16-byte md5 digest is represented in 32 hex characters.
4712  */
4713 Datum
md5_bytea(PG_FUNCTION_ARGS)4714 md5_bytea(PG_FUNCTION_ARGS)
4715 {
4716 	bytea	   *in = PG_GETARG_BYTEA_PP(0);
4717 	size_t		len;
4718 	char		hexsum[MD5_HASH_LEN + 1];
4719 
4720 	len = VARSIZE_ANY_EXHDR(in);
4721 	if (pg_md5_hash(VARDATA_ANY(in), len, hexsum) == false)
4722 		ereport(ERROR,
4723 				(errcode(ERRCODE_OUT_OF_MEMORY),
4724 				 errmsg("out of memory")));
4725 
4726 	PG_RETURN_TEXT_P(cstring_to_text(hexsum));
4727 }
4728 
4729 /*
4730  * Return the size of a datum, possibly compressed
4731  *
4732  * Works on any data type
4733  */
4734 Datum
pg_column_size(PG_FUNCTION_ARGS)4735 pg_column_size(PG_FUNCTION_ARGS)
4736 {
4737 	Datum		value = PG_GETARG_DATUM(0);
4738 	int32		result;
4739 	int			typlen;
4740 
4741 	/* On first call, get the input type's typlen, and save at *fn_extra */
4742 	if (fcinfo->flinfo->fn_extra == NULL)
4743 	{
4744 		/* Lookup the datatype of the supplied argument */
4745 		Oid			argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
4746 
4747 		typlen = get_typlen(argtypeid);
4748 		if (typlen == 0)		/* should not happen */
4749 			elog(ERROR, "cache lookup failed for type %u", argtypeid);
4750 
4751 		fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4752 													  sizeof(int));
4753 		*((int *) fcinfo->flinfo->fn_extra) = typlen;
4754 	}
4755 	else
4756 		typlen = *((int *) fcinfo->flinfo->fn_extra);
4757 
4758 	if (typlen == -1)
4759 	{
4760 		/* varlena type, possibly toasted */
4761 		result = toast_datum_size(value);
4762 	}
4763 	else if (typlen == -2)
4764 	{
4765 		/* cstring */
4766 		result = strlen(DatumGetCString(value)) + 1;
4767 	}
4768 	else
4769 	{
4770 		/* ordinary fixed-width type */
4771 		result = typlen;
4772 	}
4773 
4774 	PG_RETURN_INT32(result);
4775 }
4776 
4777 /*
4778  * string_agg - Concatenates values and returns string.
4779  *
4780  * Syntax: string_agg(value text, delimiter text) RETURNS text
4781  *
4782  * Note: Any NULL values are ignored. The first-call delimiter isn't
4783  * actually used at all, and on subsequent calls the delimiter precedes
4784  * the associated value.
4785  */
4786 
4787 /* subroutine to initialize state */
4788 static StringInfo
makeStringAggState(FunctionCallInfo fcinfo)4789 makeStringAggState(FunctionCallInfo fcinfo)
4790 {
4791 	StringInfo	state;
4792 	MemoryContext aggcontext;
4793 	MemoryContext oldcontext;
4794 
4795 	if (!AggCheckCallContext(fcinfo, &aggcontext))
4796 	{
4797 		/* cannot be called directly because of internal-type argument */
4798 		elog(ERROR, "string_agg_transfn called in non-aggregate context");
4799 	}
4800 
4801 	/*
4802 	 * Create state in aggregate context.  It'll stay there across subsequent
4803 	 * calls.
4804 	 */
4805 	oldcontext = MemoryContextSwitchTo(aggcontext);
4806 	state = makeStringInfo();
4807 	MemoryContextSwitchTo(oldcontext);
4808 
4809 	return state;
4810 }
4811 
4812 Datum
string_agg_transfn(PG_FUNCTION_ARGS)4813 string_agg_transfn(PG_FUNCTION_ARGS)
4814 {
4815 	StringInfo	state;
4816 
4817 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
4818 
4819 	/* Append the value unless null. */
4820 	if (!PG_ARGISNULL(1))
4821 	{
4822 		/* On the first time through, we ignore the delimiter. */
4823 		if (state == NULL)
4824 			state = makeStringAggState(fcinfo);
4825 		else if (!PG_ARGISNULL(2))
4826 			appendStringInfoText(state, PG_GETARG_TEXT_PP(2));	/* delimiter */
4827 
4828 		appendStringInfoText(state, PG_GETARG_TEXT_PP(1));	/* value */
4829 	}
4830 
4831 	/*
4832 	 * The transition type for string_agg() is declared to be "internal",
4833 	 * which is a pass-by-value type the same size as a pointer.
4834 	 */
4835 	PG_RETURN_POINTER(state);
4836 }
4837 
4838 Datum
string_agg_finalfn(PG_FUNCTION_ARGS)4839 string_agg_finalfn(PG_FUNCTION_ARGS)
4840 {
4841 	StringInfo	state;
4842 
4843 	/* cannot be called directly because of internal-type argument */
4844 	Assert(AggCheckCallContext(fcinfo, NULL));
4845 
4846 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
4847 
4848 	if (state != NULL)
4849 		PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
4850 	else
4851 		PG_RETURN_NULL();
4852 }
4853 
4854 /*
4855  * Implementation of both concat() and concat_ws().
4856  *
4857  * sepstr is the separator string to place between values.
4858  * argidx identifies the first argument to concatenate (counting from zero).
4859  * Returns NULL if result should be NULL, else text value.
4860  */
4861 static text *
concat_internal(const char * sepstr,int argidx,FunctionCallInfo fcinfo)4862 concat_internal(const char *sepstr, int argidx,
4863 				FunctionCallInfo fcinfo)
4864 {
4865 	text	   *result;
4866 	StringInfoData str;
4867 	bool		first_arg = true;
4868 	int			i;
4869 
4870 	/*
4871 	 * concat(VARIADIC some-array) is essentially equivalent to
4872 	 * array_to_text(), ie concat the array elements with the given separator.
4873 	 * So we just pass the case off to that code.
4874 	 */
4875 	if (get_fn_expr_variadic(fcinfo->flinfo))
4876 	{
4877 		ArrayType  *arr;
4878 
4879 		/* Should have just the one argument */
4880 		Assert(argidx == PG_NARGS() - 1);
4881 
4882 		/* concat(VARIADIC NULL) is defined as NULL */
4883 		if (PG_ARGISNULL(argidx))
4884 			return NULL;
4885 
4886 		/*
4887 		 * Non-null argument had better be an array.  We assume that any call
4888 		 * context that could let get_fn_expr_variadic return true will have
4889 		 * checked that a VARIADIC-labeled parameter actually is an array.  So
4890 		 * it should be okay to just Assert that it's an array rather than
4891 		 * doing a full-fledged error check.
4892 		 */
4893 		Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
4894 
4895 		/* OK, safe to fetch the array value */
4896 		arr = PG_GETARG_ARRAYTYPE_P(argidx);
4897 
4898 		/*
4899 		 * And serialize the array.  We tell array_to_text to ignore null
4900 		 * elements, which matches the behavior of the loop below.
4901 		 */
4902 		return array_to_text_internal(fcinfo, arr, sepstr, NULL);
4903 	}
4904 
4905 	/* Normal case without explicit VARIADIC marker */
4906 	initStringInfo(&str);
4907 
4908 	for (i = argidx; i < PG_NARGS(); i++)
4909 	{
4910 		if (!PG_ARGISNULL(i))
4911 		{
4912 			Datum		value = PG_GETARG_DATUM(i);
4913 			Oid			valtype;
4914 			Oid			typOutput;
4915 			bool		typIsVarlena;
4916 
4917 			/* add separator if appropriate */
4918 			if (first_arg)
4919 				first_arg = false;
4920 			else
4921 				appendStringInfoString(&str, sepstr);
4922 
4923 			/* call the appropriate type output function, append the result */
4924 			valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
4925 			if (!OidIsValid(valtype))
4926 				elog(ERROR, "could not determine data type of concat() input");
4927 			getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
4928 			appendStringInfoString(&str,
4929 								   OidOutputFunctionCall(typOutput, value));
4930 		}
4931 	}
4932 
4933 	result = cstring_to_text_with_len(str.data, str.len);
4934 	pfree(str.data);
4935 
4936 	return result;
4937 }
4938 
4939 /*
4940  * Concatenate all arguments. NULL arguments are ignored.
4941  */
4942 Datum
text_concat(PG_FUNCTION_ARGS)4943 text_concat(PG_FUNCTION_ARGS)
4944 {
4945 	text	   *result;
4946 
4947 	result = concat_internal("", 0, fcinfo);
4948 	if (result == NULL)
4949 		PG_RETURN_NULL();
4950 	PG_RETURN_TEXT_P(result);
4951 }
4952 
4953 /*
4954  * Concatenate all but first argument value with separators. The first
4955  * parameter is used as the separator. NULL arguments are ignored.
4956  */
4957 Datum
text_concat_ws(PG_FUNCTION_ARGS)4958 text_concat_ws(PG_FUNCTION_ARGS)
4959 {
4960 	char	   *sep;
4961 	text	   *result;
4962 
4963 	/* return NULL when separator is NULL */
4964 	if (PG_ARGISNULL(0))
4965 		PG_RETURN_NULL();
4966 	sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
4967 
4968 	result = concat_internal(sep, 1, fcinfo);
4969 	if (result == NULL)
4970 		PG_RETURN_NULL();
4971 	PG_RETURN_TEXT_P(result);
4972 }
4973 
4974 /*
4975  * Return first n characters in the string. When n is negative,
4976  * return all but last |n| characters.
4977  */
4978 Datum
text_left(PG_FUNCTION_ARGS)4979 text_left(PG_FUNCTION_ARGS)
4980 {
4981 	text	   *str = PG_GETARG_TEXT_PP(0);
4982 	const char *p = VARDATA_ANY(str);
4983 	int			len = VARSIZE_ANY_EXHDR(str);
4984 	int			n = PG_GETARG_INT32(1);
4985 	int			rlen;
4986 
4987 	if (n < 0)
4988 		n = pg_mbstrlen_with_len(p, len) + n;
4989 	rlen = pg_mbcharcliplen(p, len, n);
4990 
4991 	PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
4992 }
4993 
4994 /*
4995  * Return last n characters in the string. When n is negative,
4996  * return all but first |n| characters.
4997  */
4998 Datum
text_right(PG_FUNCTION_ARGS)4999 text_right(PG_FUNCTION_ARGS)
5000 {
5001 	text	   *str = PG_GETARG_TEXT_PP(0);
5002 	const char *p = VARDATA_ANY(str);
5003 	int			len = VARSIZE_ANY_EXHDR(str);
5004 	int			n = PG_GETARG_INT32(1);
5005 	int			off;
5006 
5007 	if (n < 0)
5008 		n = -n;
5009 	else
5010 		n = pg_mbstrlen_with_len(p, len) - n;
5011 	off = pg_mbcharcliplen(p, len, n);
5012 
5013 	PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5014 }
5015 
5016 /*
5017  * Return reversed string
5018  */
5019 Datum
text_reverse(PG_FUNCTION_ARGS)5020 text_reverse(PG_FUNCTION_ARGS)
5021 {
5022 	text	   *str = PG_GETARG_TEXT_PP(0);
5023 	const char *p = VARDATA_ANY(str);
5024 	int			len = VARSIZE_ANY_EXHDR(str);
5025 	const char *endp = p + len;
5026 	text	   *result;
5027 	char	   *dst;
5028 
5029 	result = palloc(len + VARHDRSZ);
5030 	dst = (char *) VARDATA(result) + len;
5031 	SET_VARSIZE(result, len + VARHDRSZ);
5032 
5033 	if (pg_database_encoding_max_length() > 1)
5034 	{
5035 		/* multibyte version */
5036 		while (p < endp)
5037 		{
5038 			int			sz;
5039 
5040 			sz = pg_mblen(p);
5041 			dst -= sz;
5042 			memcpy(dst, p, sz);
5043 			p += sz;
5044 		}
5045 	}
5046 	else
5047 	{
5048 		/* single byte version */
5049 		while (p < endp)
5050 			*(--dst) = *p++;
5051 	}
5052 
5053 	PG_RETURN_TEXT_P(result);
5054 }
5055 
5056 
5057 /*
5058  * Support macros for text_format()
5059  */
5060 #define TEXT_FORMAT_FLAG_MINUS	0x0001	/* is minus flag present? */
5061 
5062 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5063 	do { \
5064 		if (++(ptr) >= (end_ptr)) \
5065 			ereport(ERROR, \
5066 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5067 					 errmsg("unterminated format() type specifier"), \
5068 					 errhint("For a single \"%%\" use \"%%%%\"."))); \
5069 	} while (0)
5070 
5071 /*
5072  * Returns a formatted string
5073  */
5074 Datum
text_format(PG_FUNCTION_ARGS)5075 text_format(PG_FUNCTION_ARGS)
5076 {
5077 	text	   *fmt;
5078 	StringInfoData str;
5079 	const char *cp;
5080 	const char *start_ptr;
5081 	const char *end_ptr;
5082 	text	   *result;
5083 	int			arg;
5084 	bool		funcvariadic;
5085 	int			nargs;
5086 	Datum	   *elements = NULL;
5087 	bool	   *nulls = NULL;
5088 	Oid			element_type = InvalidOid;
5089 	Oid			prev_type = InvalidOid;
5090 	Oid			prev_width_type = InvalidOid;
5091 	FmgrInfo	typoutputfinfo;
5092 	FmgrInfo	typoutputinfo_width;
5093 
5094 	/* When format string is null, immediately return null */
5095 	if (PG_ARGISNULL(0))
5096 		PG_RETURN_NULL();
5097 
5098 	/* If argument is marked VARIADIC, expand array into elements */
5099 	if (get_fn_expr_variadic(fcinfo->flinfo))
5100 	{
5101 		ArrayType  *arr;
5102 		int16		elmlen;
5103 		bool		elmbyval;
5104 		char		elmalign;
5105 		int			nitems;
5106 
5107 		/* Should have just the one argument */
5108 		Assert(PG_NARGS() == 2);
5109 
5110 		/* If argument is NULL, we treat it as zero-length array */
5111 		if (PG_ARGISNULL(1))
5112 			nitems = 0;
5113 		else
5114 		{
5115 			/*
5116 			 * Non-null argument had better be an array.  We assume that any
5117 			 * call context that could let get_fn_expr_variadic return true
5118 			 * will have checked that a VARIADIC-labeled parameter actually is
5119 			 * an array.  So it should be okay to just Assert that it's an
5120 			 * array rather than doing a full-fledged error check.
5121 			 */
5122 			Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5123 
5124 			/* OK, safe to fetch the array value */
5125 			arr = PG_GETARG_ARRAYTYPE_P(1);
5126 
5127 			/* Get info about array element type */
5128 			element_type = ARR_ELEMTYPE(arr);
5129 			get_typlenbyvalalign(element_type,
5130 								 &elmlen, &elmbyval, &elmalign);
5131 
5132 			/* Extract all array elements */
5133 			deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5134 							  &elements, &nulls, &nitems);
5135 		}
5136 
5137 		nargs = nitems + 1;
5138 		funcvariadic = true;
5139 	}
5140 	else
5141 	{
5142 		/* Non-variadic case, we'll process the arguments individually */
5143 		nargs = PG_NARGS();
5144 		funcvariadic = false;
5145 	}
5146 
5147 	/* Setup for main loop. */
5148 	fmt = PG_GETARG_TEXT_PP(0);
5149 	start_ptr = VARDATA_ANY(fmt);
5150 	end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5151 	initStringInfo(&str);
5152 	arg = 1;					/* next argument position to print */
5153 
5154 	/* Scan format string, looking for conversion specifiers. */
5155 	for (cp = start_ptr; cp < end_ptr; cp++)
5156 	{
5157 		int			argpos;
5158 		int			widthpos;
5159 		int			flags;
5160 		int			width;
5161 		Datum		value;
5162 		bool		isNull;
5163 		Oid			typid;
5164 
5165 		/*
5166 		 * If it's not the start of a conversion specifier, just copy it to
5167 		 * the output buffer.
5168 		 */
5169 		if (*cp != '%')
5170 		{
5171 			appendStringInfoCharMacro(&str, *cp);
5172 			continue;
5173 		}
5174 
5175 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5176 
5177 		/* Easy case: %% outputs a single % */
5178 		if (*cp == '%')
5179 		{
5180 			appendStringInfoCharMacro(&str, *cp);
5181 			continue;
5182 		}
5183 
5184 		/* Parse the optional portions of the format specifier */
5185 		cp = text_format_parse_format(cp, end_ptr,
5186 									  &argpos, &widthpos,
5187 									  &flags, &width);
5188 
5189 		/*
5190 		 * Next we should see the main conversion specifier.  Whether or not
5191 		 * an argument position was present, it's known that at least one
5192 		 * character remains in the string at this point.  Experience suggests
5193 		 * that it's worth checking that that character is one of the expected
5194 		 * ones before we try to fetch arguments, so as to produce the least
5195 		 * confusing response to a mis-formatted specifier.
5196 		 */
5197 		if (strchr("sIL", *cp) == NULL)
5198 			ereport(ERROR,
5199 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5200 					 errmsg("unrecognized format() type specifier \"%c\"",
5201 							*cp),
5202 					 errhint("For a single \"%%\" use \"%%%%\".")));
5203 
5204 		/* If indirect width was specified, get its value */
5205 		if (widthpos >= 0)
5206 		{
5207 			/* Collect the specified or next argument position */
5208 			if (widthpos > 0)
5209 				arg = widthpos;
5210 			if (arg >= nargs)
5211 				ereport(ERROR,
5212 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5213 						 errmsg("too few arguments for format()")));
5214 
5215 			/* Get the value and type of the selected argument */
5216 			if (!funcvariadic)
5217 			{
5218 				value = PG_GETARG_DATUM(arg);
5219 				isNull = PG_ARGISNULL(arg);
5220 				typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5221 			}
5222 			else
5223 			{
5224 				value = elements[arg - 1];
5225 				isNull = nulls[arg - 1];
5226 				typid = element_type;
5227 			}
5228 			if (!OidIsValid(typid))
5229 				elog(ERROR, "could not determine data type of format() input");
5230 
5231 			arg++;
5232 
5233 			/* We can treat NULL width the same as zero */
5234 			if (isNull)
5235 				width = 0;
5236 			else if (typid == INT4OID)
5237 				width = DatumGetInt32(value);
5238 			else if (typid == INT2OID)
5239 				width = DatumGetInt16(value);
5240 			else
5241 			{
5242 				/* For less-usual datatypes, convert to text then to int */
5243 				char	   *str;
5244 
5245 				if (typid != prev_width_type)
5246 				{
5247 					Oid			typoutputfunc;
5248 					bool		typIsVarlena;
5249 
5250 					getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5251 					fmgr_info(typoutputfunc, &typoutputinfo_width);
5252 					prev_width_type = typid;
5253 				}
5254 
5255 				str = OutputFunctionCall(&typoutputinfo_width, value);
5256 
5257 				/* pg_atoi will complain about bad data or overflow */
5258 				width = pg_atoi(str, sizeof(int), '\0');
5259 
5260 				pfree(str);
5261 			}
5262 		}
5263 
5264 		/* Collect the specified or next argument position */
5265 		if (argpos > 0)
5266 			arg = argpos;
5267 		if (arg >= nargs)
5268 			ereport(ERROR,
5269 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5270 					 errmsg("too few arguments for format()")));
5271 
5272 		/* Get the value and type of the selected argument */
5273 		if (!funcvariadic)
5274 		{
5275 			value = PG_GETARG_DATUM(arg);
5276 			isNull = PG_ARGISNULL(arg);
5277 			typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5278 		}
5279 		else
5280 		{
5281 			value = elements[arg - 1];
5282 			isNull = nulls[arg - 1];
5283 			typid = element_type;
5284 		}
5285 		if (!OidIsValid(typid))
5286 			elog(ERROR, "could not determine data type of format() input");
5287 
5288 		arg++;
5289 
5290 		/*
5291 		 * Get the appropriate typOutput function, reusing previous one if
5292 		 * same type as previous argument.  That's particularly useful in the
5293 		 * variadic-array case, but often saves work even for ordinary calls.
5294 		 */
5295 		if (typid != prev_type)
5296 		{
5297 			Oid			typoutputfunc;
5298 			bool		typIsVarlena;
5299 
5300 			getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5301 			fmgr_info(typoutputfunc, &typoutputfinfo);
5302 			prev_type = typid;
5303 		}
5304 
5305 		/*
5306 		 * And now we can format the value.
5307 		 */
5308 		switch (*cp)
5309 		{
5310 			case 's':
5311 			case 'I':
5312 			case 'L':
5313 				text_format_string_conversion(&str, *cp, &typoutputfinfo,
5314 											  value, isNull,
5315 											  flags, width);
5316 				break;
5317 			default:
5318 				/* should not get here, because of previous check */
5319 				ereport(ERROR,
5320 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5321 						 errmsg("unrecognized format() type specifier \"%c\"",
5322 								*cp),
5323 						 errhint("For a single \"%%\" use \"%%%%\".")));
5324 				break;
5325 		}
5326 	}
5327 
5328 	/* Don't need deconstruct_array results anymore. */
5329 	if (elements != NULL)
5330 		pfree(elements);
5331 	if (nulls != NULL)
5332 		pfree(nulls);
5333 
5334 	/* Generate results. */
5335 	result = cstring_to_text_with_len(str.data, str.len);
5336 	pfree(str.data);
5337 
5338 	PG_RETURN_TEXT_P(result);
5339 }
5340 
5341 /*
5342  * Parse contiguous digits as a decimal number.
5343  *
5344  * Returns true if some digits could be parsed.
5345  * The value is returned into *value, and *ptr is advanced to the next
5346  * character to be parsed.
5347  *
5348  * Note parsing invariant: at least one character is known available before
5349  * string end (end_ptr) at entry, and this is still true at exit.
5350  */
5351 static bool
text_format_parse_digits(const char ** ptr,const char * end_ptr,int * value)5352 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5353 {
5354 	bool		found = false;
5355 	const char *cp = *ptr;
5356 	int			val = 0;
5357 
5358 	while (*cp >= '0' && *cp <= '9')
5359 	{
5360 		int			newval = val * 10 + (*cp - '0');
5361 
5362 		if (newval / 10 != val) /* overflow? */
5363 			ereport(ERROR,
5364 					(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5365 					 errmsg("number is out of range")));
5366 		val = newval;
5367 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5368 		found = true;
5369 	}
5370 
5371 	*ptr = cp;
5372 	*value = val;
5373 
5374 	return found;
5375 }
5376 
5377 /*
5378  * Parse a format specifier (generally following the SUS printf spec).
5379  *
5380  * We have already advanced over the initial '%', and we are looking for
5381  * [argpos][flags][width]type (but the type character is not consumed here).
5382  *
5383  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5384  * Output parameters:
5385  *	argpos: argument position for value to be printed.  -1 means unspecified.
5386  *	widthpos: argument position for width.  Zero means the argument position
5387  *			was unspecified (ie, take the next arg) and -1 means no width
5388  *			argument (width was omitted or specified as a constant).
5389  *	flags: bitmask of flags.
5390  *	width: directly-specified width value.  Zero means the width was omitted
5391  *			(note it's not necessary to distinguish this case from an explicit
5392  *			zero width value).
5393  *
5394  * The function result is the next character position to be parsed, ie, the
5395  * location where the type character is/should be.
5396  *
5397  * Note parsing invariant: at least one character is known available before
5398  * string end (end_ptr) at entry, and this is still true at exit.
5399  */
5400 static const char *
text_format_parse_format(const char * start_ptr,const char * end_ptr,int * argpos,int * widthpos,int * flags,int * width)5401 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5402 						 int *argpos, int *widthpos,
5403 						 int *flags, int *width)
5404 {
5405 	const char *cp = start_ptr;
5406 	int			n;
5407 
5408 	/* set defaults for output parameters */
5409 	*argpos = -1;
5410 	*widthpos = -1;
5411 	*flags = 0;
5412 	*width = 0;
5413 
5414 	/* try to identify first number */
5415 	if (text_format_parse_digits(&cp, end_ptr, &n))
5416 	{
5417 		if (*cp != '$')
5418 		{
5419 			/* Must be just a width and a type, so we're done */
5420 			*width = n;
5421 			return cp;
5422 		}
5423 		/* The number was argument position */
5424 		*argpos = n;
5425 		/* Explicit 0 for argument index is immediately refused */
5426 		if (n == 0)
5427 			ereport(ERROR,
5428 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5429 					 errmsg("format specifies argument 0, but arguments are numbered from 1")));
5430 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5431 	}
5432 
5433 	/* Handle flags (only minus is supported now) */
5434 	while (*cp == '-')
5435 	{
5436 		*flags |= TEXT_FORMAT_FLAG_MINUS;
5437 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5438 	}
5439 
5440 	if (*cp == '*')
5441 	{
5442 		/* Handle indirect width */
5443 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5444 		if (text_format_parse_digits(&cp, end_ptr, &n))
5445 		{
5446 			/* number in this position must be closed by $ */
5447 			if (*cp != '$')
5448 				ereport(ERROR,
5449 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5450 						 errmsg("width argument position must be ended by \"$\"")));
5451 			/* The number was width argument position */
5452 			*widthpos = n;
5453 			/* Explicit 0 for argument index is immediately refused */
5454 			if (n == 0)
5455 				ereport(ERROR,
5456 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5457 						 errmsg("format specifies argument 0, but arguments are numbered from 1")));
5458 			ADVANCE_PARSE_POINTER(cp, end_ptr);
5459 		}
5460 		else
5461 			*widthpos = 0;		/* width's argument position is unspecified */
5462 	}
5463 	else
5464 	{
5465 		/* Check for direct width specification */
5466 		if (text_format_parse_digits(&cp, end_ptr, &n))
5467 			*width = n;
5468 	}
5469 
5470 	/* cp should now be pointing at type character */
5471 	return cp;
5472 }
5473 
5474 /*
5475  * Format a %s, %I, or %L conversion
5476  */
5477 static void
text_format_string_conversion(StringInfo buf,char conversion,FmgrInfo * typOutputInfo,Datum value,bool isNull,int flags,int width)5478 text_format_string_conversion(StringInfo buf, char conversion,
5479 							  FmgrInfo *typOutputInfo,
5480 							  Datum value, bool isNull,
5481 							  int flags, int width)
5482 {
5483 	char	   *str;
5484 
5485 	/* Handle NULL arguments before trying to stringify the value. */
5486 	if (isNull)
5487 	{
5488 		if (conversion == 's')
5489 			text_format_append_string(buf, "", flags, width);
5490 		else if (conversion == 'L')
5491 			text_format_append_string(buf, "NULL", flags, width);
5492 		else if (conversion == 'I')
5493 			ereport(ERROR,
5494 					(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
5495 					 errmsg("null values cannot be formatted as an SQL identifier")));
5496 		return;
5497 	}
5498 
5499 	/* Stringify. */
5500 	str = OutputFunctionCall(typOutputInfo, value);
5501 
5502 	/* Escape. */
5503 	if (conversion == 'I')
5504 	{
5505 		/* quote_identifier may or may not allocate a new string. */
5506 		text_format_append_string(buf, quote_identifier(str), flags, width);
5507 	}
5508 	else if (conversion == 'L')
5509 	{
5510 		char	   *qstr = quote_literal_cstr(str);
5511 
5512 		text_format_append_string(buf, qstr, flags, width);
5513 		/* quote_literal_cstr() always allocates a new string */
5514 		pfree(qstr);
5515 	}
5516 	else
5517 		text_format_append_string(buf, str, flags, width);
5518 
5519 	/* Cleanup. */
5520 	pfree(str);
5521 }
5522 
5523 /*
5524  * Append str to buf, padding as directed by flags/width
5525  */
5526 static void
text_format_append_string(StringInfo buf,const char * str,int flags,int width)5527 text_format_append_string(StringInfo buf, const char *str,
5528 						  int flags, int width)
5529 {
5530 	bool		align_to_left = false;
5531 	int			len;
5532 
5533 	/* fast path for typical easy case */
5534 	if (width == 0)
5535 	{
5536 		appendStringInfoString(buf, str);
5537 		return;
5538 	}
5539 
5540 	if (width < 0)
5541 	{
5542 		/* Negative width: implicit '-' flag, then take absolute value */
5543 		align_to_left = true;
5544 		/* -INT_MIN is undefined */
5545 		if (width <= INT_MIN)
5546 			ereport(ERROR,
5547 					(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5548 					 errmsg("number is out of range")));
5549 		width = -width;
5550 	}
5551 	else if (flags & TEXT_FORMAT_FLAG_MINUS)
5552 		align_to_left = true;
5553 
5554 	len = pg_mbstrlen(str);
5555 	if (align_to_left)
5556 	{
5557 		/* left justify */
5558 		appendStringInfoString(buf, str);
5559 		if (len < width)
5560 			appendStringInfoSpaces(buf, width - len);
5561 	}
5562 	else
5563 	{
5564 		/* right justify */
5565 		if (len < width)
5566 			appendStringInfoSpaces(buf, width - len);
5567 		appendStringInfoString(buf, str);
5568 	}
5569 }
5570 
5571 /*
5572  * text_format_nv - nonvariadic wrapper for text_format function.
5573  *
5574  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
5575  * which checks that all built-in functions that share the implementing C
5576  * function take the same number of arguments.
5577  */
5578 Datum
text_format_nv(PG_FUNCTION_ARGS)5579 text_format_nv(PG_FUNCTION_ARGS)
5580 {
5581 	return text_format(fcinfo);
5582 }
5583 
5584 /*
5585  * Helper function for Levenshtein distance functions. Faster than memcmp(),
5586  * for this use case.
5587  */
5588 static inline bool
rest_of_char_same(const char * s1,const char * s2,int len)5589 rest_of_char_same(const char *s1, const char *s2, int len)
5590 {
5591 	while (len > 0)
5592 	{
5593 		len--;
5594 		if (s1[len] != s2[len])
5595 			return false;
5596 	}
5597 	return true;
5598 }
5599 
5600 /* Expand each Levenshtein distance variant */
5601 #include "levenshtein.c"
5602 #define LEVENSHTEIN_LESS_EQUAL
5603 #include "levenshtein.c"
5604