1 /*-------------------------------------------------------------------------
2  *
3  * varlena.c
4  *	  Functions for the variable-length built-in types.
5  *
6  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/utils/adt/varlena.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <stdbool.h>
18 #include <ctype.h>
19 #include <limits.h>
20 
21 #include "access/hash.h"
22 #include "access/tuptoaster.h"
23 #include "catalog/pg_collation.h"
24 #include "catalog/pg_type.h"
25 #include "lib/hyperloglog.h"
26 #include "libpq/md5.h"
27 #include "libpq/pqformat.h"
28 #include "miscadmin.h"
29 #include "parser/scansup.h"
30 #include "port/pg_bswap.h"
31 #include "regex/regex.h"
32 #include "utils/builtins.h"
33 #include "utils/bytea.h"
34 #include "utils/lsyscache.h"
35 #include "utils/memutils.h"
36 #include "utils/pg_locale.h"
37 #include "utils/sortsupport.h"
38 
39 
40 /* GUC variable */
41 int			bytea_output = BYTEA_OUTPUT_HEX;
42 
43 typedef struct varlena unknown;
44 typedef struct varlena VarString;
45 
46 typedef struct
47 {
48 	bool		use_wchar;		/* T if multibyte encoding */
49 	char	   *str1;			/* use these if not use_wchar */
50 	char	   *str2;			/* note: these point to original texts */
51 	pg_wchar   *wstr1;			/* use these if use_wchar */
52 	pg_wchar   *wstr2;			/* note: these are palloc'd */
53 	int			len1;			/* string lengths in logical characters */
54 	int			len2;
55 	/* Skip table for Boyer-Moore-Horspool search algorithm: */
56 	int			skiptablemask;	/* mask for ANDing with skiptable subscripts */
57 	int			skiptable[256]; /* skip distance for given mismatched char */
58 } TextPositionState;
59 
60 typedef struct
61 {
62 	char	   *buf1;			/* 1st string, or abbreviation original string
63 								 * buf */
64 	char	   *buf2;			/* 2nd string, or abbreviation strxfrm() buf */
65 	int			buflen1;
66 	int			buflen2;
67 	int			last_len1;		/* Length of last buf1 string/strxfrm() input */
68 	int			last_len2;		/* Length of last buf2 string/strxfrm() blob */
69 	int			last_returned;	/* Last comparison result (cache) */
70 	bool		cache_blob;		/* Does buf2 contain strxfrm() blob, etc? */
71 	bool		collate_c;
72 	bool		bpchar;			/* Sorting pbchar, not varchar/text/bytea? */
73 	hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
74 	hyperLogLogState full_card; /* Full key cardinality state */
75 	double		prop_card;		/* Required cardinality proportion */
76 #ifdef HAVE_LOCALE_T
77 	pg_locale_t locale;
78 #endif
79 } VarStringSortSupport;
80 
81 /*
82  * This should be large enough that most strings will fit, but small enough
83  * that we feel comfortable putting it on the stack
84  */
85 #define TEXTBUFLEN		1024
86 
87 #define DatumGetUnknownP(X)			((unknown *) PG_DETOAST_DATUM(X))
88 #define DatumGetUnknownPCopy(X)		((unknown *) PG_DETOAST_DATUM_COPY(X))
89 #define PG_GETARG_UNKNOWN_P(n)		DatumGetUnknownP(PG_GETARG_DATUM(n))
90 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
91 #define PG_RETURN_UNKNOWN_P(x)		PG_RETURN_POINTER(x)
92 
93 #define DatumGetVarStringP(X)		((VarString *) PG_DETOAST_DATUM(X))
94 #define DatumGetVarStringPP(X)		((VarString *) PG_DETOAST_DATUM_PACKED(X))
95 
96 static int	varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
97 static int	bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
98 static int	varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup);
99 static int	varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
100 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
101 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
102 static int32 text_length(Datum str);
103 static text *text_catenate(text *t1, text *t2);
104 static text *text_substring(Datum str,
105 			   int32 start,
106 			   int32 length,
107 			   bool length_not_specified);
108 static text *text_overlay(text *t1, text *t2, int sp, int sl);
109 static int	text_position(text *t1, text *t2);
110 static void text_position_setup(text *t1, text *t2, TextPositionState *state);
111 static int	text_position_next(int start_pos, TextPositionState *state);
112 static void text_position_cleanup(TextPositionState *state);
113 static int	text_cmp(text *arg1, text *arg2, Oid collid);
114 static bytea *bytea_catenate(bytea *t1, bytea *t2);
115 static bytea *bytea_substring(Datum str,
116 				int S,
117 				int L,
118 				bool length_not_specified);
119 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
120 static void appendStringInfoText(StringInfo str, const text *t);
121 static Datum text_to_array_internal(PG_FUNCTION_ARGS);
122 static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
123 					   const char *fldsep, const char *null_string);
124 static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
125 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
126 						 int *value);
127 static const char *text_format_parse_format(const char *start_ptr,
128 						 const char *end_ptr,
129 						 int *argpos, int *widthpos,
130 						 int *flags, int *width);
131 static void text_format_string_conversion(StringInfo buf, char conversion,
132 							  FmgrInfo *typOutputInfo,
133 							  Datum value, bool isNull,
134 							  int flags, int width);
135 static void text_format_append_string(StringInfo buf, const char *str,
136 						  int flags, int width);
137 
138 
139 /*****************************************************************************
140  *	 CONVERSION ROUTINES EXPORTED FOR USE BY C CODE							 *
141  *****************************************************************************/
142 
143 /*
144  * cstring_to_text
145  *
146  * Create a text value from a null-terminated C string.
147  *
148  * The new text value is freshly palloc'd with a full-size VARHDR.
149  */
150 text *
cstring_to_text(const char * s)151 cstring_to_text(const char *s)
152 {
153 	return cstring_to_text_with_len(s, strlen(s));
154 }
155 
156 /*
157  * cstring_to_text_with_len
158  *
159  * Same as cstring_to_text except the caller specifies the string length;
160  * the string need not be null_terminated.
161  */
162 text *
cstring_to_text_with_len(const char * s,int len)163 cstring_to_text_with_len(const char *s, int len)
164 {
165 	text	   *result = (text *) palloc(len + VARHDRSZ);
166 
167 	SET_VARSIZE(result, len + VARHDRSZ);
168 	memcpy(VARDATA(result), s, len);
169 
170 	return result;
171 }
172 
173 /*
174  * text_to_cstring
175  *
176  * Create a palloc'd, null-terminated C string from a text value.
177  *
178  * We support being passed a compressed or toasted text value.
179  * This is a bit bogus since such values shouldn't really be referred to as
180  * "text *", but it seems useful for robustness.  If we didn't handle that
181  * case here, we'd need another routine that did, anyway.
182  */
183 char *
text_to_cstring(const text * t)184 text_to_cstring(const text *t)
185 {
186 	/* must cast away the const, unfortunately */
187 	text	   *tunpacked = pg_detoast_datum_packed((struct varlena *) t);
188 	int			len = VARSIZE_ANY_EXHDR(tunpacked);
189 	char	   *result;
190 
191 	result = (char *) palloc(len + 1);
192 	memcpy(result, VARDATA_ANY(tunpacked), len);
193 	result[len] = '\0';
194 
195 	if (tunpacked != t)
196 		pfree(tunpacked);
197 
198 	return result;
199 }
200 
201 /*
202  * text_to_cstring_buffer
203  *
204  * Copy a text value into a caller-supplied buffer of size dst_len.
205  *
206  * The text string is truncated if necessary to fit.  The result is
207  * guaranteed null-terminated (unless dst_len == 0).
208  *
209  * We support being passed a compressed or toasted text value.
210  * This is a bit bogus since such values shouldn't really be referred to as
211  * "text *", but it seems useful for robustness.  If we didn't handle that
212  * case here, we'd need another routine that did, anyway.
213  */
214 void
text_to_cstring_buffer(const text * src,char * dst,size_t dst_len)215 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
216 {
217 	/* must cast away the const, unfortunately */
218 	text	   *srcunpacked = pg_detoast_datum_packed((struct varlena *) src);
219 	size_t		src_len = VARSIZE_ANY_EXHDR(srcunpacked);
220 
221 	if (dst_len > 0)
222 	{
223 		dst_len--;
224 		if (dst_len >= src_len)
225 			dst_len = src_len;
226 		else	/* ensure truncation is encoding-safe */
227 			dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
228 		memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
229 		dst[dst_len] = '\0';
230 	}
231 
232 	if (srcunpacked != src)
233 		pfree(srcunpacked);
234 }
235 
236 
237 /*****************************************************************************
238  *	 USER I/O ROUTINES														 *
239  *****************************************************************************/
240 
241 
242 #define VAL(CH)			((CH) - '0')
243 #define DIG(VAL)		((VAL) + '0')
244 
245 /*
246  *		byteain			- converts from printable representation of byte array
247  *
248  *		Non-printable characters must be passed as '\nnn' (octal) and are
249  *		converted to internal form.  '\' must be passed as '\\'.
250  *		ereport(ERROR, ...) if bad form.
251  *
252  *		BUGS:
253  *				The input is scanned twice.
254  *				The error checking of input is minimal.
255  */
256 Datum
byteain(PG_FUNCTION_ARGS)257 byteain(PG_FUNCTION_ARGS)
258 {
259 	char	   *inputText = PG_GETARG_CSTRING(0);
260 	char	   *tp;
261 	char	   *rp;
262 	int			bc;
263 	bytea	   *result;
264 
265 	/* Recognize hex input */
266 	if (inputText[0] == '\\' && inputText[1] == 'x')
267 	{
268 		size_t		len = strlen(inputText);
269 
270 		bc = (len - 2) / 2 + VARHDRSZ;	/* maximum possible length */
271 		result = palloc(bc);
272 		bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
273 		SET_VARSIZE(result, bc + VARHDRSZ);		/* actual length */
274 
275 		PG_RETURN_BYTEA_P(result);
276 	}
277 
278 	/* Else, it's the traditional escaped style */
279 	for (bc = 0, tp = inputText; *tp != '\0'; bc++)
280 	{
281 		if (tp[0] != '\\')
282 			tp++;
283 		else if ((tp[0] == '\\') &&
284 				 (tp[1] >= '0' && tp[1] <= '3') &&
285 				 (tp[2] >= '0' && tp[2] <= '7') &&
286 				 (tp[3] >= '0' && tp[3] <= '7'))
287 			tp += 4;
288 		else if ((tp[0] == '\\') &&
289 				 (tp[1] == '\\'))
290 			tp += 2;
291 		else
292 		{
293 			/*
294 			 * one backslash, not followed by another or ### valid octal
295 			 */
296 			ereport(ERROR,
297 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
298 					 errmsg("invalid input syntax for type bytea")));
299 		}
300 	}
301 
302 	bc += VARHDRSZ;
303 
304 	result = (bytea *) palloc(bc);
305 	SET_VARSIZE(result, bc);
306 
307 	tp = inputText;
308 	rp = VARDATA(result);
309 	while (*tp != '\0')
310 	{
311 		if (tp[0] != '\\')
312 			*rp++ = *tp++;
313 		else if ((tp[0] == '\\') &&
314 				 (tp[1] >= '0' && tp[1] <= '3') &&
315 				 (tp[2] >= '0' && tp[2] <= '7') &&
316 				 (tp[3] >= '0' && tp[3] <= '7'))
317 		{
318 			bc = VAL(tp[1]);
319 			bc <<= 3;
320 			bc += VAL(tp[2]);
321 			bc <<= 3;
322 			*rp++ = bc + VAL(tp[3]);
323 
324 			tp += 4;
325 		}
326 		else if ((tp[0] == '\\') &&
327 				 (tp[1] == '\\'))
328 		{
329 			*rp++ = '\\';
330 			tp += 2;
331 		}
332 		else
333 		{
334 			/*
335 			 * We should never get here. The first pass should not allow it.
336 			 */
337 			ereport(ERROR,
338 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
339 					 errmsg("invalid input syntax for type bytea")));
340 		}
341 	}
342 
343 	PG_RETURN_BYTEA_P(result);
344 }
345 
346 /*
347  *		byteaout		- converts to printable representation of byte array
348  *
349  *		In the traditional escaped format, non-printable characters are
350  *		printed as '\nnn' (octal) and '\' as '\\'.
351  */
352 Datum
byteaout(PG_FUNCTION_ARGS)353 byteaout(PG_FUNCTION_ARGS)
354 {
355 	bytea	   *vlena = PG_GETARG_BYTEA_PP(0);
356 	char	   *result;
357 	char	   *rp;
358 
359 	if (bytea_output == BYTEA_OUTPUT_HEX)
360 	{
361 		/* Print hex format */
362 		rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
363 		*rp++ = '\\';
364 		*rp++ = 'x';
365 		rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
366 	}
367 	else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
368 	{
369 		/* Print traditional escaped format */
370 		char	   *vp;
371 		int			len;
372 		int			i;
373 
374 		len = 1;				/* empty string has 1 char */
375 		vp = VARDATA_ANY(vlena);
376 		for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
377 		{
378 			if (*vp == '\\')
379 				len += 2;
380 			else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
381 				len += 4;
382 			else
383 				len++;
384 		}
385 		rp = result = (char *) palloc(len);
386 		vp = VARDATA_ANY(vlena);
387 		for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
388 		{
389 			if (*vp == '\\')
390 			{
391 				*rp++ = '\\';
392 				*rp++ = '\\';
393 			}
394 			else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
395 			{
396 				int			val;	/* holds unprintable chars */
397 
398 				val = *vp;
399 				rp[0] = '\\';
400 				rp[3] = DIG(val & 07);
401 				val >>= 3;
402 				rp[2] = DIG(val & 07);
403 				val >>= 3;
404 				rp[1] = DIG(val & 03);
405 				rp += 4;
406 			}
407 			else
408 				*rp++ = *vp;
409 		}
410 	}
411 	else
412 	{
413 		elog(ERROR, "unrecognized bytea_output setting: %d",
414 			 bytea_output);
415 		rp = result = NULL;		/* keep compiler quiet */
416 	}
417 	*rp = '\0';
418 	PG_RETURN_CSTRING(result);
419 }
420 
421 /*
422  *		bytearecv			- converts external binary format to bytea
423  */
424 Datum
bytearecv(PG_FUNCTION_ARGS)425 bytearecv(PG_FUNCTION_ARGS)
426 {
427 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
428 	bytea	   *result;
429 	int			nbytes;
430 
431 	nbytes = buf->len - buf->cursor;
432 	result = (bytea *) palloc(nbytes + VARHDRSZ);
433 	SET_VARSIZE(result, nbytes + VARHDRSZ);
434 	pq_copymsgbytes(buf, VARDATA(result), nbytes);
435 	PG_RETURN_BYTEA_P(result);
436 }
437 
438 /*
439  *		byteasend			- converts bytea to binary format
440  *
441  * This is a special case: just copy the input...
442  */
443 Datum
byteasend(PG_FUNCTION_ARGS)444 byteasend(PG_FUNCTION_ARGS)
445 {
446 	bytea	   *vlena = PG_GETARG_BYTEA_P_COPY(0);
447 
448 	PG_RETURN_BYTEA_P(vlena);
449 }
450 
451 Datum
bytea_string_agg_transfn(PG_FUNCTION_ARGS)452 bytea_string_agg_transfn(PG_FUNCTION_ARGS)
453 {
454 	StringInfo	state;
455 
456 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
457 
458 	/* Append the value unless null. */
459 	if (!PG_ARGISNULL(1))
460 	{
461 		bytea	   *value = PG_GETARG_BYTEA_PP(1);
462 
463 		/* On the first time through, we ignore the delimiter. */
464 		if (state == NULL)
465 			state = makeStringAggState(fcinfo);
466 		else if (!PG_ARGISNULL(2))
467 		{
468 			bytea	   *delim = PG_GETARG_BYTEA_PP(2);
469 
470 			appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
471 		}
472 
473 		appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
474 	}
475 
476 	/*
477 	 * The transition type for string_agg() is declared to be "internal",
478 	 * which is a pass-by-value type the same size as a pointer.
479 	 */
480 	PG_RETURN_POINTER(state);
481 }
482 
483 Datum
bytea_string_agg_finalfn(PG_FUNCTION_ARGS)484 bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
485 {
486 	StringInfo	state;
487 
488 	/* cannot be called directly because of internal-type argument */
489 	Assert(AggCheckCallContext(fcinfo, NULL));
490 
491 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
492 
493 	if (state != NULL)
494 	{
495 		bytea	   *result;
496 
497 		result = (bytea *) palloc(state->len + VARHDRSZ);
498 		SET_VARSIZE(result, state->len + VARHDRSZ);
499 		memcpy(VARDATA(result), state->data, state->len);
500 		PG_RETURN_BYTEA_P(result);
501 	}
502 	else
503 		PG_RETURN_NULL();
504 }
505 
506 /*
507  *		textin			- converts "..." to internal representation
508  */
509 Datum
textin(PG_FUNCTION_ARGS)510 textin(PG_FUNCTION_ARGS)
511 {
512 	char	   *inputText = PG_GETARG_CSTRING(0);
513 
514 	PG_RETURN_TEXT_P(cstring_to_text(inputText));
515 }
516 
517 /*
518  *		textout			- converts internal representation to "..."
519  */
520 Datum
textout(PG_FUNCTION_ARGS)521 textout(PG_FUNCTION_ARGS)
522 {
523 	Datum		txt = PG_GETARG_DATUM(0);
524 
525 	PG_RETURN_CSTRING(TextDatumGetCString(txt));
526 }
527 
528 /*
529  *		textrecv			- converts external binary format to text
530  */
531 Datum
textrecv(PG_FUNCTION_ARGS)532 textrecv(PG_FUNCTION_ARGS)
533 {
534 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
535 	text	   *result;
536 	char	   *str;
537 	int			nbytes;
538 
539 	str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
540 
541 	result = cstring_to_text_with_len(str, nbytes);
542 	pfree(str);
543 	PG_RETURN_TEXT_P(result);
544 }
545 
546 /*
547  *		textsend			- converts text to binary format
548  */
549 Datum
textsend(PG_FUNCTION_ARGS)550 textsend(PG_FUNCTION_ARGS)
551 {
552 	text	   *t = PG_GETARG_TEXT_PP(0);
553 	StringInfoData buf;
554 
555 	pq_begintypsend(&buf);
556 	pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
557 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
558 }
559 
560 
561 /*
562  *		unknownin			- converts "..." to internal representation
563  */
564 Datum
unknownin(PG_FUNCTION_ARGS)565 unknownin(PG_FUNCTION_ARGS)
566 {
567 	char	   *str = PG_GETARG_CSTRING(0);
568 
569 	/* representation is same as cstring */
570 	PG_RETURN_CSTRING(pstrdup(str));
571 }
572 
573 /*
574  *		unknownout			- converts internal representation to "..."
575  */
576 Datum
unknownout(PG_FUNCTION_ARGS)577 unknownout(PG_FUNCTION_ARGS)
578 {
579 	/* representation is same as cstring */
580 	char	   *str = PG_GETARG_CSTRING(0);
581 
582 	PG_RETURN_CSTRING(pstrdup(str));
583 }
584 
585 /*
586  *		unknownrecv			- converts external binary format to unknown
587  */
588 Datum
unknownrecv(PG_FUNCTION_ARGS)589 unknownrecv(PG_FUNCTION_ARGS)
590 {
591 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
592 	char	   *str;
593 	int			nbytes;
594 
595 	str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
596 	/* representation is same as cstring */
597 	PG_RETURN_CSTRING(str);
598 }
599 
600 /*
601  *		unknownsend			- converts unknown to binary format
602  */
603 Datum
unknownsend(PG_FUNCTION_ARGS)604 unknownsend(PG_FUNCTION_ARGS)
605 {
606 	/* representation is same as cstring */
607 	char	   *str = PG_GETARG_CSTRING(0);
608 	StringInfoData buf;
609 
610 	pq_begintypsend(&buf);
611 	pq_sendtext(&buf, str, strlen(str));
612 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
613 }
614 
615 
616 /* ========== PUBLIC ROUTINES ========== */
617 
618 /*
619  * textlen -
620  *	  returns the logical length of a text*
621  *	   (which is less than the VARSIZE of the text*)
622  */
623 Datum
textlen(PG_FUNCTION_ARGS)624 textlen(PG_FUNCTION_ARGS)
625 {
626 	Datum		str = PG_GETARG_DATUM(0);
627 
628 	/* try to avoid decompressing argument */
629 	PG_RETURN_INT32(text_length(str));
630 }
631 
632 /*
633  * text_length -
634  *	Does the real work for textlen()
635  *
636  *	This is broken out so it can be called directly by other string processing
637  *	functions.  Note that the argument is passed as a Datum, to indicate that
638  *	it may still be in compressed form.  We can avoid decompressing it at all
639  *	in some cases.
640  */
641 static int32
text_length(Datum str)642 text_length(Datum str)
643 {
644 	/* fastpath when max encoding length is one */
645 	if (pg_database_encoding_max_length() == 1)
646 		PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
647 	else
648 	{
649 		text	   *t = DatumGetTextPP(str);
650 
651 		PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
652 											 VARSIZE_ANY_EXHDR(t)));
653 	}
654 }
655 
656 /*
657  * textoctetlen -
658  *	  returns the physical length of a text*
659  *	   (which is less than the VARSIZE of the text*)
660  */
661 Datum
textoctetlen(PG_FUNCTION_ARGS)662 textoctetlen(PG_FUNCTION_ARGS)
663 {
664 	Datum		str = PG_GETARG_DATUM(0);
665 
666 	/* We need not detoast the input at all */
667 	PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
668 }
669 
670 /*
671  * textcat -
672  *	  takes two text* and returns a text* that is the concatenation of
673  *	  the two.
674  *
675  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
676  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
677  * Allocate space for output in all cases.
678  * XXX - thomas 1997-07-10
679  */
680 Datum
textcat(PG_FUNCTION_ARGS)681 textcat(PG_FUNCTION_ARGS)
682 {
683 	text	   *t1 = PG_GETARG_TEXT_PP(0);
684 	text	   *t2 = PG_GETARG_TEXT_PP(1);
685 
686 	PG_RETURN_TEXT_P(text_catenate(t1, t2));
687 }
688 
689 /*
690  * text_catenate
691  *	Guts of textcat(), broken out so it can be used by other functions
692  *
693  * Arguments can be in short-header form, but not compressed or out-of-line
694  */
695 static text *
text_catenate(text * t1,text * t2)696 text_catenate(text *t1, text *t2)
697 {
698 	text	   *result;
699 	int			len1,
700 				len2,
701 				len;
702 	char	   *ptr;
703 
704 	len1 = VARSIZE_ANY_EXHDR(t1);
705 	len2 = VARSIZE_ANY_EXHDR(t2);
706 
707 	/* paranoia ... probably should throw error instead? */
708 	if (len1 < 0)
709 		len1 = 0;
710 	if (len2 < 0)
711 		len2 = 0;
712 
713 	len = len1 + len2 + VARHDRSZ;
714 	result = (text *) palloc(len);
715 
716 	/* Set size of result string... */
717 	SET_VARSIZE(result, len);
718 
719 	/* Fill data field of result string... */
720 	ptr = VARDATA(result);
721 	if (len1 > 0)
722 		memcpy(ptr, VARDATA_ANY(t1), len1);
723 	if (len2 > 0)
724 		memcpy(ptr + len1, VARDATA_ANY(t2), len2);
725 
726 	return result;
727 }
728 
729 /*
730  * charlen_to_bytelen()
731  *	Compute the number of bytes occupied by n characters starting at *p
732  *
733  * It is caller's responsibility that there actually are n characters;
734  * the string need not be null-terminated.
735  */
736 static int
charlen_to_bytelen(const char * p,int n)737 charlen_to_bytelen(const char *p, int n)
738 {
739 	if (pg_database_encoding_max_length() == 1)
740 	{
741 		/* Optimization for single-byte encodings */
742 		return n;
743 	}
744 	else
745 	{
746 		const char *s;
747 
748 		for (s = p; n > 0; n--)
749 			s += pg_mblen(s);
750 
751 		return s - p;
752 	}
753 }
754 
755 /*
756  * text_substr()
757  * Return a substring starting at the specified position.
758  * - thomas 1997-12-31
759  *
760  * Input:
761  *	- string
762  *	- starting position (is one-based)
763  *	- string length
764  *
765  * If the starting position is zero or less, then return from the start of the string
766  *	adjusting the length to be consistent with the "negative start" per SQL.
767  * If the length is less than zero, return the remaining string.
768  *
769  * Added multibyte support.
770  * - Tatsuo Ishii 1998-4-21
771  * Changed behavior if starting position is less than one to conform to SQL behavior.
772  * Formerly returned the entire string; now returns a portion.
773  * - Thomas Lockhart 1998-12-10
774  * Now uses faster TOAST-slicing interface
775  * - John Gray 2002-02-22
776  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
777  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
778  * error; if E < 1, return '', not entire string). Fixed MB related bug when
779  * S > LC and < LC + 4 sometimes garbage characters are returned.
780  * - Joe Conway 2002-08-10
781  */
782 Datum
text_substr(PG_FUNCTION_ARGS)783 text_substr(PG_FUNCTION_ARGS)
784 {
785 	PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
786 									PG_GETARG_INT32(1),
787 									PG_GETARG_INT32(2),
788 									false));
789 }
790 
791 /*
792  * text_substr_no_len -
793  *	  Wrapper to avoid opr_sanity failure due to
794  *	  one function accepting a different number of args.
795  */
796 Datum
text_substr_no_len(PG_FUNCTION_ARGS)797 text_substr_no_len(PG_FUNCTION_ARGS)
798 {
799 	PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
800 									PG_GETARG_INT32(1),
801 									-1, true));
802 }
803 
804 /*
805  * text_substring -
806  *	Does the real work for text_substr() and text_substr_no_len()
807  *
808  *	This is broken out so it can be called directly by other string processing
809  *	functions.  Note that the argument is passed as a Datum, to indicate that
810  *	it may still be in compressed/toasted form.  We can avoid detoasting all
811  *	of it in some cases.
812  *
813  *	The result is always a freshly palloc'd datum.
814  */
815 static text *
text_substring(Datum str,int32 start,int32 length,bool length_not_specified)816 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
817 {
818 	int32		eml = pg_database_encoding_max_length();
819 	int32		S = start;		/* start position */
820 	int32		S1;				/* adjusted start position */
821 	int32		L1;				/* adjusted substring length */
822 
823 	/* life is easy if the encoding max length is 1 */
824 	if (eml == 1)
825 	{
826 		S1 = Max(S, 1);
827 
828 		if (length_not_specified)		/* special case - get length to end of
829 										 * string */
830 			L1 = -1;
831 		else
832 		{
833 			/* end position */
834 			int			E = S + length;
835 
836 			/*
837 			 * A negative value for L is the only way for the end position to
838 			 * be before the start. SQL99 says to throw an error.
839 			 */
840 			if (E < S)
841 				ereport(ERROR,
842 						(errcode(ERRCODE_SUBSTRING_ERROR),
843 						 errmsg("negative substring length not allowed")));
844 
845 			/*
846 			 * A zero or negative value for the end position can happen if the
847 			 * start was negative or one. SQL99 says to return a zero-length
848 			 * string.
849 			 */
850 			if (E < 1)
851 				return cstring_to_text("");
852 
853 			L1 = E - S1;
854 		}
855 
856 		/*
857 		 * If the start position is past the end of the string, SQL99 says to
858 		 * return a zero-length string -- PG_GETARG_TEXT_P_SLICE() will do
859 		 * that for us. Convert to zero-based starting position
860 		 */
861 		return DatumGetTextPSlice(str, S1 - 1, L1);
862 	}
863 	else if (eml > 1)
864 	{
865 		/*
866 		 * When encoding max length is > 1, we can't get LC without
867 		 * detoasting, so we'll grab a conservatively large slice now and go
868 		 * back later to do the right thing
869 		 */
870 		int32		slice_start;
871 		int32		slice_size;
872 		int32		slice_strlen;
873 		text	   *slice;
874 		int32		E1;
875 		int32		i;
876 		char	   *p;
877 		char	   *s;
878 		text	   *ret;
879 
880 		/*
881 		 * if S is past the end of the string, the tuple toaster will return a
882 		 * zero-length string to us
883 		 */
884 		S1 = Max(S, 1);
885 
886 		/*
887 		 * We need to start at position zero because there is no way to know
888 		 * in advance which byte offset corresponds to the supplied start
889 		 * position.
890 		 */
891 		slice_start = 0;
892 
893 		if (length_not_specified)		/* special case - get length to end of
894 										 * string */
895 			slice_size = L1 = -1;
896 		else
897 		{
898 			int			E = S + length;
899 
900 			/*
901 			 * A negative value for L is the only way for the end position to
902 			 * be before the start. SQL99 says to throw an error.
903 			 */
904 			if (E < S)
905 				ereport(ERROR,
906 						(errcode(ERRCODE_SUBSTRING_ERROR),
907 						 errmsg("negative substring length not allowed")));
908 
909 			/*
910 			 * A zero or negative value for the end position can happen if the
911 			 * start was negative or one. SQL99 says to return a zero-length
912 			 * string.
913 			 */
914 			if (E < 1)
915 				return cstring_to_text("");
916 
917 			/*
918 			 * if E is past the end of the string, the tuple toaster will
919 			 * truncate the length for us
920 			 */
921 			L1 = E - S1;
922 
923 			/*
924 			 * Total slice size in bytes can't be any longer than the start
925 			 * position plus substring length times the encoding max length.
926 			 */
927 			slice_size = (S1 + L1) * eml;
928 		}
929 
930 		/*
931 		 * If we're working with an untoasted source, no need to do an extra
932 		 * copying step.
933 		 */
934 		if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
935 			VARATT_IS_EXTERNAL(DatumGetPointer(str)))
936 			slice = DatumGetTextPSlice(str, slice_start, slice_size);
937 		else
938 			slice = (text *) DatumGetPointer(str);
939 
940 		/* see if we got back an empty string */
941 		if (VARSIZE_ANY_EXHDR(slice) == 0)
942 		{
943 			if (slice != (text *) DatumGetPointer(str))
944 				pfree(slice);
945 			return cstring_to_text("");
946 		}
947 
948 		/* Now we can get the actual length of the slice in MB characters */
949 		slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
950 											VARSIZE_ANY_EXHDR(slice));
951 
952 		/*
953 		 * Check that the start position wasn't > slice_strlen. If so, SQL99
954 		 * says to return a zero-length string.
955 		 */
956 		if (S1 > slice_strlen)
957 		{
958 			if (slice != (text *) DatumGetPointer(str))
959 				pfree(slice);
960 			return cstring_to_text("");
961 		}
962 
963 		/*
964 		 * Adjust L1 and E1 now that we know the slice string length. Again
965 		 * remember that S1 is one based, and slice_start is zero based.
966 		 */
967 		if (L1 > -1)
968 			E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
969 		else
970 			E1 = slice_start + 1 + slice_strlen;
971 
972 		/*
973 		 * Find the start position in the slice; remember S1 is not zero based
974 		 */
975 		p = VARDATA_ANY(slice);
976 		for (i = 0; i < S1 - 1; i++)
977 			p += pg_mblen(p);
978 
979 		/* hang onto a pointer to our start position */
980 		s = p;
981 
982 		/*
983 		 * Count the actual bytes used by the substring of the requested
984 		 * length.
985 		 */
986 		for (i = S1; i < E1; i++)
987 			p += pg_mblen(p);
988 
989 		ret = (text *) palloc(VARHDRSZ + (p - s));
990 		SET_VARSIZE(ret, VARHDRSZ + (p - s));
991 		memcpy(VARDATA(ret), s, (p - s));
992 
993 		if (slice != (text *) DatumGetPointer(str))
994 			pfree(slice);
995 
996 		return ret;
997 	}
998 	else
999 		elog(ERROR, "invalid backend encoding: encoding max length < 1");
1000 
1001 	/* not reached: suppress compiler warning */
1002 	return NULL;
1003 }
1004 
1005 /*
1006  * textoverlay
1007  *	Replace specified substring of first string with second
1008  *
1009  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1010  * This code is a direct implementation of what the standard says.
1011  */
1012 Datum
textoverlay(PG_FUNCTION_ARGS)1013 textoverlay(PG_FUNCTION_ARGS)
1014 {
1015 	text	   *t1 = PG_GETARG_TEXT_PP(0);
1016 	text	   *t2 = PG_GETARG_TEXT_PP(1);
1017 	int			sp = PG_GETARG_INT32(2);		/* substring start position */
1018 	int			sl = PG_GETARG_INT32(3);		/* substring length */
1019 
1020 	PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1021 }
1022 
1023 Datum
textoverlay_no_len(PG_FUNCTION_ARGS)1024 textoverlay_no_len(PG_FUNCTION_ARGS)
1025 {
1026 	text	   *t1 = PG_GETARG_TEXT_PP(0);
1027 	text	   *t2 = PG_GETARG_TEXT_PP(1);
1028 	int			sp = PG_GETARG_INT32(2);		/* substring start position */
1029 	int			sl;
1030 
1031 	sl = text_length(PointerGetDatum(t2));		/* defaults to length(t2) */
1032 	PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1033 }
1034 
1035 static text *
text_overlay(text * t1,text * t2,int sp,int sl)1036 text_overlay(text *t1, text *t2, int sp, int sl)
1037 {
1038 	text	   *result;
1039 	text	   *s1;
1040 	text	   *s2;
1041 	int			sp_pl_sl;
1042 
1043 	/*
1044 	 * Check for possible integer-overflow cases.  For negative sp, throw a
1045 	 * "substring length" error because that's what should be expected
1046 	 * according to the spec's definition of OVERLAY().
1047 	 */
1048 	if (sp <= 0)
1049 		ereport(ERROR,
1050 				(errcode(ERRCODE_SUBSTRING_ERROR),
1051 				 errmsg("negative substring length not allowed")));
1052 	sp_pl_sl = sp + sl;
1053 	if (sp_pl_sl <= sl)
1054 		ereport(ERROR,
1055 				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1056 				 errmsg("integer out of range")));
1057 
1058 	s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1059 	s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1060 	result = text_catenate(s1, t2);
1061 	result = text_catenate(result, s2);
1062 
1063 	return result;
1064 }
1065 
1066 /*
1067  * textpos -
1068  *	  Return the position of the specified substring.
1069  *	  Implements the SQL POSITION() function.
1070  *	  Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1071  * - thomas 1997-07-27
1072  */
1073 Datum
textpos(PG_FUNCTION_ARGS)1074 textpos(PG_FUNCTION_ARGS)
1075 {
1076 	text	   *str = PG_GETARG_TEXT_PP(0);
1077 	text	   *search_str = PG_GETARG_TEXT_PP(1);
1078 
1079 	PG_RETURN_INT32((int32) text_position(str, search_str));
1080 }
1081 
1082 /*
1083  * text_position -
1084  *	Does the real work for textpos()
1085  *
1086  * Inputs:
1087  *		t1 - string to be searched
1088  *		t2 - pattern to match within t1
1089  * Result:
1090  *		Character index of the first matched char, starting from 1,
1091  *		or 0 if no match.
1092  *
1093  *	This is broken out so it can be called directly by other string processing
1094  *	functions.
1095  */
1096 static int
text_position(text * t1,text * t2)1097 text_position(text *t1, text *t2)
1098 {
1099 	TextPositionState state;
1100 	int			result;
1101 
1102 	text_position_setup(t1, t2, &state);
1103 	result = text_position_next(1, &state);
1104 	text_position_cleanup(&state);
1105 	return result;
1106 }
1107 
1108 
1109 /*
1110  * text_position_setup, text_position_next, text_position_cleanup -
1111  *	Component steps of text_position()
1112  *
1113  * These are broken out so that a string can be efficiently searched for
1114  * multiple occurrences of the same pattern.  text_position_next may be
1115  * called multiple times with increasing values of start_pos, which is
1116  * the 1-based character position to start the search from.  The "state"
1117  * variable is normally just a local variable in the caller.
1118  */
1119 
1120 static void
text_position_setup(text * t1,text * t2,TextPositionState * state)1121 text_position_setup(text *t1, text *t2, TextPositionState *state)
1122 {
1123 	int			len1 = VARSIZE_ANY_EXHDR(t1);
1124 	int			len2 = VARSIZE_ANY_EXHDR(t2);
1125 
1126 	if (pg_database_encoding_max_length() == 1)
1127 	{
1128 		/* simple case - single byte encoding */
1129 		state->use_wchar = false;
1130 		state->str1 = VARDATA_ANY(t1);
1131 		state->str2 = VARDATA_ANY(t2);
1132 		state->len1 = len1;
1133 		state->len2 = len2;
1134 	}
1135 	else
1136 	{
1137 		/* not as simple - multibyte encoding */
1138 		pg_wchar   *p1,
1139 				   *p2;
1140 
1141 		p1 = (pg_wchar *) palloc((len1 + 1) * sizeof(pg_wchar));
1142 		len1 = pg_mb2wchar_with_len(VARDATA_ANY(t1), p1, len1);
1143 		p2 = (pg_wchar *) palloc((len2 + 1) * sizeof(pg_wchar));
1144 		len2 = pg_mb2wchar_with_len(VARDATA_ANY(t2), p2, len2);
1145 
1146 		state->use_wchar = true;
1147 		state->wstr1 = p1;
1148 		state->wstr2 = p2;
1149 		state->len1 = len1;
1150 		state->len2 = len2;
1151 	}
1152 
1153 	/*
1154 	 * Prepare the skip table for Boyer-Moore-Horspool searching.  In these
1155 	 * notes we use the terminology that the "haystack" is the string to be
1156 	 * searched (t1) and the "needle" is the pattern being sought (t2).
1157 	 *
1158 	 * If the needle is empty or bigger than the haystack then there is no
1159 	 * point in wasting cycles initializing the table.  We also choose not to
1160 	 * use B-M-H for needles of length 1, since the skip table can't possibly
1161 	 * save anything in that case.
1162 	 */
1163 	if (len1 >= len2 && len2 > 1)
1164 	{
1165 		int			searchlength = len1 - len2;
1166 		int			skiptablemask;
1167 		int			last;
1168 		int			i;
1169 
1170 		/*
1171 		 * First we must determine how much of the skip table to use.  The
1172 		 * declaration of TextPositionState allows up to 256 elements, but for
1173 		 * short search problems we don't really want to have to initialize so
1174 		 * many elements --- it would take too long in comparison to the
1175 		 * actual search time.  So we choose a useful skip table size based on
1176 		 * the haystack length minus the needle length.  The closer the needle
1177 		 * length is to the haystack length the less useful skipping becomes.
1178 		 *
1179 		 * Note: since we use bit-masking to select table elements, the skip
1180 		 * table size MUST be a power of 2, and so the mask must be 2^N-1.
1181 		 */
1182 		if (searchlength < 16)
1183 			skiptablemask = 3;
1184 		else if (searchlength < 64)
1185 			skiptablemask = 7;
1186 		else if (searchlength < 128)
1187 			skiptablemask = 15;
1188 		else if (searchlength < 512)
1189 			skiptablemask = 31;
1190 		else if (searchlength < 2048)
1191 			skiptablemask = 63;
1192 		else if (searchlength < 4096)
1193 			skiptablemask = 127;
1194 		else
1195 			skiptablemask = 255;
1196 		state->skiptablemask = skiptablemask;
1197 
1198 		/*
1199 		 * Initialize the skip table.  We set all elements to the needle
1200 		 * length, since this is the correct skip distance for any character
1201 		 * not found in the needle.
1202 		 */
1203 		for (i = 0; i <= skiptablemask; i++)
1204 			state->skiptable[i] = len2;
1205 
1206 		/*
1207 		 * Now examine the needle.  For each character except the last one,
1208 		 * set the corresponding table element to the appropriate skip
1209 		 * distance.  Note that when two characters share the same skip table
1210 		 * entry, the one later in the needle must determine the skip
1211 		 * distance.
1212 		 */
1213 		last = len2 - 1;
1214 
1215 		if (!state->use_wchar)
1216 		{
1217 			const char *str2 = state->str2;
1218 
1219 			for (i = 0; i < last; i++)
1220 				state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1221 		}
1222 		else
1223 		{
1224 			const pg_wchar *wstr2 = state->wstr2;
1225 
1226 			for (i = 0; i < last; i++)
1227 				state->skiptable[wstr2[i] & skiptablemask] = last - i;
1228 		}
1229 	}
1230 }
1231 
1232 static int
text_position_next(int start_pos,TextPositionState * state)1233 text_position_next(int start_pos, TextPositionState *state)
1234 {
1235 	int			haystack_len = state->len1;
1236 	int			needle_len = state->len2;
1237 	int			skiptablemask = state->skiptablemask;
1238 
1239 	Assert(start_pos > 0);		/* else caller error */
1240 
1241 	if (needle_len <= 0)
1242 		return start_pos;		/* result for empty pattern */
1243 
1244 	start_pos--;				/* adjust for zero based arrays */
1245 
1246 	/* Done if the needle can't possibly fit */
1247 	if (haystack_len < start_pos + needle_len)
1248 		return 0;
1249 
1250 	if (!state->use_wchar)
1251 	{
1252 		/* simple case - single byte encoding */
1253 		const char *haystack = state->str1;
1254 		const char *needle = state->str2;
1255 		const char *haystack_end = &haystack[haystack_len];
1256 		const char *hptr;
1257 
1258 		if (needle_len == 1)
1259 		{
1260 			/* No point in using B-M-H for a one-character needle */
1261 			char		nchar = *needle;
1262 
1263 			hptr = &haystack[start_pos];
1264 			while (hptr < haystack_end)
1265 			{
1266 				if (*hptr == nchar)
1267 					return hptr - haystack + 1;
1268 				hptr++;
1269 			}
1270 		}
1271 		else
1272 		{
1273 			const char *needle_last = &needle[needle_len - 1];
1274 
1275 			/* Start at startpos plus the length of the needle */
1276 			hptr = &haystack[start_pos + needle_len - 1];
1277 			while (hptr < haystack_end)
1278 			{
1279 				/* Match the needle scanning *backward* */
1280 				const char *nptr;
1281 				const char *p;
1282 
1283 				nptr = needle_last;
1284 				p = hptr;
1285 				while (*nptr == *p)
1286 				{
1287 					/* Matched it all?	If so, return 1-based position */
1288 					if (nptr == needle)
1289 						return p - haystack + 1;
1290 					nptr--, p--;
1291 				}
1292 
1293 				/*
1294 				 * No match, so use the haystack char at hptr to decide how
1295 				 * far to advance.  If the needle had any occurrence of that
1296 				 * character (or more precisely, one sharing the same
1297 				 * skiptable entry) before its last character, then we advance
1298 				 * far enough to align the last such needle character with
1299 				 * that haystack position.  Otherwise we can advance by the
1300 				 * whole needle length.
1301 				 */
1302 				hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1303 			}
1304 		}
1305 	}
1306 	else
1307 	{
1308 		/* The multibyte char version. This works exactly the same way. */
1309 		const pg_wchar *haystack = state->wstr1;
1310 		const pg_wchar *needle = state->wstr2;
1311 		const pg_wchar *haystack_end = &haystack[haystack_len];
1312 		const pg_wchar *hptr;
1313 
1314 		if (needle_len == 1)
1315 		{
1316 			/* No point in using B-M-H for a one-character needle */
1317 			pg_wchar	nchar = *needle;
1318 
1319 			hptr = &haystack[start_pos];
1320 			while (hptr < haystack_end)
1321 			{
1322 				if (*hptr == nchar)
1323 					return hptr - haystack + 1;
1324 				hptr++;
1325 			}
1326 		}
1327 		else
1328 		{
1329 			const pg_wchar *needle_last = &needle[needle_len - 1];
1330 
1331 			/* Start at startpos plus the length of the needle */
1332 			hptr = &haystack[start_pos + needle_len - 1];
1333 			while (hptr < haystack_end)
1334 			{
1335 				/* Match the needle scanning *backward* */
1336 				const pg_wchar *nptr;
1337 				const pg_wchar *p;
1338 
1339 				nptr = needle_last;
1340 				p = hptr;
1341 				while (*nptr == *p)
1342 				{
1343 					/* Matched it all?	If so, return 1-based position */
1344 					if (nptr == needle)
1345 						return p - haystack + 1;
1346 					nptr--, p--;
1347 				}
1348 
1349 				/*
1350 				 * No match, so use the haystack char at hptr to decide how
1351 				 * far to advance.  If the needle had any occurrence of that
1352 				 * character (or more precisely, one sharing the same
1353 				 * skiptable entry) before its last character, then we advance
1354 				 * far enough to align the last such needle character with
1355 				 * that haystack position.  Otherwise we can advance by the
1356 				 * whole needle length.
1357 				 */
1358 				hptr += state->skiptable[*hptr & skiptablemask];
1359 			}
1360 		}
1361 	}
1362 
1363 	return 0;					/* not found */
1364 }
1365 
1366 static void
text_position_cleanup(TextPositionState * state)1367 text_position_cleanup(TextPositionState *state)
1368 {
1369 	if (state->use_wchar)
1370 	{
1371 		pfree(state->wstr1);
1372 		pfree(state->wstr2);
1373 	}
1374 }
1375 
1376 /* varstr_cmp()
1377  * Comparison function for text strings with given lengths.
1378  * Includes locale support, but must copy strings to temporary memory
1379  *	to allow null-termination for inputs to strcoll().
1380  * Returns an integer less than, equal to, or greater than zero, indicating
1381  * whether arg1 is less than, equal to, or greater than arg2.
1382  */
1383 int
varstr_cmp(char * arg1,int len1,char * arg2,int len2,Oid collid)1384 varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid)
1385 {
1386 	int			result;
1387 
1388 	/*
1389 	 * Unfortunately, there is no strncoll(), so in the non-C locale case we
1390 	 * have to do some memory copying.  This turns out to be significantly
1391 	 * slower, so we optimize the case where LC_COLLATE is C.  We also try to
1392 	 * optimize relatively-short strings by avoiding palloc/pfree overhead.
1393 	 */
1394 	if (lc_collate_is_c(collid))
1395 	{
1396 		result = memcmp(arg1, arg2, Min(len1, len2));
1397 		if ((result == 0) && (len1 != len2))
1398 			result = (len1 < len2) ? -1 : 1;
1399 	}
1400 	else
1401 	{
1402 		char		a1buf[TEXTBUFLEN];
1403 		char		a2buf[TEXTBUFLEN];
1404 		char	   *a1p,
1405 				   *a2p;
1406 
1407 #ifdef HAVE_LOCALE_T
1408 		pg_locale_t mylocale = 0;
1409 #endif
1410 
1411 		if (collid != DEFAULT_COLLATION_OID)
1412 		{
1413 			if (!OidIsValid(collid))
1414 			{
1415 				/*
1416 				 * This typically means that the parser could not resolve a
1417 				 * conflict of implicit collations, so report it that way.
1418 				 */
1419 				ereport(ERROR,
1420 						(errcode(ERRCODE_INDETERMINATE_COLLATION),
1421 						 errmsg("could not determine which collation to use for string comparison"),
1422 						 errhint("Use the COLLATE clause to set the collation explicitly.")));
1423 			}
1424 #ifdef HAVE_LOCALE_T
1425 			mylocale = pg_newlocale_from_collation(collid);
1426 #endif
1427 		}
1428 
1429 		/*
1430 		 * memcmp() can't tell us which of two unequal strings sorts first,
1431 		 * but it's a cheap way to tell if they're equal.  Testing shows that
1432 		 * memcmp() followed by strcoll() is only trivially slower than
1433 		 * strcoll() by itself, so we don't lose much if this doesn't work out
1434 		 * very often, and if it does - for example, because there are many
1435 		 * equal strings in the input - then we win big by avoiding expensive
1436 		 * collation-aware comparisons.
1437 		 */
1438 		if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1439 			return 0;
1440 
1441 #ifdef WIN32
1442 		/* Win32 does not have UTF-8, so we need to map to UTF-16 */
1443 		if (GetDatabaseEncoding() == PG_UTF8)
1444 		{
1445 			int			a1len;
1446 			int			a2len;
1447 			int			r;
1448 
1449 			if (len1 >= TEXTBUFLEN / 2)
1450 			{
1451 				a1len = len1 * 2 + 2;
1452 				a1p = palloc(a1len);
1453 			}
1454 			else
1455 			{
1456 				a1len = TEXTBUFLEN;
1457 				a1p = a1buf;
1458 			}
1459 			if (len2 >= TEXTBUFLEN / 2)
1460 			{
1461 				a2len = len2 * 2 + 2;
1462 				a2p = palloc(a2len);
1463 			}
1464 			else
1465 			{
1466 				a2len = TEXTBUFLEN;
1467 				a2p = a2buf;
1468 			}
1469 
1470 			/* stupid Microsloth API does not work for zero-length input */
1471 			if (len1 == 0)
1472 				r = 0;
1473 			else
1474 			{
1475 				r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1476 										(LPWSTR) a1p, a1len / 2);
1477 				if (!r)
1478 					ereport(ERROR,
1479 							(errmsg("could not convert string to UTF-16: error code %lu",
1480 									GetLastError())));
1481 			}
1482 			((LPWSTR) a1p)[r] = 0;
1483 
1484 			if (len2 == 0)
1485 				r = 0;
1486 			else
1487 			{
1488 				r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1489 										(LPWSTR) a2p, a2len / 2);
1490 				if (!r)
1491 					ereport(ERROR,
1492 							(errmsg("could not convert string to UTF-16: error code %lu",
1493 									GetLastError())));
1494 			}
1495 			((LPWSTR) a2p)[r] = 0;
1496 
1497 			errno = 0;
1498 #ifdef HAVE_LOCALE_T
1499 			if (mylocale)
1500 				result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale);
1501 			else
1502 #endif
1503 				result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1504 			if (result == 2147483647)	/* _NLSCMPERROR; missing from mingw
1505 										 * headers */
1506 				ereport(ERROR,
1507 						(errmsg("could not compare Unicode strings: %m")));
1508 
1509 			/*
1510 			 * In some locales wcscoll() can claim that nonidentical strings
1511 			 * are equal.  Believing that would be bad news for a number of
1512 			 * reasons, so we follow Perl's lead and sort "equal" strings
1513 			 * according to strcmp (on the UTF-8 representation).
1514 			 */
1515 			if (result == 0)
1516 			{
1517 				result = memcmp(arg1, arg2, Min(len1, len2));
1518 				if ((result == 0) && (len1 != len2))
1519 					result = (len1 < len2) ? -1 : 1;
1520 			}
1521 
1522 			if (a1p != a1buf)
1523 				pfree(a1p);
1524 			if (a2p != a2buf)
1525 				pfree(a2p);
1526 
1527 			return result;
1528 		}
1529 #endif   /* WIN32 */
1530 
1531 		if (len1 >= TEXTBUFLEN)
1532 			a1p = (char *) palloc(len1 + 1);
1533 		else
1534 			a1p = a1buf;
1535 		if (len2 >= TEXTBUFLEN)
1536 			a2p = (char *) palloc(len2 + 1);
1537 		else
1538 			a2p = a2buf;
1539 
1540 		memcpy(a1p, arg1, len1);
1541 		a1p[len1] = '\0';
1542 		memcpy(a2p, arg2, len2);
1543 		a2p[len2] = '\0';
1544 
1545 #ifdef HAVE_LOCALE_T
1546 		if (mylocale)
1547 			result = strcoll_l(a1p, a2p, mylocale);
1548 		else
1549 #endif
1550 			result = strcoll(a1p, a2p);
1551 
1552 		/*
1553 		 * In some locales strcoll() can claim that nonidentical strings are
1554 		 * equal.  Believing that would be bad news for a number of reasons,
1555 		 * so we follow Perl's lead and sort "equal" strings according to
1556 		 * strcmp().
1557 		 */
1558 		if (result == 0)
1559 			result = strcmp(a1p, a2p);
1560 
1561 		if (a1p != a1buf)
1562 			pfree(a1p);
1563 		if (a2p != a2buf)
1564 			pfree(a2p);
1565 	}
1566 
1567 	return result;
1568 }
1569 
1570 /* text_cmp()
1571  * Internal comparison function for text strings.
1572  * Returns -1, 0 or 1
1573  */
1574 static int
text_cmp(text * arg1,text * arg2,Oid collid)1575 text_cmp(text *arg1, text *arg2, Oid collid)
1576 {
1577 	char	   *a1p,
1578 			   *a2p;
1579 	int			len1,
1580 				len2;
1581 
1582 	a1p = VARDATA_ANY(arg1);
1583 	a2p = VARDATA_ANY(arg2);
1584 
1585 	len1 = VARSIZE_ANY_EXHDR(arg1);
1586 	len2 = VARSIZE_ANY_EXHDR(arg2);
1587 
1588 	return varstr_cmp(a1p, len1, a2p, len2, collid);
1589 }
1590 
1591 /*
1592  * Comparison functions for text strings.
1593  *
1594  * Note: btree indexes need these routines not to leak memory; therefore,
1595  * be careful to free working copies of toasted datums.  Most places don't
1596  * need to be so careful.
1597  */
1598 
1599 Datum
texteq(PG_FUNCTION_ARGS)1600 texteq(PG_FUNCTION_ARGS)
1601 {
1602 	Datum		arg1 = PG_GETARG_DATUM(0);
1603 	Datum		arg2 = PG_GETARG_DATUM(1);
1604 	bool		result;
1605 	Size		len1,
1606 				len2;
1607 
1608 	/*
1609 	 * Since we only care about equality or not-equality, we can avoid all the
1610 	 * expense of strcoll() here, and just do bitwise comparison.  In fact, we
1611 	 * don't even have to do a bitwise comparison if we can show the lengths
1612 	 * of the strings are unequal; which might save us from having to detoast
1613 	 * one or both values.
1614 	 */
1615 	len1 = toast_raw_datum_size(arg1);
1616 	len2 = toast_raw_datum_size(arg2);
1617 	if (len1 != len2)
1618 		result = false;
1619 	else
1620 	{
1621 		text	   *targ1 = DatumGetTextPP(arg1);
1622 		text	   *targ2 = DatumGetTextPP(arg2);
1623 
1624 		result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1625 						 len1 - VARHDRSZ) == 0);
1626 
1627 		PG_FREE_IF_COPY(targ1, 0);
1628 		PG_FREE_IF_COPY(targ2, 1);
1629 	}
1630 
1631 	PG_RETURN_BOOL(result);
1632 }
1633 
1634 Datum
textne(PG_FUNCTION_ARGS)1635 textne(PG_FUNCTION_ARGS)
1636 {
1637 	Datum		arg1 = PG_GETARG_DATUM(0);
1638 	Datum		arg2 = PG_GETARG_DATUM(1);
1639 	bool		result;
1640 	Size		len1,
1641 				len2;
1642 
1643 	/* See comment in texteq() */
1644 	len1 = toast_raw_datum_size(arg1);
1645 	len2 = toast_raw_datum_size(arg2);
1646 	if (len1 != len2)
1647 		result = true;
1648 	else
1649 	{
1650 		text	   *targ1 = DatumGetTextPP(arg1);
1651 		text	   *targ2 = DatumGetTextPP(arg2);
1652 
1653 		result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1654 						 len1 - VARHDRSZ) != 0);
1655 
1656 		PG_FREE_IF_COPY(targ1, 0);
1657 		PG_FREE_IF_COPY(targ2, 1);
1658 	}
1659 
1660 	PG_RETURN_BOOL(result);
1661 }
1662 
1663 Datum
text_lt(PG_FUNCTION_ARGS)1664 text_lt(PG_FUNCTION_ARGS)
1665 {
1666 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1667 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1668 	bool		result;
1669 
1670 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1671 
1672 	PG_FREE_IF_COPY(arg1, 0);
1673 	PG_FREE_IF_COPY(arg2, 1);
1674 
1675 	PG_RETURN_BOOL(result);
1676 }
1677 
1678 Datum
text_le(PG_FUNCTION_ARGS)1679 text_le(PG_FUNCTION_ARGS)
1680 {
1681 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1682 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1683 	bool		result;
1684 
1685 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1686 
1687 	PG_FREE_IF_COPY(arg1, 0);
1688 	PG_FREE_IF_COPY(arg2, 1);
1689 
1690 	PG_RETURN_BOOL(result);
1691 }
1692 
1693 Datum
text_gt(PG_FUNCTION_ARGS)1694 text_gt(PG_FUNCTION_ARGS)
1695 {
1696 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1697 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1698 	bool		result;
1699 
1700 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1701 
1702 	PG_FREE_IF_COPY(arg1, 0);
1703 	PG_FREE_IF_COPY(arg2, 1);
1704 
1705 	PG_RETURN_BOOL(result);
1706 }
1707 
1708 Datum
text_ge(PG_FUNCTION_ARGS)1709 text_ge(PG_FUNCTION_ARGS)
1710 {
1711 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1712 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1713 	bool		result;
1714 
1715 	result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1716 
1717 	PG_FREE_IF_COPY(arg1, 0);
1718 	PG_FREE_IF_COPY(arg2, 1);
1719 
1720 	PG_RETURN_BOOL(result);
1721 }
1722 
1723 Datum
bttextcmp(PG_FUNCTION_ARGS)1724 bttextcmp(PG_FUNCTION_ARGS)
1725 {
1726 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
1727 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
1728 	int32		result;
1729 
1730 	result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1731 
1732 	PG_FREE_IF_COPY(arg1, 0);
1733 	PG_FREE_IF_COPY(arg2, 1);
1734 
1735 	PG_RETURN_INT32(result);
1736 }
1737 
1738 Datum
bttextsortsupport(PG_FUNCTION_ARGS)1739 bttextsortsupport(PG_FUNCTION_ARGS)
1740 {
1741 	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1742 	Oid			collid = ssup->ssup_collation;
1743 	MemoryContext oldcontext;
1744 
1745 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1746 
1747 	/* Use generic string SortSupport */
1748 	varstr_sortsupport(ssup, collid, false);
1749 
1750 	MemoryContextSwitchTo(oldcontext);
1751 
1752 	PG_RETURN_VOID();
1753 }
1754 
1755 /*
1756  * Generic sortsupport interface for character type's operator classes.
1757  * Includes locale support, and support for BpChar semantics (i.e. removing
1758  * trailing spaces before comparison).
1759  *
1760  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1761  * same representation.  Callers that always use the C collation (e.g.
1762  * non-collatable type callers like bytea) may have NUL bytes in their strings;
1763  * this will not work with any other collation, though.
1764  */
1765 void
varstr_sortsupport(SortSupport ssup,Oid collid,bool bpchar)1766 varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar)
1767 {
1768 	bool		abbreviate = ssup->abbreviate;
1769 	bool		collate_c = false;
1770 	VarStringSortSupport *sss;
1771 
1772 #ifdef HAVE_LOCALE_T
1773 	pg_locale_t locale = 0;
1774 #endif
1775 
1776 	/*
1777 	 * If possible, set ssup->comparator to a function which can be used to
1778 	 * directly compare two datums.  If we can do this, we'll avoid the
1779 	 * overhead of a trip through the fmgr layer for every comparison, which
1780 	 * can be substantial.
1781 	 *
1782 	 * Most typically, we'll set the comparator to varstrfastcmp_locale, which
1783 	 * uses strcoll() to perform comparisons and knows about the special
1784 	 * requirements of BpChar callers.  However, if LC_COLLATE = C, we can
1785 	 * make things quite a bit faster with varstrfastcmp_c or bpcharfastcmp_c,
1786 	 * both of which use memcmp() rather than strcoll().
1787 	 *
1788 	 * There is a further exception on Windows.  When the database encoding is
1789 	 * UTF-8 and we are not using the C collation, complex hacks are required.
1790 	 * We don't currently have a comparator that handles that case, so we fall
1791 	 * back on the slow method of having the sort code invoke bttextcmp() (in
1792 	 * the case of text) via the fmgr trampoline.
1793 	 */
1794 	if (lc_collate_is_c(collid))
1795 	{
1796 		if (!bpchar)
1797 			ssup->comparator = varstrfastcmp_c;
1798 		else
1799 			ssup->comparator = bpcharfastcmp_c;
1800 
1801 		collate_c = true;
1802 	}
1803 #ifdef WIN32
1804 	else if (GetDatabaseEncoding() == PG_UTF8)
1805 		return;
1806 #endif
1807 	else
1808 	{
1809 		ssup->comparator = varstrfastcmp_locale;
1810 
1811 		/*
1812 		 * We need a collation-sensitive comparison.  To make things faster,
1813 		 * we'll figure out the collation based on the locale id and cache the
1814 		 * result.
1815 		 */
1816 		if (collid != DEFAULT_COLLATION_OID)
1817 		{
1818 			if (!OidIsValid(collid))
1819 			{
1820 				/*
1821 				 * This typically means that the parser could not resolve a
1822 				 * conflict of implicit collations, so report it that way.
1823 				 */
1824 				ereport(ERROR,
1825 						(errcode(ERRCODE_INDETERMINATE_COLLATION),
1826 						 errmsg("could not determine which collation to use for string comparison"),
1827 						 errhint("Use the COLLATE clause to set the collation explicitly.")));
1828 			}
1829 #ifdef HAVE_LOCALE_T
1830 			locale = pg_newlocale_from_collation(collid);
1831 #endif
1832 		}
1833 	}
1834 
1835 	/*
1836 	 * Unfortunately, it seems that abbreviation for non-C collations is
1837 	 * broken on many common platforms; testing of multiple versions of glibc
1838 	 * reveals that, for many locales, strcoll() and strxfrm() do not return
1839 	 * consistent results, which is fatal to this optimization.  While no
1840 	 * other libc other than Cygwin has so far been shown to have a problem,
1841 	 * we take the conservative course of action for right now and disable
1842 	 * this categorically.  (Users who are certain this isn't a problem on
1843 	 * their system can define TRUST_STRXFRM.)
1844 	 *
1845 	 * Even apart from the risk of broken locales, it's possible that there
1846 	 * are platforms where the use of abbreviated keys should be disabled at
1847 	 * compile time.  Having only 4 byte datums could make worst-case
1848 	 * performance drastically more likely, for example.  Moreover, Darwin's
1849 	 * strxfrm() implementations is known to not effectively concentrate a
1850 	 * significant amount of entropy from the original string in earlier
1851 	 * transformed blobs.  It's possible that other supported platforms are
1852 	 * similarly encumbered.  So, if we ever get past disabling this
1853 	 * categorically, we may still want or need to disable it for particular
1854 	 * platforms.
1855 	 */
1856 #ifndef TRUST_STRXFRM
1857 	if (!collate_c)
1858 		abbreviate = false;
1859 #endif
1860 
1861 	/*
1862 	 * If we're using abbreviated keys, or if we're using a locale-aware
1863 	 * comparison, we need to initialize a StringSortSupport object.  Both
1864 	 * cases will make use of the temporary buffers we initialize here for
1865 	 * scratch space (and to detect requirement for BpChar semantics from
1866 	 * caller), and the abbreviation case requires additional state.
1867 	 */
1868 	if (abbreviate || !collate_c)
1869 	{
1870 		sss = palloc(sizeof(VarStringSortSupport));
1871 		sss->buf1 = palloc(TEXTBUFLEN);
1872 		sss->buflen1 = TEXTBUFLEN;
1873 		sss->buf2 = palloc(TEXTBUFLEN);
1874 		sss->buflen2 = TEXTBUFLEN;
1875 		/* Start with invalid values */
1876 		sss->last_len1 = -1;
1877 		sss->last_len2 = -1;
1878 		/* Initialize */
1879 		sss->last_returned = 0;
1880 #ifdef HAVE_LOCALE_T
1881 		sss->locale = locale;
1882 #endif
1883 
1884 		/*
1885 		 * To avoid somehow confusing a strxfrm() blob and an original string,
1886 		 * constantly keep track of the variety of data that buf1 and buf2
1887 		 * currently contain.
1888 		 *
1889 		 * Comparisons may be interleaved with conversion calls.  Frequently,
1890 		 * conversions and comparisons are batched into two distinct phases,
1891 		 * but the correctness of caching cannot hinge upon this.  For
1892 		 * comparison caching, buffer state is only trusted if cache_blob is
1893 		 * found set to false, whereas strxfrm() caching only trusts the state
1894 		 * when cache_blob is found set to true.
1895 		 *
1896 		 * Arbitrarily initialize cache_blob to true.
1897 		 */
1898 		sss->cache_blob = true;
1899 		sss->collate_c = collate_c;
1900 		sss->bpchar = bpchar;
1901 		ssup->ssup_extra = sss;
1902 
1903 		/*
1904 		 * If possible, plan to use the abbreviated keys optimization.  The
1905 		 * core code may switch back to authoritative comparator should
1906 		 * abbreviation be aborted.
1907 		 */
1908 		if (abbreviate)
1909 		{
1910 			sss->prop_card = 0.20;
1911 			initHyperLogLog(&sss->abbr_card, 10);
1912 			initHyperLogLog(&sss->full_card, 10);
1913 			ssup->abbrev_full_comparator = ssup->comparator;
1914 			ssup->comparator = varstrcmp_abbrev;
1915 			ssup->abbrev_converter = varstr_abbrev_convert;
1916 			ssup->abbrev_abort = varstr_abbrev_abort;
1917 		}
1918 	}
1919 }
1920 
1921 /*
1922  * sortsupport comparison func (for C locale case)
1923  */
1924 static int
varstrfastcmp_c(Datum x,Datum y,SortSupport ssup)1925 varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
1926 {
1927 	VarString  *arg1 = DatumGetVarStringPP(x);
1928 	VarString  *arg2 = DatumGetVarStringPP(y);
1929 	char	   *a1p,
1930 			   *a2p;
1931 	int			len1,
1932 				len2,
1933 				result;
1934 
1935 	a1p = VARDATA_ANY(arg1);
1936 	a2p = VARDATA_ANY(arg2);
1937 
1938 	len1 = VARSIZE_ANY_EXHDR(arg1);
1939 	len2 = VARSIZE_ANY_EXHDR(arg2);
1940 
1941 	result = memcmp(a1p, a2p, Min(len1, len2));
1942 	if ((result == 0) && (len1 != len2))
1943 		result = (len1 < len2) ? -1 : 1;
1944 
1945 	/* We can't afford to leak memory here. */
1946 	if (PointerGetDatum(arg1) != x)
1947 		pfree(arg1);
1948 	if (PointerGetDatum(arg2) != y)
1949 		pfree(arg2);
1950 
1951 	return result;
1952 }
1953 
1954 /*
1955  * sortsupport comparison func (for BpChar C locale case)
1956  *
1957  * BpChar outsources its sortsupport to this module.  Specialization for the
1958  * varstr_sortsupport BpChar case, modeled on
1959  * internal_bpchar_pattern_compare().
1960  */
1961 static int
bpcharfastcmp_c(Datum x,Datum y,SortSupport ssup)1962 bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
1963 {
1964 	BpChar	   *arg1 = DatumGetBpCharPP(x);
1965 	BpChar	   *arg2 = DatumGetBpCharPP(y);
1966 	char	   *a1p,
1967 			   *a2p;
1968 	int			len1,
1969 				len2,
1970 				result;
1971 
1972 	a1p = VARDATA_ANY(arg1);
1973 	a2p = VARDATA_ANY(arg2);
1974 
1975 	len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
1976 	len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
1977 
1978 	result = memcmp(a1p, a2p, Min(len1, len2));
1979 	if ((result == 0) && (len1 != len2))
1980 		result = (len1 < len2) ? -1 : 1;
1981 
1982 	/* We can't afford to leak memory here. */
1983 	if (PointerGetDatum(arg1) != x)
1984 		pfree(arg1);
1985 	if (PointerGetDatum(arg2) != y)
1986 		pfree(arg2);
1987 
1988 	return result;
1989 }
1990 
1991 /*
1992  * sortsupport comparison func (for locale case)
1993  */
1994 static int
varstrfastcmp_locale(Datum x,Datum y,SortSupport ssup)1995 varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup)
1996 {
1997 	VarString  *arg1 = DatumGetVarStringPP(x);
1998 	VarString  *arg2 = DatumGetVarStringPP(y);
1999 	bool		arg1_match;
2000 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2001 
2002 	/* working state */
2003 	char	   *a1p,
2004 			   *a2p;
2005 	int			len1,
2006 				len2,
2007 				result;
2008 
2009 	a1p = VARDATA_ANY(arg1);
2010 	a2p = VARDATA_ANY(arg2);
2011 
2012 	len1 = VARSIZE_ANY_EXHDR(arg1);
2013 	len2 = VARSIZE_ANY_EXHDR(arg2);
2014 
2015 	/* Fast pre-check for equality, as discussed in varstr_cmp() */
2016 	if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2017 	{
2018 		/*
2019 		 * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2020 		 * last_len2.  Existing contents of buffers might still be used by
2021 		 * next call.
2022 		 *
2023 		 * It's fine to allow the comparison of BpChar padding bytes here,
2024 		 * even though that implies that the memcmp() will usually be
2025 		 * performed for BpChar callers (though multibyte characters could
2026 		 * still prevent that from occurring).  The memcmp() is still very
2027 		 * cheap, and BpChar's funny semantics have us remove trailing spaces
2028 		 * (not limited to padding), so we need make no distinction between
2029 		 * padding space characters and "real" space characters.
2030 		 */
2031 		result = 0;
2032 		goto done;
2033 	}
2034 
2035 	if (sss->bpchar)
2036 	{
2037 		/* Get true number of bytes, ignoring trailing spaces */
2038 		len1 = bpchartruelen(a1p, len1);
2039 		len2 = bpchartruelen(a2p, len2);
2040 	}
2041 
2042 	if (len1 >= sss->buflen1)
2043 	{
2044 		pfree(sss->buf1);
2045 		sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2046 		sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2047 	}
2048 	if (len2 >= sss->buflen2)
2049 	{
2050 		pfree(sss->buf2);
2051 		sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2052 		sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2053 	}
2054 
2055 	/*
2056 	 * We're likely to be asked to compare the same strings repeatedly, and
2057 	 * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2058 	 * comparisons, even though in general there is no reason to think that
2059 	 * that will work out (every string datum may be unique).  Caching does
2060 	 * not slow things down measurably when it doesn't work out, and can speed
2061 	 * things up by rather a lot when it does.  In part, this is because the
2062 	 * memcmp() compares data from cachelines that are needed in L1 cache even
2063 	 * when the last comparison's result cannot be reused.
2064 	 */
2065 	arg1_match = true;
2066 	if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2067 	{
2068 		arg1_match = false;
2069 		memcpy(sss->buf1, a1p, len1);
2070 		sss->buf1[len1] = '\0';
2071 		sss->last_len1 = len1;
2072 	}
2073 
2074 	/*
2075 	 * If we're comparing the same two strings as last time, we can return the
2076 	 * same answer without calling strcoll() again.  This is more likely than
2077 	 * it seems (at least with moderate to low cardinality sets), because
2078 	 * quicksort compares the same pivot against many values.
2079 	 */
2080 	if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2081 	{
2082 		memcpy(sss->buf2, a2p, len2);
2083 		sss->buf2[len2] = '\0';
2084 		sss->last_len2 = len2;
2085 	}
2086 	else if (arg1_match && !sss->cache_blob)
2087 	{
2088 		/* Use result cached following last actual strcoll() call */
2089 		result = sss->last_returned;
2090 		goto done;
2091 	}
2092 
2093 #ifdef HAVE_LOCALE_T
2094 	if (sss->locale)
2095 		result = strcoll_l(sss->buf1, sss->buf2, sss->locale);
2096 	else
2097 #endif
2098 		result = strcoll(sss->buf1, sss->buf2);
2099 
2100 	/*
2101 	 * In some locales strcoll() can claim that nonidentical strings are
2102 	 * equal. Believing that would be bad news for a number of reasons, so we
2103 	 * follow Perl's lead and sort "equal" strings according to strcmp().
2104 	 */
2105 	if (result == 0)
2106 		result = strcmp(sss->buf1, sss->buf2);
2107 
2108 	/* Cache result, perhaps saving an expensive strcoll() call next time */
2109 	sss->cache_blob = false;
2110 	sss->last_returned = result;
2111 done:
2112 	/* We can't afford to leak memory here. */
2113 	if (PointerGetDatum(arg1) != x)
2114 		pfree(arg1);
2115 	if (PointerGetDatum(arg2) != y)
2116 		pfree(arg2);
2117 
2118 	return result;
2119 }
2120 
2121 /*
2122  * Abbreviated key comparison func
2123  */
2124 static int
varstrcmp_abbrev(Datum x,Datum y,SortSupport ssup)2125 varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup)
2126 {
2127 	/*
2128 	 * When 0 is returned, the core system will call varstrfastcmp_c()
2129 	 * (bpcharfastcmp_c() in BpChar case) or varstrfastcmp_locale().  Even a
2130 	 * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2131 	 * authoritatively, for the same reason that there is a strcoll()
2132 	 * tie-breaker call to strcmp() in varstr_cmp().
2133 	 */
2134 	if (x > y)
2135 		return 1;
2136 	else if (x == y)
2137 		return 0;
2138 	else
2139 		return -1;
2140 }
2141 
2142 /*
2143  * Conversion routine for sortsupport.  Converts original to abbreviated key
2144  * representation.  Our encoding strategy is simple -- pack the first 8 bytes
2145  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2146  * stored in reverse order), and treat it as an unsigned integer.  When the "C"
2147  * locale is used, or in case of bytea, just memcpy() from original instead.
2148  */
2149 static Datum
varstr_abbrev_convert(Datum original,SortSupport ssup)2150 varstr_abbrev_convert(Datum original, SortSupport ssup)
2151 {
2152 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2153 	VarString  *authoritative = DatumGetVarStringPP(original);
2154 	char	   *authoritative_data = VARDATA_ANY(authoritative);
2155 
2156 	/* working state */
2157 	Datum		res;
2158 	char	   *pres;
2159 	int			len;
2160 	uint32		hash;
2161 
2162 	pres = (char *) &res;
2163 	/* memset(), so any non-overwritten bytes are NUL */
2164 	memset(pres, 0, sizeof(Datum));
2165 	len = VARSIZE_ANY_EXHDR(authoritative);
2166 
2167 	/* Get number of bytes, ignoring trailing spaces */
2168 	if (sss->bpchar)
2169 		len = bpchartruelen(authoritative_data, len);
2170 
2171 	/*
2172 	 * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2173 	 * abbreviate keys.  The full comparator for the C locale is always
2174 	 * memcmp().  It would be incorrect to allow bytea callers (callers that
2175 	 * always force the C collation -- bytea isn't a collatable type, but this
2176 	 * approach is convenient) to use strxfrm().  This is because bytea
2177 	 * strings may contain NUL bytes.  Besides, this should be faster, too.
2178 	 *
2179 	 * More generally, it's okay that bytea callers can have NUL bytes in
2180 	 * strings because varstrcmp_abbrev() need not make a distinction between
2181 	 * terminating NUL bytes, and NUL bytes representing actual NULs in the
2182 	 * authoritative representation.  Hopefully a comparison at or past one
2183 	 * abbreviated key's terminating NUL byte will resolve the comparison
2184 	 * without consulting the authoritative representation; specifically, some
2185 	 * later non-NUL byte in the longer string can resolve the comparison
2186 	 * against a subsequent terminating NUL in the shorter string.  There will
2187 	 * usually be what is effectively a "length-wise" resolution there and
2188 	 * then.
2189 	 *
2190 	 * If that doesn't work out -- if all bytes in the longer string
2191 	 * positioned at or past the offset of the smaller string's (first)
2192 	 * terminating NUL are actually representative of NUL bytes in the
2193 	 * authoritative binary string (perhaps with some *terminating* NUL bytes
2194 	 * towards the end of the longer string iff it happens to still be small)
2195 	 * -- then an authoritative tie-breaker will happen, and do the right
2196 	 * thing: explicitly consider string length.
2197 	 */
2198 	if (sss->collate_c)
2199 		memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2200 	else
2201 	{
2202 		Size		bsize;
2203 
2204 		/*
2205 		 * We're not using the C collation, so fall back on strxfrm.
2206 		 */
2207 
2208 		/* By convention, we use buffer 1 to store and NUL-terminate */
2209 		if (len >= sss->buflen1)
2210 		{
2211 			pfree(sss->buf1);
2212 			sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2213 			sss->buf1 = palloc(sss->buflen1);
2214 		}
2215 
2216 		/* Might be able to reuse strxfrm() blob from last call */
2217 		if (sss->last_len1 == len && sss->cache_blob &&
2218 			memcmp(sss->buf1, authoritative_data, len) == 0)
2219 		{
2220 			memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2221 			/* No change affecting cardinality, so no hashing required */
2222 			goto done;
2223 		}
2224 
2225 		/* Just like strcoll(), strxfrm() expects a NUL-terminated string */
2226 		memcpy(sss->buf1, authoritative_data, len);
2227 		sss->buf1[len] = '\0';
2228 		sss->last_len1 = len;
2229 
2230 		for (;;)
2231 		{
2232 #ifdef HAVE_LOCALE_T
2233 			if (sss->locale)
2234 				bsize = strxfrm_l(sss->buf2, sss->buf1,
2235 								  sss->buflen2, sss->locale);
2236 			else
2237 #endif
2238 				bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2239 
2240 			sss->last_len2 = bsize;
2241 			if (bsize < sss->buflen2)
2242 				break;
2243 
2244 			/*
2245 			 * The C standard states that the contents of the buffer is now
2246 			 * unspecified.  Grow buffer, and retry.
2247 			 */
2248 			pfree(sss->buf2);
2249 			sss->buflen2 = Max(bsize + 1,
2250 							   Min(sss->buflen2 * 2, MaxAllocSize));
2251 			sss->buf2 = palloc(sss->buflen2);
2252 		}
2253 
2254 		/*
2255 		 * Every Datum byte is always compared.  This is safe because the
2256 		 * strxfrm() blob is itself NUL terminated, leaving no danger of
2257 		 * misinterpreting any NUL bytes not intended to be interpreted as
2258 		 * logically representing termination.
2259 		 *
2260 		 * (Actually, even if there were NUL bytes in the blob it would be
2261 		 * okay.  See remarks on bytea case above.)
2262 		 */
2263 		memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2264 	}
2265 
2266 	/*
2267 	 * Maintain approximate cardinality of both abbreviated keys and original,
2268 	 * authoritative keys using HyperLogLog.  Used as cheap insurance against
2269 	 * the worst case, where we do many string transformations for no saving
2270 	 * in full strcoll()-based comparisons.  These statistics are used by
2271 	 * varstr_abbrev_abort().
2272 	 *
2273 	 * First, Hash key proper, or a significant fraction of it.  Mix in length
2274 	 * in order to compensate for cases where differences are past
2275 	 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2276 	 */
2277 	hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2278 								   Min(len, PG_CACHE_LINE_SIZE)));
2279 
2280 	if (len > PG_CACHE_LINE_SIZE)
2281 		hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2282 
2283 	addHyperLogLog(&sss->full_card, hash);
2284 
2285 	/* Hash abbreviated key */
2286 #if SIZEOF_DATUM == 8
2287 	{
2288 		uint32		lohalf,
2289 					hihalf;
2290 
2291 		lohalf = (uint32) res;
2292 		hihalf = (uint32) (res >> 32);
2293 		hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2294 	}
2295 #else							/* SIZEOF_DATUM != 8 */
2296 	hash = DatumGetUInt32(hash_uint32((uint32) res));
2297 #endif
2298 
2299 	addHyperLogLog(&sss->abbr_card, hash);
2300 
2301 	/* Cache result, perhaps saving an expensive strxfrm() call next time */
2302 	sss->cache_blob = true;
2303 done:
2304 
2305 	/*
2306 	 * Byteswap on little-endian machines.
2307 	 *
2308 	 * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2309 	 * comparator) works correctly on all platforms.  If we didn't do this,
2310 	 * the comparator would have to call memcmp() with a pair of pointers to
2311 	 * the first byte of each abbreviated key, which is slower.
2312 	 */
2313 	res = DatumBigEndianToNative(res);
2314 
2315 	/* Don't leak memory here */
2316 	if (PointerGetDatum(authoritative) != original)
2317 		pfree(authoritative);
2318 
2319 	return res;
2320 }
2321 
2322 /*
2323  * Callback for estimating effectiveness of abbreviated key optimization, using
2324  * heuristic rules.  Returns value indicating if the abbreviation optimization
2325  * should be aborted, based on its projected effectiveness.
2326  */
2327 static bool
varstr_abbrev_abort(int memtupcount,SortSupport ssup)2328 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2329 {
2330 	VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2331 	double		abbrev_distinct,
2332 				key_distinct;
2333 
2334 	Assert(ssup->abbreviate);
2335 
2336 	/* Have a little patience */
2337 	if (memtupcount < 100)
2338 		return false;
2339 
2340 	abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2341 	key_distinct = estimateHyperLogLog(&sss->full_card);
2342 
2343 	/*
2344 	 * Clamp cardinality estimates to at least one distinct value.  While
2345 	 * NULLs are generally disregarded, if only NULL values were seen so far,
2346 	 * that might misrepresent costs if we failed to clamp.
2347 	 */
2348 	if (abbrev_distinct <= 1.0)
2349 		abbrev_distinct = 1.0;
2350 
2351 	if (key_distinct <= 1.0)
2352 		key_distinct = 1.0;
2353 
2354 	/*
2355 	 * In the worst case all abbreviated keys are identical, while at the same
2356 	 * time there are differences within full key strings not captured in
2357 	 * abbreviations.
2358 	 */
2359 #ifdef TRACE_SORT
2360 	if (trace_sort)
2361 	{
2362 		double		norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2363 
2364 		elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2365 			 "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2366 			 memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2367 			 sss->prop_card);
2368 	}
2369 #endif
2370 
2371 	/*
2372 	 * If the number of distinct abbreviated keys approximately matches the
2373 	 * number of distinct authoritative original keys, that's reason enough to
2374 	 * proceed.  We can win even with a very low cardinality set if most
2375 	 * tie-breakers only memcmp().  This is by far the most important
2376 	 * consideration.
2377 	 *
2378 	 * While comparisons that are resolved at the abbreviated key level are
2379 	 * considerably cheaper than tie-breakers resolved with memcmp(), both of
2380 	 * those two outcomes are so much cheaper than a full strcoll() once
2381 	 * sorting is underway that it doesn't seem worth it to weigh abbreviated
2382 	 * cardinality against the overall size of the set in order to more
2383 	 * accurately model costs.  Assume that an abbreviated comparison, and an
2384 	 * abbreviated comparison with a cheap memcmp()-based authoritative
2385 	 * resolution are equivalent.
2386 	 */
2387 	if (abbrev_distinct > key_distinct * sss->prop_card)
2388 	{
2389 		/*
2390 		 * When we have exceeded 10,000 tuples, decay required cardinality
2391 		 * aggressively for next call.
2392 		 *
2393 		 * This is useful because the number of comparisons required on
2394 		 * average increases at a linearithmic rate, and at roughly 10,000
2395 		 * tuples that factor will start to dominate over the linear costs of
2396 		 * string transformation (this is a conservative estimate).  The decay
2397 		 * rate is chosen to be a little less aggressive than halving -- which
2398 		 * (since we're called at points at which memtupcount has doubled)
2399 		 * would never see the cost model actually abort past the first call
2400 		 * following a decay.  This decay rate is mostly a precaution against
2401 		 * a sudden, violent swing in how well abbreviated cardinality tracks
2402 		 * full key cardinality.  The decay also serves to prevent a marginal
2403 		 * case from being aborted too late, when too much has already been
2404 		 * invested in string transformation.
2405 		 *
2406 		 * It's possible for sets of several million distinct strings with
2407 		 * mere tens of thousands of distinct abbreviated keys to still
2408 		 * benefit very significantly.  This will generally occur provided
2409 		 * each abbreviated key is a proxy for a roughly uniform number of the
2410 		 * set's full keys. If it isn't so, we hope to catch that early and
2411 		 * abort.  If it isn't caught early, by the time the problem is
2412 		 * apparent it's probably not worth aborting.
2413 		 */
2414 		if (memtupcount > 10000)
2415 			sss->prop_card *= 0.65;
2416 
2417 		return false;
2418 	}
2419 
2420 	/*
2421 	 * Abort abbreviation strategy.
2422 	 *
2423 	 * The worst case, where all abbreviated keys are identical while all
2424 	 * original strings differ will typically only see a regression of about
2425 	 * 10% in execution time for small to medium sized lists of strings.
2426 	 * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2427 	 * often expect very large improvements, particularly with sets of strings
2428 	 * of moderately high to high abbreviated cardinality.  There is little to
2429 	 * lose but much to gain, which our strategy reflects.
2430 	 */
2431 #ifdef TRACE_SORT
2432 	if (trace_sort)
2433 		elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2434 			 "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2435 			 memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2436 #endif
2437 
2438 	return true;
2439 }
2440 
2441 Datum
text_larger(PG_FUNCTION_ARGS)2442 text_larger(PG_FUNCTION_ARGS)
2443 {
2444 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2445 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2446 	text	   *result;
2447 
2448 	result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2449 
2450 	PG_RETURN_TEXT_P(result);
2451 }
2452 
2453 Datum
text_smaller(PG_FUNCTION_ARGS)2454 text_smaller(PG_FUNCTION_ARGS)
2455 {
2456 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2457 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2458 	text	   *result;
2459 
2460 	result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2461 
2462 	PG_RETURN_TEXT_P(result);
2463 }
2464 
2465 
2466 /*
2467  * The following operators support character-by-character comparison
2468  * of text datums, to allow building indexes suitable for LIKE clauses.
2469  * Note that the regular texteq/textne comparison operators, and regular
2470  * support functions 1 and 2 with "C" collation are assumed to be
2471  * compatible with these!
2472  */
2473 
2474 static int
internal_text_pattern_compare(text * arg1,text * arg2)2475 internal_text_pattern_compare(text *arg1, text *arg2)
2476 {
2477 	int			result;
2478 	int			len1,
2479 				len2;
2480 
2481 	len1 = VARSIZE_ANY_EXHDR(arg1);
2482 	len2 = VARSIZE_ANY_EXHDR(arg2);
2483 
2484 	result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
2485 	if (result != 0)
2486 		return result;
2487 	else if (len1 < len2)
2488 		return -1;
2489 	else if (len1 > len2)
2490 		return 1;
2491 	else
2492 		return 0;
2493 }
2494 
2495 
2496 Datum
text_pattern_lt(PG_FUNCTION_ARGS)2497 text_pattern_lt(PG_FUNCTION_ARGS)
2498 {
2499 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2500 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2501 	int			result;
2502 
2503 	result = internal_text_pattern_compare(arg1, arg2);
2504 
2505 	PG_FREE_IF_COPY(arg1, 0);
2506 	PG_FREE_IF_COPY(arg2, 1);
2507 
2508 	PG_RETURN_BOOL(result < 0);
2509 }
2510 
2511 
2512 Datum
text_pattern_le(PG_FUNCTION_ARGS)2513 text_pattern_le(PG_FUNCTION_ARGS)
2514 {
2515 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2516 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2517 	int			result;
2518 
2519 	result = internal_text_pattern_compare(arg1, arg2);
2520 
2521 	PG_FREE_IF_COPY(arg1, 0);
2522 	PG_FREE_IF_COPY(arg2, 1);
2523 
2524 	PG_RETURN_BOOL(result <= 0);
2525 }
2526 
2527 
2528 Datum
text_pattern_ge(PG_FUNCTION_ARGS)2529 text_pattern_ge(PG_FUNCTION_ARGS)
2530 {
2531 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2532 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2533 	int			result;
2534 
2535 	result = internal_text_pattern_compare(arg1, arg2);
2536 
2537 	PG_FREE_IF_COPY(arg1, 0);
2538 	PG_FREE_IF_COPY(arg2, 1);
2539 
2540 	PG_RETURN_BOOL(result >= 0);
2541 }
2542 
2543 
2544 Datum
text_pattern_gt(PG_FUNCTION_ARGS)2545 text_pattern_gt(PG_FUNCTION_ARGS)
2546 {
2547 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2548 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2549 	int			result;
2550 
2551 	result = internal_text_pattern_compare(arg1, arg2);
2552 
2553 	PG_FREE_IF_COPY(arg1, 0);
2554 	PG_FREE_IF_COPY(arg2, 1);
2555 
2556 	PG_RETURN_BOOL(result > 0);
2557 }
2558 
2559 
2560 Datum
bttext_pattern_cmp(PG_FUNCTION_ARGS)2561 bttext_pattern_cmp(PG_FUNCTION_ARGS)
2562 {
2563 	text	   *arg1 = PG_GETARG_TEXT_PP(0);
2564 	text	   *arg2 = PG_GETARG_TEXT_PP(1);
2565 	int			result;
2566 
2567 	result = internal_text_pattern_compare(arg1, arg2);
2568 
2569 	PG_FREE_IF_COPY(arg1, 0);
2570 	PG_FREE_IF_COPY(arg2, 1);
2571 
2572 	PG_RETURN_INT32(result);
2573 }
2574 
2575 
2576 Datum
bttext_pattern_sortsupport(PG_FUNCTION_ARGS)2577 bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
2578 {
2579 	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
2580 	MemoryContext oldcontext;
2581 
2582 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2583 
2584 	/* Use generic string SortSupport, forcing "C" collation */
2585 	varstr_sortsupport(ssup, C_COLLATION_OID, false);
2586 
2587 	MemoryContextSwitchTo(oldcontext);
2588 
2589 	PG_RETURN_VOID();
2590 }
2591 
2592 
2593 /*-------------------------------------------------------------
2594  * byteaoctetlen
2595  *
2596  * get the number of bytes contained in an instance of type 'bytea'
2597  *-------------------------------------------------------------
2598  */
2599 Datum
byteaoctetlen(PG_FUNCTION_ARGS)2600 byteaoctetlen(PG_FUNCTION_ARGS)
2601 {
2602 	Datum		str = PG_GETARG_DATUM(0);
2603 
2604 	/* We need not detoast the input at all */
2605 	PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
2606 }
2607 
2608 /*
2609  * byteacat -
2610  *	  takes two bytea* and returns a bytea* that is the concatenation of
2611  *	  the two.
2612  *
2613  * Cloned from textcat and modified as required.
2614  */
2615 Datum
byteacat(PG_FUNCTION_ARGS)2616 byteacat(PG_FUNCTION_ARGS)
2617 {
2618 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
2619 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
2620 
2621 	PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
2622 }
2623 
2624 /*
2625  * bytea_catenate
2626  *	Guts of byteacat(), broken out so it can be used by other functions
2627  *
2628  * Arguments can be in short-header form, but not compressed or out-of-line
2629  */
2630 static bytea *
bytea_catenate(bytea * t1,bytea * t2)2631 bytea_catenate(bytea *t1, bytea *t2)
2632 {
2633 	bytea	   *result;
2634 	int			len1,
2635 				len2,
2636 				len;
2637 	char	   *ptr;
2638 
2639 	len1 = VARSIZE_ANY_EXHDR(t1);
2640 	len2 = VARSIZE_ANY_EXHDR(t2);
2641 
2642 	/* paranoia ... probably should throw error instead? */
2643 	if (len1 < 0)
2644 		len1 = 0;
2645 	if (len2 < 0)
2646 		len2 = 0;
2647 
2648 	len = len1 + len2 + VARHDRSZ;
2649 	result = (bytea *) palloc(len);
2650 
2651 	/* Set size of result string... */
2652 	SET_VARSIZE(result, len);
2653 
2654 	/* Fill data field of result string... */
2655 	ptr = VARDATA(result);
2656 	if (len1 > 0)
2657 		memcpy(ptr, VARDATA_ANY(t1), len1);
2658 	if (len2 > 0)
2659 		memcpy(ptr + len1, VARDATA_ANY(t2), len2);
2660 
2661 	return result;
2662 }
2663 
2664 #define PG_STR_GET_BYTEA(str_) \
2665 	DatumGetByteaP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
2666 
2667 /*
2668  * bytea_substr()
2669  * Return a substring starting at the specified position.
2670  * Cloned from text_substr and modified as required.
2671  *
2672  * Input:
2673  *	- string
2674  *	- starting position (is one-based)
2675  *	- string length (optional)
2676  *
2677  * If the starting position is zero or less, then return from the start of the string
2678  * adjusting the length to be consistent with the "negative start" per SQL.
2679  * If the length is less than zero, an ERROR is thrown. If no third argument
2680  * (length) is provided, the length to the end of the string is assumed.
2681  */
2682 Datum
bytea_substr(PG_FUNCTION_ARGS)2683 bytea_substr(PG_FUNCTION_ARGS)
2684 {
2685 	PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
2686 									  PG_GETARG_INT32(1),
2687 									  PG_GETARG_INT32(2),
2688 									  false));
2689 }
2690 
2691 /*
2692  * bytea_substr_no_len -
2693  *	  Wrapper to avoid opr_sanity failure due to
2694  *	  one function accepting a different number of args.
2695  */
2696 Datum
bytea_substr_no_len(PG_FUNCTION_ARGS)2697 bytea_substr_no_len(PG_FUNCTION_ARGS)
2698 {
2699 	PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
2700 									  PG_GETARG_INT32(1),
2701 									  -1,
2702 									  true));
2703 }
2704 
2705 static bytea *
bytea_substring(Datum str,int S,int L,bool length_not_specified)2706 bytea_substring(Datum str,
2707 				int S,
2708 				int L,
2709 				bool length_not_specified)
2710 {
2711 	int			S1;				/* adjusted start position */
2712 	int			L1;				/* adjusted substring length */
2713 
2714 	S1 = Max(S, 1);
2715 
2716 	if (length_not_specified)
2717 	{
2718 		/*
2719 		 * Not passed a length - DatumGetByteaPSlice() grabs everything to the
2720 		 * end of the string if we pass it a negative value for length.
2721 		 */
2722 		L1 = -1;
2723 	}
2724 	else
2725 	{
2726 		/* end position */
2727 		int			E = S + L;
2728 
2729 		/*
2730 		 * A negative value for L is the only way for the end position to be
2731 		 * before the start. SQL99 says to throw an error.
2732 		 */
2733 		if (E < S)
2734 			ereport(ERROR,
2735 					(errcode(ERRCODE_SUBSTRING_ERROR),
2736 					 errmsg("negative substring length not allowed")));
2737 
2738 		/*
2739 		 * A zero or negative value for the end position can happen if the
2740 		 * start was negative or one. SQL99 says to return a zero-length
2741 		 * string.
2742 		 */
2743 		if (E < 1)
2744 			return PG_STR_GET_BYTEA("");
2745 
2746 		L1 = E - S1;
2747 	}
2748 
2749 	/*
2750 	 * If the start position is past the end of the string, SQL99 says to
2751 	 * return a zero-length string -- DatumGetByteaPSlice() will do that for
2752 	 * us. Convert to zero-based starting position
2753 	 */
2754 	return DatumGetByteaPSlice(str, S1 - 1, L1);
2755 }
2756 
2757 /*
2758  * byteaoverlay
2759  *	Replace specified substring of first string with second
2760  *
2761  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
2762  * This code is a direct implementation of what the standard says.
2763  */
2764 Datum
byteaoverlay(PG_FUNCTION_ARGS)2765 byteaoverlay(PG_FUNCTION_ARGS)
2766 {
2767 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
2768 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
2769 	int			sp = PG_GETARG_INT32(2);		/* substring start position */
2770 	int			sl = PG_GETARG_INT32(3);		/* substring length */
2771 
2772 	PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
2773 }
2774 
2775 Datum
byteaoverlay_no_len(PG_FUNCTION_ARGS)2776 byteaoverlay_no_len(PG_FUNCTION_ARGS)
2777 {
2778 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
2779 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
2780 	int			sp = PG_GETARG_INT32(2);		/* substring start position */
2781 	int			sl;
2782 
2783 	sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
2784 	PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
2785 }
2786 
2787 static bytea *
bytea_overlay(bytea * t1,bytea * t2,int sp,int sl)2788 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
2789 {
2790 	bytea	   *result;
2791 	bytea	   *s1;
2792 	bytea	   *s2;
2793 	int			sp_pl_sl;
2794 
2795 	/*
2796 	 * Check for possible integer-overflow cases.  For negative sp, throw a
2797 	 * "substring length" error because that's what should be expected
2798 	 * according to the spec's definition of OVERLAY().
2799 	 */
2800 	if (sp <= 0)
2801 		ereport(ERROR,
2802 				(errcode(ERRCODE_SUBSTRING_ERROR),
2803 				 errmsg("negative substring length not allowed")));
2804 	sp_pl_sl = sp + sl;
2805 	if (sp_pl_sl <= sl)
2806 		ereport(ERROR,
2807 				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
2808 				 errmsg("integer out of range")));
2809 
2810 	s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
2811 	s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
2812 	result = bytea_catenate(s1, t2);
2813 	result = bytea_catenate(result, s2);
2814 
2815 	return result;
2816 }
2817 
2818 /*
2819  * byteapos -
2820  *	  Return the position of the specified substring.
2821  *	  Implements the SQL POSITION() function.
2822  * Cloned from textpos and modified as required.
2823  */
2824 Datum
byteapos(PG_FUNCTION_ARGS)2825 byteapos(PG_FUNCTION_ARGS)
2826 {
2827 	bytea	   *t1 = PG_GETARG_BYTEA_PP(0);
2828 	bytea	   *t2 = PG_GETARG_BYTEA_PP(1);
2829 	int			pos;
2830 	int			px,
2831 				p;
2832 	int			len1,
2833 				len2;
2834 	char	   *p1,
2835 			   *p2;
2836 
2837 	len1 = VARSIZE_ANY_EXHDR(t1);
2838 	len2 = VARSIZE_ANY_EXHDR(t2);
2839 
2840 	if (len2 <= 0)
2841 		PG_RETURN_INT32(1);		/* result for empty pattern */
2842 
2843 	p1 = VARDATA_ANY(t1);
2844 	p2 = VARDATA_ANY(t2);
2845 
2846 	pos = 0;
2847 	px = (len1 - len2);
2848 	for (p = 0; p <= px; p++)
2849 	{
2850 		if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
2851 		{
2852 			pos = p + 1;
2853 			break;
2854 		};
2855 		p1++;
2856 	};
2857 
2858 	PG_RETURN_INT32(pos);
2859 }
2860 
2861 /*-------------------------------------------------------------
2862  * byteaGetByte
2863  *
2864  * this routine treats "bytea" as an array of bytes.
2865  * It returns the Nth byte (a number between 0 and 255).
2866  *-------------------------------------------------------------
2867  */
2868 Datum
byteaGetByte(PG_FUNCTION_ARGS)2869 byteaGetByte(PG_FUNCTION_ARGS)
2870 {
2871 	bytea	   *v = PG_GETARG_BYTEA_PP(0);
2872 	int32		n = PG_GETARG_INT32(1);
2873 	int			len;
2874 	int			byte;
2875 
2876 	len = VARSIZE_ANY_EXHDR(v);
2877 
2878 	if (n < 0 || n >= len)
2879 		ereport(ERROR,
2880 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2881 				 errmsg("index %d out of valid range, 0..%d",
2882 						n, len - 1)));
2883 
2884 	byte = ((unsigned char *) VARDATA_ANY(v))[n];
2885 
2886 	PG_RETURN_INT32(byte);
2887 }
2888 
2889 /*-------------------------------------------------------------
2890  * byteaGetBit
2891  *
2892  * This routine treats a "bytea" type like an array of bits.
2893  * It returns the value of the Nth bit (0 or 1).
2894  *
2895  *-------------------------------------------------------------
2896  */
2897 Datum
byteaGetBit(PG_FUNCTION_ARGS)2898 byteaGetBit(PG_FUNCTION_ARGS)
2899 {
2900 	bytea	   *v = PG_GETARG_BYTEA_PP(0);
2901 	int32		n = PG_GETARG_INT32(1);
2902 	int			byteNo,
2903 				bitNo;
2904 	int			len;
2905 	int			byte;
2906 
2907 	len = VARSIZE_ANY_EXHDR(v);
2908 
2909 	/* Do comparison arithmetic in int64 in case len exceeds INT_MAX/8 */
2910 	if (n < 0 || n >= (int64) len * 8)
2911 		ereport(ERROR,
2912 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2913 				 errmsg("index %d out of valid range, 0..%d",
2914 						n, (int) Min((int64) len * 8 - 1, INT_MAX))));
2915 
2916 	byteNo = n / 8;
2917 	bitNo = n % 8;
2918 
2919 	byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
2920 
2921 	if (byte & (1 << bitNo))
2922 		PG_RETURN_INT32(1);
2923 	else
2924 		PG_RETURN_INT32(0);
2925 }
2926 
2927 /*-------------------------------------------------------------
2928  * byteaSetByte
2929  *
2930  * Given an instance of type 'bytea' creates a new one with
2931  * the Nth byte set to the given value.
2932  *
2933  *-------------------------------------------------------------
2934  */
2935 Datum
byteaSetByte(PG_FUNCTION_ARGS)2936 byteaSetByte(PG_FUNCTION_ARGS)
2937 {
2938 	bytea	   *v = PG_GETARG_BYTEA_P(0);
2939 	int32		n = PG_GETARG_INT32(1);
2940 	int32		newByte = PG_GETARG_INT32(2);
2941 	int			len;
2942 	bytea	   *res;
2943 
2944 	len = VARSIZE(v) - VARHDRSZ;
2945 
2946 	if (n < 0 || n >= len)
2947 		ereport(ERROR,
2948 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2949 				 errmsg("index %d out of valid range, 0..%d",
2950 						n, len - 1)));
2951 
2952 	/*
2953 	 * Make a copy of the original varlena.
2954 	 */
2955 	res = (bytea *) palloc(VARSIZE(v));
2956 	memcpy((char *) res, (char *) v, VARSIZE(v));
2957 
2958 	/*
2959 	 * Now set the byte.
2960 	 */
2961 	((unsigned char *) VARDATA(res))[n] = newByte;
2962 
2963 	PG_RETURN_BYTEA_P(res);
2964 }
2965 
2966 /*-------------------------------------------------------------
2967  * byteaSetBit
2968  *
2969  * Given an instance of type 'bytea' creates a new one with
2970  * the Nth bit set to the given value.
2971  *
2972  *-------------------------------------------------------------
2973  */
2974 Datum
byteaSetBit(PG_FUNCTION_ARGS)2975 byteaSetBit(PG_FUNCTION_ARGS)
2976 {
2977 	bytea	   *v = PG_GETARG_BYTEA_P(0);
2978 	int32		n = PG_GETARG_INT32(1);
2979 	int32		newBit = PG_GETARG_INT32(2);
2980 	bytea	   *res;
2981 	int			len;
2982 	int			oldByte,
2983 				newByte;
2984 	int			byteNo,
2985 				bitNo;
2986 
2987 	len = VARSIZE(v) - VARHDRSZ;
2988 
2989 	/* Do comparison arithmetic in int64 in case len exceeds INT_MAX/8 */
2990 	if (n < 0 || n >= (int64) len * 8)
2991 		ereport(ERROR,
2992 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
2993 				 errmsg("index %d out of valid range, 0..%d",
2994 						n, (int) Min((int64) len * 8 - 1, INT_MAX))));
2995 
2996 	byteNo = n / 8;
2997 	bitNo = n % 8;
2998 
2999 	/*
3000 	 * sanity check!
3001 	 */
3002 	if (newBit != 0 && newBit != 1)
3003 		ereport(ERROR,
3004 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3005 				 errmsg("new bit must be 0 or 1")));
3006 
3007 	/*
3008 	 * Make a copy of the original varlena.
3009 	 */
3010 	res = (bytea *) palloc(VARSIZE(v));
3011 	memcpy((char *) res, (char *) v, VARSIZE(v));
3012 
3013 	/*
3014 	 * Update the byte.
3015 	 */
3016 	oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3017 
3018 	if (newBit == 0)
3019 		newByte = oldByte & (~(1 << bitNo));
3020 	else
3021 		newByte = oldByte | (1 << bitNo);
3022 
3023 	((unsigned char *) VARDATA(res))[byteNo] = newByte;
3024 
3025 	PG_RETURN_BYTEA_P(res);
3026 }
3027 
3028 
3029 /* text_name()
3030  * Converts a text type to a Name type.
3031  */
3032 Datum
text_name(PG_FUNCTION_ARGS)3033 text_name(PG_FUNCTION_ARGS)
3034 {
3035 	text	   *s = PG_GETARG_TEXT_PP(0);
3036 	Name		result;
3037 	int			len;
3038 
3039 	len = VARSIZE_ANY_EXHDR(s);
3040 
3041 	/* Truncate oversize input */
3042 	if (len >= NAMEDATALEN)
3043 		len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3044 
3045 	/* We use palloc0 here to ensure result is zero-padded */
3046 	result = (Name) palloc0(NAMEDATALEN);
3047 	memcpy(NameStr(*result), VARDATA_ANY(s), len);
3048 
3049 	PG_RETURN_NAME(result);
3050 }
3051 
3052 /* name_text()
3053  * Converts a Name type to a text type.
3054  */
3055 Datum
name_text(PG_FUNCTION_ARGS)3056 name_text(PG_FUNCTION_ARGS)
3057 {
3058 	Name		s = PG_GETARG_NAME(0);
3059 
3060 	PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3061 }
3062 
3063 
3064 /*
3065  * textToQualifiedNameList - convert a text object to list of names
3066  *
3067  * This implements the input parsing needed by nextval() and other
3068  * functions that take a text parameter representing a qualified name.
3069  * We split the name at dots, downcase if not double-quoted, and
3070  * truncate names if they're too long.
3071  */
3072 List *
textToQualifiedNameList(text * textval)3073 textToQualifiedNameList(text *textval)
3074 {
3075 	char	   *rawname;
3076 	List	   *result = NIL;
3077 	List	   *namelist;
3078 	ListCell   *l;
3079 
3080 	/* Convert to C string (handles possible detoasting). */
3081 	/* Note we rely on being able to modify rawname below. */
3082 	rawname = text_to_cstring(textval);
3083 
3084 	if (!SplitIdentifierString(rawname, '.', &namelist))
3085 		ereport(ERROR,
3086 				(errcode(ERRCODE_INVALID_NAME),
3087 				 errmsg("invalid name syntax")));
3088 
3089 	if (namelist == NIL)
3090 		ereport(ERROR,
3091 				(errcode(ERRCODE_INVALID_NAME),
3092 				 errmsg("invalid name syntax")));
3093 
3094 	foreach(l, namelist)
3095 	{
3096 		char	   *curname = (char *) lfirst(l);
3097 
3098 		result = lappend(result, makeString(pstrdup(curname)));
3099 	}
3100 
3101 	pfree(rawname);
3102 	list_free(namelist);
3103 
3104 	return result;
3105 }
3106 
3107 /*
3108  * SplitIdentifierString --- parse a string containing identifiers
3109  *
3110  * This is the guts of textToQualifiedNameList, and is exported for use in
3111  * other situations such as parsing GUC variables.  In the GUC case, it's
3112  * important to avoid memory leaks, so the API is designed to minimize the
3113  * amount of stuff that needs to be allocated and freed.
3114  *
3115  * Inputs:
3116  *	rawstring: the input string; must be overwritable!	On return, it's
3117  *			   been modified to contain the separated identifiers.
3118  *	separator: the separator punctuation expected between identifiers
3119  *			   (typically '.' or ',').  Whitespace may also appear around
3120  *			   identifiers.
3121  * Outputs:
3122  *	namelist: filled with a palloc'd list of pointers to identifiers within
3123  *			  rawstring.  Caller should list_free() this even on error return.
3124  *
3125  * Returns TRUE if okay, FALSE if there is a syntax error in the string.
3126  *
3127  * Note that an empty string is considered okay here, though not in
3128  * textToQualifiedNameList.
3129  */
3130 bool
SplitIdentifierString(char * rawstring,char separator,List ** namelist)3131 SplitIdentifierString(char *rawstring, char separator,
3132 					  List **namelist)
3133 {
3134 	char	   *nextp = rawstring;
3135 	bool		done = false;
3136 
3137 	*namelist = NIL;
3138 
3139 	while (scanner_isspace(*nextp))
3140 		nextp++;				/* skip leading whitespace */
3141 
3142 	if (*nextp == '\0')
3143 		return true;			/* allow empty string */
3144 
3145 	/* At the top of the loop, we are at start of a new identifier. */
3146 	do
3147 	{
3148 		char	   *curname;
3149 		char	   *endp;
3150 
3151 		if (*nextp == '"')
3152 		{
3153 			/* Quoted name --- collapse quote-quote pairs, no downcasing */
3154 			curname = nextp + 1;
3155 			for (;;)
3156 			{
3157 				endp = strchr(nextp + 1, '"');
3158 				if (endp == NULL)
3159 					return false;		/* mismatched quotes */
3160 				if (endp[1] != '"')
3161 					break;		/* found end of quoted name */
3162 				/* Collapse adjacent quotes into one quote, and look again */
3163 				memmove(endp, endp + 1, strlen(endp));
3164 				nextp = endp;
3165 			}
3166 			/* endp now points at the terminating quote */
3167 			nextp = endp + 1;
3168 		}
3169 		else
3170 		{
3171 			/* Unquoted name --- extends to separator or whitespace */
3172 			char	   *downname;
3173 			int			len;
3174 
3175 			curname = nextp;
3176 			while (*nextp && *nextp != separator &&
3177 				   !scanner_isspace(*nextp))
3178 				nextp++;
3179 			endp = nextp;
3180 			if (curname == nextp)
3181 				return false;	/* empty unquoted name not allowed */
3182 
3183 			/*
3184 			 * Downcase the identifier, using same code as main lexer does.
3185 			 *
3186 			 * XXX because we want to overwrite the input in-place, we cannot
3187 			 * support a downcasing transformation that increases the string
3188 			 * length.  This is not a problem given the current implementation
3189 			 * of downcase_truncate_identifier, but we'll probably have to do
3190 			 * something about this someday.
3191 			 */
3192 			len = endp - curname;
3193 			downname = downcase_truncate_identifier(curname, len, false);
3194 			Assert(strlen(downname) <= len);
3195 			strncpy(curname, downname, len);	/* strncpy is required here */
3196 			pfree(downname);
3197 		}
3198 
3199 		while (scanner_isspace(*nextp))
3200 			nextp++;			/* skip trailing whitespace */
3201 
3202 		if (*nextp == separator)
3203 		{
3204 			nextp++;
3205 			while (scanner_isspace(*nextp))
3206 				nextp++;		/* skip leading whitespace for next */
3207 			/* we expect another name, so done remains false */
3208 		}
3209 		else if (*nextp == '\0')
3210 			done = true;
3211 		else
3212 			return false;		/* invalid syntax */
3213 
3214 		/* Now safe to overwrite separator with a null */
3215 		*endp = '\0';
3216 
3217 		/* Truncate name if it's overlength */
3218 		truncate_identifier(curname, strlen(curname), false);
3219 
3220 		/*
3221 		 * Finished isolating current name --- add it to list
3222 		 */
3223 		*namelist = lappend(*namelist, curname);
3224 
3225 		/* Loop back if we didn't reach end of string */
3226 	} while (!done);
3227 
3228 	return true;
3229 }
3230 
3231 
3232 /*
3233  * SplitDirectoriesString --- parse a string containing directory names
3234  *
3235  * This is similar to SplitIdentifierString, except that the parsing
3236  * rules are meant to handle pathnames instead of identifiers: there is
3237  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3238  * and we apply canonicalize_path() to each extracted string.  Because of the
3239  * last, the returned strings are separately palloc'd rather than being
3240  * pointers into rawstring --- but we still scribble on rawstring.
3241  *
3242  * Inputs:
3243  *	rawstring: the input string; must be modifiable!
3244  *	separator: the separator punctuation expected between directories
3245  *			   (typically ',' or ';').  Whitespace may also appear around
3246  *			   directories.
3247  * Outputs:
3248  *	namelist: filled with a palloc'd list of directory names.
3249  *			  Caller should list_free_deep() this even on error return.
3250  *
3251  * Returns TRUE if okay, FALSE if there is a syntax error in the string.
3252  *
3253  * Note that an empty string is considered okay here.
3254  */
3255 bool
SplitDirectoriesString(char * rawstring,char separator,List ** namelist)3256 SplitDirectoriesString(char *rawstring, char separator,
3257 					   List **namelist)
3258 {
3259 	char	   *nextp = rawstring;
3260 	bool		done = false;
3261 
3262 	*namelist = NIL;
3263 
3264 	while (scanner_isspace(*nextp))
3265 		nextp++;				/* skip leading whitespace */
3266 
3267 	if (*nextp == '\0')
3268 		return true;			/* allow empty string */
3269 
3270 	/* At the top of the loop, we are at start of a new directory. */
3271 	do
3272 	{
3273 		char	   *curname;
3274 		char	   *endp;
3275 
3276 		if (*nextp == '"')
3277 		{
3278 			/* Quoted name --- collapse quote-quote pairs */
3279 			curname = nextp + 1;
3280 			for (;;)
3281 			{
3282 				endp = strchr(nextp + 1, '"');
3283 				if (endp == NULL)
3284 					return false;		/* mismatched quotes */
3285 				if (endp[1] != '"')
3286 					break;		/* found end of quoted name */
3287 				/* Collapse adjacent quotes into one quote, and look again */
3288 				memmove(endp, endp + 1, strlen(endp));
3289 				nextp = endp;
3290 			}
3291 			/* endp now points at the terminating quote */
3292 			nextp = endp + 1;
3293 		}
3294 		else
3295 		{
3296 			/* Unquoted name --- extends to separator or end of string */
3297 			curname = endp = nextp;
3298 			while (*nextp && *nextp != separator)
3299 			{
3300 				/* trailing whitespace should not be included in name */
3301 				if (!scanner_isspace(*nextp))
3302 					endp = nextp + 1;
3303 				nextp++;
3304 			}
3305 			if (curname == endp)
3306 				return false;	/* empty unquoted name not allowed */
3307 		}
3308 
3309 		while (scanner_isspace(*nextp))
3310 			nextp++;			/* skip trailing whitespace */
3311 
3312 		if (*nextp == separator)
3313 		{
3314 			nextp++;
3315 			while (scanner_isspace(*nextp))
3316 				nextp++;		/* skip leading whitespace for next */
3317 			/* we expect another name, so done remains false */
3318 		}
3319 		else if (*nextp == '\0')
3320 			done = true;
3321 		else
3322 			return false;		/* invalid syntax */
3323 
3324 		/* Now safe to overwrite separator with a null */
3325 		*endp = '\0';
3326 
3327 		/* Truncate path if it's overlength */
3328 		if (strlen(curname) >= MAXPGPATH)
3329 			curname[MAXPGPATH - 1] = '\0';
3330 
3331 		/*
3332 		 * Finished isolating current name --- add it to list
3333 		 */
3334 		curname = pstrdup(curname);
3335 		canonicalize_path(curname);
3336 		*namelist = lappend(*namelist, curname);
3337 
3338 		/* Loop back if we didn't reach end of string */
3339 	} while (!done);
3340 
3341 	return true;
3342 }
3343 
3344 
3345 /*
3346  * SplitGUCList --- parse a string containing identifiers or file names
3347  *
3348  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3349  * presuming whether the elements will be taken as identifiers or file names.
3350  * We assume the input has already been through flatten_set_variable_args(),
3351  * so that we need never downcase (if appropriate, that was done already).
3352  * Nor do we ever truncate, since we don't know the correct max length.
3353  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3354  * because any embedded whitespace should have led to double-quoting).
3355  * Otherwise the API is identical to SplitIdentifierString.
3356  *
3357  * XXX it's annoying to have so many copies of this string-splitting logic.
3358  * However, it's not clear that having one function with a bunch of option
3359  * flags would be much better.
3360  *
3361  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3362  * Be sure to update that if you have to change this.
3363  *
3364  * Inputs:
3365  *	rawstring: the input string; must be overwritable!	On return, it's
3366  *			   been modified to contain the separated identifiers.
3367  *	separator: the separator punctuation expected between identifiers
3368  *			   (typically '.' or ',').  Whitespace may also appear around
3369  *			   identifiers.
3370  * Outputs:
3371  *	namelist: filled with a palloc'd list of pointers to identifiers within
3372  *			  rawstring.  Caller should list_free() this even on error return.
3373  *
3374  * Returns true if okay, false if there is a syntax error in the string.
3375  */
3376 bool
SplitGUCList(char * rawstring,char separator,List ** namelist)3377 SplitGUCList(char *rawstring, char separator,
3378 			 List **namelist)
3379 {
3380 	char	   *nextp = rawstring;
3381 	bool		done = false;
3382 
3383 	*namelist = NIL;
3384 
3385 	while (scanner_isspace(*nextp))
3386 		nextp++;				/* skip leading whitespace */
3387 
3388 	if (*nextp == '\0')
3389 		return true;			/* allow empty string */
3390 
3391 	/* At the top of the loop, we are at start of a new identifier. */
3392 	do
3393 	{
3394 		char	   *curname;
3395 		char	   *endp;
3396 
3397 		if (*nextp == '"')
3398 		{
3399 			/* Quoted name --- collapse quote-quote pairs */
3400 			curname = nextp + 1;
3401 			for (;;)
3402 			{
3403 				endp = strchr(nextp + 1, '"');
3404 				if (endp == NULL)
3405 					return false;	/* mismatched quotes */
3406 				if (endp[1] != '"')
3407 					break;		/* found end of quoted name */
3408 				/* Collapse adjacent quotes into one quote, and look again */
3409 				memmove(endp, endp + 1, strlen(endp));
3410 				nextp = endp;
3411 			}
3412 			/* endp now points at the terminating quote */
3413 			nextp = endp + 1;
3414 		}
3415 		else
3416 		{
3417 			/* Unquoted name --- extends to separator or whitespace */
3418 			curname = nextp;
3419 			while (*nextp && *nextp != separator &&
3420 				   !scanner_isspace(*nextp))
3421 				nextp++;
3422 			endp = nextp;
3423 			if (curname == nextp)
3424 				return false;	/* empty unquoted name not allowed */
3425 		}
3426 
3427 		while (scanner_isspace(*nextp))
3428 			nextp++;			/* skip trailing whitespace */
3429 
3430 		if (*nextp == separator)
3431 		{
3432 			nextp++;
3433 			while (scanner_isspace(*nextp))
3434 				nextp++;		/* skip leading whitespace for next */
3435 			/* we expect another name, so done remains false */
3436 		}
3437 		else if (*nextp == '\0')
3438 			done = true;
3439 		else
3440 			return false;		/* invalid syntax */
3441 
3442 		/* Now safe to overwrite separator with a null */
3443 		*endp = '\0';
3444 
3445 		/*
3446 		 * Finished isolating current name --- add it to list
3447 		 */
3448 		*namelist = lappend(*namelist, curname);
3449 
3450 		/* Loop back if we didn't reach end of string */
3451 	} while (!done);
3452 
3453 	return true;
3454 }
3455 
3456 
3457 /*****************************************************************************
3458  *	Comparison Functions used for bytea
3459  *
3460  * Note: btree indexes need these routines not to leak memory; therefore,
3461  * be careful to free working copies of toasted datums.  Most places don't
3462  * need to be so careful.
3463  *****************************************************************************/
3464 
3465 Datum
byteaeq(PG_FUNCTION_ARGS)3466 byteaeq(PG_FUNCTION_ARGS)
3467 {
3468 	Datum		arg1 = PG_GETARG_DATUM(0);
3469 	Datum		arg2 = PG_GETARG_DATUM(1);
3470 	bool		result;
3471 	Size		len1,
3472 				len2;
3473 
3474 	/*
3475 	 * We can use a fast path for unequal lengths, which might save us from
3476 	 * having to detoast one or both values.
3477 	 */
3478 	len1 = toast_raw_datum_size(arg1);
3479 	len2 = toast_raw_datum_size(arg2);
3480 	if (len1 != len2)
3481 		result = false;
3482 	else
3483 	{
3484 		bytea	   *barg1 = DatumGetByteaPP(arg1);
3485 		bytea	   *barg2 = DatumGetByteaPP(arg2);
3486 
3487 		result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3488 						 len1 - VARHDRSZ) == 0);
3489 
3490 		PG_FREE_IF_COPY(barg1, 0);
3491 		PG_FREE_IF_COPY(barg2, 1);
3492 	}
3493 
3494 	PG_RETURN_BOOL(result);
3495 }
3496 
3497 Datum
byteane(PG_FUNCTION_ARGS)3498 byteane(PG_FUNCTION_ARGS)
3499 {
3500 	Datum		arg1 = PG_GETARG_DATUM(0);
3501 	Datum		arg2 = PG_GETARG_DATUM(1);
3502 	bool		result;
3503 	Size		len1,
3504 				len2;
3505 
3506 	/*
3507 	 * We can use a fast path for unequal lengths, which might save us from
3508 	 * having to detoast one or both values.
3509 	 */
3510 	len1 = toast_raw_datum_size(arg1);
3511 	len2 = toast_raw_datum_size(arg2);
3512 	if (len1 != len2)
3513 		result = true;
3514 	else
3515 	{
3516 		bytea	   *barg1 = DatumGetByteaPP(arg1);
3517 		bytea	   *barg2 = DatumGetByteaPP(arg2);
3518 
3519 		result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3520 						 len1 - VARHDRSZ) != 0);
3521 
3522 		PG_FREE_IF_COPY(barg1, 0);
3523 		PG_FREE_IF_COPY(barg2, 1);
3524 	}
3525 
3526 	PG_RETURN_BOOL(result);
3527 }
3528 
3529 Datum
bytealt(PG_FUNCTION_ARGS)3530 bytealt(PG_FUNCTION_ARGS)
3531 {
3532 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
3533 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
3534 	int			len1,
3535 				len2;
3536 	int			cmp;
3537 
3538 	len1 = VARSIZE_ANY_EXHDR(arg1);
3539 	len2 = VARSIZE_ANY_EXHDR(arg2);
3540 
3541 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3542 
3543 	PG_FREE_IF_COPY(arg1, 0);
3544 	PG_FREE_IF_COPY(arg2, 1);
3545 
3546 	PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
3547 }
3548 
3549 Datum
byteale(PG_FUNCTION_ARGS)3550 byteale(PG_FUNCTION_ARGS)
3551 {
3552 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
3553 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
3554 	int			len1,
3555 				len2;
3556 	int			cmp;
3557 
3558 	len1 = VARSIZE_ANY_EXHDR(arg1);
3559 	len2 = VARSIZE_ANY_EXHDR(arg2);
3560 
3561 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3562 
3563 	PG_FREE_IF_COPY(arg1, 0);
3564 	PG_FREE_IF_COPY(arg2, 1);
3565 
3566 	PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
3567 }
3568 
3569 Datum
byteagt(PG_FUNCTION_ARGS)3570 byteagt(PG_FUNCTION_ARGS)
3571 {
3572 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
3573 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
3574 	int			len1,
3575 				len2;
3576 	int			cmp;
3577 
3578 	len1 = VARSIZE_ANY_EXHDR(arg1);
3579 	len2 = VARSIZE_ANY_EXHDR(arg2);
3580 
3581 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3582 
3583 	PG_FREE_IF_COPY(arg1, 0);
3584 	PG_FREE_IF_COPY(arg2, 1);
3585 
3586 	PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
3587 }
3588 
3589 Datum
byteage(PG_FUNCTION_ARGS)3590 byteage(PG_FUNCTION_ARGS)
3591 {
3592 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
3593 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
3594 	int			len1,
3595 				len2;
3596 	int			cmp;
3597 
3598 	len1 = VARSIZE_ANY_EXHDR(arg1);
3599 	len2 = VARSIZE_ANY_EXHDR(arg2);
3600 
3601 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3602 
3603 	PG_FREE_IF_COPY(arg1, 0);
3604 	PG_FREE_IF_COPY(arg2, 1);
3605 
3606 	PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
3607 }
3608 
3609 Datum
byteacmp(PG_FUNCTION_ARGS)3610 byteacmp(PG_FUNCTION_ARGS)
3611 {
3612 	bytea	   *arg1 = PG_GETARG_BYTEA_PP(0);
3613 	bytea	   *arg2 = PG_GETARG_BYTEA_PP(1);
3614 	int			len1,
3615 				len2;
3616 	int			cmp;
3617 
3618 	len1 = VARSIZE_ANY_EXHDR(arg1);
3619 	len2 = VARSIZE_ANY_EXHDR(arg2);
3620 
3621 	cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3622 	if ((cmp == 0) && (len1 != len2))
3623 		cmp = (len1 < len2) ? -1 : 1;
3624 
3625 	PG_FREE_IF_COPY(arg1, 0);
3626 	PG_FREE_IF_COPY(arg2, 1);
3627 
3628 	PG_RETURN_INT32(cmp);
3629 }
3630 
3631 Datum
bytea_sortsupport(PG_FUNCTION_ARGS)3632 bytea_sortsupport(PG_FUNCTION_ARGS)
3633 {
3634 	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3635 	MemoryContext oldcontext;
3636 
3637 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3638 
3639 	/* Use generic string SortSupport, forcing "C" collation */
3640 	varstr_sortsupport(ssup, C_COLLATION_OID, false);
3641 
3642 	MemoryContextSwitchTo(oldcontext);
3643 
3644 	PG_RETURN_VOID();
3645 }
3646 
3647 /*
3648  * appendStringInfoText
3649  *
3650  * Append a text to str.
3651  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
3652  */
3653 static void
appendStringInfoText(StringInfo str,const text * t)3654 appendStringInfoText(StringInfo str, const text *t)
3655 {
3656 	appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
3657 }
3658 
3659 /*
3660  * replace_text
3661  * replace all occurrences of 'old_sub_str' in 'orig_str'
3662  * with 'new_sub_str' to form 'new_str'
3663  *
3664  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
3665  * otherwise returns 'new_str'
3666  */
3667 Datum
replace_text(PG_FUNCTION_ARGS)3668 replace_text(PG_FUNCTION_ARGS)
3669 {
3670 	text	   *src_text = PG_GETARG_TEXT_PP(0);
3671 	text	   *from_sub_text = PG_GETARG_TEXT_PP(1);
3672 	text	   *to_sub_text = PG_GETARG_TEXT_PP(2);
3673 	int			src_text_len;
3674 	int			from_sub_text_len;
3675 	TextPositionState state;
3676 	text	   *ret_text;
3677 	int			start_posn;
3678 	int			curr_posn;
3679 	int			chunk_len;
3680 	char	   *start_ptr;
3681 	StringInfoData str;
3682 
3683 	text_position_setup(src_text, from_sub_text, &state);
3684 
3685 	/*
3686 	 * Note: we check the converted string length, not the original, because
3687 	 * they could be different if the input contained invalid encoding.
3688 	 */
3689 	src_text_len = state.len1;
3690 	from_sub_text_len = state.len2;
3691 
3692 	/* Return unmodified source string if empty source or pattern */
3693 	if (src_text_len < 1 || from_sub_text_len < 1)
3694 	{
3695 		text_position_cleanup(&state);
3696 		PG_RETURN_TEXT_P(src_text);
3697 	}
3698 
3699 	start_posn = 1;
3700 	curr_posn = text_position_next(1, &state);
3701 
3702 	/* When the from_sub_text is not found, there is nothing to do. */
3703 	if (curr_posn == 0)
3704 	{
3705 		text_position_cleanup(&state);
3706 		PG_RETURN_TEXT_P(src_text);
3707 	}
3708 
3709 	/* start_ptr points to the start_posn'th character of src_text */
3710 	start_ptr = VARDATA_ANY(src_text);
3711 
3712 	initStringInfo(&str);
3713 
3714 	do
3715 	{
3716 		CHECK_FOR_INTERRUPTS();
3717 
3718 		/* copy the data skipped over by last text_position_next() */
3719 		chunk_len = charlen_to_bytelen(start_ptr, curr_posn - start_posn);
3720 		appendBinaryStringInfo(&str, start_ptr, chunk_len);
3721 
3722 		appendStringInfoText(&str, to_sub_text);
3723 
3724 		start_posn = curr_posn;
3725 		start_ptr += chunk_len;
3726 		start_posn += from_sub_text_len;
3727 		start_ptr += charlen_to_bytelen(start_ptr, from_sub_text_len);
3728 
3729 		curr_posn = text_position_next(start_posn, &state);
3730 	}
3731 	while (curr_posn > 0);
3732 
3733 	/* copy trailing data */
3734 	chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
3735 	appendBinaryStringInfo(&str, start_ptr, chunk_len);
3736 
3737 	text_position_cleanup(&state);
3738 
3739 	ret_text = cstring_to_text_with_len(str.data, str.len);
3740 	pfree(str.data);
3741 
3742 	PG_RETURN_TEXT_P(ret_text);
3743 }
3744 
3745 /*
3746  * check_replace_text_has_escape_char
3747  *
3748  * check whether replace_text contains escape char.
3749  */
3750 static bool
check_replace_text_has_escape_char(const text * replace_text)3751 check_replace_text_has_escape_char(const text *replace_text)
3752 {
3753 	const char *p = VARDATA_ANY(replace_text);
3754 	const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
3755 
3756 	if (pg_database_encoding_max_length() == 1)
3757 	{
3758 		for (; p < p_end; p++)
3759 		{
3760 			if (*p == '\\')
3761 				return true;
3762 		}
3763 	}
3764 	else
3765 	{
3766 		for (; p < p_end; p += pg_mblen(p))
3767 		{
3768 			if (*p == '\\')
3769 				return true;
3770 		}
3771 	}
3772 
3773 	return false;
3774 }
3775 
3776 /*
3777  * appendStringInfoRegexpSubstr
3778  *
3779  * Append replace_text to str, substituting regexp back references for
3780  * \n escapes.  start_ptr is the start of the match in the source string,
3781  * at logical character position data_pos.
3782  */
3783 static void
appendStringInfoRegexpSubstr(StringInfo str,text * replace_text,regmatch_t * pmatch,char * start_ptr,int data_pos)3784 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
3785 							 regmatch_t *pmatch,
3786 							 char *start_ptr, int data_pos)
3787 {
3788 	const char *p = VARDATA_ANY(replace_text);
3789 	const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
3790 	int			eml = pg_database_encoding_max_length();
3791 
3792 	for (;;)
3793 	{
3794 		const char *chunk_start = p;
3795 		int			so;
3796 		int			eo;
3797 
3798 		/* Find next escape char. */
3799 		if (eml == 1)
3800 		{
3801 			for (; p < p_end && *p != '\\'; p++)
3802 				 /* nothing */ ;
3803 		}
3804 		else
3805 		{
3806 			for (; p < p_end && *p != '\\'; p += pg_mblen(p))
3807 				 /* nothing */ ;
3808 		}
3809 
3810 		/* Copy the text we just scanned over, if any. */
3811 		if (p > chunk_start)
3812 			appendBinaryStringInfo(str, chunk_start, p - chunk_start);
3813 
3814 		/* Done if at end of string, else advance over escape char. */
3815 		if (p >= p_end)
3816 			break;
3817 		p++;
3818 
3819 		if (p >= p_end)
3820 		{
3821 			/* Escape at very end of input.  Treat same as unexpected char */
3822 			appendStringInfoChar(str, '\\');
3823 			break;
3824 		}
3825 
3826 		if (*p >= '1' && *p <= '9')
3827 		{
3828 			/* Use the back reference of regexp. */
3829 			int			idx = *p - '0';
3830 
3831 			so = pmatch[idx].rm_so;
3832 			eo = pmatch[idx].rm_eo;
3833 			p++;
3834 		}
3835 		else if (*p == '&')
3836 		{
3837 			/* Use the entire matched string. */
3838 			so = pmatch[0].rm_so;
3839 			eo = pmatch[0].rm_eo;
3840 			p++;
3841 		}
3842 		else if (*p == '\\')
3843 		{
3844 			/* \\ means transfer one \ to output. */
3845 			appendStringInfoChar(str, '\\');
3846 			p++;
3847 			continue;
3848 		}
3849 		else
3850 		{
3851 			/*
3852 			 * If escape char is not followed by any expected char, just treat
3853 			 * it as ordinary data to copy.  (XXX would it be better to throw
3854 			 * an error?)
3855 			 */
3856 			appendStringInfoChar(str, '\\');
3857 			continue;
3858 		}
3859 
3860 		if (so != -1 && eo != -1)
3861 		{
3862 			/*
3863 			 * Copy the text that is back reference of regexp.  Note so and eo
3864 			 * are counted in characters not bytes.
3865 			 */
3866 			char	   *chunk_start;
3867 			int			chunk_len;
3868 
3869 			Assert(so >= data_pos);
3870 			chunk_start = start_ptr;
3871 			chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
3872 			chunk_len = charlen_to_bytelen(chunk_start, eo - so);
3873 			appendBinaryStringInfo(str, chunk_start, chunk_len);
3874 		}
3875 	}
3876 }
3877 
3878 #define REGEXP_REPLACE_BACKREF_CNT		10
3879 
3880 /*
3881  * replace_text_regexp
3882  *
3883  * replace text that matches to regexp in src_text to replace_text.
3884  *
3885  * Note: to avoid having to include regex.h in builtins.h, we declare
3886  * the regexp argument as void *, but really it's regex_t *.
3887  */
3888 text *
replace_text_regexp(text * src_text,void * regexp,text * replace_text,bool glob)3889 replace_text_regexp(text *src_text, void *regexp,
3890 					text *replace_text, bool glob)
3891 {
3892 	text	   *ret_text;
3893 	regex_t    *re = (regex_t *) regexp;
3894 	int			src_text_len = VARSIZE_ANY_EXHDR(src_text);
3895 	StringInfoData buf;
3896 	regmatch_t	pmatch[REGEXP_REPLACE_BACKREF_CNT];
3897 	pg_wchar   *data;
3898 	size_t		data_len;
3899 	int			search_start;
3900 	int			data_pos;
3901 	char	   *start_ptr;
3902 	bool		have_escape;
3903 
3904 	initStringInfo(&buf);
3905 
3906 	/* Convert data string to wide characters. */
3907 	data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
3908 	data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
3909 
3910 	/* Check whether replace_text has escape char. */
3911 	have_escape = check_replace_text_has_escape_char(replace_text);
3912 
3913 	/* start_ptr points to the data_pos'th character of src_text */
3914 	start_ptr = (char *) VARDATA_ANY(src_text);
3915 	data_pos = 0;
3916 
3917 	search_start = 0;
3918 	while (search_start <= data_len)
3919 	{
3920 		int			regexec_result;
3921 
3922 		CHECK_FOR_INTERRUPTS();
3923 
3924 		regexec_result = pg_regexec(re,
3925 									data,
3926 									data_len,
3927 									search_start,
3928 									NULL,		/* no details */
3929 									REGEXP_REPLACE_BACKREF_CNT,
3930 									pmatch,
3931 									0);
3932 
3933 		if (regexec_result == REG_NOMATCH)
3934 			break;
3935 
3936 		if (regexec_result != REG_OKAY)
3937 		{
3938 			char		errMsg[100];
3939 
3940 			CHECK_FOR_INTERRUPTS();
3941 			pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
3942 			ereport(ERROR,
3943 					(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
3944 					 errmsg("regular expression failed: %s", errMsg)));
3945 		}
3946 
3947 		/*
3948 		 * Copy the text to the left of the match position.  Note we are given
3949 		 * character not byte indexes.
3950 		 */
3951 		if (pmatch[0].rm_so - data_pos > 0)
3952 		{
3953 			int			chunk_len;
3954 
3955 			chunk_len = charlen_to_bytelen(start_ptr,
3956 										   pmatch[0].rm_so - data_pos);
3957 			appendBinaryStringInfo(&buf, start_ptr, chunk_len);
3958 
3959 			/*
3960 			 * Advance start_ptr over that text, to avoid multiple rescans of
3961 			 * it if the replace_text contains multiple back-references.
3962 			 */
3963 			start_ptr += chunk_len;
3964 			data_pos = pmatch[0].rm_so;
3965 		}
3966 
3967 		/*
3968 		 * Copy the replace_text. Process back references when the
3969 		 * replace_text has escape characters.
3970 		 */
3971 		if (have_escape)
3972 			appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
3973 										 start_ptr, data_pos);
3974 		else
3975 			appendStringInfoText(&buf, replace_text);
3976 
3977 		/* Advance start_ptr and data_pos over the matched text. */
3978 		start_ptr += charlen_to_bytelen(start_ptr,
3979 										pmatch[0].rm_eo - data_pos);
3980 		data_pos = pmatch[0].rm_eo;
3981 
3982 		/*
3983 		 * When global option is off, replace the first instance only.
3984 		 */
3985 		if (!glob)
3986 			break;
3987 
3988 		/*
3989 		 * Advance search position.  Normally we start the next search at the
3990 		 * end of the previous match; but if the match was of zero length, we
3991 		 * have to advance by one character, or we'd just find the same match
3992 		 * again.
3993 		 */
3994 		search_start = data_pos;
3995 		if (pmatch[0].rm_so == pmatch[0].rm_eo)
3996 			search_start++;
3997 	}
3998 
3999 	/*
4000 	 * Copy the text to the right of the last match.
4001 	 */
4002 	if (data_pos < data_len)
4003 	{
4004 		int			chunk_len;
4005 
4006 		chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4007 		appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4008 	}
4009 
4010 	ret_text = cstring_to_text_with_len(buf.data, buf.len);
4011 	pfree(buf.data);
4012 	pfree(data);
4013 
4014 	return ret_text;
4015 }
4016 
4017 /*
4018  * split_text
4019  * parse input string
4020  * return ord item (1 based)
4021  * based on provided field separator
4022  */
4023 Datum
split_text(PG_FUNCTION_ARGS)4024 split_text(PG_FUNCTION_ARGS)
4025 {
4026 	text	   *inputstring = PG_GETARG_TEXT_PP(0);
4027 	text	   *fldsep = PG_GETARG_TEXT_PP(1);
4028 	int			fldnum = PG_GETARG_INT32(2);
4029 	int			inputstring_len;
4030 	int			fldsep_len;
4031 	TextPositionState state;
4032 	int			start_posn;
4033 	int			end_posn;
4034 	text	   *result_text;
4035 
4036 	/* field number is 1 based */
4037 	if (fldnum < 1)
4038 		ereport(ERROR,
4039 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4040 				 errmsg("field position must be greater than zero")));
4041 
4042 	text_position_setup(inputstring, fldsep, &state);
4043 
4044 	/*
4045 	 * Note: we check the converted string length, not the original, because
4046 	 * they could be different if the input contained invalid encoding.
4047 	 */
4048 	inputstring_len = state.len1;
4049 	fldsep_len = state.len2;
4050 
4051 	/* return empty string for empty input string */
4052 	if (inputstring_len < 1)
4053 	{
4054 		text_position_cleanup(&state);
4055 		PG_RETURN_TEXT_P(cstring_to_text(""));
4056 	}
4057 
4058 	/* empty field separator */
4059 	if (fldsep_len < 1)
4060 	{
4061 		text_position_cleanup(&state);
4062 		/* if first field, return input string, else empty string */
4063 		if (fldnum == 1)
4064 			PG_RETURN_TEXT_P(inputstring);
4065 		else
4066 			PG_RETURN_TEXT_P(cstring_to_text(""));
4067 	}
4068 
4069 	/* identify bounds of first field */
4070 	start_posn = 1;
4071 	end_posn = text_position_next(1, &state);
4072 
4073 	/* special case if fldsep not found at all */
4074 	if (end_posn == 0)
4075 	{
4076 		text_position_cleanup(&state);
4077 		/* if field 1 requested, return input string, else empty string */
4078 		if (fldnum == 1)
4079 			PG_RETURN_TEXT_P(inputstring);
4080 		else
4081 			PG_RETURN_TEXT_P(cstring_to_text(""));
4082 	}
4083 
4084 	while (end_posn > 0 && --fldnum > 0)
4085 	{
4086 		/* identify bounds of next field */
4087 		start_posn = end_posn + fldsep_len;
4088 		end_posn = text_position_next(start_posn, &state);
4089 	}
4090 
4091 	text_position_cleanup(&state);
4092 
4093 	if (fldnum > 0)
4094 	{
4095 		/* N'th field separator not found */
4096 		/* if last field requested, return it, else empty string */
4097 		if (fldnum == 1)
4098 			result_text = text_substring(PointerGetDatum(inputstring),
4099 										 start_posn,
4100 										 -1,
4101 										 true);
4102 		else
4103 			result_text = cstring_to_text("");
4104 	}
4105 	else
4106 	{
4107 		/* non-last field requested */
4108 		result_text = text_substring(PointerGetDatum(inputstring),
4109 									 start_posn,
4110 									 end_posn - start_posn,
4111 									 false);
4112 	}
4113 
4114 	PG_RETURN_TEXT_P(result_text);
4115 }
4116 
4117 /*
4118  * Convenience function to return true when two text params are equal.
4119  */
4120 static bool
text_isequal(text * txt1,text * txt2)4121 text_isequal(text *txt1, text *txt2)
4122 {
4123 	return DatumGetBool(DirectFunctionCall2(texteq,
4124 											PointerGetDatum(txt1),
4125 											PointerGetDatum(txt2)));
4126 }
4127 
4128 /*
4129  * text_to_array
4130  * parse input string and return text array of elements,
4131  * based on provided field separator
4132  */
4133 Datum
text_to_array(PG_FUNCTION_ARGS)4134 text_to_array(PG_FUNCTION_ARGS)
4135 {
4136 	return text_to_array_internal(fcinfo);
4137 }
4138 
4139 /*
4140  * text_to_array_null
4141  * parse input string and return text array of elements,
4142  * based on provided field separator and null string
4143  *
4144  * This is a separate entry point only to prevent the regression tests from
4145  * complaining about different argument sets for the same internal function.
4146  */
4147 Datum
text_to_array_null(PG_FUNCTION_ARGS)4148 text_to_array_null(PG_FUNCTION_ARGS)
4149 {
4150 	return text_to_array_internal(fcinfo);
4151 }
4152 
4153 /*
4154  * common code for text_to_array and text_to_array_null functions
4155  *
4156  * These are not strict so we have to test for null inputs explicitly.
4157  */
4158 static Datum
text_to_array_internal(PG_FUNCTION_ARGS)4159 text_to_array_internal(PG_FUNCTION_ARGS)
4160 {
4161 	text	   *inputstring;
4162 	text	   *fldsep;
4163 	text	   *null_string;
4164 	int			inputstring_len;
4165 	int			fldsep_len;
4166 	char	   *start_ptr;
4167 	text	   *result_text;
4168 	bool		is_null;
4169 	ArrayBuildState *astate = NULL;
4170 
4171 	/* when input string is NULL, then result is NULL too */
4172 	if (PG_ARGISNULL(0))
4173 		PG_RETURN_NULL();
4174 
4175 	inputstring = PG_GETARG_TEXT_PP(0);
4176 
4177 	/* fldsep can be NULL */
4178 	if (!PG_ARGISNULL(1))
4179 		fldsep = PG_GETARG_TEXT_PP(1);
4180 	else
4181 		fldsep = NULL;
4182 
4183 	/* null_string can be NULL or omitted */
4184 	if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4185 		null_string = PG_GETARG_TEXT_PP(2);
4186 	else
4187 		null_string = NULL;
4188 
4189 	if (fldsep != NULL)
4190 	{
4191 		/*
4192 		 * Normal case with non-null fldsep.  Use the text_position machinery
4193 		 * to search for occurrences of fldsep.
4194 		 */
4195 		TextPositionState state;
4196 		int			fldnum;
4197 		int			start_posn;
4198 		int			end_posn;
4199 		int			chunk_len;
4200 
4201 		text_position_setup(inputstring, fldsep, &state);
4202 
4203 		/*
4204 		 * Note: we check the converted string length, not the original,
4205 		 * because they could be different if the input contained invalid
4206 		 * encoding.
4207 		 */
4208 		inputstring_len = state.len1;
4209 		fldsep_len = state.len2;
4210 
4211 		/* return empty array for empty input string */
4212 		if (inputstring_len < 1)
4213 		{
4214 			text_position_cleanup(&state);
4215 			PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4216 		}
4217 
4218 		/*
4219 		 * empty field separator: return the input string as a one-element
4220 		 * array
4221 		 */
4222 		if (fldsep_len < 1)
4223 		{
4224 			text_position_cleanup(&state);
4225 			/* single element can be a NULL too */
4226 			is_null = null_string ? text_isequal(inputstring, null_string) : false;
4227 			PG_RETURN_ARRAYTYPE_P(create_singleton_array(fcinfo, TEXTOID,
4228 												PointerGetDatum(inputstring),
4229 														 is_null, 1));
4230 		}
4231 
4232 		start_posn = 1;
4233 		/* start_ptr points to the start_posn'th character of inputstring */
4234 		start_ptr = VARDATA_ANY(inputstring);
4235 
4236 		for (fldnum = 1;; fldnum++)		/* field number is 1 based */
4237 		{
4238 			CHECK_FOR_INTERRUPTS();
4239 
4240 			end_posn = text_position_next(start_posn, &state);
4241 
4242 			if (end_posn == 0)
4243 			{
4244 				/* fetch last field */
4245 				chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4246 			}
4247 			else
4248 			{
4249 				/* fetch non-last field */
4250 				chunk_len = charlen_to_bytelen(start_ptr, end_posn - start_posn);
4251 			}
4252 
4253 			/* must build a temp text datum to pass to accumArrayResult */
4254 			result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4255 			is_null = null_string ? text_isequal(result_text, null_string) : false;
4256 
4257 			/* stash away this field */
4258 			astate = accumArrayResult(astate,
4259 									  PointerGetDatum(result_text),
4260 									  is_null,
4261 									  TEXTOID,
4262 									  CurrentMemoryContext);
4263 
4264 			pfree(result_text);
4265 
4266 			if (end_posn == 0)
4267 				break;
4268 
4269 			start_posn = end_posn;
4270 			start_ptr += chunk_len;
4271 			start_posn += fldsep_len;
4272 			start_ptr += charlen_to_bytelen(start_ptr, fldsep_len);
4273 		}
4274 
4275 		text_position_cleanup(&state);
4276 	}
4277 	else
4278 	{
4279 		/*
4280 		 * When fldsep is NULL, each character in the inputstring becomes an
4281 		 * element in the result array.  The separator is effectively the
4282 		 * space between characters.
4283 		 */
4284 		inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4285 
4286 		/* return empty array for empty input string */
4287 		if (inputstring_len < 1)
4288 			PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4289 
4290 		start_ptr = VARDATA_ANY(inputstring);
4291 
4292 		while (inputstring_len > 0)
4293 		{
4294 			int			chunk_len = pg_mblen(start_ptr);
4295 
4296 			CHECK_FOR_INTERRUPTS();
4297 
4298 			/* must build a temp text datum to pass to accumArrayResult */
4299 			result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4300 			is_null = null_string ? text_isequal(result_text, null_string) : false;
4301 
4302 			/* stash away this field */
4303 			astate = accumArrayResult(astate,
4304 									  PointerGetDatum(result_text),
4305 									  is_null,
4306 									  TEXTOID,
4307 									  CurrentMemoryContext);
4308 
4309 			pfree(result_text);
4310 
4311 			start_ptr += chunk_len;
4312 			inputstring_len -= chunk_len;
4313 		}
4314 	}
4315 
4316 	PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
4317 										  CurrentMemoryContext));
4318 }
4319 
4320 /*
4321  * array_to_text
4322  * concatenate Cstring representation of input array elements
4323  * using provided field separator
4324  */
4325 Datum
array_to_text(PG_FUNCTION_ARGS)4326 array_to_text(PG_FUNCTION_ARGS)
4327 {
4328 	ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
4329 	char	   *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4330 
4331 	PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4332 }
4333 
4334 /*
4335  * array_to_text_null
4336  * concatenate Cstring representation of input array elements
4337  * using provided field separator and null string
4338  *
4339  * This version is not strict so we have to test for null inputs explicitly.
4340  */
4341 Datum
array_to_text_null(PG_FUNCTION_ARGS)4342 array_to_text_null(PG_FUNCTION_ARGS)
4343 {
4344 	ArrayType  *v;
4345 	char	   *fldsep;
4346 	char	   *null_string;
4347 
4348 	/* returns NULL when first or second parameter is NULL */
4349 	if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4350 		PG_RETURN_NULL();
4351 
4352 	v = PG_GETARG_ARRAYTYPE_P(0);
4353 	fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4354 
4355 	/* NULL null string is passed through as a null pointer */
4356 	if (!PG_ARGISNULL(2))
4357 		null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4358 	else
4359 		null_string = NULL;
4360 
4361 	PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4362 }
4363 
4364 /*
4365  * common code for array_to_text and array_to_text_null functions
4366  */
4367 static text *
array_to_text_internal(FunctionCallInfo fcinfo,ArrayType * v,const char * fldsep,const char * null_string)4368 array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
4369 					   const char *fldsep, const char *null_string)
4370 {
4371 	text	   *result;
4372 	int			nitems,
4373 			   *dims,
4374 				ndims;
4375 	Oid			element_type;
4376 	int			typlen;
4377 	bool		typbyval;
4378 	char		typalign;
4379 	StringInfoData buf;
4380 	bool		printed = false;
4381 	char	   *p;
4382 	bits8	   *bitmap;
4383 	int			bitmask;
4384 	int			i;
4385 	ArrayMetaState *my_extra;
4386 
4387 	ndims = ARR_NDIM(v);
4388 	dims = ARR_DIMS(v);
4389 	nitems = ArrayGetNItems(ndims, dims);
4390 
4391 	/* if there are no elements, return an empty string */
4392 	if (nitems == 0)
4393 		return cstring_to_text_with_len("", 0);
4394 
4395 	element_type = ARR_ELEMTYPE(v);
4396 	initStringInfo(&buf);
4397 
4398 	/*
4399 	 * We arrange to look up info about element type, including its output
4400 	 * conversion proc, only once per series of calls, assuming the element
4401 	 * type doesn't change underneath us.
4402 	 */
4403 	my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4404 	if (my_extra == NULL)
4405 	{
4406 		fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4407 													  sizeof(ArrayMetaState));
4408 		my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4409 		my_extra->element_type = ~element_type;
4410 	}
4411 
4412 	if (my_extra->element_type != element_type)
4413 	{
4414 		/*
4415 		 * Get info about element type, including its output conversion proc
4416 		 */
4417 		get_type_io_data(element_type, IOFunc_output,
4418 						 &my_extra->typlen, &my_extra->typbyval,
4419 						 &my_extra->typalign, &my_extra->typdelim,
4420 						 &my_extra->typioparam, &my_extra->typiofunc);
4421 		fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4422 					  fcinfo->flinfo->fn_mcxt);
4423 		my_extra->element_type = element_type;
4424 	}
4425 	typlen = my_extra->typlen;
4426 	typbyval = my_extra->typbyval;
4427 	typalign = my_extra->typalign;
4428 
4429 	p = ARR_DATA_PTR(v);
4430 	bitmap = ARR_NULLBITMAP(v);
4431 	bitmask = 1;
4432 
4433 	for (i = 0; i < nitems; i++)
4434 	{
4435 		Datum		itemvalue;
4436 		char	   *value;
4437 
4438 		/* Get source element, checking for NULL */
4439 		if (bitmap && (*bitmap & bitmask) == 0)
4440 		{
4441 			/* if null_string is NULL, we just ignore null elements */
4442 			if (null_string != NULL)
4443 			{
4444 				if (printed)
4445 					appendStringInfo(&buf, "%s%s", fldsep, null_string);
4446 				else
4447 					appendStringInfoString(&buf, null_string);
4448 				printed = true;
4449 			}
4450 		}
4451 		else
4452 		{
4453 			itemvalue = fetch_att(p, typbyval, typlen);
4454 
4455 			value = OutputFunctionCall(&my_extra->proc, itemvalue);
4456 
4457 			if (printed)
4458 				appendStringInfo(&buf, "%s%s", fldsep, value);
4459 			else
4460 				appendStringInfoString(&buf, value);
4461 			printed = true;
4462 
4463 			p = att_addlength_pointer(p, typlen, p);
4464 			p = (char *) att_align_nominal(p, typalign);
4465 		}
4466 
4467 		/* advance bitmap pointer if any */
4468 		if (bitmap)
4469 		{
4470 			bitmask <<= 1;
4471 			if (bitmask == 0x100)
4472 			{
4473 				bitmap++;
4474 				bitmask = 1;
4475 			}
4476 		}
4477 	}
4478 
4479 	result = cstring_to_text_with_len(buf.data, buf.len);
4480 	pfree(buf.data);
4481 
4482 	return result;
4483 }
4484 
4485 #define HEXBASE 16
4486 /*
4487  * Convert an int32 to a string containing a base 16 (hex) representation of
4488  * the number.
4489  */
4490 Datum
to_hex32(PG_FUNCTION_ARGS)4491 to_hex32(PG_FUNCTION_ARGS)
4492 {
4493 	uint32		value = (uint32) PG_GETARG_INT32(0);
4494 	char	   *ptr;
4495 	const char *digits = "0123456789abcdef";
4496 	char		buf[32];		/* bigger than needed, but reasonable */
4497 
4498 	ptr = buf + sizeof(buf) - 1;
4499 	*ptr = '\0';
4500 
4501 	do
4502 	{
4503 		*--ptr = digits[value % HEXBASE];
4504 		value /= HEXBASE;
4505 	} while (ptr > buf && value);
4506 
4507 	PG_RETURN_TEXT_P(cstring_to_text(ptr));
4508 }
4509 
4510 /*
4511  * Convert an int64 to a string containing a base 16 (hex) representation of
4512  * the number.
4513  */
4514 Datum
to_hex64(PG_FUNCTION_ARGS)4515 to_hex64(PG_FUNCTION_ARGS)
4516 {
4517 	uint64		value = (uint64) PG_GETARG_INT64(0);
4518 	char	   *ptr;
4519 	const char *digits = "0123456789abcdef";
4520 	char		buf[32];		/* bigger than needed, but reasonable */
4521 
4522 	ptr = buf + sizeof(buf) - 1;
4523 	*ptr = '\0';
4524 
4525 	do
4526 	{
4527 		*--ptr = digits[value % HEXBASE];
4528 		value /= HEXBASE;
4529 	} while (ptr > buf && value);
4530 
4531 	PG_RETURN_TEXT_P(cstring_to_text(ptr));
4532 }
4533 
4534 /*
4535  * Create an md5 hash of a text string and return it as hex
4536  *
4537  * md5 produces a 16 byte (128 bit) hash; double it for hex
4538  */
4539 #define MD5_HASH_LEN  32
4540 
4541 Datum
md5_text(PG_FUNCTION_ARGS)4542 md5_text(PG_FUNCTION_ARGS)
4543 {
4544 	text	   *in_text = PG_GETARG_TEXT_PP(0);
4545 	size_t		len;
4546 	char		hexsum[MD5_HASH_LEN + 1];
4547 
4548 	/* Calculate the length of the buffer using varlena metadata */
4549 	len = VARSIZE_ANY_EXHDR(in_text);
4550 
4551 	/* get the hash result */
4552 	if (pg_md5_hash(VARDATA_ANY(in_text), len, hexsum) == false)
4553 		ereport(ERROR,
4554 				(errcode(ERRCODE_OUT_OF_MEMORY),
4555 				 errmsg("out of memory")));
4556 
4557 	/* convert to text and return it */
4558 	PG_RETURN_TEXT_P(cstring_to_text(hexsum));
4559 }
4560 
4561 /*
4562  * Create an md5 hash of a bytea field and return it as a hex string:
4563  * 16-byte md5 digest is represented in 32 hex characters.
4564  */
4565 Datum
md5_bytea(PG_FUNCTION_ARGS)4566 md5_bytea(PG_FUNCTION_ARGS)
4567 {
4568 	bytea	   *in = PG_GETARG_BYTEA_PP(0);
4569 	size_t		len;
4570 	char		hexsum[MD5_HASH_LEN + 1];
4571 
4572 	len = VARSIZE_ANY_EXHDR(in);
4573 	if (pg_md5_hash(VARDATA_ANY(in), len, hexsum) == false)
4574 		ereport(ERROR,
4575 				(errcode(ERRCODE_OUT_OF_MEMORY),
4576 				 errmsg("out of memory")));
4577 
4578 	PG_RETURN_TEXT_P(cstring_to_text(hexsum));
4579 }
4580 
4581 /*
4582  * Return the size of a datum, possibly compressed
4583  *
4584  * Works on any data type
4585  */
4586 Datum
pg_column_size(PG_FUNCTION_ARGS)4587 pg_column_size(PG_FUNCTION_ARGS)
4588 {
4589 	Datum		value = PG_GETARG_DATUM(0);
4590 	int32		result;
4591 	int			typlen;
4592 
4593 	/* On first call, get the input type's typlen, and save at *fn_extra */
4594 	if (fcinfo->flinfo->fn_extra == NULL)
4595 	{
4596 		/* Lookup the datatype of the supplied argument */
4597 		Oid			argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
4598 
4599 		typlen = get_typlen(argtypeid);
4600 		if (typlen == 0)		/* should not happen */
4601 			elog(ERROR, "cache lookup failed for type %u", argtypeid);
4602 
4603 		fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4604 													  sizeof(int));
4605 		*((int *) fcinfo->flinfo->fn_extra) = typlen;
4606 	}
4607 	else
4608 		typlen = *((int *) fcinfo->flinfo->fn_extra);
4609 
4610 	if (typlen == -1)
4611 	{
4612 		/* varlena type, possibly toasted */
4613 		result = toast_datum_size(value);
4614 	}
4615 	else if (typlen == -2)
4616 	{
4617 		/* cstring */
4618 		result = strlen(DatumGetCString(value)) + 1;
4619 	}
4620 	else
4621 	{
4622 		/* ordinary fixed-width type */
4623 		result = typlen;
4624 	}
4625 
4626 	PG_RETURN_INT32(result);
4627 }
4628 
4629 /*
4630  * string_agg - Concatenates values and returns string.
4631  *
4632  * Syntax: string_agg(value text, delimiter text) RETURNS text
4633  *
4634  * Note: Any NULL values are ignored. The first-call delimiter isn't
4635  * actually used at all, and on subsequent calls the delimiter precedes
4636  * the associated value.
4637  */
4638 
4639 /* subroutine to initialize state */
4640 static StringInfo
makeStringAggState(FunctionCallInfo fcinfo)4641 makeStringAggState(FunctionCallInfo fcinfo)
4642 {
4643 	StringInfo	state;
4644 	MemoryContext aggcontext;
4645 	MemoryContext oldcontext;
4646 
4647 	if (!AggCheckCallContext(fcinfo, &aggcontext))
4648 	{
4649 		/* cannot be called directly because of internal-type argument */
4650 		elog(ERROR, "string_agg_transfn called in non-aggregate context");
4651 	}
4652 
4653 	/*
4654 	 * Create state in aggregate context.  It'll stay there across subsequent
4655 	 * calls.
4656 	 */
4657 	oldcontext = MemoryContextSwitchTo(aggcontext);
4658 	state = makeStringInfo();
4659 	MemoryContextSwitchTo(oldcontext);
4660 
4661 	return state;
4662 }
4663 
4664 Datum
string_agg_transfn(PG_FUNCTION_ARGS)4665 string_agg_transfn(PG_FUNCTION_ARGS)
4666 {
4667 	StringInfo	state;
4668 
4669 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
4670 
4671 	/* Append the value unless null. */
4672 	if (!PG_ARGISNULL(1))
4673 	{
4674 		/* On the first time through, we ignore the delimiter. */
4675 		if (state == NULL)
4676 			state = makeStringAggState(fcinfo);
4677 		else if (!PG_ARGISNULL(2))
4678 			appendStringInfoText(state, PG_GETARG_TEXT_PP(2));	/* delimiter */
4679 
4680 		appendStringInfoText(state, PG_GETARG_TEXT_PP(1));		/* value */
4681 	}
4682 
4683 	/*
4684 	 * The transition type for string_agg() is declared to be "internal",
4685 	 * which is a pass-by-value type the same size as a pointer.
4686 	 */
4687 	PG_RETURN_POINTER(state);
4688 }
4689 
4690 Datum
string_agg_finalfn(PG_FUNCTION_ARGS)4691 string_agg_finalfn(PG_FUNCTION_ARGS)
4692 {
4693 	StringInfo	state;
4694 
4695 	/* cannot be called directly because of internal-type argument */
4696 	Assert(AggCheckCallContext(fcinfo, NULL));
4697 
4698 	state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
4699 
4700 	if (state != NULL)
4701 		PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
4702 	else
4703 		PG_RETURN_NULL();
4704 }
4705 
4706 /*
4707  * Implementation of both concat() and concat_ws().
4708  *
4709  * sepstr is the separator string to place between values.
4710  * argidx identifies the first argument to concatenate (counting from zero).
4711  * Returns NULL if result should be NULL, else text value.
4712  */
4713 static text *
concat_internal(const char * sepstr,int argidx,FunctionCallInfo fcinfo)4714 concat_internal(const char *sepstr, int argidx,
4715 				FunctionCallInfo fcinfo)
4716 {
4717 	text	   *result;
4718 	StringInfoData str;
4719 	bool		first_arg = true;
4720 	int			i;
4721 
4722 	/*
4723 	 * concat(VARIADIC some-array) is essentially equivalent to
4724 	 * array_to_text(), ie concat the array elements with the given separator.
4725 	 * So we just pass the case off to that code.
4726 	 */
4727 	if (get_fn_expr_variadic(fcinfo->flinfo))
4728 	{
4729 		ArrayType  *arr;
4730 
4731 		/* Should have just the one argument */
4732 		Assert(argidx == PG_NARGS() - 1);
4733 
4734 		/* concat(VARIADIC NULL) is defined as NULL */
4735 		if (PG_ARGISNULL(argidx))
4736 			return NULL;
4737 
4738 		/*
4739 		 * Non-null argument had better be an array.  We assume that any call
4740 		 * context that could let get_fn_expr_variadic return true will have
4741 		 * checked that a VARIADIC-labeled parameter actually is an array.  So
4742 		 * it should be okay to just Assert that it's an array rather than
4743 		 * doing a full-fledged error check.
4744 		 */
4745 		Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
4746 
4747 		/* OK, safe to fetch the array value */
4748 		arr = PG_GETARG_ARRAYTYPE_P(argidx);
4749 
4750 		/*
4751 		 * And serialize the array.  We tell array_to_text to ignore null
4752 		 * elements, which matches the behavior of the loop below.
4753 		 */
4754 		return array_to_text_internal(fcinfo, arr, sepstr, NULL);
4755 	}
4756 
4757 	/* Normal case without explicit VARIADIC marker */
4758 	initStringInfo(&str);
4759 
4760 	for (i = argidx; i < PG_NARGS(); i++)
4761 	{
4762 		if (!PG_ARGISNULL(i))
4763 		{
4764 			Datum		value = PG_GETARG_DATUM(i);
4765 			Oid			valtype;
4766 			Oid			typOutput;
4767 			bool		typIsVarlena;
4768 
4769 			/* add separator if appropriate */
4770 			if (first_arg)
4771 				first_arg = false;
4772 			else
4773 				appendStringInfoString(&str, sepstr);
4774 
4775 			/* call the appropriate type output function, append the result */
4776 			valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
4777 			if (!OidIsValid(valtype))
4778 				elog(ERROR, "could not determine data type of concat() input");
4779 			getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
4780 			appendStringInfoString(&str,
4781 								   OidOutputFunctionCall(typOutput, value));
4782 		}
4783 	}
4784 
4785 	result = cstring_to_text_with_len(str.data, str.len);
4786 	pfree(str.data);
4787 
4788 	return result;
4789 }
4790 
4791 /*
4792  * Concatenate all arguments. NULL arguments are ignored.
4793  */
4794 Datum
text_concat(PG_FUNCTION_ARGS)4795 text_concat(PG_FUNCTION_ARGS)
4796 {
4797 	text	   *result;
4798 
4799 	result = concat_internal("", 0, fcinfo);
4800 	if (result == NULL)
4801 		PG_RETURN_NULL();
4802 	PG_RETURN_TEXT_P(result);
4803 }
4804 
4805 /*
4806  * Concatenate all but first argument value with separators. The first
4807  * parameter is used as the separator. NULL arguments are ignored.
4808  */
4809 Datum
text_concat_ws(PG_FUNCTION_ARGS)4810 text_concat_ws(PG_FUNCTION_ARGS)
4811 {
4812 	char	   *sep;
4813 	text	   *result;
4814 
4815 	/* return NULL when separator is NULL */
4816 	if (PG_ARGISNULL(0))
4817 		PG_RETURN_NULL();
4818 	sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
4819 
4820 	result = concat_internal(sep, 1, fcinfo);
4821 	if (result == NULL)
4822 		PG_RETURN_NULL();
4823 	PG_RETURN_TEXT_P(result);
4824 }
4825 
4826 /*
4827  * Return first n characters in the string. When n is negative,
4828  * return all but last |n| characters.
4829  */
4830 Datum
text_left(PG_FUNCTION_ARGS)4831 text_left(PG_FUNCTION_ARGS)
4832 {
4833 	text	   *str = PG_GETARG_TEXT_PP(0);
4834 	const char *p = VARDATA_ANY(str);
4835 	int			len = VARSIZE_ANY_EXHDR(str);
4836 	int			n = PG_GETARG_INT32(1);
4837 	int			rlen;
4838 
4839 	if (n < 0)
4840 		n = pg_mbstrlen_with_len(p, len) + n;
4841 	rlen = pg_mbcharcliplen(p, len, n);
4842 
4843 	PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
4844 }
4845 
4846 /*
4847  * Return last n characters in the string. When n is negative,
4848  * return all but first |n| characters.
4849  */
4850 Datum
text_right(PG_FUNCTION_ARGS)4851 text_right(PG_FUNCTION_ARGS)
4852 {
4853 	text	   *str = PG_GETARG_TEXT_PP(0);
4854 	const char *p = VARDATA_ANY(str);
4855 	int			len = VARSIZE_ANY_EXHDR(str);
4856 	int			n = PG_GETARG_INT32(1);
4857 	int			off;
4858 
4859 	if (n < 0)
4860 		n = -n;
4861 	else
4862 		n = pg_mbstrlen_with_len(p, len) - n;
4863 	off = pg_mbcharcliplen(p, len, n);
4864 
4865 	PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
4866 }
4867 
4868 /*
4869  * Return reversed string
4870  */
4871 Datum
text_reverse(PG_FUNCTION_ARGS)4872 text_reverse(PG_FUNCTION_ARGS)
4873 {
4874 	text	   *str = PG_GETARG_TEXT_PP(0);
4875 	const char *p = VARDATA_ANY(str);
4876 	int			len = VARSIZE_ANY_EXHDR(str);
4877 	const char *endp = p + len;
4878 	text	   *result;
4879 	char	   *dst;
4880 
4881 	result = palloc(len + VARHDRSZ);
4882 	dst = (char *) VARDATA(result) + len;
4883 	SET_VARSIZE(result, len + VARHDRSZ);
4884 
4885 	if (pg_database_encoding_max_length() > 1)
4886 	{
4887 		/* multibyte version */
4888 		while (p < endp)
4889 		{
4890 			int			sz;
4891 
4892 			sz = pg_mblen(p);
4893 			dst -= sz;
4894 			memcpy(dst, p, sz);
4895 			p += sz;
4896 		}
4897 	}
4898 	else
4899 	{
4900 		/* single byte version */
4901 		while (p < endp)
4902 			*(--dst) = *p++;
4903 	}
4904 
4905 	PG_RETURN_TEXT_P(result);
4906 }
4907 
4908 
4909 /*
4910  * Support macros for text_format()
4911  */
4912 #define TEXT_FORMAT_FLAG_MINUS	0x0001	/* is minus flag present? */
4913 
4914 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
4915 	do { \
4916 		if (++(ptr) >= (end_ptr)) \
4917 			ereport(ERROR, \
4918 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
4919 					 errmsg("unterminated format() type specifier"), \
4920 					 errhint("For a single \"%%\" use \"%%%%\"."))); \
4921 	} while (0)
4922 
4923 /*
4924  * Returns a formatted string
4925  */
4926 Datum
text_format(PG_FUNCTION_ARGS)4927 text_format(PG_FUNCTION_ARGS)
4928 {
4929 	text	   *fmt;
4930 	StringInfoData str;
4931 	const char *cp;
4932 	const char *start_ptr;
4933 	const char *end_ptr;
4934 	text	   *result;
4935 	int			arg;
4936 	bool		funcvariadic;
4937 	int			nargs;
4938 	Datum	   *elements = NULL;
4939 	bool	   *nulls = NULL;
4940 	Oid			element_type = InvalidOid;
4941 	Oid			prev_type = InvalidOid;
4942 	Oid			prev_width_type = InvalidOid;
4943 	FmgrInfo	typoutputfinfo;
4944 	FmgrInfo	typoutputinfo_width;
4945 
4946 	/* When format string is null, immediately return null */
4947 	if (PG_ARGISNULL(0))
4948 		PG_RETURN_NULL();
4949 
4950 	/* If argument is marked VARIADIC, expand array into elements */
4951 	if (get_fn_expr_variadic(fcinfo->flinfo))
4952 	{
4953 		ArrayType  *arr;
4954 		int16		elmlen;
4955 		bool		elmbyval;
4956 		char		elmalign;
4957 		int			nitems;
4958 
4959 		/* Should have just the one argument */
4960 		Assert(PG_NARGS() == 2);
4961 
4962 		/* If argument is NULL, we treat it as zero-length array */
4963 		if (PG_ARGISNULL(1))
4964 			nitems = 0;
4965 		else
4966 		{
4967 			/*
4968 			 * Non-null argument had better be an array.  We assume that any
4969 			 * call context that could let get_fn_expr_variadic return true
4970 			 * will have checked that a VARIADIC-labeled parameter actually is
4971 			 * an array.  So it should be okay to just Assert that it's an
4972 			 * array rather than doing a full-fledged error check.
4973 			 */
4974 			Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
4975 
4976 			/* OK, safe to fetch the array value */
4977 			arr = PG_GETARG_ARRAYTYPE_P(1);
4978 
4979 			/* Get info about array element type */
4980 			element_type = ARR_ELEMTYPE(arr);
4981 			get_typlenbyvalalign(element_type,
4982 								 &elmlen, &elmbyval, &elmalign);
4983 
4984 			/* Extract all array elements */
4985 			deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
4986 							  &elements, &nulls, &nitems);
4987 		}
4988 
4989 		nargs = nitems + 1;
4990 		funcvariadic = true;
4991 	}
4992 	else
4993 	{
4994 		/* Non-variadic case, we'll process the arguments individually */
4995 		nargs = PG_NARGS();
4996 		funcvariadic = false;
4997 	}
4998 
4999 	/* Setup for main loop. */
5000 	fmt = PG_GETARG_TEXT_PP(0);
5001 	start_ptr = VARDATA_ANY(fmt);
5002 	end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5003 	initStringInfo(&str);
5004 	arg = 1;					/* next argument position to print */
5005 
5006 	/* Scan format string, looking for conversion specifiers. */
5007 	for (cp = start_ptr; cp < end_ptr; cp++)
5008 	{
5009 		int			argpos;
5010 		int			widthpos;
5011 		int			flags;
5012 		int			width;
5013 		Datum		value;
5014 		bool		isNull;
5015 		Oid			typid;
5016 
5017 		/*
5018 		 * If it's not the start of a conversion specifier, just copy it to
5019 		 * the output buffer.
5020 		 */
5021 		if (*cp != '%')
5022 		{
5023 			appendStringInfoCharMacro(&str, *cp);
5024 			continue;
5025 		}
5026 
5027 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5028 
5029 		/* Easy case: %% outputs a single % */
5030 		if (*cp == '%')
5031 		{
5032 			appendStringInfoCharMacro(&str, *cp);
5033 			continue;
5034 		}
5035 
5036 		/* Parse the optional portions of the format specifier */
5037 		cp = text_format_parse_format(cp, end_ptr,
5038 									  &argpos, &widthpos,
5039 									  &flags, &width);
5040 
5041 		/*
5042 		 * Next we should see the main conversion specifier.  Whether or not
5043 		 * an argument position was present, it's known that at least one
5044 		 * character remains in the string at this point.  Experience suggests
5045 		 * that it's worth checking that that character is one of the expected
5046 		 * ones before we try to fetch arguments, so as to produce the least
5047 		 * confusing response to a mis-formatted specifier.
5048 		 */
5049 		if (strchr("sIL", *cp) == NULL)
5050 			ereport(ERROR,
5051 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5052 					 errmsg("unrecognized format() type specifier \"%c\"",
5053 							*cp),
5054 					 errhint("For a single \"%%\" use \"%%%%\".")));
5055 
5056 		/* If indirect width was specified, get its value */
5057 		if (widthpos >= 0)
5058 		{
5059 			/* Collect the specified or next argument position */
5060 			if (widthpos > 0)
5061 				arg = widthpos;
5062 			if (arg >= nargs)
5063 				ereport(ERROR,
5064 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5065 						 errmsg("too few arguments for format()")));
5066 
5067 			/* Get the value and type of the selected argument */
5068 			if (!funcvariadic)
5069 			{
5070 				value = PG_GETARG_DATUM(arg);
5071 				isNull = PG_ARGISNULL(arg);
5072 				typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5073 			}
5074 			else
5075 			{
5076 				value = elements[arg - 1];
5077 				isNull = nulls[arg - 1];
5078 				typid = element_type;
5079 			}
5080 			if (!OidIsValid(typid))
5081 				elog(ERROR, "could not determine data type of format() input");
5082 
5083 			arg++;
5084 
5085 			/* We can treat NULL width the same as zero */
5086 			if (isNull)
5087 				width = 0;
5088 			else if (typid == INT4OID)
5089 				width = DatumGetInt32(value);
5090 			else if (typid == INT2OID)
5091 				width = DatumGetInt16(value);
5092 			else
5093 			{
5094 				/* For less-usual datatypes, convert to text then to int */
5095 				char	   *str;
5096 
5097 				if (typid != prev_width_type)
5098 				{
5099 					Oid			typoutputfunc;
5100 					bool		typIsVarlena;
5101 
5102 					getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5103 					fmgr_info(typoutputfunc, &typoutputinfo_width);
5104 					prev_width_type = typid;
5105 				}
5106 
5107 				str = OutputFunctionCall(&typoutputinfo_width, value);
5108 
5109 				/* pg_atoi will complain about bad data or overflow */
5110 				width = pg_atoi(str, sizeof(int), '\0');
5111 
5112 				pfree(str);
5113 			}
5114 		}
5115 
5116 		/* Collect the specified or next argument position */
5117 		if (argpos > 0)
5118 			arg = argpos;
5119 		if (arg >= nargs)
5120 			ereport(ERROR,
5121 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5122 					 errmsg("too few arguments for format()")));
5123 
5124 		/* Get the value and type of the selected argument */
5125 		if (!funcvariadic)
5126 		{
5127 			value = PG_GETARG_DATUM(arg);
5128 			isNull = PG_ARGISNULL(arg);
5129 			typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5130 		}
5131 		else
5132 		{
5133 			value = elements[arg - 1];
5134 			isNull = nulls[arg - 1];
5135 			typid = element_type;
5136 		}
5137 		if (!OidIsValid(typid))
5138 			elog(ERROR, "could not determine data type of format() input");
5139 
5140 		arg++;
5141 
5142 		/*
5143 		 * Get the appropriate typOutput function, reusing previous one if
5144 		 * same type as previous argument.  That's particularly useful in the
5145 		 * variadic-array case, but often saves work even for ordinary calls.
5146 		 */
5147 		if (typid != prev_type)
5148 		{
5149 			Oid			typoutputfunc;
5150 			bool		typIsVarlena;
5151 
5152 			getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5153 			fmgr_info(typoutputfunc, &typoutputfinfo);
5154 			prev_type = typid;
5155 		}
5156 
5157 		/*
5158 		 * And now we can format the value.
5159 		 */
5160 		switch (*cp)
5161 		{
5162 			case 's':
5163 			case 'I':
5164 			case 'L':
5165 				text_format_string_conversion(&str, *cp, &typoutputfinfo,
5166 											  value, isNull,
5167 											  flags, width);
5168 				break;
5169 			default:
5170 				/* should not get here, because of previous check */
5171 				ereport(ERROR,
5172 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5173 						 errmsg("unrecognized format() type specifier \"%c\"",
5174 								*cp),
5175 						 errhint("For a single \"%%\" use \"%%%%\".")));
5176 				break;
5177 		}
5178 	}
5179 
5180 	/* Don't need deconstruct_array results anymore. */
5181 	if (elements != NULL)
5182 		pfree(elements);
5183 	if (nulls != NULL)
5184 		pfree(nulls);
5185 
5186 	/* Generate results. */
5187 	result = cstring_to_text_with_len(str.data, str.len);
5188 	pfree(str.data);
5189 
5190 	PG_RETURN_TEXT_P(result);
5191 }
5192 
5193 /*
5194  * Parse contiguous digits as a decimal number.
5195  *
5196  * Returns true if some digits could be parsed.
5197  * The value is returned into *value, and *ptr is advanced to the next
5198  * character to be parsed.
5199  *
5200  * Note parsing invariant: at least one character is known available before
5201  * string end (end_ptr) at entry, and this is still true at exit.
5202  */
5203 static bool
text_format_parse_digits(const char ** ptr,const char * end_ptr,int * value)5204 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5205 {
5206 	bool		found = false;
5207 	const char *cp = *ptr;
5208 	int			val = 0;
5209 
5210 	while (*cp >= '0' && *cp <= '9')
5211 	{
5212 		int			newval = val * 10 + (*cp - '0');
5213 
5214 		if (newval / 10 != val) /* overflow? */
5215 			ereport(ERROR,
5216 					(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5217 					 errmsg("number is out of range")));
5218 		val = newval;
5219 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5220 		found = true;
5221 	}
5222 
5223 	*ptr = cp;
5224 	*value = val;
5225 
5226 	return found;
5227 }
5228 
5229 /*
5230  * Parse a format specifier (generally following the SUS printf spec).
5231  *
5232  * We have already advanced over the initial '%', and we are looking for
5233  * [argpos][flags][width]type (but the type character is not consumed here).
5234  *
5235  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5236  * Output parameters:
5237  *	argpos: argument position for value to be printed.  -1 means unspecified.
5238  *	widthpos: argument position for width.  Zero means the argument position
5239  *			was unspecified (ie, take the next arg) and -1 means no width
5240  *			argument (width was omitted or specified as a constant).
5241  *	flags: bitmask of flags.
5242  *	width: directly-specified width value.  Zero means the width was omitted
5243  *			(note it's not necessary to distinguish this case from an explicit
5244  *			zero width value).
5245  *
5246  * The function result is the next character position to be parsed, ie, the
5247  * location where the type character is/should be.
5248  *
5249  * Note parsing invariant: at least one character is known available before
5250  * string end (end_ptr) at entry, and this is still true at exit.
5251  */
5252 static const char *
text_format_parse_format(const char * start_ptr,const char * end_ptr,int * argpos,int * widthpos,int * flags,int * width)5253 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5254 						 int *argpos, int *widthpos,
5255 						 int *flags, int *width)
5256 {
5257 	const char *cp = start_ptr;
5258 	int			n;
5259 
5260 	/* set defaults for output parameters */
5261 	*argpos = -1;
5262 	*widthpos = -1;
5263 	*flags = 0;
5264 	*width = 0;
5265 
5266 	/* try to identify first number */
5267 	if (text_format_parse_digits(&cp, end_ptr, &n))
5268 	{
5269 		if (*cp != '$')
5270 		{
5271 			/* Must be just a width and a type, so we're done */
5272 			*width = n;
5273 			return cp;
5274 		}
5275 		/* The number was argument position */
5276 		*argpos = n;
5277 		/* Explicit 0 for argument index is immediately refused */
5278 		if (n == 0)
5279 			ereport(ERROR,
5280 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5281 					 errmsg("format specifies argument 0, but arguments are numbered from 1")));
5282 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5283 	}
5284 
5285 	/* Handle flags (only minus is supported now) */
5286 	while (*cp == '-')
5287 	{
5288 		*flags |= TEXT_FORMAT_FLAG_MINUS;
5289 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5290 	}
5291 
5292 	if (*cp == '*')
5293 	{
5294 		/* Handle indirect width */
5295 		ADVANCE_PARSE_POINTER(cp, end_ptr);
5296 		if (text_format_parse_digits(&cp, end_ptr, &n))
5297 		{
5298 			/* number in this position must be closed by $ */
5299 			if (*cp != '$')
5300 				ereport(ERROR,
5301 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5302 				  errmsg("width argument position must be ended by \"$\"")));
5303 			/* The number was width argument position */
5304 			*widthpos = n;
5305 			/* Explicit 0 for argument index is immediately refused */
5306 			if (n == 0)
5307 				ereport(ERROR,
5308 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5309 						 errmsg("format specifies argument 0, but arguments are numbered from 1")));
5310 			ADVANCE_PARSE_POINTER(cp, end_ptr);
5311 		}
5312 		else
5313 			*widthpos = 0;		/* width's argument position is unspecified */
5314 	}
5315 	else
5316 	{
5317 		/* Check for direct width specification */
5318 		if (text_format_parse_digits(&cp, end_ptr, &n))
5319 			*width = n;
5320 	}
5321 
5322 	/* cp should now be pointing at type character */
5323 	return cp;
5324 }
5325 
5326 /*
5327  * Format a %s, %I, or %L conversion
5328  */
5329 static void
text_format_string_conversion(StringInfo buf,char conversion,FmgrInfo * typOutputInfo,Datum value,bool isNull,int flags,int width)5330 text_format_string_conversion(StringInfo buf, char conversion,
5331 							  FmgrInfo *typOutputInfo,
5332 							  Datum value, bool isNull,
5333 							  int flags, int width)
5334 {
5335 	char	   *str;
5336 
5337 	/* Handle NULL arguments before trying to stringify the value. */
5338 	if (isNull)
5339 	{
5340 		if (conversion == 's')
5341 			text_format_append_string(buf, "", flags, width);
5342 		else if (conversion == 'L')
5343 			text_format_append_string(buf, "NULL", flags, width);
5344 		else if (conversion == 'I')
5345 			ereport(ERROR,
5346 					(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
5347 			errmsg("null values cannot be formatted as an SQL identifier")));
5348 		return;
5349 	}
5350 
5351 	/* Stringify. */
5352 	str = OutputFunctionCall(typOutputInfo, value);
5353 
5354 	/* Escape. */
5355 	if (conversion == 'I')
5356 	{
5357 		/* quote_identifier may or may not allocate a new string. */
5358 		text_format_append_string(buf, quote_identifier(str), flags, width);
5359 	}
5360 	else if (conversion == 'L')
5361 	{
5362 		char	   *qstr = quote_literal_cstr(str);
5363 
5364 		text_format_append_string(buf, qstr, flags, width);
5365 		/* quote_literal_cstr() always allocates a new string */
5366 		pfree(qstr);
5367 	}
5368 	else
5369 		text_format_append_string(buf, str, flags, width);
5370 
5371 	/* Cleanup. */
5372 	pfree(str);
5373 }
5374 
5375 /*
5376  * Append str to buf, padding as directed by flags/width
5377  */
5378 static void
text_format_append_string(StringInfo buf,const char * str,int flags,int width)5379 text_format_append_string(StringInfo buf, const char *str,
5380 						  int flags, int width)
5381 {
5382 	bool		align_to_left = false;
5383 	int			len;
5384 
5385 	/* fast path for typical easy case */
5386 	if (width == 0)
5387 	{
5388 		appendStringInfoString(buf, str);
5389 		return;
5390 	}
5391 
5392 	if (width < 0)
5393 	{
5394 		/* Negative width: implicit '-' flag, then take absolute value */
5395 		align_to_left = true;
5396 		/* -INT_MIN is undefined */
5397 		if (width <= INT_MIN)
5398 			ereport(ERROR,
5399 					(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5400 					 errmsg("number is out of range")));
5401 		width = -width;
5402 	}
5403 	else if (flags & TEXT_FORMAT_FLAG_MINUS)
5404 		align_to_left = true;
5405 
5406 	len = pg_mbstrlen(str);
5407 	if (align_to_left)
5408 	{
5409 		/* left justify */
5410 		appendStringInfoString(buf, str);
5411 		if (len < width)
5412 			appendStringInfoSpaces(buf, width - len);
5413 	}
5414 	else
5415 	{
5416 		/* right justify */
5417 		if (len < width)
5418 			appendStringInfoSpaces(buf, width - len);
5419 		appendStringInfoString(buf, str);
5420 	}
5421 }
5422 
5423 /*
5424  * text_format_nv - nonvariadic wrapper for text_format function.
5425  *
5426  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
5427  * which checks that all built-in functions that share the implementing C
5428  * function take the same number of arguments.
5429  */
5430 Datum
text_format_nv(PG_FUNCTION_ARGS)5431 text_format_nv(PG_FUNCTION_ARGS)
5432 {
5433 	return text_format(fcinfo);
5434 }
5435 
5436 /*
5437  * Helper function for Levenshtein distance functions. Faster than memcmp(),
5438  * for this use case.
5439  */
5440 static inline bool
rest_of_char_same(const char * s1,const char * s2,int len)5441 rest_of_char_same(const char *s1, const char *s2, int len)
5442 {
5443 	while (len > 0)
5444 	{
5445 		len--;
5446 		if (s1[len] != s2[len])
5447 			return false;
5448 	}
5449 	return true;
5450 }
5451 
5452 /* Expand each Levenshtein distance variant */
5453 #include "levenshtein.c"
5454 #define LEVENSHTEIN_LESS_EQUAL
5455 #include "levenshtein.c"
5456