1 /*
2  * psql - the PostgreSQL interactive terminal
3  *
4  * Copyright (c) 2000-2020, PostgreSQL Global Development Group
5  *
6  * src/bin/psql/stringutils.c
7  */
8 #include "postgres_fe.h"
9 
10 #include <ctype.h>
11 
12 #include "common.h"
13 #include "stringutils.h"
14 
15 #define PQmblenBounded(s, e)  strnlen(s, PQmblen(s, e))
16 
17 
18 /*
19  * Replacement for strtok() (a.k.a. poor man's flex)
20  *
21  * Splits a string into tokens, returning one token per call, then NULL
22  * when no more tokens exist in the given string.
23  *
24  * The calling convention is similar to that of strtok, but with more
25  * frammishes.
26  *
27  * s -			string to parse, if NULL continue parsing the last string
28  * whitespace - set of whitespace characters that separate tokens
29  * delim -		set of non-whitespace separator characters (or NULL)
30  * quote -		set of characters that can quote a token (NULL if none)
31  * escape -		character that can quote quotes (0 if none)
32  * e_strings -	if true, treat E'...' syntax as a valid token
33  * del_quotes - if true, strip quotes from the returned token, else return
34  *				it exactly as found in the string
35  * encoding -	the active character-set encoding
36  *
37  * Characters in 'delim', if any, will be returned as single-character
38  * tokens unless part of a quoted token.
39  *
40  * Double occurrences of the quoting character are always taken to represent
41  * a single quote character in the data.  If escape isn't 0, then escape
42  * followed by anything (except \0) is a data character too.
43  *
44  * The combination of e_strings and del_quotes both true is not currently
45  * handled.  This could be fixed but it's not needed anywhere at the moment.
46  *
47  * Note that the string s is _not_ overwritten in this implementation.
48  *
49  * NB: it's okay to vary delim, quote, and escape from one call to the
50  * next on a single source string, but changing whitespace is a bad idea
51  * since you might lose data.
52  */
53 char *
strtokx(const char * s,const char * whitespace,const char * delim,const char * quote,char escape,bool e_strings,bool del_quotes,int encoding)54 strtokx(const char *s,
55 		const char *whitespace,
56 		const char *delim,
57 		const char *quote,
58 		char escape,
59 		bool e_strings,
60 		bool del_quotes,
61 		int encoding)
62 {
63 	static char *storage = NULL;	/* store the local copy of the users
64 									 * string here */
65 	static char *string = NULL; /* pointer into storage where to continue on
66 								 * next call */
67 
68 	/* variously abused variables: */
69 	unsigned int offset;
70 	char	   *start;
71 	char	   *p;
72 
73 	if (s)
74 	{
75 		free(storage);
76 
77 		/*
78 		 * We may need extra space to insert delimiter nulls for adjacent
79 		 * tokens.  2X the space is a gross overestimate, but it's unlikely
80 		 * that this code will be used on huge strings anyway.
81 		 */
82 		storage = pg_malloc(2 * strlen(s) + 1);
83 		strcpy(storage, s);
84 		string = storage;
85 	}
86 
87 	if (!storage)
88 		return NULL;
89 
90 	/* skip leading whitespace */
91 	offset = strspn(string, whitespace);
92 	start = &string[offset];
93 
94 	/* end of string reached? */
95 	if (*start == '\0')
96 	{
97 		/* technically we don't need to free here, but we're nice */
98 		free(storage);
99 		storage = NULL;
100 		string = NULL;
101 		return NULL;
102 	}
103 
104 	/* test if delimiter character */
105 	if (delim && strchr(delim, *start))
106 	{
107 		/*
108 		 * If not at end of string, we need to insert a null to terminate the
109 		 * returned token.  We can just overwrite the next character if it
110 		 * happens to be in the whitespace set ... otherwise move over the
111 		 * rest of the string to make room.  (This is why we allocated extra
112 		 * space above).
113 		 */
114 		p = start + 1;
115 		if (*p != '\0')
116 		{
117 			if (!strchr(whitespace, *p))
118 				memmove(p + 1, p, strlen(p) + 1);
119 			*p = '\0';
120 			string = p + 1;
121 		}
122 		else
123 		{
124 			/* at end of string, so no extra work */
125 			string = p;
126 		}
127 
128 		return start;
129 	}
130 
131 	/* check for E string */
132 	p = start;
133 	if (e_strings &&
134 		(*p == 'E' || *p == 'e') &&
135 		p[1] == '\'')
136 	{
137 		quote = "'";
138 		escape = '\\';			/* if std strings before, not any more */
139 		p++;
140 	}
141 
142 	/* test if quoting character */
143 	if (quote && strchr(quote, *p))
144 	{
145 		/* okay, we have a quoted token, now scan for the closer */
146 		char		thisquote = *p++;
147 
148 		for (; *p; p += PQmblenBounded(p, encoding))
149 		{
150 			if (*p == escape && p[1] != '\0')
151 				p++;			/* process escaped anything */
152 			else if (*p == thisquote && p[1] == thisquote)
153 				p++;			/* process doubled quote */
154 			else if (*p == thisquote)
155 			{
156 				p++;			/* skip trailing quote */
157 				break;
158 			}
159 		}
160 
161 		/*
162 		 * If not at end of string, we need to insert a null to terminate the
163 		 * returned token.  See notes above.
164 		 */
165 		if (*p != '\0')
166 		{
167 			if (!strchr(whitespace, *p))
168 				memmove(p + 1, p, strlen(p) + 1);
169 			*p = '\0';
170 			string = p + 1;
171 		}
172 		else
173 		{
174 			/* at end of string, so no extra work */
175 			string = p;
176 		}
177 
178 		/* Clean up the token if caller wants that */
179 		if (del_quotes)
180 			strip_quotes(start, thisquote, escape, encoding);
181 
182 		return start;
183 	}
184 
185 	/*
186 	 * Otherwise no quoting character.  Scan till next whitespace, delimiter
187 	 * or quote.  NB: at this point, *start is known not to be '\0',
188 	 * whitespace, delim, or quote, so we will consume at least one character.
189 	 */
190 	offset = strcspn(start, whitespace);
191 
192 	if (delim)
193 	{
194 		unsigned int offset2 = strcspn(start, delim);
195 
196 		if (offset > offset2)
197 			offset = offset2;
198 	}
199 
200 	if (quote)
201 	{
202 		unsigned int offset2 = strcspn(start, quote);
203 
204 		if (offset > offset2)
205 			offset = offset2;
206 	}
207 
208 	p = start + offset;
209 
210 	/*
211 	 * If not at end of string, we need to insert a null to terminate the
212 	 * returned token.  See notes above.
213 	 */
214 	if (*p != '\0')
215 	{
216 		if (!strchr(whitespace, *p))
217 			memmove(p + 1, p, strlen(p) + 1);
218 		*p = '\0';
219 		string = p + 1;
220 	}
221 	else
222 	{
223 		/* at end of string, so no extra work */
224 		string = p;
225 	}
226 
227 	return start;
228 }
229 
230 
231 /*
232  * strip_quotes
233  *
234  * Remove quotes from the string at *source.  Leading and trailing occurrences
235  * of 'quote' are removed; embedded double occurrences of 'quote' are reduced
236  * to single occurrences; if 'escape' is not 0 then 'escape' removes special
237  * significance of next character.
238  *
239  * Note that the source string is overwritten in-place.
240  */
241 void
strip_quotes(char * source,char quote,char escape,int encoding)242 strip_quotes(char *source, char quote, char escape, int encoding)
243 {
244 	char	   *src;
245 	char	   *dst;
246 
247 	Assert(source != NULL);
248 	Assert(quote != '\0');
249 
250 	src = dst = source;
251 
252 	if (*src && *src == quote)
253 		src++;					/* skip leading quote */
254 
255 	while (*src)
256 	{
257 		char		c = *src;
258 		int			i;
259 
260 		if (c == quote && src[1] == '\0')
261 			break;				/* skip trailing quote */
262 		else if (c == quote && src[1] == quote)
263 			src++;				/* process doubled quote */
264 		else if (c == escape && src[1] != '\0')
265 			src++;				/* process escaped character */
266 
267 		i = PQmblenBounded(src, encoding);
268 		while (i--)
269 			*dst++ = *src++;
270 	}
271 
272 	*dst = '\0';
273 }
274 
275 
276 /*
277  * quote_if_needed
278  *
279  * Opposite of strip_quotes().  If "source" denotes itself literally without
280  * quoting or escaping, returns NULL.  Otherwise, returns a malloc'd copy with
281  * quoting and escaping applied:
282  *
283  * source -			string to parse
284  * entails_quote -	any of these present?  need outer quotes
285  * quote -			doubled within string, affixed to both ends
286  * escape -			doubled within string
287  * force_quote -	if true, quote the output even if it doesn't "need" it
288  * encoding -		the active character-set encoding
289  *
290  * Do not use this as a substitute for PQescapeStringConn().  Use it for
291  * strings to be parsed by strtokx() or psql_scan_slash_option().
292  */
293 char *
quote_if_needed(const char * source,const char * entails_quote,char quote,char escape,bool force_quote,int encoding)294 quote_if_needed(const char *source, const char *entails_quote,
295 				char quote, char escape, bool force_quote,
296 				int encoding)
297 {
298 	const char *src;
299 	char	   *ret;
300 	char	   *dst;
301 	bool		need_quotes = force_quote;
302 
303 	Assert(source != NULL);
304 	Assert(quote != '\0');
305 
306 	src = source;
307 	dst = ret = pg_malloc(2 * strlen(src) + 3); /* excess */
308 
309 	*dst++ = quote;
310 
311 	while (*src)
312 	{
313 		char		c = *src;
314 		int			i;
315 
316 		if (c == quote)
317 		{
318 			need_quotes = true;
319 			*dst++ = quote;
320 		}
321 		else if (c == escape)
322 		{
323 			need_quotes = true;
324 			*dst++ = escape;
325 		}
326 		else if (strchr(entails_quote, c))
327 			need_quotes = true;
328 
329 		i = PQmblenBounded(src, encoding);
330 		while (i--)
331 			*dst++ = *src++;
332 	}
333 
334 	*dst++ = quote;
335 	*dst = '\0';
336 
337 	if (!need_quotes)
338 	{
339 		free(ret);
340 		ret = NULL;
341 	}
342 
343 	return ret;
344 }
345