1 /*-------------------------------------------------------------------------
2  *
3  * regprefix.c
4  *	  Extract a common prefix, if any, from a compiled regex.
5  *
6  *
7  * Portions Copyright (c) 2012-2018, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1998, 1999 Henry Spencer
9  *
10  * IDENTIFICATION
11  *	  src/backend/regex/regprefix.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "regex/regguts.h"
17 
18 
19 /*
20  * forward declarations
21  */
22 static int findprefix(struct cnfa *cnfa, struct colormap *cm,
23 		   chr *string, size_t *slength);
24 
25 
26 /*
27  * pg_regprefix - get common prefix for regular expression
28  *
29  * Returns one of:
30  *	REG_NOMATCH: there is no common prefix of strings matching the regex
31  *	REG_PREFIX: there is a common prefix of strings matching the regex
32  *	REG_EXACT: all strings satisfying the regex must match the same string
33  *	or a REG_XXX error code
34  *
35  * In the non-failure cases, *string is set to a malloc'd string containing
36  * the common prefix or exact value, of length *slength (measured in chrs
37  * not bytes!).
38  *
39  * This function does not analyze all complex cases (such as lookaround
40  * constraints) exactly.  Therefore it is possible that some strings matching
41  * the reported prefix or exact-match string do not satisfy the regex.  But
42  * it should never be the case that a string satisfying the regex does not
43  * match the reported prefix or exact-match string.
44  */
45 int
pg_regprefix(regex_t * re,chr ** string,size_t * slength)46 pg_regprefix(regex_t *re,
47 			 chr **string,
48 			 size_t *slength)
49 {
50 	struct guts *g;
51 	struct cnfa *cnfa;
52 	int			st;
53 
54 	/* sanity checks */
55 	if (string == NULL || slength == NULL)
56 		return REG_INVARG;
57 	*string = NULL;				/* initialize for failure cases */
58 	*slength = 0;
59 	if (re == NULL || re->re_magic != REMAGIC)
60 		return REG_INVARG;
61 	if (re->re_csize != sizeof(chr))
62 		return REG_MIXED;
63 
64 	/* Initialize locale-dependent support */
65 	pg_set_regex_collation(re->re_collation);
66 
67 	/* setup */
68 	g = (struct guts *) re->re_guts;
69 	if (g->info & REG_UIMPOSSIBLE)
70 		return REG_NOMATCH;
71 
72 	/*
73 	 * This implementation considers only the search NFA for the topmost regex
74 	 * tree node.  Therefore, constraints such as backrefs are not fully
75 	 * applied, which is allowed per the function's API spec.
76 	 */
77 	assert(g->tree != NULL);
78 	cnfa = &g->tree->cnfa;
79 
80 	/*
81 	 * Since a correct NFA should never contain any exit-free loops, it should
82 	 * not be possible for our traversal to return to a previously visited NFA
83 	 * state.  Hence we need at most nstates chrs in the output string.
84 	 */
85 	*string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
86 	if (*string == NULL)
87 		return REG_ESPACE;
88 
89 	/* do it */
90 	st = findprefix(cnfa, &g->cmap, *string, slength);
91 
92 	assert(*slength <= cnfa->nstates);
93 
94 	/* clean up */
95 	if (st != REG_PREFIX && st != REG_EXACT)
96 	{
97 		FREE(*string);
98 		*string = NULL;
99 		*slength = 0;
100 	}
101 
102 	return st;
103 }
104 
105 /*
106  * findprefix - extract common prefix from cNFA
107  *
108  * Results are returned into the preallocated chr array string[], with
109  * *slength (which must be preset to zero) incremented for each chr.
110  */
111 static int						/* regprefix return code */
findprefix(struct cnfa * cnfa,struct colormap * cm,chr * string,size_t * slength)112 findprefix(struct cnfa *cnfa,
113 		   struct colormap *cm,
114 		   chr *string,
115 		   size_t *slength)
116 {
117 	int			st;
118 	int			nextst;
119 	color		thiscolor;
120 	chr			c;
121 	struct carc *ca;
122 
123 	/*
124 	 * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
125 	 * anchored left.  If we have both BOS and BOL, they must go to the same
126 	 * next state.
127 	 */
128 	st = cnfa->pre;
129 	nextst = -1;
130 	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
131 	{
132 		if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
133 		{
134 			if (nextst == -1)
135 				nextst = ca->to;
136 			else if (nextst != ca->to)
137 				return REG_NOMATCH;
138 		}
139 		else
140 			return REG_NOMATCH;
141 	}
142 	if (nextst == -1)
143 		return REG_NOMATCH;
144 
145 	/*
146 	 * Scan through successive states, stopping as soon as we find one with
147 	 * more than one acceptable transition character (either multiple colors
148 	 * on out-arcs, or a color with more than one member chr).
149 	 *
150 	 * We could find a state with multiple out-arcs that are all labeled with
151 	 * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
152 	 * In that case we add the chr "c" to the output string but then exit the
153 	 * loop with nextst == -1.  This leaves a little bit on the table: if the
154 	 * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
155 	 * to the prefix.  But chasing multiple parallel state chains doesn't seem
156 	 * worth the trouble.
157 	 */
158 	do
159 	{
160 		st = nextst;
161 		nextst = -1;
162 		thiscolor = COLORLESS;
163 		for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
164 		{
165 			/* We can ignore BOS/BOL arcs */
166 			if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
167 				continue;
168 			/* ... but EOS/EOL arcs terminate the search, as do LACONs */
169 			if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
170 				ca->co >= cnfa->ncolors)
171 			{
172 				thiscolor = COLORLESS;
173 				break;
174 			}
175 			if (thiscolor == COLORLESS)
176 			{
177 				/* First plain outarc */
178 				thiscolor = ca->co;
179 				nextst = ca->to;
180 			}
181 			else if (thiscolor == ca->co)
182 			{
183 				/* Another plain outarc for same color */
184 				nextst = -1;
185 			}
186 			else
187 			{
188 				/* More than one plain outarc color terminates the search */
189 				thiscolor = COLORLESS;
190 				break;
191 			}
192 		}
193 		/* Done if we didn't find exactly one color on plain outarcs */
194 		if (thiscolor == COLORLESS)
195 			break;
196 		/* The color must be a singleton */
197 		if (cm->cd[thiscolor].nschrs != 1)
198 			break;
199 		/* Must not have any high-color-map entries */
200 		if (cm->cd[thiscolor].nuchrs != 0)
201 			break;
202 
203 		/*
204 		 * Identify the color's sole member chr and add it to the prefix
205 		 * string.  In general the colormap data structure doesn't provide a
206 		 * way to find color member chrs, except by trying GETCOLOR() on each
207 		 * possible chr value, which won't do at all.  However, for the cases
208 		 * we care about it should be sufficient to test the "firstchr" value,
209 		 * that is the first chr ever added to the color.  There are cases
210 		 * where this might no longer be a member of the color (so we do need
211 		 * to test), but none of them are likely to arise for a character that
212 		 * is a member of a common prefix.  If we do hit such a corner case,
213 		 * we just fall out without adding anything to the prefix string.
214 		 */
215 		c = cm->cd[thiscolor].firstchr;
216 		if (GETCOLOR(cm, c) != thiscolor)
217 			break;
218 
219 		string[(*slength)++] = c;
220 
221 		/* Advance to next state, but only if we have a unique next state */
222 	} while (nextst != -1);
223 
224 	/*
225 	 * If we ended at a state that only has EOS/EOL outarcs leading to the
226 	 * "post" state, then we have an exact-match string.  Note this is true
227 	 * even if the string is of zero length.
228 	 */
229 	nextst = -1;
230 	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
231 	{
232 		if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
233 		{
234 			if (nextst == -1)
235 				nextst = ca->to;
236 			else if (nextst != ca->to)
237 			{
238 				nextst = -1;
239 				break;
240 			}
241 		}
242 		else
243 		{
244 			nextst = -1;
245 			break;
246 		}
247 	}
248 	if (nextst == cnfa->post)
249 		return REG_EXACT;
250 
251 	/*
252 	 * Otherwise, if we were unable to identify any prefix characters, say
253 	 * NOMATCH --- the pattern is anchored left, but doesn't specify any
254 	 * particular first character.
255 	 */
256 	if (*slength > 0)
257 		return REG_PREFIX;
258 
259 	return REG_NOMATCH;
260 }
261