1 #include <stdio.h>
2 #include <string.h>
3 #include <unistd.h>
4 #include <ctype.h>
5 #include <sys/time.h>
6 #include "lib/mlrutil.h"
7 #include "lib/mlrregex.h"
8 #include "lib/mlr_globals.h"
9 #include "lib/free_flags.h"
10 
11 // ----------------------------------------------------------------
12 // Succeeds or aborts the process. cflag REG_EXTENDED is already included.
13 //
14 // Reason for the double-backslashing routine: Miller DSL literals are unbackslashed, e.g. the
15 // two-character sequence "\t" is converted to a tab character, and users need to type "\\t" to get
16 // a backslash followed by a t. Well and good, but the system regex library handles backslashes not
17 // quite as I want. Namely, without double-backslashing,
18 //
19 //   echo 'x=a\tb' | mlr put '$x=sub($x,"\\t","TAB")'
20 //
21 // (note: not echo -e, but just plain echo) outputs
22 //
23 //   a\TABb
24 //
25 // while
26 //
27 //   echo 'x=a\tb' | mlr put '$x=sub($x,"\\\\t","TAB")'
28 //
29 // outputs
30 //
31 //   aTABb
32 //
33 // Using double-backslashing, backslashes can be escaped as the regex library requires, before I call regcomp:
34 //
35 //   echo 'x=a\tb' | mlr put '$x=sub($x,"\\t","TAB")'
36 //
37 // outputs
38 //
39 //   aTABb
40 //
41 // as desired.
42 
regcomp_or_die(regex_t * pregex,char * regex_string,int cflags)43 regex_t* regcomp_or_die(regex_t* pregex, char* regex_string, int cflags) {
44 	cflags |= REG_EXTENDED;
45 	char* doubly_backslashed = mlr_alloc_double_backslash(regex_string);
46 	int rc = regcomp(pregex, doubly_backslashed, cflags);
47 	free(doubly_backslashed);
48 	if (rc != 0) {
49 		size_t nbytes = regerror(rc, pregex, NULL, 0);
50 		char* errbuf = malloc(nbytes);
51 		(void)regerror(rc, pregex, errbuf, nbytes);
52 		fprintf(stderr, "%s: could not compile regex \"%s\" : %s\n",
53 			MLR_GLOBALS.bargv0, regex_string, errbuf);
54 		exit(1);
55 	}
56 	return pregex;
57 }
58 
59 // Always uses cflags with REG_EXTENDED.
60 // If the regex_string is of the form a.*b, compiles it using cflags without REG_ICASE.
61 // If the regex_string is of the form "a.*b", compiles a.*b using cflags without REG_ICASE.
62 // If the regex_string is of the form "a.*b"i, compiles a.*b using cflags with REG_ICASE.
regcomp_or_die_quoted(regex_t * pregex,char * orig_regex_string,int cflags)63 regex_t* regcomp_or_die_quoted(regex_t* pregex, char* orig_regex_string, int cflags) {
64 	cflags |= REG_EXTENDED;
65 	if (string_starts_with(orig_regex_string, "\"")) {
66 		char* regex_string = mlr_strdup_or_die(orig_regex_string);
67 		int len = 0;
68 		if (string_ends_with(regex_string, "\"", &len)) {
69 			regex_string[len-1] = 0;
70 		} else if (string_ends_with(regex_string, "\"i", &len)) {
71 			regex_string[len-2] = 0;
72 			cflags |= REG_ICASE;
73 		} else {
74 			fprintf(stderr, "%s: imbalanced double-quote in regex [%s].\n",
75 				MLR_GLOBALS.bargv0, regex_string);
76 			exit(1);
77 		}
78 		regcomp_or_die(pregex, regex_string+1, cflags);
79 		free(regex_string);
80 	} else {
81 		regcomp_or_die(pregex, orig_regex_string, cflags);
82 	}
83 	return pregex;
84 }
85 
86 // Returns TRUE for match, FALSE for no match, and aborts the process if
87 // regexec returns anything else.
regmatch_or_die(const regex_t * pregex,const char * restrict match_string,size_t nmatchmax,regmatch_t pmatch[restrict])88 int regmatch_or_die(const regex_t* pregex, const char* restrict match_string,
89 	size_t nmatchmax, regmatch_t pmatch[restrict])
90 {
91 	int rc = regexec(pregex, match_string, nmatchmax, pmatch, 0);
92 	if (rc == 0) {
93 		return TRUE;
94 	} else if (rc == REG_NOMATCH) {
95 		return FALSE;
96 	} else {
97 		size_t nbytes = regerror(rc, pregex, NULL, 0);
98 		char* errbuf = malloc(nbytes);
99 		(void)regerror(rc, pregex, errbuf, nbytes);
100 		printf("regexec failure: %s\n", errbuf);
101 		exit(1);
102 	}
103 }
104 
105 // Capture-group example:
106 // sed: $ echo '<<abcdefg>>'|sed 's/ab\(.\)d\(..\)g/AYEBEE\1DEE\2GEE/' gives <<AYEBEEcDEEefGEE>>
107 // mlr: echo 'x=<<abcdefg>>' | mlr put '$x = sub($x, "ab(.)d(..)g", "AYEBEE\1DEE\2GEE")' x=<<AYEBEEcDEEefGEE>>
108 
regex_sub(char * input,regex_t * pregex,string_builder_t * psb,char * replacement,int * pmatched,int * pall_captured)109 char* regex_sub(char* input, regex_t* pregex, string_builder_t* psb, char* replacement,
110 	int* pmatched, int *pall_captured)
111 {
112 	const size_t nmatchmax = 10; // Capture-groups \1 through \9 supported, along with entire-string match \0
113 	regmatch_t matches[nmatchmax];
114 	if (pall_captured)
115 		*pall_captured = TRUE;
116 
117 	*pmatched = regmatch_or_die(pregex, input, nmatchmax, matches);
118 	if (!*pmatched) {
119 		return mlr_strdup_or_die(input);
120 	} else {
121 		sb_append_chars(psb, input, 0, matches[0].rm_so-1);
122 		char* p = replacement;
123 		while (*p) {
124 			if (p[0] == '\\' && isdigit(p[1])) {
125 				int idx = p[1] - '0';
126 				regmatch_t* pmatch = &matches[idx];
127 				if (pmatch->rm_so == -1) {
128 					if (pall_captured)
129 						*pall_captured = FALSE;
130 					// implicitly append empty string by doing nothing at all --
131 					// we don't need to write:
132 					// sb_append_string(psb, "");
133 				} else {
134 					sb_append_chars(psb, input, matches[idx].rm_so, matches[idx].rm_eo-1);
135 				}
136 				p += 2;
137 			} else {
138 				sb_append_char(psb, *p);
139 				p++;
140 			}
141 		}
142 		sb_append_chars(psb, input, matches[0].rm_eo, strlen(input));
143 
144 		return sb_finish(psb);
145 	}
146 }
147 
regex_gsub(char * input,regex_t * pregex,string_builder_t * psb,char * replacement,int * pmatched,int * pall_captured,char * pfree_flags)148 char* regex_gsub(char* input, regex_t* pregex, string_builder_t* psb, char* replacement,
149 	int *pmatched, int* pall_captured, char* pfree_flags)
150 {
151 	const size_t nmatchmax = 10;
152 	regmatch_t matches[nmatchmax];
153 	*pmatched = FALSE;
154 	*pall_captured = TRUE;
155 	*pfree_flags = NO_FREE;
156 
157 	int   match_start = 0;
158 	char* current_input = input;
159 
160 	while (TRUE) {
161 		int matched = regmatch_or_die(pregex, &current_input[match_start], nmatchmax, matches);
162 		if (!matched) {
163 			if (input == current_input) {
164 				*pfree_flags = FREE_ENTRY_VALUE;
165 				return mlr_strdup_or_die(current_input);
166 			} else {
167 				return current_input;
168 			}
169 		}
170 		*pmatched = TRUE;
171 
172 		sb_append_chars(psb, current_input, 0, match_start + matches[0].rm_so-1);
173 
174 		char* p = replacement;
175 		int len1 = psb->used_length;
176 		while (*p) {
177 			if (p[0] == '\\' && isdigit(p[1])) {
178 				int idx = p[1] - '0';
179 				regmatch_t* pmatch = &matches[idx];
180 				if (pmatch->rm_so == -1) {
181 					*pall_captured = FALSE;
182 					// implicitly append empty string by doing nothing at all --
183 					// we don't need to write:
184 					// sb_append_string(psb, "");
185 				} else {
186 					sb_append_chars(psb, &current_input[match_start], matches[idx].rm_so, matches[idx].rm_eo-1);
187 				}
188 				p += 2;
189 			} else {
190 				sb_append_char(psb, *p);
191 				p++;
192 			}
193 		}
194 
195 		int replen = psb->used_length - len1;
196 		sb_append_chars(psb, current_input, match_start + matches[0].rm_eo, strlen(current_input));
197 
198 		char* next_input = sb_finish(psb);
199 		if (*pfree_flags & FREE_ENTRY_VALUE)
200 			free(current_input);
201 		current_input = next_input;
202 		*pfree_flags = FREE_ENTRY_VALUE;
203 
204 		match_start += matches[0].rm_so + replen;
205 	}
206 }
207 
208 // ----------------------------------------------------------------
regextract(char * input,regex_t * pregex)209 char* regextract(char* input, regex_t* pregex) {
210 	const size_t nmatchmax = 1;
211 	regmatch_t matches[nmatchmax];
212 
213 	int matched = regmatch_or_die(pregex, input, nmatchmax, matches);
214 	if (!matched) {
215 		return NULL;
216 	}
217 	regmatch_t* pmatch = &matches[0];
218 	int len = pmatch->rm_eo - pmatch->rm_so;
219 	return mlr_alloc_string_from_char_range(&input[pmatch->rm_so], len);
220 }
221 
222 // ----------------------------------------------------------------
regextract_or_else(char * input,regex_t * pregex,char * default_value)223 char* regextract_or_else(char* input, regex_t* pregex, char* default_value) {
224 	const size_t nmatchmax = 1;
225 	regmatch_t matches[nmatchmax];
226 
227 	int matched = regmatch_or_die(pregex, input, nmatchmax, matches);
228 	if (!matched) {
229 		return mlr_strdup_or_die(default_value);
230 	}
231 	regmatch_t* pmatch = &matches[0];
232 	int len = pmatch->rm_eo - pmatch->rm_so;
233 	return mlr_alloc_string_from_char_range(&input[pmatch->rm_so], len);
234 }
235 
236 // ----------------------------------------------------------------
237 // Slot 0 is the entire matched input string.
238 // Slots 1 and up are substring matches for parenthesized capture expressions (if any).
239 // Example regex "a(.*)e" with input string "abcde": slot 1 points to "bcd" and match_count = 2.
240 // Slot 2 has rm_so == -1.
241 // (If all allocated slots have matches then there is no slot with -1's.)
242 
243 // Input "abcde"
244 // Regex "a(.*)e"
245 // matches[0].rm_so =  0, matches[0].rm_eo =  5
246 // matches[1].rm_so =  1, matches[1].rm_eo =  4
247 // matches[2].rm_so = -1, matches[2].rm_eo = -1
248 //
249 // pregex_captures->length = 2
250 // pregex_captures->strings[0] = "abcde"
251 // pregex_captures->strings[1] = "bcd"
252 //
253 // Note that even if there is no match, a non-null zero-length regex-captures array is returned (by reference).
254 // This is important: see the comments in mapper_put for details.
255 
save_regex_captures(string_array_t ** ppregex_captures,char * input,regmatch_t matches[],int nmatchmax)256 void save_regex_captures(string_array_t** ppregex_captures, char* input, regmatch_t matches[], int nmatchmax) {
257 	int match_count = 0;
258 	match_count = 0;
259 	// In fully occupied case, there will be no slots with -1's.
260 	// Using optional regex captures, one slot may have rm_so == rm_eo == -1 (i.e. trivial) while a subsequent slot
261 	// may be non-trivial. So we need to check all slots.
262 	for (int i = 0; i < nmatchmax; i++) {
263 		if (matches[i].rm_so != -1) {
264 			match_count = i + 1;
265 		}
266 	}
267 	if (*ppregex_captures != NULL)
268 		string_array_realloc(*ppregex_captures, match_count);
269 	else
270 		*ppregex_captures = string_array_alloc(match_count);
271 	string_array_t* pregex_captures = *ppregex_captures;
272 	if (match_count >= 1) {
273 		for (int i = 0; i < match_count; i++) {
274 			int len = matches[i].rm_eo - matches[i].rm_so;
275 			pregex_captures->strings[i] = mlr_alloc_string_from_char_range(&input[matches[i].rm_so], len);
276 		}
277 		pregex_captures->strings_need_freeing = TRUE;
278 	}
279 }
280 
281 // ----------------------------------------------------------------
282 // Using the above example:
283 // Input "abcde"
284 // Regex "a(.*)e"
285 //
286 // pregex_captures->length = 2
287 // pregex_captures->strings[0] = "abcde"
288 // pregex_captures->strings[1] = "bcd"
289 //
290 // "\0" should be replaced with "abcde".
291 // "\1" should be replaced with "bcd".
292 // "\2" through "\9" should be replaced with "".
293 
interpolate_regex_captures(char * input,string_array_t * pregex_captures,int * pwas_allocated)294 char* interpolate_regex_captures(char* input, string_array_t* pregex_captures, int* pwas_allocated) {
295 	*pwas_allocated = FALSE;
296 
297 	string_builder_t* psb = sb_alloc(32);
298 
299 	char* p = input;
300 	while (*p) {
301 		if (p[0] == '\\' && isdigit(p[1])) {
302 			*pwas_allocated = TRUE;
303 			int idx = p[1] - '0';
304 			if (idx < pregex_captures->length)
305 				sb_append_string(psb, pregex_captures->strings[idx]);
306 			p += 2;
307 		} else {
308 			sb_append_char(psb, *p);
309 			p++;
310 		}
311 	}
312 
313 	if (*pwas_allocated) {
314 		char* output = sb_finish(psb);
315 		sb_free(psb);
316 		return output;
317 	} else {
318 		sb_free(psb);
319 		return input;
320 	}
321 }
322