1 #include <stdio.h>
2 #include <string.h>
3 #include <unistd.h>
4 #include <ctype.h>
5 #include <sys/time.h>
6 #include "lib/mlrutil.h"
7 #include "lib/mlrregex.h"
8 #include "lib/mlr_globals.h"
9 #include "lib/free_flags.h"
10
11 // ----------------------------------------------------------------
12 // Succeeds or aborts the process. cflag REG_EXTENDED is already included.
13 //
14 // Reason for the double-backslashing routine: Miller DSL literals are unbackslashed, e.g. the
15 // two-character sequence "\t" is converted to a tab character, and users need to type "\\t" to get
16 // a backslash followed by a t. Well and good, but the system regex library handles backslashes not
17 // quite as I want. Namely, without double-backslashing,
18 //
19 // echo 'x=a\tb' | mlr put '$x=sub($x,"\\t","TAB")'
20 //
21 // (note: not echo -e, but just plain echo) outputs
22 //
23 // a\TABb
24 //
25 // while
26 //
27 // echo 'x=a\tb' | mlr put '$x=sub($x,"\\\\t","TAB")'
28 //
29 // outputs
30 //
31 // aTABb
32 //
33 // Using double-backslashing, backslashes can be escaped as the regex library requires, before I call regcomp:
34 //
35 // echo 'x=a\tb' | mlr put '$x=sub($x,"\\t","TAB")'
36 //
37 // outputs
38 //
39 // aTABb
40 //
41 // as desired.
42
regcomp_or_die(regex_t * pregex,char * regex_string,int cflags)43 regex_t* regcomp_or_die(regex_t* pregex, char* regex_string, int cflags) {
44 cflags |= REG_EXTENDED;
45 char* doubly_backslashed = mlr_alloc_double_backslash(regex_string);
46 int rc = regcomp(pregex, doubly_backslashed, cflags);
47 free(doubly_backslashed);
48 if (rc != 0) {
49 size_t nbytes = regerror(rc, pregex, NULL, 0);
50 char* errbuf = malloc(nbytes);
51 (void)regerror(rc, pregex, errbuf, nbytes);
52 fprintf(stderr, "%s: could not compile regex \"%s\" : %s\n",
53 MLR_GLOBALS.bargv0, regex_string, errbuf);
54 exit(1);
55 }
56 return pregex;
57 }
58
59 // Always uses cflags with REG_EXTENDED.
60 // If the regex_string is of the form a.*b, compiles it using cflags without REG_ICASE.
61 // If the regex_string is of the form "a.*b", compiles a.*b using cflags without REG_ICASE.
62 // If the regex_string is of the form "a.*b"i, compiles a.*b using cflags with REG_ICASE.
regcomp_or_die_quoted(regex_t * pregex,char * orig_regex_string,int cflags)63 regex_t* regcomp_or_die_quoted(regex_t* pregex, char* orig_regex_string, int cflags) {
64 cflags |= REG_EXTENDED;
65 if (string_starts_with(orig_regex_string, "\"")) {
66 char* regex_string = mlr_strdup_or_die(orig_regex_string);
67 int len = 0;
68 if (string_ends_with(regex_string, "\"", &len)) {
69 regex_string[len-1] = 0;
70 } else if (string_ends_with(regex_string, "\"i", &len)) {
71 regex_string[len-2] = 0;
72 cflags |= REG_ICASE;
73 } else {
74 fprintf(stderr, "%s: imbalanced double-quote in regex [%s].\n",
75 MLR_GLOBALS.bargv0, regex_string);
76 exit(1);
77 }
78 regcomp_or_die(pregex, regex_string+1, cflags);
79 free(regex_string);
80 } else {
81 regcomp_or_die(pregex, orig_regex_string, cflags);
82 }
83 return pregex;
84 }
85
86 // Returns TRUE for match, FALSE for no match, and aborts the process if
87 // regexec returns anything else.
regmatch_or_die(const regex_t * pregex,const char * restrict match_string,size_t nmatchmax,regmatch_t pmatch[restrict])88 int regmatch_or_die(const regex_t* pregex, const char* restrict match_string,
89 size_t nmatchmax, regmatch_t pmatch[restrict])
90 {
91 int rc = regexec(pregex, match_string, nmatchmax, pmatch, 0);
92 if (rc == 0) {
93 return TRUE;
94 } else if (rc == REG_NOMATCH) {
95 return FALSE;
96 } else {
97 size_t nbytes = regerror(rc, pregex, NULL, 0);
98 char* errbuf = malloc(nbytes);
99 (void)regerror(rc, pregex, errbuf, nbytes);
100 printf("regexec failure: %s\n", errbuf);
101 exit(1);
102 }
103 }
104
105 // Capture-group example:
106 // sed: $ echo '<<abcdefg>>'|sed 's/ab\(.\)d\(..\)g/AYEBEE\1DEE\2GEE/' gives <<AYEBEEcDEEefGEE>>
107 // mlr: echo 'x=<<abcdefg>>' | mlr put '$x = sub($x, "ab(.)d(..)g", "AYEBEE\1DEE\2GEE")' x=<<AYEBEEcDEEefGEE>>
108
regex_sub(char * input,regex_t * pregex,string_builder_t * psb,char * replacement,int * pmatched,int * pall_captured)109 char* regex_sub(char* input, regex_t* pregex, string_builder_t* psb, char* replacement,
110 int* pmatched, int *pall_captured)
111 {
112 const size_t nmatchmax = 10; // Capture-groups \1 through \9 supported, along with entire-string match \0
113 regmatch_t matches[nmatchmax];
114 if (pall_captured)
115 *pall_captured = TRUE;
116
117 *pmatched = regmatch_or_die(pregex, input, nmatchmax, matches);
118 if (!*pmatched) {
119 return mlr_strdup_or_die(input);
120 } else {
121 sb_append_chars(psb, input, 0, matches[0].rm_so-1);
122 char* p = replacement;
123 while (*p) {
124 if (p[0] == '\\' && isdigit(p[1])) {
125 int idx = p[1] - '0';
126 regmatch_t* pmatch = &matches[idx];
127 if (pmatch->rm_so == -1) {
128 if (pall_captured)
129 *pall_captured = FALSE;
130 // implicitly append empty string by doing nothing at all --
131 // we don't need to write:
132 // sb_append_string(psb, "");
133 } else {
134 sb_append_chars(psb, input, matches[idx].rm_so, matches[idx].rm_eo-1);
135 }
136 p += 2;
137 } else {
138 sb_append_char(psb, *p);
139 p++;
140 }
141 }
142 sb_append_chars(psb, input, matches[0].rm_eo, strlen(input));
143
144 return sb_finish(psb);
145 }
146 }
147
regex_gsub(char * input,regex_t * pregex,string_builder_t * psb,char * replacement,int * pmatched,int * pall_captured,char * pfree_flags)148 char* regex_gsub(char* input, regex_t* pregex, string_builder_t* psb, char* replacement,
149 int *pmatched, int* pall_captured, char* pfree_flags)
150 {
151 const size_t nmatchmax = 10;
152 regmatch_t matches[nmatchmax];
153 *pmatched = FALSE;
154 *pall_captured = TRUE;
155 *pfree_flags = NO_FREE;
156
157 int match_start = 0;
158 char* current_input = input;
159
160 while (TRUE) {
161 int matched = regmatch_or_die(pregex, ¤t_input[match_start], nmatchmax, matches);
162 if (!matched) {
163 if (input == current_input) {
164 *pfree_flags = FREE_ENTRY_VALUE;
165 return mlr_strdup_or_die(current_input);
166 } else {
167 return current_input;
168 }
169 }
170 *pmatched = TRUE;
171
172 sb_append_chars(psb, current_input, 0, match_start + matches[0].rm_so-1);
173
174 char* p = replacement;
175 int len1 = psb->used_length;
176 while (*p) {
177 if (p[0] == '\\' && isdigit(p[1])) {
178 int idx = p[1] - '0';
179 regmatch_t* pmatch = &matches[idx];
180 if (pmatch->rm_so == -1) {
181 *pall_captured = FALSE;
182 // implicitly append empty string by doing nothing at all --
183 // we don't need to write:
184 // sb_append_string(psb, "");
185 } else {
186 sb_append_chars(psb, ¤t_input[match_start], matches[idx].rm_so, matches[idx].rm_eo-1);
187 }
188 p += 2;
189 } else {
190 sb_append_char(psb, *p);
191 p++;
192 }
193 }
194
195 int replen = psb->used_length - len1;
196 sb_append_chars(psb, current_input, match_start + matches[0].rm_eo, strlen(current_input));
197
198 char* next_input = sb_finish(psb);
199 if (*pfree_flags & FREE_ENTRY_VALUE)
200 free(current_input);
201 current_input = next_input;
202 *pfree_flags = FREE_ENTRY_VALUE;
203
204 match_start += matches[0].rm_so + replen;
205 }
206 }
207
208 // ----------------------------------------------------------------
regextract(char * input,regex_t * pregex)209 char* regextract(char* input, regex_t* pregex) {
210 const size_t nmatchmax = 1;
211 regmatch_t matches[nmatchmax];
212
213 int matched = regmatch_or_die(pregex, input, nmatchmax, matches);
214 if (!matched) {
215 return NULL;
216 }
217 regmatch_t* pmatch = &matches[0];
218 int len = pmatch->rm_eo - pmatch->rm_so;
219 return mlr_alloc_string_from_char_range(&input[pmatch->rm_so], len);
220 }
221
222 // ----------------------------------------------------------------
regextract_or_else(char * input,regex_t * pregex,char * default_value)223 char* regextract_or_else(char* input, regex_t* pregex, char* default_value) {
224 const size_t nmatchmax = 1;
225 regmatch_t matches[nmatchmax];
226
227 int matched = regmatch_or_die(pregex, input, nmatchmax, matches);
228 if (!matched) {
229 return mlr_strdup_or_die(default_value);
230 }
231 regmatch_t* pmatch = &matches[0];
232 int len = pmatch->rm_eo - pmatch->rm_so;
233 return mlr_alloc_string_from_char_range(&input[pmatch->rm_so], len);
234 }
235
236 // ----------------------------------------------------------------
237 // Slot 0 is the entire matched input string.
238 // Slots 1 and up are substring matches for parenthesized capture expressions (if any).
239 // Example regex "a(.*)e" with input string "abcde": slot 1 points to "bcd" and match_count = 2.
240 // Slot 2 has rm_so == -1.
241 // (If all allocated slots have matches then there is no slot with -1's.)
242
243 // Input "abcde"
244 // Regex "a(.*)e"
245 // matches[0].rm_so = 0, matches[0].rm_eo = 5
246 // matches[1].rm_so = 1, matches[1].rm_eo = 4
247 // matches[2].rm_so = -1, matches[2].rm_eo = -1
248 //
249 // pregex_captures->length = 2
250 // pregex_captures->strings[0] = "abcde"
251 // pregex_captures->strings[1] = "bcd"
252 //
253 // Note that even if there is no match, a non-null zero-length regex-captures array is returned (by reference).
254 // This is important: see the comments in mapper_put for details.
255
save_regex_captures(string_array_t ** ppregex_captures,char * input,regmatch_t matches[],int nmatchmax)256 void save_regex_captures(string_array_t** ppregex_captures, char* input, regmatch_t matches[], int nmatchmax) {
257 int match_count = 0;
258 match_count = 0;
259 // In fully occupied case, there will be no slots with -1's.
260 // Using optional regex captures, one slot may have rm_so == rm_eo == -1 (i.e. trivial) while a subsequent slot
261 // may be non-trivial. So we need to check all slots.
262 for (int i = 0; i < nmatchmax; i++) {
263 if (matches[i].rm_so != -1) {
264 match_count = i + 1;
265 }
266 }
267 if (*ppregex_captures != NULL)
268 string_array_realloc(*ppregex_captures, match_count);
269 else
270 *ppregex_captures = string_array_alloc(match_count);
271 string_array_t* pregex_captures = *ppregex_captures;
272 if (match_count >= 1) {
273 for (int i = 0; i < match_count; i++) {
274 int len = matches[i].rm_eo - matches[i].rm_so;
275 pregex_captures->strings[i] = mlr_alloc_string_from_char_range(&input[matches[i].rm_so], len);
276 }
277 pregex_captures->strings_need_freeing = TRUE;
278 }
279 }
280
281 // ----------------------------------------------------------------
282 // Using the above example:
283 // Input "abcde"
284 // Regex "a(.*)e"
285 //
286 // pregex_captures->length = 2
287 // pregex_captures->strings[0] = "abcde"
288 // pregex_captures->strings[1] = "bcd"
289 //
290 // "\0" should be replaced with "abcde".
291 // "\1" should be replaced with "bcd".
292 // "\2" through "\9" should be replaced with "".
293
interpolate_regex_captures(char * input,string_array_t * pregex_captures,int * pwas_allocated)294 char* interpolate_regex_captures(char* input, string_array_t* pregex_captures, int* pwas_allocated) {
295 *pwas_allocated = FALSE;
296
297 string_builder_t* psb = sb_alloc(32);
298
299 char* p = input;
300 while (*p) {
301 if (p[0] == '\\' && isdigit(p[1])) {
302 *pwas_allocated = TRUE;
303 int idx = p[1] - '0';
304 if (idx < pregex_captures->length)
305 sb_append_string(psb, pregex_captures->strings[idx]);
306 p += 2;
307 } else {
308 sb_append_char(psb, *p);
309 p++;
310 }
311 }
312
313 if (*pwas_allocated) {
314 char* output = sb_finish(psb);
315 sb_free(psb);
316 return output;
317 } else {
318 sb_free(psb);
319 return input;
320 }
321 }
322