1 /* rgxg - ReGular eXpression Generator
2  *
3  * Copyright (c) 2010-2013,2020 Hannes von Haugwitz
4  *
5  * This software is provided 'as-is', without any express or implied
6  * warranty. In no event will the authors be held liable for any damages
7  * arising from the use of this software.
8  *
9  * Permission is granted to anyone to use this software for any purpose,
10  * including commercial applications, and to alter it and redistribute
11  * it freely, subject to the following restrictions:
12  *
13  *     1. The origin of this software must not be misrepresented; you
14  *     must not claim that you wrote the original software. If you use
15  *     this software in a product, an acknowledgment in the product
16  *     documentation would be appreciated but is not required.
17  *
18  *     2. Altered source versions must be plainly marked as such, and
19  *     must not be misrepresented as being the original software.
20  *
21  *     3. This notice may not be removed or altered from any source
22  *     distribution.
23  */
24 
25 /* needed for rgxg_number_* */
26 #include "rgxg/number.h"
27 
28 /* needed for EASY_CHAR */
29 #include "common_macros.h"
30 
31 /* needed for internal_plain_number_base10 */
32 #include "internal_utils.h"
33 
34 /* needed for LLONG_MAX */
35 #include <limits.h>
36 
37 /* needed for NULL */
38 #include <stdlib.h>
39 
40 /* pre-compile limits */
41 static long long limits[] = {
42     0L, LLONG_MAX, LLONG_MAX/2, LLONG_MAX/3, LLONG_MAX/4, LLONG_MAX/5,
43     LLONG_MAX/6, LLONG_MAX/7, LLONG_MAX/8, LLONG_MAX/9, LLONG_MAX/10,
44     LLONG_MAX/11, LLONG_MAX/12, LLONG_MAX/13, LLONG_MAX/14, LLONG_MAX/15,
45     LLONG_MAX/16, LLONG_MAX/17, LLONG_MAX/18, LLONG_MAX/19, LLONG_MAX/20,
46     LLONG_MAX/21, LLONG_MAX/22, LLONG_MAX/23, LLONG_MAX/24, LLONG_MAX/25,
47     LLONG_MAX/26, LLONG_MAX/27, LLONG_MAX/28, LLONG_MAX/29, LLONG_MAX/30,
48     LLONG_MAX/31, LLONG_MAX/32
49 };
50 
rgxg_number_of_digits_long_long(long long number,int base)51 static size_t rgxg_number_of_digits_long_long(long long number, int base) {
52     long long power = base;
53     long long limit = limits[base];
54     size_t n = 1;
55 
56     while(number >= power) {
57         n++;
58         if (power > limit) break;
59         power *= base;
60     }
61 
62     return n;
63 }
64 
rgxg_power(int base,int exp)65 static long long rgxg_power(int base, int exp) {
66     long long r = 1;
67     while (exp--) { r *=base; }
68     return r;
69 }
70 
rgxg_base32_char(long long number,int upper)71 static char rgxg_base32_char(long long number, int upper) {
72     /* 'A'-10 = '7', 'a'-10='W' */
73     return number+(number < 10 ? '0' : (upper ? '7' : 'W'));
74 }
75 
rgxg_base32_range(long long first,long long last,char * regex,rgxg_options_t options)76 static int rgxg_base32_range(long long first, long long
77         last, char* regex, rgxg_options_t options) {
78     /* size 4 is enough as long base <=36 */
79     long long stk[4];
80     int top, brackets;
81     int n = 0;
82 
83     top = -1;
84     brackets=first-last || (last > 9 && !(RGXG_NOUPPERCASE&options || RGXG_NOLOWERCASE&options));
85     if (brackets) { EASY_CHAR('[') }
86     stk[++top] = first;
87     stk[++top] = last;
88     do {
89         last = stk[top--];
90         first = stk[top--];
91         switch (last-first) {
92             case 0:
93                 if (first <= 9 || !(RGXG_NOUPPERCASE&options)) {
94                     EASY_CHAR(rgxg_base32_char(first, 1))
95                 }
96                 if (first > 9 && !(RGXG_NOLOWERCASE&options)) {
97                     EASY_CHAR(rgxg_base32_char(first, 0))
98                 }
99                 break;
100             case 1:
101                 if (first <= 9 || !(RGXG_NOUPPERCASE&options)) {
102                     EASY_CHAR(rgxg_base32_char(first, 1))
103                 }
104                 if (first > 9 && !(RGXG_NOLOWERCASE&options)) {
105                     EASY_CHAR(rgxg_base32_char(first, 0))
106                 }
107 
108                 if (last <= 9 || !(RGXG_NOUPPERCASE&options)) {
109                     EASY_CHAR(rgxg_base32_char(last, 1))
110                 }
111                 if (last > 9 && !(RGXG_NOLOWERCASE&options)) {
112                     EASY_CHAR(rgxg_base32_char(last, 0))
113                 }
114                 break;
115             default:
116                 if (first <= 9 && last >= 10) {
117                     stk[++top] = 10;
118                     stk[++top] = last;
119                     stk[++top] = first;
120                     stk[++top] = 9;
121                 } else {
122                     if (first <= 9 || !(RGXG_NOUPPERCASE&options)) {
123                         EASY_CHAR(rgxg_base32_char(first, 1))
124                         EASY_CHAR('-')
125                         EASY_CHAR(rgxg_base32_char(last, 1))
126                     }
127                     if (first > 9 && !(RGXG_NOLOWERCASE&options)) {
128                         EASY_CHAR(rgxg_base32_char(first, 0))
129                         EASY_CHAR('-')
130                         EASY_CHAR(rgxg_base32_char(last, 0))
131                     }
132                 }
133                 break;
134         }
135     } while (top > 0);
136 
137     if (brackets) { EASY_CHAR(']') }
138 
139     return n;
140 }
141 
142 #define EASY_IF_ERROR_ELSE(condition, error) \
143     if (condition) { \
144         n = error; \
145     } else
146 
rgxg_number_single(long long number,int base,char * regex,rgxg_options_t options)147 static int rgxg_number_single(long long number, int base, char* regex, rgxg_options_t
148         options) {
149     int n;
150     long long m;
151 
152     n = 0;
153     m = number;
154     do {
155         n += (m%base <= 9 || RGXG_NOUPPERCASE&options || RGXG_NOLOWERCASE&options) ? 1 : 4;
156         m /= base;
157     } while (m);
158 
159     if (regex) {
160         int pos = n;
161         do {
162             m = number%base;
163             if (m <= 9 || RGXG_NOUPPERCASE&options || RGXG_NOLOWERCASE&options) {
164                 regex[--pos] = rgxg_base32_char(m, RGXG_NOLOWERCASE&options);
165             } else {
166                 regex[--pos] = ']';
167                 regex[--pos] = rgxg_base32_char(m, 0);
168                 regex[--pos] = rgxg_base32_char(m, 1);
169                 regex[--pos] = '[';
170             }
171             number /= base;
172         } while (number);
173     }
174 
175     return n;
176 }
177 
178 #define EASY_NUMBER_ASSIGNMENT(number) \
179     n += rgxg_number_single(number, base, (regex ? regex+n : NULL), options);
180 
181 #define EASY_BASE32_RANGE_ASSIGNMENT(first, last) \
182     n += rgxg_base32_range(first, last, (regex ? regex+n : NULL), options);
183 
184 #define EASY_LEADING_ZEROS \
185 if (number_of_leading_zeros < ((RGXG_LEADINGZERO&options) ? 5 : 4)) { \
186     /* 0000000000 -> 0{10} (0?0?0?0? -> 0{0,4}) but 0000 -> 0000 (0?0?0? -> 0?0?0?) */ \
187     while (number_of_leading_zeros-- > 0) { \
188         EASY_CHAR('0') \
189         if (RGXG_VARLEADINGZERO&options) { EASY_CHAR('?') } \
190     } \
191 } else { \
192     EASY_CHAR('0') \
193     EASY_CHAR('{') \
194     if (RGXG_VARLEADINGZERO&options) { \
195         EASY_CHAR('0') \
196         EASY_CHAR(',') \
197     } \
198     n += internal_plain_number_base10(number_of_leading_zeros, (regex ? regex+n : NULL)); \
199     EASY_CHAR('}') \
200 }
201 
rgxg_number_range(long long first,long long last,int base,size_t min_length,char * regex,rgxg_options_t options)202 int rgxg_number_range (long long first, long long last, int base,
203         size_t min_length, char *regex, rgxg_options_t options) {
204     long long prefix, prefix_range_first, prefix_range_last,
205          current_last, power_max, power_max_plus_one;
206     int n, min, max, number_of_leading_zeros, parenthesis;
207     size_t max_num_of_digits;
208 
209     EASY_VALIDATE_MUTEXOPTIONS(RGXG_LEADINGZERO, RGXG_VARLEADINGZERO)
210     EASY_VALIDATE_MUTEXOPTIONS(RGXG_NOUPPERCASE, RGXG_NOLOWERCASE)
211     EASY_IF_ERROR_ELSE(base < 2 || base > 32, RGXG_ERROR_BASE)
212     EASY_IF_ERROR_ELSE(first < 0 || last < 0, RGXG_ERROR_NEGARG)
213     EASY_IF_ERROR_ELSE(first > last , RGXG_ERROR_RANGE) {
214         n = 0;
215         parenthesis = 0;
216         max_num_of_digits = 0;
217         if ((RGXG_LEADINGZERO|RGXG_VARLEADINGZERO)&options) {
218             max_num_of_digits = rgxg_number_of_digits_long_long(last, base);
219             if (max_num_of_digits < min_length) {
220                 max_num_of_digits = min_length;
221             }
222         }
223         current_last = last;
224 
225         do {
226             max = 0;
227             prefix = current_last/base;
228             prefix_range_first = first/base != current_last/base ? 0 : first%base;
229             prefix_range_last = current_last%base;
230 
231             power_max = rgxg_power(base, max);
232             power_max_plus_one = power_max * base;
233 
234             while (prefix_range_last == base-1 && (prefix*power_max_plus_one-1 > first) ) {
235                 ++max;
236                 current_last /= base;
237                 prefix = current_last/base;
238                 power_max = power_max_plus_one;
239                 power_max_plus_one *= base;
240                 prefix_range_first =
241                     prefix*power_max_plus_one+!prefix*power_max > first
242                     ? ((((RGXG_LEADINGZERO|RGXG_VARLEADINGZERO)&options) && first == 0) ? 0 : !prefix)
243                     : first/power_max%base+(prefix*power_max_plus_one+(first/power_max%base)*power_max != first);
244                 prefix_range_last = current_last%base;
245             }
246             current_last = prefix*power_max_plus_one + prefix_range_first*power_max;
247             min = max;
248             if (max && prefix_range_first == 1 && prefix_range_last == base-1 && !((RGXG_LEADINGZERO|RGXG_VARLEADINGZERO)&options)) {
249                 while (current_last/base && current_last/base >= first) {
250                     min--;
251                     current_last /= base;
252                 }
253             }
254             current_last--;
255 
256             if (!parenthesis && !(RGXG_NOOUTERPARENS&options) && first <= current_last ) {
257                 if (!(max == 1 && first ==0 && ((current_last == base -1
258                     && min == 1) || (current_last == 0 && min == 0)))
259                     ) { /* no parenthesis around 1?[0-9] and [1-9]?[0-9] */
260                     EASY_CHAR('(')
261                     parenthesis = 1;
262                 }
263             }
264 
265             if ((RGXG_LEADINGZERO|RGXG_VARLEADINGZERO)&options) {
266                 number_of_leading_zeros = max_num_of_digits-(prefix ? rgxg_number_of_digits_long_long(prefix, base): 0)-max-1;
267                 EASY_LEADING_ZEROS
268             }
269 
270             if (prefix) {
271                 EASY_NUMBER_ASSIGNMENT(prefix)
272             }
273             if (prefix_range_first == 0 && prefix_range_last == base-1) { /* 55[0-9][0-9] => 55[0-9]{2} */
274                 max++; min++;
275             } else {
276                 EASY_BASE32_RANGE_ASSIGNMENT(prefix_range_first, prefix_range_last)
277             }
278             if (max) { /* handle multiple [0-9] */
279                 if (max == 1 && first ==0 && ((current_last == base -1
280                     && min == 1) || (current_last == 0 && min == 0))
281                     ) { /* 1[0-9]|[0-9] => 1?[0-9] and [1-9][0-9]?|0 -> [1-9]?[0-9] */
282                     EASY_CHAR('?')
283                     EASY_BASE32_RANGE_ASSIGNMENT(0, base-1)
284                     current_last = -1;
285                 } else {
286                     if ((RGXG_VARLEADINGZERO&options) && first == 0 &&
287                             prefix == 0 && prefix_range_first == 0) {
288                         if (prefix_range_last != base-1) {
289                             EASY_CHAR('?')
290                         }
291                         min = 1;
292                     }
293                     EASY_BASE32_RANGE_ASSIGNMENT(0, base-1)
294                     if (max == 1 && min == 0) { /* 5[0-9]{0,1} => 5[0-9]? */
295                         EASY_CHAR('?')
296                     } else if (max > 1) {
297                         EASY_CHAR('{')
298                         if (min != max) {
299                             n += internal_plain_number_base10(min, (regex ? regex+n : NULL));
300                             EASY_CHAR(',')
301                         }
302                         n += internal_plain_number_base10(max, (regex ? regex+n : NULL));
303                         EASY_CHAR('}')
304                     }
305                 }
306             }
307             if (first <= current_last) {
308                 EASY_CHAR('|')
309             }
310         } while (first <= current_last);
311         if (parenthesis) {
312             EASY_CHAR(')')
313         }
314         if (!(RGXG_NONULLBYTE&options) && regex) { regex[n] = '\0'; }
315     }
316     return n;
317 }
318 
rgxg_number_greaterequal(long long number,int base,size_t min_length,char * regex,rgxg_options_t options)319 int rgxg_number_greaterequal (long long number, int base,
320         size_t min_length, char *regex, rgxg_options_t options) {
321     int n, no_power_of_base, min, max, number_of_leading_zeros;
322     size_t count;
323     long long boundary;
324 
325     EASY_VALIDATE_MUTEXOPTIONS(RGXG_LEADINGZERO, RGXG_VARLEADINGZERO)
326     EASY_VALIDATE_MUTEXOPTIONS(RGXG_NOUPPERCASE, RGXG_NOLOWERCASE)
327     EASY_IF_ERROR_ELSE(base < 2 || base > 32, RGXG_ERROR_BASE)
328     EASY_IF_ERROR_ELSE(number < 0, RGXG_ERROR_NEGARG)
329     EASY_IF_ERROR_ELSE(number > rgxg_power(base, rgxg_number_of_digits_long_long(limits[base], base)), RGXG_ERROR_ARG2BIG) {
330         n = 0;
331         count = rgxg_number_of_digits_long_long(number ,base);
332 
333         boundary = rgxg_power(base,count-1);
334         no_power_of_base = (number != boundary) || (RGXG_LEADINGZERO|RGXG_VARLEADINGZERO)&options; /* ([1-9][0-9]{3,}|[1-9][0-9]{2}) => [1-9][0-9]{2,}  but ([1-9][0-9]+|0[1-9]) => ([1-9][0-9]+|0[1-9]) */
335 
336         if (!(RGXG_NOOUTERPARENS&options) && no_power_of_base) {
337             EASY_CHAR('(')
338         }
339         count -= (!no_power_of_base);
340 
341         max = (RGXG_LEADINGZERO|RGXG_VARLEADINGZERO)&options && min_length > count+1 ? min_length-1 : count;
342         if (number) { min = count; } else { min = max; };
343         for (int i = max ; i >= min; --i) {
344             if (i < max) { EASY_CHAR('|') }
345             number_of_leading_zeros = max-i;
346             EASY_LEADING_ZEROS
347             EASY_BASE32_RANGE_ASSIGNMENT(1, base-1)
348             EASY_BASE32_RANGE_ASSIGNMENT(0, base-1)
349             if (i == 0 && i == max) { /* [1-9][0-9]{0,} => [1-9][0-9]* */
350                 EASY_CHAR('*')
351             } else if (i == 1 && i == max) { /* [1-9][0-9]{1,} => [1-9][0-9]+ */
352                 EASY_CHAR('+')
353             } else if (i > 1) {
354                 EASY_CHAR('{')
355                 n += internal_plain_number_base10(i, (regex ? regex+n : NULL));
356                 if (i == max) { EASY_CHAR(',') }
357                 EASY_CHAR('}')
358             }
359         }
360         if (no_power_of_base) {
361             boundary *= base;
362             EASY_CHAR('|')
363             if (number > 0 || !((RGXG_LEADINGZERO|RGXG_VARLEADINGZERO)&options)) { count = max; }
364             n += rgxg_number_range(number, boundary-1, base, count+1,
365                     (regex ? regex+n : NULL), RGXG_NONULLBYTE|RGXG_NOOUTERPARENS|options);
366             if (number == 0 && (RGXG_LEADINGZERO|RGXG_VARLEADINGZERO)&options) {
367                 if (max > 1) {
368                     EASY_CHAR('{')
369                     if ((RGXG_VARLEADINGZERO)&options) {
370                         EASY_CHAR('1')
371                         EASY_CHAR(',')
372                     }
373                     n += internal_plain_number_base10(max, (regex ? regex+n : NULL));
374                     EASY_CHAR('}')
375                 }
376             }
377             if (!(RGXG_NOOUTERPARENS&options)) {
378                 EASY_CHAR(')')
379             }
380         }
381         if (!(RGXG_NONULLBYTE&options) && regex) { regex[n] = '\0'; }
382     }
383 
384     return n;
385 }
386