1 /*
2     massip-parse
3 
4     This module parses IPv4 and IPv6 addresses.
5 
6     It's not a typical parser. It's optimized around parsing large
7     files containing millions of addresses and ranges using a
8     "state-machine parser".
9 */
10 #include "massip.h"
11 #include "massip-parse.h"
12 #include "massip-rangesv4.h"
13 #include "massip-rangesv6.h"
14 #include "logger.h"
15 #include "util-bool.h"
16 #include "util-malloc.h"
17 #include "string_s.h"
18 #include "unusedparm.h"
19 
20 #include <string.h>
21 
22 struct massip_parser
23 {
24     unsigned long long line_number;
25     unsigned long long char_number;
26     unsigned state;
27     unsigned tmp;
28     unsigned char digit_count;
29     unsigned addr;
30     unsigned begin;
31     unsigned end;
32     struct {
33         ipv6address _begin;
34         ipv6address _end;
35         unsigned short tmp[8];
36         unsigned char index;
37         unsigned char ellision_index;
38         unsigned is_bracket:1;
39         unsigned is_second:1;
40     } ipv6;
41 };
42 
43 /***************************************************************************
44  ***************************************************************************/
45 static struct massip_parser *
_parser_init(struct massip_parser * p)46 _parser_init(struct massip_parser *p)
47 {
48     memset(p, 0, sizeof(*p));
49     p->line_number = 1;
50     p->ipv6.ellision_index = 8;
51     return p;
52 }
53 
54 /***************************************************************************
55  ***************************************************************************/
56 static void
_parser_destroy(struct massip_parser * p)57 _parser_destroy(struct massip_parser *p)
58 {
59     UNUSEDPARM(p);
60 }
61 
62 /***************************************************************************
63  ***************************************************************************/
64 static void
_parser_err(struct massip_parser * p,unsigned long long * line_number,unsigned long long * charindex)65 _parser_err(struct massip_parser *p, unsigned long long *line_number, unsigned long long *charindex)
66 {
67     *line_number = p->line_number;
68     *charindex = p->char_number;
69 }
70 
71 /**
72  * Called before parsing the first address in a pair, and also
73  * after the first address, to prepare for parsing the next
74  * address
75  */
76 static void
_init_next_address(struct massip_parser * p,int is_second)77 _init_next_address(struct massip_parser *p, int is_second)
78 {
79     p->tmp = 0;
80     p->ipv6.ellision_index = 8;
81     p->ipv6.index = 0;
82     p->ipv6.is_bracket = 0;
83     p->digit_count = 0;
84     p->ipv6.is_second = is_second;
85 }
86 
87 
88 
89 static unsigned
_parser_finish_ipv6(struct massip_parser * p)90 _parser_finish_ipv6(struct massip_parser *p)
91 {
92     unsigned index = p->ipv6.index;
93     unsigned ellision = p->ipv6.ellision_index;
94 
95 
96     /* We must have seen 8 numbers, or an ellision */
97     if (index < 8 && ellision >= 8)
98         return 1;
99 
100     /* Handle ellision */
101     memmove(
102         &p->ipv6.tmp[8-(index-ellision)],
103         &p->ipv6.tmp[ellision],
104         sizeof(p->ipv6.tmp[0]) * (index-ellision)
105         );
106     memset(
107         &p->ipv6.tmp[ellision],
108         0,
109         sizeof(p->ipv6.tmp[0]) * (8 - index)
110     );
111 
112     /* Copy over to begin/end. We parse the address as a series of 16-bit
113      * integers, but return the result as two 64-bit integers */
114     {
115         ipv6address a;
116         a.hi = (uint64_t)p->ipv6.tmp[0] << 48ULL
117                 | (uint64_t)p->ipv6.tmp[1] << 32ULL
118                 | (uint64_t)p->ipv6.tmp[2] << 16ULL
119                 | (uint64_t)p->ipv6.tmp[3] << 0ULL;
120         a.lo = (uint64_t)p->ipv6.tmp[4] << 48ULL
121                 | (uint64_t)p->ipv6.tmp[5] << 32ULL
122                 | (uint64_t)p->ipv6.tmp[6] << 16ULL
123                 | (uint64_t)p->ipv6.tmp[7] << 0ULL;
124         if (p->ipv6.is_second)
125             p->ipv6._end = a;
126         else {
127             p->ipv6._begin = a;
128 
129             /* Set this here in case there is no 'end' address */
130             p->ipv6._end = a;
131         }
132     }
133 
134     /* Reset the parser to start parsing the next address */
135     _init_next_address(p, 1);
136 
137     return 0;
138 }
139 
140 /***************************************************************************
141  * We store the IPv6 addresses that we are building inside the 'state'
142  * of the state-machine. This function copies them out of the opaque
143  * state into discrete values.
144  ***************************************************************************/
145 static void
_parser_get_ipv6(struct massip_parser * state,ipv6address * begin,ipv6address * end)146 _parser_get_ipv6(struct massip_parser *state, ipv6address *begin, ipv6address *end)
147 {
148     *begin = state->ipv6._begin;
149     *end = state->ipv6._end;
150 }
151 
152 enum parser_state_t {
153     LINE_START, ADDR_START,
154     COMMENT,
155     NUMBER0, NUMBER1, NUMBER2, NUMBER3, NUMBER_ERR,
156     SECOND0, SECOND1, SECOND2, SECOND3, SECOND_ERR,
157     IPV4_CIDR_NUM,
158     UNIDASH1, UNIDASH2,
159     IPV6_BEGIN, IPV6_COLON, IPV6_CIDR, IPV6_CIDR_NUM,
160     IPV6_NEXT,
161     IPV6_END,
162     ERROR
163 };
164 
165 /***************************************************************************
166  * When we start parsing an address, we don't know whether it's going to
167  * be IPv4 or IPv6. We assume IPv4, but when we hit a condition indicating
168  * that it's IPv6 instead, we need change the temporary number we
169  * are working on from decimal to hex, then move from the middle of
170  * parsing an IPv4 address to the middle of parsing an IPv6 address.
171  ***************************************************************************/
172 static int
_switch_to_ipv6(struct massip_parser * p,int old_state)173 _switch_to_ipv6(struct massip_parser *p, int old_state)
174 {
175     unsigned num = p->tmp;
176 
177     num = ((num/1000)%10) * 16 * 16 * 16
178         + ((num/100)%10) * 16 * 16
179         + ((num/10)%10) * 16
180         + (num % 10);
181 
182     //printf("%u -> 0x%x\n", p->tmp, num);
183     p->tmp = num;
184     return old_state;
185 }
186 
187 
188 enum {
189     IPV4_n, IPV4_nn, IPV4_nnn, IPV4_nnn_,
190     IPV4_nnn_n, IPV4_nnn_nn, IPV4_nnn_nnn, IPV4_nnn_nnn_,
191     IPV4_nnn_nnn_n, IPV4_nnn_nnn_nn, IPV4_nnn_nnn_nnn, IPV4_nnn_nnn_nnn_,
192     IPV4_nnn_nnn_nnn_n, IPV4_nnn_nnn_nnn_nn, IPV4_nnn_nnn_nnn_nnn, IPV4_nnn_nnn_nnn_nnn_,
193     IPV4e_n, IPV4e_nn, IPV4e_nnn, IPV4e_nnn_,
194     IPV4e_nnn_n, IPV4e_nnn_nn, IPV4e_nnn_nnn, IPV4e_nnn_nnn_,
195     IPV4e_nnn_nnn_n, IPV4e_nnn_nnn_nn, IPV4e_nnn_nnn_nnn, IPV4e_nnn_nnn_nnn_,
196     IPV4e_nnn_nnn_nnn_n, IPV4e_nnn_nnn_nnn_nn, IPV4e_nnn_nnn_nnn_nnn, IPV4e_nnn_nnn_nnn_nnn_,
197 
198 
199 };
200 
201 
202 /**
203  * Applies a CIDR mask to an IPv4 address to creat a begin/end address.
204  */
205 static void
_ipv4_apply_cidr(unsigned * begin,unsigned * end,unsigned bitcount)206 _ipv4_apply_cidr(unsigned *begin, unsigned *end, unsigned bitcount)
207 {
208     unsigned long long mask = 0xFFFFFFFF00000000ULL >> bitcount;
209 
210     /* mask off low-order bits */
211     *begin &= (unsigned)mask;
212 
213     /* Set all suffix bits to 1, so that 192.168.1.0/24 has
214      * an ending address of 192.168.1.255. */
215     *end = *begin | (unsigned)~mask;
216 }
217 
218 /**
219  * Given an address 'being' and a 'prefix', return the 'begin' and 'end' address of the range.
220  * @param begin
221  *      An in/out parameter. This may have some extra bits somewhere in the range.
222  *      These will be masked off and set to zero when the function returns.
223  * @param end
224  *      An out prameter. This will be set to the last address of the range, meaning
225  *      that all the trailing bits will be set to '1'.
226  * @parame prefix
227  *      The number of bits of the prefix, from [0..128]. If the value is 0,
228  *      then the 'begin' address will be set to all zeroes and the 'end'
229  *      address will be set to all ones. If the value is 128,
230  *      the 'begin' address is unchanged and hte 'end' address
231  *      is set to the same as 'begin'.
232  */
233 static void
_ipv6_apply_cidr(ipv6address * begin,ipv6address * end,unsigned prefix)234 _ipv6_apply_cidr(ipv6address *begin, ipv6address *end, unsigned prefix)
235 {
236     ipv6address mask;
237 
238     /* For bad prefixes, make sure we return an invalid address */
239     if (prefix > 128) {
240         static const ipv6address invalid = {~0ULL, ~0ULL};
241         *begin = invalid;
242         *end = invalid;
243         return;
244     };
245 
246     /* Create the mask from the prefix */
247     if (prefix > 64)
248         mask.hi = ~0ULL;
249     else if (prefix == 0)
250         mask.hi = 0;
251     else
252         mask.hi = ~0ULL << (64 - prefix);
253 
254     if (prefix > 64)
255         mask.lo = ~0ULL << (128 - prefix);
256     else
257         mask.lo = 0;
258 
259     /* Mask off any non-zero bits from the start
260      * TODO print warning */
261     begin->hi &= mask.hi;
262     begin->lo &= mask.lo;
263 
264     /* Set all suffix bits to 1, so that 192.168.1.0/24 has
265      * an ending address of 192.168.1.255. */
266     end->hi = begin->hi | ~mask.hi;
267     end->lo = begin->lo | ~mask.lo;
268 }
269 
270 /***************************************************************************
271  * Parse the next IPv4/IPv6 address from a text stream, using a
272  * 'state-machine parser'.
273  ***************************************************************************/
274 static enum {Still_Working, Found_Error, Found_IPv4, Found_IPv6}
_parser_next(struct massip_parser * p,const char * buf,size_t * r_offset,size_t length,unsigned * r_begin,unsigned * r_end)275 _parser_next(struct massip_parser *p, const char *buf, size_t *r_offset, size_t length,
276                 unsigned *r_begin, unsigned *r_end)
277 {
278     size_t i;
279     enum parser_state_t state = p->state;
280     int result = Still_Working;
281 
282     /* The 'offset' parameter is optional. If NULL, then set it to zero */
283     if (r_offset)
284         i = *r_offset;
285     else
286         i = 0;
287 
288     /* For all bytes in this chunk. This loop will exit early once
289      * we've found a complete IP address. */
290     while (i < length) {
291         unsigned char c = buf[i++];
292 
293         p->char_number++;
294         switch (state) {
295             case LINE_START:
296             case ADDR_START:
297                 _init_next_address(p, 0);
298                 switch (c) {
299                     case ' ': case '\t': case '\r':
300                         /* ignore leading whitespace */
301                         continue;
302                     case '\n':
303                         p->line_number++;
304                         p->char_number = 0;
305                         continue;
306                     case '#': case ';': case '/': case '-':
307                         state = COMMENT;
308                         continue;
309 
310                     case '0': case '1': case '2': case '3': case '4':
311                     case '5': case '6': case '7': case '8': case '9':
312                         p->tmp = (c - '0');
313                         p->digit_count = 1;
314                         state = NUMBER0;
315                         break;
316                     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
317                         p->tmp = (c - 'a' + 10);
318                         p->digit_count = 1;
319                         state = IPV6_BEGIN;
320                         break;
321                     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
322                         p->tmp = (c - 'A' + 10);
323                         p->digit_count = 1;
324                         state = IPV6_BEGIN;
325                         break;
326                     case ':':
327                         p->ipv6.tmp[p->ipv6.index++] = 0;
328                         state = IPV6_COLON;
329                         break;
330                     case '[':
331                         p->ipv6.is_bracket = 1;
332                         state = IPV6_BEGIN;
333                         break;
334                     default:
335                         state = ERROR;
336                         length = i; /* break out of loop */
337                         break;
338                 }
339                 break;
340             case IPV6_CIDR:
341                 p->digit_count = 0;
342                 p->tmp = 0;
343                 switch (c) {
344                     case '0': case '1': case '2': case '3': case '4':
345                     case '5': case '6': case '7': case '8': case '9':
346                         p->tmp = (c - '0');
347                         p->digit_count = 1;
348                         state = IPV6_CIDR_NUM;
349                         break;
350                     default:
351                         state = ERROR;
352                         length = i; /* break out of loop */
353                         break;
354                 }
355                 break;
356 
357             case IPV6_COLON:
358                 p->digit_count = 0;
359                 p->tmp = 0;
360                 if (c == ':') {
361                     if (p->ipv6.ellision_index < 8) {
362                         state = ERROR;
363                         length = i;
364                     } else {
365                         p->ipv6.ellision_index = p->ipv6.index;
366                         state = IPV6_COLON;
367                     }
368                     break;
369                 }
370                 state = IPV6_BEGIN;
371 
372                 /* drop down */
373             case IPV6_BEGIN:
374             case IPV6_NEXT:
375                 switch (c) {
376                     case '0': case '1': case '2': case '3': case '4':
377                     case '5': case '6': case '7': case '8': case '9':
378                         if (p->digit_count >= 4) {
379                             state = ERROR;
380                             length = i;
381                         } else {
382                             p->tmp = p->tmp * 16 + (c - '0');
383                             p->digit_count++;
384                         }
385                         break;
386                     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
387                         if (p->digit_count >= 4) {
388                             state = ERROR;
389                             length = i;
390                         } else {
391                             p->tmp = p->tmp * 16 + (c - 'a' + 10);
392                             p->digit_count++;
393                         }
394                         break;
395                     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
396                         if (p->digit_count >= 4) {
397                             state = ERROR;
398                             length = i;
399                         } else {
400                             p->tmp = p->tmp * 16 + (c - 'A' + 10);
401                             p->digit_count++;
402                         }
403                         break;
404                     case ':':
405                         if (p->ipv6.index >= 8) {
406                             state = ERROR;
407                             length = i;
408                         } else {
409                             p->ipv6.tmp[p->ipv6.index++] = (unsigned short)p->tmp;
410                             state = IPV6_COLON;
411                         }
412                         break;
413                     case ']':
414                         if (!p->ipv6.is_bracket) {
415                             state = ERROR;
416                             length = i;
417                         } else {
418                             state = IPV6_END;
419                         }
420                         break;
421                     case '[':
422                         if (p->ipv6.is_bracket) {
423                             state = ERROR;
424                             length = i;
425                         } else {
426                             p->ipv6.is_bracket = 1;
427                         }
428                         break;
429                     case '/':
430                     case ' ':
431                     case '\t':
432                     case '\r':
433                     case '\n':
434                     case ',':
435                     case '-':
436                         i--; /* push back */
437                         state = IPV6_END;
438                         continue;
439                     default:
440                         state = ERROR;
441                         length = i;
442                         break;
443                 }
444                 break;
445 
446             case IPV6_END:
447                 /* Finish off the trailing number */
448                 p->ipv6.tmp[p->ipv6.index++] = (unsigned short)p->tmp;
449 
450                 /* Do the final processing of this IPv6 address and
451                  * and prepair for the next one */
452                 if (_parser_finish_ipv6(p) != 0) {
453                     state = ERROR;
454                     length = i;
455                     continue;
456                 }
457 
458                 /* Now decide the next state, whether this is a single
459                  * address, an address range, or a CIDR address */
460                 switch (c) {
461                     case '/':
462                         result = Still_Working;
463                         state = IPV6_CIDR;
464                         break;
465                     case '-':
466                         result = Still_Working;
467                         state = IPV6_NEXT;
468                         break;
469                     case '\n':
470                         p->line_number++;
471                         p->char_number = 0;
472                         /* drop down */
473                     case ' ':
474                     case '\t':
475                     case '\r':
476                     case ',':
477                         result = Found_IPv6;
478                         state = 0;
479                         length = i; /* shortend the end to break out of loop */
480                         break;
481                     default:
482                         state = ERROR;
483                         length = i;
484                         break;
485                 }
486                 break;
487             case COMMENT:
488                 if (c == '\n') {
489                     state = LINE_START;
490                     p->line_number++;
491                     p->char_number = 0;
492                 } else
493                     state = COMMENT;
494                 break;
495             case IPV6_CIDR_NUM:
496                 switch (c) {
497                     case '0': case '1': case '2': case '3': case '4':
498                     case '5': case '6': case '7': case '8': case '9':
499                         if (p->digit_count == 4) {
500                             state = ERROR;
501                             length = i; /* break out of loop */
502                         } else {
503                             p->digit_count++;
504                             p->tmp = p->tmp * 10 + (c - '0');
505                             if (p->tmp > 128) {
506                                 state = ERROR;
507                                 length = i;
508                             }
509                             continue;
510                         }
511                         break;
512                     case ':':
513                     case ',':
514                     case ' ':
515                     case '\t':
516                     case '\r':
517                     case '\n':
518                         {
519                             _ipv6_apply_cidr(&p->ipv6._begin, &p->ipv6._end, p->tmp);
520 
521                             state = ADDR_START;
522                             length = i; /* break out of loop */
523                             if (c == '\n') {
524                                 p->line_number++;
525                                 p->char_number = 0;
526                             }
527                             *r_begin = p->begin;
528                             *r_end = p->end;
529                             result = Found_IPv6;
530                         }
531                         break;
532                     default:
533                         state = ERROR;
534                         length = i; /* break out of loop */
535                         break;
536                 }
537                 break;
538             case IPV4_CIDR_NUM:
539                 switch (c) {
540                     case '0': case '1': case '2': case '3': case '4':
541                     case '5': case '6': case '7': case '8': case '9':
542                         if (p->digit_count == 3) {
543                             state = ERROR;
544                             length = i; /* break out of loop */
545                         } else {
546                             p->digit_count++;
547                             p->tmp = p->tmp * 10 + (c - '0');
548                             if (p->tmp > 32) {
549                                 state = ERROR;
550                                 length = i;
551                             }
552                             continue;
553                         }
554                         break;
555                     case ':':
556                     case ',':
557                     case ' ':
558                     case '\t':
559                     case '\r':
560                     case '\n':
561                         {
562                             _ipv4_apply_cidr(&p->begin, &p->end, p->tmp);
563                             state = ADDR_START;
564                             length = i; /* break out of loop */
565                             if (c == '\n') {
566                                 p->line_number++;
567                                 p->char_number = 0;
568                             }
569                             *r_begin = p->begin;
570                             *r_end = p->end;
571                             result = Found_IPv4;
572                         }
573                         break;
574                     default:
575                         state = ERROR;
576                         length = i; /* break out of loop */
577                         break;
578                 }
579                 break;
580 
581             case UNIDASH1:
582                 if (c == 0x80)
583                     state = UNIDASH2;
584                 else {
585                     state = ERROR;
586                     length = i; /* break out of loop */
587                 }
588                 break;
589             case UNIDASH2:
590                 /* This covers:
591                  * U+2010 HYPHEN
592                  * U+2011 NON-BREAKING HYPHEN
593                  * U+2012 FIGURE DASH
594                  * U+2013 EN DASH
595                  * U+2014 EM DASH
596                  * U+2015 HORIZONTAL BAR
597                  */
598                 if (c < 0x90 || 0x95 < c) {
599                     state = ERROR;
600                     length = i; /* break out of loop */
601                 } else {
602                     c = '-';
603                     state = NUMBER3;
604                     /* drop down */
605                 }
606 
607 
608             case NUMBER0:
609             case NUMBER1:
610             case NUMBER2:
611             case NUMBER3:
612             case SECOND0:
613             case SECOND1:
614             case SECOND2:
615             case SECOND3:
616                 switch (c) {
617                     case '.':
618                         p->addr = (p->addr << 8) | p->tmp;
619                         p->tmp = 0;
620                         p->digit_count = 0;
621                         if (state == NUMBER3 || state == SECOND3) {
622                             length = i;
623                             state = ERROR;
624                         } else
625                             state++;
626                         break;
627                     case '0': case '1': case '2': case '3': case '4':
628                     case '5': case '6': case '7': case '8': case '9':
629                         p->digit_count++;
630                         p->tmp = p->tmp * 10 + (c - '0');
631                         if (p->tmp > 255 || p->digit_count > 3) {
632                             if (state == NUMBER0) {
633                                 /* Assume that we've actually got an
634                                  * IPv6 number */
635                                 _switch_to_ipv6(p, state);
636                                 state = IPV6_BEGIN;
637                             } else {
638                                 state = ERROR;
639                                 length = i;
640                             }
641                         }
642                         continue;
643                         break;
644                     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
645                     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
646                         if (state == NUMBER0 || state == SECOND0) {
647                             /* Assume that we've actually got an
648                              * IPv6 number */
649                             _switch_to_ipv6(p, state);
650                             state = IPV6_BEGIN;
651                             i--; /* go back one character */
652                         } else {
653                             state = ERROR;
654                             length = i; /* break out of loop */
655                         }
656                         break;
657                     case 0xe2:
658                         if (state == NUMBER3) {
659                             state = UNIDASH1;
660                         } else {
661                             state = ERROR;
662                             length = i; /* break out of loop */
663                         }
664                         break;
665                     case '-':
666                     case 0x96: /* long dash, comes from copy/pasting into exclude files */
667                         if (state == NUMBER3) {
668                             p->begin = (p->addr << 8) | p->tmp;
669                             p->tmp = 0;
670                             p->digit_count = 0;
671                             p->addr = 0;
672                             state = SECOND0;
673                         } else {
674                             state = NUMBER_ERR;
675                             length = i;
676                         }
677                         break;
678                     case '/':
679                         if (state == NUMBER3) {
680                             p->begin = (p->addr << 8) | p->tmp;
681                             p->tmp = 0;
682                             p->digit_count = 0;
683                             p->addr = 0;
684                             state = IPV4_CIDR_NUM;
685                         } else {
686                             state = NUMBER_ERR;
687                             length = i; /* break out of loop */
688                         }
689                         break;
690                     case ':':
691                         if (state == NUMBER0) {
692                             /* Assume this is an IPv6 address instead of an IPv4 address */
693                             _switch_to_ipv6(p, state);
694                             state = IPV6_BEGIN;
695                             i--;
696                             break;
697                         }
698                     case ',':
699                     case ' ':
700                     case '\t':
701                     case '\r':
702                     case '\n':
703                         if (state == NUMBER3) {
704                             p->begin = (p->addr << 8) | p->tmp;
705                             p->end = p->begin;
706                             p->tmp = 0;
707                             p->digit_count = 0;
708                             p->addr = 0;
709                             state = ADDR_START;
710                             length = i; /* break out of loop */
711                             if (c == '\n') {
712                                 p->line_number++;
713                                 p->char_number = 0;
714                             }
715                             *r_begin = p->begin;
716                             *r_end = p->end;
717                             result = Found_IPv4;
718                         } else if (state == SECOND3) {
719                             p->end = (p->addr << 8) | p->tmp;
720                             p->tmp = 0;
721                             p->digit_count = 0;
722                             p->addr = 0;
723                             state = ADDR_START;
724                             length = i; /* break out of loop */
725                             if (c == '\n') {
726                                 p->line_number++;
727                                 p->char_number = 0;
728                             }
729                             *r_begin = p->begin;
730                             *r_end = p->end;
731                             result = Found_IPv4;
732                         } else {
733                             state = NUMBER_ERR;
734                             length = i;
735                         }
736                         break;
737                     default:
738                         state = ERROR;
739                         length = i; /* break out of loop */
740                         break;
741                 }
742                 break;
743 
744             default:
745             case ERROR:
746             case NUMBER_ERR:
747             case SECOND_ERR:
748                 state = ERROR;
749                 length = i; /* break */
750                 break;
751         }
752     }
753 
754     /* The 'offset' parameter is optional. If NULL, then
755      * we don't return a value */
756     if (r_offset)
757         *r_offset = i;
758 
759     p->state = state;
760     if (state == ERROR || state == NUMBER_ERR || state == SECOND_ERR)
761         result = Found_Error;
762     return result;
763 }
764 
765 
766 /***************************************************************************
767  * Test errors. We should get exactly which line-number and which character
768  * in the line caused the error
769  ***************************************************************************/
770 static int
rangefile_test_error(const char * buf,unsigned long long in_line_number,unsigned long long in_char_number,unsigned which_test)771 rangefile_test_error(const char *buf, unsigned long long in_line_number, unsigned long long in_char_number, unsigned which_test)
772 {
773     size_t length = strlen(buf);
774     size_t offset = 0;
775     struct massip_parser p[1];
776     unsigned out_begin = 0xa3a3a3a3;
777     unsigned out_end  = 0xa3a3a3a3;
778     unsigned long long out_line_number;
779     unsigned long long out_char_number;
780     int x;
781 
782     /* test the entire buffer */
783     _parser_init(p);
784     x = _parser_next(p, buf, &offset, length, &out_begin, &out_end);
785     if (x != Found_Error)
786         goto fail;
787     _parser_err(p, &out_line_number, &out_char_number);
788     if (in_line_number != out_line_number || in_char_number != out_char_number)
789         goto fail;
790 
791     /* test one byte at a time */
792     _parser_destroy(p);
793     _parser_init(p);
794     offset = 0;
795     out_begin = 0xa3a3a3a3;
796     out_end  = 0xa3a3a3a3;
797 
798     x = 0;
799     while (offset < length) {
800         x = _parser_next(p, buf, &offset, offset+1, &out_begin, &out_end);
801         if (x == Found_Error)
802             break;
803     }
804     if (x != Found_Error)
805         goto fail;
806     _parser_err(p, &out_line_number, &out_char_number);
807 
808     if (in_line_number != out_line_number || in_char_number != out_char_number)
809         goto fail;
810 
811     _parser_destroy(p);
812     return 0;
813 fail:
814     _parser_destroy(p);
815     fprintf(stderr, "[-] rangefile test fail, line=%u\n", which_test);
816     return 1;
817 }
818 
819 /***************************************************************************
820  ***************************************************************************/
821 int
massip_parse_file(struct MassIP * massip,const char * filename)822 massip_parse_file(struct MassIP *massip, const char *filename)
823 {
824     struct RangeList *targets_ipv4 = &massip->ipv4;
825     struct Range6List *targets_ipv6 = &massip->ipv6;
826     struct massip_parser p[1];
827     char buf[65536];
828     FILE *fp = NULL;
829     int err;
830     bool is_error = false;
831     unsigned addr_count = 0;
832     unsigned long long line_number, char_number;
833 
834     /*
835      * Open the file containing IP addresses, which can potentially be
836      * many megabytes in size
837      */
838     err = fopen_s(&fp, filename, "rb");
839     if (err || fp == NULL) {
840         perror(filename);
841         exit(1);
842     }
843 
844     /*
845      * Create a parser for reading in the IP addresses using a state
846      * machine parser
847      */
848     _parser_init(p);
849 
850     /*
851      * Read in the data a block at a time, parsing according to the state
852      * machine.
853      */
854     while (!is_error) {
855         size_t count;
856         size_t offset;
857 
858         count = fread(buf, 1, sizeof(buf), fp);
859         if (count <= 0)
860             break;
861 
862         offset = 0;
863         while (offset < count) {
864             unsigned begin, end;
865 
866             err = _parser_next(p, buf, &offset, count, &begin, &end);
867             switch (err) {
868             case Still_Working:
869                 if (offset < count) {
870                     /* We reached this somehow in the middle of the buffer, but
871                      * this return is only possible at the end of the buffer */
872                     fprintf(stderr, "[-] rangeparse_next(): unknown coding failure\n");
873                 }
874                 break;
875             case Found_Error:
876             default:
877                 _parser_err(p, &line_number, &char_number);
878                 fprintf(stderr, "[-] %s:%llu:%llu: invalid IP address on line #%llu\n", filename, line_number, char_number, line_number);
879                 is_error = true;
880                 count = offset;
881                 break;
882             case Found_IPv4:
883                 rangelist_add_range(targets_ipv4, begin, end);
884                 addr_count++;
885                 break;
886             case Found_IPv6:
887                 {
888                     ipv6address found_begin, found_end;
889                     _parser_get_ipv6(p, &found_begin, &found_end);
890                     range6list_add_range(targets_ipv6, found_begin, found_end);
891                     addr_count++;
892                 }
893                 break;
894             }
895         }
896     }
897     fclose(fp);
898 
899     /* In case the file doesn't end with a newline '\n', then artificially
900      * add one to the end. This is just a repeat of the code above */
901     if (!is_error) {
902         size_t offset = 0;
903         unsigned begin, end;
904         err = _parser_next(p, "\n", &offset, 1, &begin, &end);
905         switch (err) {
906         case Still_Working:
907                 break;
908         case Found_Error:
909         default:
910             _parser_err(p, &line_number, &char_number);
911             fprintf(stderr, "[-] %s:%llu:%llu: invalid IP address on line #%llu\n", filename, line_number, char_number, line_number);
912             is_error = true;
913             break;
914         case Found_IPv4:
915             rangelist_add_range(targets_ipv4, begin, end);
916             addr_count++;
917             break;
918         case Found_IPv6:
919             {
920                 ipv6address found_begin, found_end;
921                 _parser_get_ipv6(p, &found_begin, &found_end);
922                 range6list_add_range(targets_ipv6, found_begin, found_end);
923                 addr_count++;
924             }
925             break;
926         }
927     }
928 
929     LOG(1, "[+] %s: %u addresses read\n", filename, addr_count);
930 
931     /* Target list must be sorted every time it's been changed,
932      * before it can be used */
933     rangelist_sort(targets_ipv4);
934 
935     if (is_error)
936         return -1;  /* fail */
937     else
938         return 0; /* success*/
939 }
940 
941 
942 ipv6address
massip_parse_ipv6(const char * line)943 massip_parse_ipv6(const char *line)
944 {
945     struct massip_parser p[1];
946     size_t count = strlen(line);
947     size_t offset = 0;
948     int err;
949     unsigned begin, end;
950     ipv6address result;
951     ipv6address range;
952 
953     _parser_init(p);
954     err = _parser_next(p, line, &offset, count, &begin, &end);
955 again:
956     switch (err) {
957         case Still_Working:
958             if (offset < count) {
959                 /* We reached this somehow in the middle of the buffer, but
960                  * this return is only possible at the end of the buffer */
961                 fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
962                 goto fail;
963             } else {
964                 err = _parser_next(p, "\n", 0, 1, &begin, &end);
965                 if (err == Still_Working) {
966                     fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
967                     goto fail;
968                 } else {
969                     goto again;
970                 }
971             }
972             break;
973         case Found_Error:
974         default:
975             goto fail;
976         case Found_IPv4:
977             goto fail;
978         case Found_IPv6:
979             _parser_get_ipv6(p, &result, &range);
980             if (!ipv6address_is_equal(result, range))
981                 goto fail;
982             return result;
983     }
984 fail:
985     result.hi = ~0ULL;
986     result.lo = ~0ULL;
987     return result;
988 }
989 
990 unsigned
massip_parse_ipv4(const char * line)991 massip_parse_ipv4(const char *line)
992 {
993     struct massip_parser p[1];
994     size_t count = strlen(line);
995     size_t offset = 0;
996     int err;
997     unsigned begin, end;
998 
999 
1000     _parser_init(p);
1001     err = _parser_next(p, line, &offset, count, &begin, &end);
1002 again:
1003     switch (err) {
1004         case Still_Working:
1005             if (offset < count) {
1006                 /* We reached this somehow in the middle of the buffer, but
1007                  * this return is only possible at the end of the buffer */
1008                 fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
1009                 goto fail;
1010             } else {
1011                 err = _parser_next(p, "\n", 0, 1, &begin, &end);
1012                 if (err == Still_Working) {
1013                     fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
1014                     goto fail;
1015                 } else {
1016                     goto again;
1017                 }
1018             }
1019             break;
1020         case Found_Error:
1021         default:
1022             goto fail;
1023         case Found_IPv6:
1024             goto fail;
1025         case Found_IPv4:
1026             if (begin != end)
1027                 goto fail;
1028             return begin;
1029     }
1030 fail:
1031     return 0xFFFFFFFF;
1032 }
1033 
1034 enum RangeParseResult
massip_parse_range(const char * line,size_t * offset,size_t count,struct Range * ipv4,struct Range6 * ipv6)1035 massip_parse_range(const char *line, size_t *offset, size_t count, struct Range *ipv4, struct Range6 *ipv6)
1036 {
1037     struct massip_parser p[1];
1038     int err;
1039     unsigned begin, end;
1040     size_t tmp_offset = 0;
1041 
1042     /* The 'count' (length of the string) is an optional parameter. If
1043      * zero, and also the offset is NULL, then set it to the string length */
1044     if (count == 0 && offset == NULL)
1045         count = strlen(line);
1046 
1047     /* The offset is an optional parameter. If NULL, then we set
1048      * it to point to a value on the stack instead */
1049     if (offset == NULL)
1050         offset = &tmp_offset;
1051 
1052     /* Creat e parser object */
1053     _parser_init(p);
1054 
1055     /* Parse the next range from the input */
1056     err = _parser_next(p, line, offset, count, &begin, &end);
1057 again:
1058     switch (err) {
1059         case Still_Working:
1060             if (*offset < count) {
1061                 /* We reached this somehow in the middle of the buffer, but
1062                  * this return is only possible at the end of the buffer */
1063                 fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
1064                 return Bad_Address;
1065             } else {
1066                 err = _parser_next(p, "\n", 0, 1, &begin, &end);
1067                 if (err == Still_Working) {
1068                     fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
1069                     return Bad_Address;
1070                 } else {
1071                     goto again;
1072                 }
1073             }
1074             break;
1075         case Found_Error:
1076         default:
1077             return Bad_Address;
1078         case Found_IPv4:
1079             ipv4->begin = begin;
1080             ipv4->end = end;
1081             return Ipv4_Address;
1082         case Found_IPv6:
1083             _parser_get_ipv6(p, &ipv6->begin, &ipv6->end);
1084             return Ipv6_Address;
1085     }
1086 }
1087 
1088 /**
1089  * This tests  parsing when addresses/ranges are specified on the command-line
1090  * or configuration files, rather than the other test-cases which test parsing
1091  * when the IP addresses are specified in a file. The thing we are looking for
1092  * here is specifically when users separate addresses with things like
1093  * commas and spaces.
1094  */
1095 static int
selftest_massip_parse_range(void)1096 selftest_massip_parse_range(void)
1097 {
1098     struct testcases {
1099         const char *line;
1100         union {
1101             struct Range ipv4;
1102             struct Range6 ipv6;
1103         } list[4];
1104     } cases[] = {
1105         {"0.0.1.0/24,0.0.3.0-0.0.4.0", {{{0x100,0x1ff}}, {{0x300,0x400}}}},
1106         {"0.0.1.0-0.0.1.255,0.0.3.0-0.0.4.0", {{{0x100,0x1ff}}, {{0x300,0x400}}}},
1107         {"0.0.1.0/24 0.0.3.0-0.0.4.0", {{{0x100,0x1ff}}, {{0x300,0x400}}}},
1108         {0}
1109     };
1110     size_t i;
1111 
1112     for (i=0; cases[i].line; i++) {
1113         size_t length = strlen(cases[i].line);
1114         size_t offset = 0;
1115         size_t j = 0;
1116         struct Range6 range6;
1117         struct Range range4;
1118 
1119         while (offset < length) {
1120             int x;
1121             x = massip_parse_range(cases[i].line, &offset, length, &range4, &range6);
1122             switch (x) {
1123                 default:
1124                 case Bad_Address:
1125                     fprintf(stdout, "[-] selftest_massip_parse_range[%u] fail\n", (unsigned)i);
1126                     return 1;
1127                 case Ipv4_Address:
1128                     if (cases[i].list[j].ipv4.begin != range4.begin
1129                         || cases[i].list[j].ipv4.end != range4.end) {
1130                         fprintf(stdout, "[-] %u.%u.%u.%u - %u.%u.%u.%u\n",
1131                                 (unsigned char)(range4.begin>>24),
1132                                 (unsigned char)(range4.begin>>16),
1133                                 (unsigned char)(range4.begin>> 8),
1134                                 (unsigned char)(range4.begin>> 0),
1135                                 (unsigned char)(range4.end>>24),
1136                                 (unsigned char)(range4.end>>16),
1137                                 (unsigned char)(range4.end>> 8),
1138                                 (unsigned char)(range4.end>> 0)
1139                                 );
1140                         fprintf(stdout, "[-] selftest_massip_parse_range[%u] fail\n", (unsigned)i);
1141                         return 1;
1142                     }
1143                     break;
1144             }
1145             j++;
1146         }
1147 
1148         /* Make sure we have found all the expected cases */
1149         if (cases[i].list[j].ipv4.begin != 0) {
1150             fprintf(stdout, "[-] selftest_massip_parse_range[%u] fail\n", (unsigned)i);
1151             return 1;
1152         }
1153     }
1154     return 0;
1155 }
1156 
1157 
1158 /***************************************************************************
1159  ***************************************************************************/
1160 static int
rangefile6_test_buffer(struct massip_parser * parser,const char * buf,ipv6address expected_begin,ipv6address expected_end)1161 rangefile6_test_buffer(struct massip_parser *parser,
1162                        const char *buf,
1163                        ipv6address expected_begin,
1164                        ipv6address expected_end)
1165 {
1166     size_t length = strlen(buf);
1167     size_t offset = 0;
1168     ipv6address found_begin = {1,2};
1169     ipv6address found_end = {1,2};
1170     unsigned tmp1, tmp2;
1171     int err;
1172 
1173     /* test the entire buffer */
1174     err = _parser_next(parser, buf, &offset, length, &tmp1, &tmp2);
1175     if (err == Still_Working)
1176         err = _parser_next(parser, "\n", 0, 1, &tmp1, &tmp2);
1177     switch (err) {
1178     case Found_IPv6:
1179         /* Extract the resulting IPv6 address from the state structure */
1180         _parser_get_ipv6(parser, &found_begin, &found_end);
1181 
1182         /* Test to see if the parsed address equals the expected address */
1183         if (!ipv6address_is_equal(found_begin, expected_begin)) {
1184             ipaddress_formatted_t fmt1 = ipv6address_fmt(found_begin);
1185             ipaddress_formatted_t fmt2 = ipv6address_fmt(expected_begin);
1186             fprintf(stderr, "[-] begin mismatch: found=[%s], expected=[%s]\n", fmt1.string, fmt2.string);
1187             goto fail;
1188         }
1189         if (!ipv6address_is_equal(found_end, expected_end)) {
1190             ipaddress_formatted_t fmt1 = ipv6address_fmt(found_end);
1191             ipaddress_formatted_t fmt2 = ipv6address_fmt(expected_end);
1192             fprintf(stderr, "[-] end mismatch: found=[%s], expected=[%s]\n", fmt1.string, fmt2.string);
1193             goto fail;
1194         }
1195         break;
1196     case Found_IPv4:
1197         if (expected_begin.hi != 0 || expected_end.hi != 0)
1198             goto fail;
1199         if (tmp1 != expected_begin.lo || tmp2 != expected_end.lo)
1200             goto fail;
1201         break;
1202     case Still_Working:
1203         /* Found a partial address, which is a normal result in the
1204          * real world at buffer boundaries, but which is an error
1205          * here */
1206         goto fail;
1207     case Found_Error:
1208     default:
1209         goto fail;
1210     }
1211 
1212     return 0; /* success */
1213 fail:
1214     return 1; /* failure */
1215 }
1216 
1217 /***************************************************************************
1218  * List of test cases. Each test case contains three parts:
1219  * - the string representation of an address, as read from a file, meaning
1220  *   that it can contain additional things like comment strings
1221  * - the first address of a range, which in the case of IPv6 addresses
1222  *   will be two 64-bit numbers, but an IPv4 address have a high-order
1223  *   number set to zero and the low-order number set to the IPv4 address
1224  * - the second address of a range, which in the case of individual
1225  *   addresses, will be equal to the first number
1226  ***************************************************************************/
1227 struct {
1228     const char *string;
1229     ipv6address begin;
1230     ipv6address end;
1231 } test_cases[] = {
1232     {"[1::1]/126", {0x0001000000000000ULL, 0ULL}, {0x0001000000000000ULL, 3ULL}},
1233     {"1::1/126", {0x0001000000000000ULL, 0ULL}, {0x0001000000000000ULL, 3ULL}},
1234     {"[1::1]-[2::3]", {0x0001000000000000ULL, 1ULL}, {0x0002000000000000ULL, 3ULL}},
1235     {"1::1-2::3", {0x0001000000000000ULL, 1ULL}, {0x0002000000000000ULL, 3ULL}},
1236     {"[1234:5678:9abc:def0:0fed:cba9:8765:4321]", {0x123456789abcdef0ULL, 0x0fedcba987654321ULL}, {0x123456789abcdef0ULL, 0x0fedcba987654321ULL}},
1237     {"22ab::1", {0x22ab000000000000ULL, 1ULL}, {0x22ab000000000000ULL, 1ULL}},
1238     {"240e:33c:2:c080:d08:d0e:b53:e74e", {0x240e033c0002c080ULL, 0x0d080d0e0b53e74eULL}, {0x240e033c0002c080ULL, 0x0d080d0e0b53e74eULL}},
1239     {"2a03:90c0:105::9", {0x2a0390c001050000ULL, 9ULL}, {0x2a0390c001050000ULL, 9ULL}},
1240     {"2a03:9060:0:400::2", {0x2a03906000000400ULL, 2ULL}, {0x2a03906000000400ULL, 2ULL}},
1241     {"2c0f:ff00:0:a:face:b00c:0:a7", {0x2c0fff000000000aULL, 0xfaceb00c000000a7ULL}, {0x2c0fff000000000aULL, 0xfaceb00c000000a7ULL}},
1242     {"2a01:5b40:0:4a01:0:e21d:789f:59b1", {0x2a015b4000004a01ULL, 0x0000e21d789f59b1ULL}, {0x2a015b4000004a01ULL, 0x0000e21d789f59b1ULL}},
1243     {"2001:1200:10::1", {0x2001120000100000ULL, 1ULL}, {0x2001120000100000ULL, 1ULL}},
1244     {"fec0:0:0:ffff::1", {0xfec000000000ffffULL, 1ULL}, {0xfec000000000ffffULL, 1ULL}},
1245     {"1234:5678:9abc:def0:0fed:cba9:8765:4321", {0x123456789abcdef0ULL, 0x0fedcba987654321ULL}, {0x123456789abcdef0ULL, 0x0fedcba987654321ULL}},
1246     {"[1111:2222:3333:4444:5555:6666:7777:8888]", {0x1111222233334444ULL, 0x5555666677778888ULL}, {0x1111222233334444ULL, 0x5555666677778888ULL}},
1247     {"1::1", {0x0001000000000000ULL, 1ULL}, {0x0001000000000000ULL, 1ULL}},
1248     {"1.2.3.4", {0, 0x01020304}, {0, 0x01020304}},
1249     {"#test\n  97.86.162.161" "\x96" "97.86.162.175\n", {0, 0x6156a2a1}, {0, 0x6156a2af}},
1250     {"1.2.3.4/24\n", {0, 0x01020300}, {0, 0x010203ff}},
1251     {" 1.2.3.4-1.2.3.5\n", {0, 0x01020304}, {0, 0x01020305}},
1252     {0,{0,0},{0,0}}
1253 };
1254 
1255 /***************************************************************************
1256  * Called during "make test" to run a regression test over this module.
1257  ***************************************************************************/
1258 int
massip_parse_selftest(void)1259 massip_parse_selftest(void)
1260 {
1261     int x = 0;
1262     size_t i;
1263     struct massip_parser parser[1];
1264 
1265 
1266     /* Run through the test cases, stopping at the first failure */
1267     _parser_init(parser);
1268     for (i=0; test_cases[i].string; i++) {
1269         x += rangefile6_test_buffer(parser,
1270                                     test_cases[i].string,
1271                                     test_cases[i].begin,
1272                                     test_cases[i].end);
1273         if (x) {
1274             fprintf(stderr, "[-] failed: %u: %s\n", (unsigned)i, test_cases[i].string);
1275             break;
1276         }
1277     }
1278     _parser_destroy(parser);
1279 
1280 
1281     /* First, do the single line test */
1282     x += selftest_massip_parse_range();
1283     if (x)
1284         return x;
1285 
1286 
1287     x += rangefile_test_error("#bad ipv4\n 257.1.1.1\n", 2, 5, __LINE__);
1288     x += rangefile_test_error("#bad ipv4\n 1.257.1.1.1\n", 2, 6, __LINE__);
1289     x += rangefile_test_error("#bad ipv4\n 1.10.257.1.1.1\n", 2, 9, __LINE__);
1290     x += rangefile_test_error("#bad ipv4\n 1.10.255.256.1.1.1\n", 2, 13, __LINE__);
1291     x += rangefile_test_error("#bad ipv4\n 1.1.1.1.1\n", 2, 9, __LINE__);
1292 
1293     if (x)
1294        LOG(0, "[-] rangefile_selftest: fail\n");
1295     return x;
1296 }
1297 
1298