1 /*
2 massip-parse
3
4 This module parses IPv4 and IPv6 addresses.
5
6 It's not a typical parser. It's optimized around parsing large
7 files containing millions of addresses and ranges using a
8 "state-machine parser".
9 */
10 #include "massip.h"
11 #include "massip-parse.h"
12 #include "massip-rangesv4.h"
13 #include "massip-rangesv6.h"
14 #include "logger.h"
15 #include "util-bool.h"
16 #include "util-malloc.h"
17 #include "string_s.h"
18 #include "unusedparm.h"
19
20 #include <string.h>
21
22 struct massip_parser
23 {
24 unsigned long long line_number;
25 unsigned long long char_number;
26 unsigned state;
27 unsigned tmp;
28 unsigned char digit_count;
29 unsigned addr;
30 unsigned begin;
31 unsigned end;
32 struct {
33 ipv6address _begin;
34 ipv6address _end;
35 unsigned short tmp[8];
36 unsigned char index;
37 unsigned char ellision_index;
38 unsigned is_bracket:1;
39 unsigned is_second:1;
40 } ipv6;
41 };
42
43 /***************************************************************************
44 ***************************************************************************/
45 static struct massip_parser *
_parser_init(struct massip_parser * p)46 _parser_init(struct massip_parser *p)
47 {
48 memset(p, 0, sizeof(*p));
49 p->line_number = 1;
50 p->ipv6.ellision_index = 8;
51 return p;
52 }
53
54 /***************************************************************************
55 ***************************************************************************/
56 static void
_parser_destroy(struct massip_parser * p)57 _parser_destroy(struct massip_parser *p)
58 {
59 UNUSEDPARM(p);
60 }
61
62 /***************************************************************************
63 ***************************************************************************/
64 static void
_parser_err(struct massip_parser * p,unsigned long long * line_number,unsigned long long * charindex)65 _parser_err(struct massip_parser *p, unsigned long long *line_number, unsigned long long *charindex)
66 {
67 *line_number = p->line_number;
68 *charindex = p->char_number;
69 }
70
71 /**
72 * Called before parsing the first address in a pair, and also
73 * after the first address, to prepare for parsing the next
74 * address
75 */
76 static void
_init_next_address(struct massip_parser * p,int is_second)77 _init_next_address(struct massip_parser *p, int is_second)
78 {
79 p->tmp = 0;
80 p->ipv6.ellision_index = 8;
81 p->ipv6.index = 0;
82 p->ipv6.is_bracket = 0;
83 p->digit_count = 0;
84 p->ipv6.is_second = is_second;
85 }
86
87
88
89 static unsigned
_parser_finish_ipv6(struct massip_parser * p)90 _parser_finish_ipv6(struct massip_parser *p)
91 {
92 unsigned index = p->ipv6.index;
93 unsigned ellision = p->ipv6.ellision_index;
94
95
96 /* We must have seen 8 numbers, or an ellision */
97 if (index < 8 && ellision >= 8)
98 return 1;
99
100 /* Handle ellision */
101 memmove(
102 &p->ipv6.tmp[8-(index-ellision)],
103 &p->ipv6.tmp[ellision],
104 sizeof(p->ipv6.tmp[0]) * (index-ellision)
105 );
106 memset(
107 &p->ipv6.tmp[ellision],
108 0,
109 sizeof(p->ipv6.tmp[0]) * (8 - index)
110 );
111
112 /* Copy over to begin/end. We parse the address as a series of 16-bit
113 * integers, but return the result as two 64-bit integers */
114 {
115 ipv6address a;
116 a.hi = (uint64_t)p->ipv6.tmp[0] << 48ULL
117 | (uint64_t)p->ipv6.tmp[1] << 32ULL
118 | (uint64_t)p->ipv6.tmp[2] << 16ULL
119 | (uint64_t)p->ipv6.tmp[3] << 0ULL;
120 a.lo = (uint64_t)p->ipv6.tmp[4] << 48ULL
121 | (uint64_t)p->ipv6.tmp[5] << 32ULL
122 | (uint64_t)p->ipv6.tmp[6] << 16ULL
123 | (uint64_t)p->ipv6.tmp[7] << 0ULL;
124 if (p->ipv6.is_second)
125 p->ipv6._end = a;
126 else {
127 p->ipv6._begin = a;
128
129 /* Set this here in case there is no 'end' address */
130 p->ipv6._end = a;
131 }
132 }
133
134 /* Reset the parser to start parsing the next address */
135 _init_next_address(p, 1);
136
137 return 0;
138 }
139
140 /***************************************************************************
141 * We store the IPv6 addresses that we are building inside the 'state'
142 * of the state-machine. This function copies them out of the opaque
143 * state into discrete values.
144 ***************************************************************************/
145 static void
_parser_get_ipv6(struct massip_parser * state,ipv6address * begin,ipv6address * end)146 _parser_get_ipv6(struct massip_parser *state, ipv6address *begin, ipv6address *end)
147 {
148 *begin = state->ipv6._begin;
149 *end = state->ipv6._end;
150 }
151
152 enum parser_state_t {
153 LINE_START, ADDR_START,
154 COMMENT,
155 NUMBER0, NUMBER1, NUMBER2, NUMBER3, NUMBER_ERR,
156 SECOND0, SECOND1, SECOND2, SECOND3, SECOND_ERR,
157 IPV4_CIDR_NUM,
158 UNIDASH1, UNIDASH2,
159 IPV6_BEGIN, IPV6_COLON, IPV6_CIDR, IPV6_CIDR_NUM,
160 IPV6_NEXT,
161 IPV6_END,
162 ERROR
163 };
164
165 /***************************************************************************
166 * When we start parsing an address, we don't know whether it's going to
167 * be IPv4 or IPv6. We assume IPv4, but when we hit a condition indicating
168 * that it's IPv6 instead, we need change the temporary number we
169 * are working on from decimal to hex, then move from the middle of
170 * parsing an IPv4 address to the middle of parsing an IPv6 address.
171 ***************************************************************************/
172 static int
_switch_to_ipv6(struct massip_parser * p,int old_state)173 _switch_to_ipv6(struct massip_parser *p, int old_state)
174 {
175 unsigned num = p->tmp;
176
177 num = ((num/1000)%10) * 16 * 16 * 16
178 + ((num/100)%10) * 16 * 16
179 + ((num/10)%10) * 16
180 + (num % 10);
181
182 //printf("%u -> 0x%x\n", p->tmp, num);
183 p->tmp = num;
184 return old_state;
185 }
186
187
188 enum {
189 IPV4_n, IPV4_nn, IPV4_nnn, IPV4_nnn_,
190 IPV4_nnn_n, IPV4_nnn_nn, IPV4_nnn_nnn, IPV4_nnn_nnn_,
191 IPV4_nnn_nnn_n, IPV4_nnn_nnn_nn, IPV4_nnn_nnn_nnn, IPV4_nnn_nnn_nnn_,
192 IPV4_nnn_nnn_nnn_n, IPV4_nnn_nnn_nnn_nn, IPV4_nnn_nnn_nnn_nnn, IPV4_nnn_nnn_nnn_nnn_,
193 IPV4e_n, IPV4e_nn, IPV4e_nnn, IPV4e_nnn_,
194 IPV4e_nnn_n, IPV4e_nnn_nn, IPV4e_nnn_nnn, IPV4e_nnn_nnn_,
195 IPV4e_nnn_nnn_n, IPV4e_nnn_nnn_nn, IPV4e_nnn_nnn_nnn, IPV4e_nnn_nnn_nnn_,
196 IPV4e_nnn_nnn_nnn_n, IPV4e_nnn_nnn_nnn_nn, IPV4e_nnn_nnn_nnn_nnn, IPV4e_nnn_nnn_nnn_nnn_,
197
198
199 };
200
201
202 /**
203 * Applies a CIDR mask to an IPv4 address to creat a begin/end address.
204 */
205 static void
_ipv4_apply_cidr(unsigned * begin,unsigned * end,unsigned bitcount)206 _ipv4_apply_cidr(unsigned *begin, unsigned *end, unsigned bitcount)
207 {
208 unsigned long long mask = 0xFFFFFFFF00000000ULL >> bitcount;
209
210 /* mask off low-order bits */
211 *begin &= (unsigned)mask;
212
213 /* Set all suffix bits to 1, so that 192.168.1.0/24 has
214 * an ending address of 192.168.1.255. */
215 *end = *begin | (unsigned)~mask;
216 }
217
218 /**
219 * Given an address 'being' and a 'prefix', return the 'begin' and 'end' address of the range.
220 * @param begin
221 * An in/out parameter. This may have some extra bits somewhere in the range.
222 * These will be masked off and set to zero when the function returns.
223 * @param end
224 * An out prameter. This will be set to the last address of the range, meaning
225 * that all the trailing bits will be set to '1'.
226 * @parame prefix
227 * The number of bits of the prefix, from [0..128]. If the value is 0,
228 * then the 'begin' address will be set to all zeroes and the 'end'
229 * address will be set to all ones. If the value is 128,
230 * the 'begin' address is unchanged and hte 'end' address
231 * is set to the same as 'begin'.
232 */
233 static void
_ipv6_apply_cidr(ipv6address * begin,ipv6address * end,unsigned prefix)234 _ipv6_apply_cidr(ipv6address *begin, ipv6address *end, unsigned prefix)
235 {
236 ipv6address mask;
237
238 /* For bad prefixes, make sure we return an invalid address */
239 if (prefix > 128) {
240 static const ipv6address invalid = {~0ULL, ~0ULL};
241 *begin = invalid;
242 *end = invalid;
243 return;
244 };
245
246 /* Create the mask from the prefix */
247 if (prefix > 64)
248 mask.hi = ~0ULL;
249 else if (prefix == 0)
250 mask.hi = 0;
251 else
252 mask.hi = ~0ULL << (64 - prefix);
253
254 if (prefix > 64)
255 mask.lo = ~0ULL << (128 - prefix);
256 else
257 mask.lo = 0;
258
259 /* Mask off any non-zero bits from the start
260 * TODO print warning */
261 begin->hi &= mask.hi;
262 begin->lo &= mask.lo;
263
264 /* Set all suffix bits to 1, so that 192.168.1.0/24 has
265 * an ending address of 192.168.1.255. */
266 end->hi = begin->hi | ~mask.hi;
267 end->lo = begin->lo | ~mask.lo;
268 }
269
270 /***************************************************************************
271 * Parse the next IPv4/IPv6 address from a text stream, using a
272 * 'state-machine parser'.
273 ***************************************************************************/
274 static enum {Still_Working, Found_Error, Found_IPv4, Found_IPv6}
_parser_next(struct massip_parser * p,const char * buf,size_t * r_offset,size_t length,unsigned * r_begin,unsigned * r_end)275 _parser_next(struct massip_parser *p, const char *buf, size_t *r_offset, size_t length,
276 unsigned *r_begin, unsigned *r_end)
277 {
278 size_t i;
279 enum parser_state_t state = p->state;
280 int result = Still_Working;
281
282 /* The 'offset' parameter is optional. If NULL, then set it to zero */
283 if (r_offset)
284 i = *r_offset;
285 else
286 i = 0;
287
288 /* For all bytes in this chunk. This loop will exit early once
289 * we've found a complete IP address. */
290 while (i < length) {
291 unsigned char c = buf[i++];
292
293 p->char_number++;
294 switch (state) {
295 case LINE_START:
296 case ADDR_START:
297 _init_next_address(p, 0);
298 switch (c) {
299 case ' ': case '\t': case '\r':
300 /* ignore leading whitespace */
301 continue;
302 case '\n':
303 p->line_number++;
304 p->char_number = 0;
305 continue;
306 case '#': case ';': case '/': case '-':
307 state = COMMENT;
308 continue;
309
310 case '0': case '1': case '2': case '3': case '4':
311 case '5': case '6': case '7': case '8': case '9':
312 p->tmp = (c - '0');
313 p->digit_count = 1;
314 state = NUMBER0;
315 break;
316 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
317 p->tmp = (c - 'a' + 10);
318 p->digit_count = 1;
319 state = IPV6_BEGIN;
320 break;
321 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
322 p->tmp = (c - 'A' + 10);
323 p->digit_count = 1;
324 state = IPV6_BEGIN;
325 break;
326 case ':':
327 p->ipv6.tmp[p->ipv6.index++] = 0;
328 state = IPV6_COLON;
329 break;
330 case '[':
331 p->ipv6.is_bracket = 1;
332 state = IPV6_BEGIN;
333 break;
334 default:
335 state = ERROR;
336 length = i; /* break out of loop */
337 break;
338 }
339 break;
340 case IPV6_CIDR:
341 p->digit_count = 0;
342 p->tmp = 0;
343 switch (c) {
344 case '0': case '1': case '2': case '3': case '4':
345 case '5': case '6': case '7': case '8': case '9':
346 p->tmp = (c - '0');
347 p->digit_count = 1;
348 state = IPV6_CIDR_NUM;
349 break;
350 default:
351 state = ERROR;
352 length = i; /* break out of loop */
353 break;
354 }
355 break;
356
357 case IPV6_COLON:
358 p->digit_count = 0;
359 p->tmp = 0;
360 if (c == ':') {
361 if (p->ipv6.ellision_index < 8) {
362 state = ERROR;
363 length = i;
364 } else {
365 p->ipv6.ellision_index = p->ipv6.index;
366 state = IPV6_COLON;
367 }
368 break;
369 }
370 state = IPV6_BEGIN;
371
372 /* drop down */
373 case IPV6_BEGIN:
374 case IPV6_NEXT:
375 switch (c) {
376 case '0': case '1': case '2': case '3': case '4':
377 case '5': case '6': case '7': case '8': case '9':
378 if (p->digit_count >= 4) {
379 state = ERROR;
380 length = i;
381 } else {
382 p->tmp = p->tmp * 16 + (c - '0');
383 p->digit_count++;
384 }
385 break;
386 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
387 if (p->digit_count >= 4) {
388 state = ERROR;
389 length = i;
390 } else {
391 p->tmp = p->tmp * 16 + (c - 'a' + 10);
392 p->digit_count++;
393 }
394 break;
395 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
396 if (p->digit_count >= 4) {
397 state = ERROR;
398 length = i;
399 } else {
400 p->tmp = p->tmp * 16 + (c - 'A' + 10);
401 p->digit_count++;
402 }
403 break;
404 case ':':
405 if (p->ipv6.index >= 8) {
406 state = ERROR;
407 length = i;
408 } else {
409 p->ipv6.tmp[p->ipv6.index++] = (unsigned short)p->tmp;
410 state = IPV6_COLON;
411 }
412 break;
413 case ']':
414 if (!p->ipv6.is_bracket) {
415 state = ERROR;
416 length = i;
417 } else {
418 state = IPV6_END;
419 }
420 break;
421 case '[':
422 if (p->ipv6.is_bracket) {
423 state = ERROR;
424 length = i;
425 } else {
426 p->ipv6.is_bracket = 1;
427 }
428 break;
429 case '/':
430 case ' ':
431 case '\t':
432 case '\r':
433 case '\n':
434 case ',':
435 case '-':
436 i--; /* push back */
437 state = IPV6_END;
438 continue;
439 default:
440 state = ERROR;
441 length = i;
442 break;
443 }
444 break;
445
446 case IPV6_END:
447 /* Finish off the trailing number */
448 p->ipv6.tmp[p->ipv6.index++] = (unsigned short)p->tmp;
449
450 /* Do the final processing of this IPv6 address and
451 * and prepair for the next one */
452 if (_parser_finish_ipv6(p) != 0) {
453 state = ERROR;
454 length = i;
455 continue;
456 }
457
458 /* Now decide the next state, whether this is a single
459 * address, an address range, or a CIDR address */
460 switch (c) {
461 case '/':
462 result = Still_Working;
463 state = IPV6_CIDR;
464 break;
465 case '-':
466 result = Still_Working;
467 state = IPV6_NEXT;
468 break;
469 case '\n':
470 p->line_number++;
471 p->char_number = 0;
472 /* drop down */
473 case ' ':
474 case '\t':
475 case '\r':
476 case ',':
477 result = Found_IPv6;
478 state = 0;
479 length = i; /* shortend the end to break out of loop */
480 break;
481 default:
482 state = ERROR;
483 length = i;
484 break;
485 }
486 break;
487 case COMMENT:
488 if (c == '\n') {
489 state = LINE_START;
490 p->line_number++;
491 p->char_number = 0;
492 } else
493 state = COMMENT;
494 break;
495 case IPV6_CIDR_NUM:
496 switch (c) {
497 case '0': case '1': case '2': case '3': case '4':
498 case '5': case '6': case '7': case '8': case '9':
499 if (p->digit_count == 4) {
500 state = ERROR;
501 length = i; /* break out of loop */
502 } else {
503 p->digit_count++;
504 p->tmp = p->tmp * 10 + (c - '0');
505 if (p->tmp > 128) {
506 state = ERROR;
507 length = i;
508 }
509 continue;
510 }
511 break;
512 case ':':
513 case ',':
514 case ' ':
515 case '\t':
516 case '\r':
517 case '\n':
518 {
519 _ipv6_apply_cidr(&p->ipv6._begin, &p->ipv6._end, p->tmp);
520
521 state = ADDR_START;
522 length = i; /* break out of loop */
523 if (c == '\n') {
524 p->line_number++;
525 p->char_number = 0;
526 }
527 *r_begin = p->begin;
528 *r_end = p->end;
529 result = Found_IPv6;
530 }
531 break;
532 default:
533 state = ERROR;
534 length = i; /* break out of loop */
535 break;
536 }
537 break;
538 case IPV4_CIDR_NUM:
539 switch (c) {
540 case '0': case '1': case '2': case '3': case '4':
541 case '5': case '6': case '7': case '8': case '9':
542 if (p->digit_count == 3) {
543 state = ERROR;
544 length = i; /* break out of loop */
545 } else {
546 p->digit_count++;
547 p->tmp = p->tmp * 10 + (c - '0');
548 if (p->tmp > 32) {
549 state = ERROR;
550 length = i;
551 }
552 continue;
553 }
554 break;
555 case ':':
556 case ',':
557 case ' ':
558 case '\t':
559 case '\r':
560 case '\n':
561 {
562 _ipv4_apply_cidr(&p->begin, &p->end, p->tmp);
563 state = ADDR_START;
564 length = i; /* break out of loop */
565 if (c == '\n') {
566 p->line_number++;
567 p->char_number = 0;
568 }
569 *r_begin = p->begin;
570 *r_end = p->end;
571 result = Found_IPv4;
572 }
573 break;
574 default:
575 state = ERROR;
576 length = i; /* break out of loop */
577 break;
578 }
579 break;
580
581 case UNIDASH1:
582 if (c == 0x80)
583 state = UNIDASH2;
584 else {
585 state = ERROR;
586 length = i; /* break out of loop */
587 }
588 break;
589 case UNIDASH2:
590 /* This covers:
591 * U+2010 HYPHEN
592 * U+2011 NON-BREAKING HYPHEN
593 * U+2012 FIGURE DASH
594 * U+2013 EN DASH
595 * U+2014 EM DASH
596 * U+2015 HORIZONTAL BAR
597 */
598 if (c < 0x90 || 0x95 < c) {
599 state = ERROR;
600 length = i; /* break out of loop */
601 } else {
602 c = '-';
603 state = NUMBER3;
604 /* drop down */
605 }
606
607
608 case NUMBER0:
609 case NUMBER1:
610 case NUMBER2:
611 case NUMBER3:
612 case SECOND0:
613 case SECOND1:
614 case SECOND2:
615 case SECOND3:
616 switch (c) {
617 case '.':
618 p->addr = (p->addr << 8) | p->tmp;
619 p->tmp = 0;
620 p->digit_count = 0;
621 if (state == NUMBER3 || state == SECOND3) {
622 length = i;
623 state = ERROR;
624 } else
625 state++;
626 break;
627 case '0': case '1': case '2': case '3': case '4':
628 case '5': case '6': case '7': case '8': case '9':
629 p->digit_count++;
630 p->tmp = p->tmp * 10 + (c - '0');
631 if (p->tmp > 255 || p->digit_count > 3) {
632 if (state == NUMBER0) {
633 /* Assume that we've actually got an
634 * IPv6 number */
635 _switch_to_ipv6(p, state);
636 state = IPV6_BEGIN;
637 } else {
638 state = ERROR;
639 length = i;
640 }
641 }
642 continue;
643 break;
644 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
645 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
646 if (state == NUMBER0 || state == SECOND0) {
647 /* Assume that we've actually got an
648 * IPv6 number */
649 _switch_to_ipv6(p, state);
650 state = IPV6_BEGIN;
651 i--; /* go back one character */
652 } else {
653 state = ERROR;
654 length = i; /* break out of loop */
655 }
656 break;
657 case 0xe2:
658 if (state == NUMBER3) {
659 state = UNIDASH1;
660 } else {
661 state = ERROR;
662 length = i; /* break out of loop */
663 }
664 break;
665 case '-':
666 case 0x96: /* long dash, comes from copy/pasting into exclude files */
667 if (state == NUMBER3) {
668 p->begin = (p->addr << 8) | p->tmp;
669 p->tmp = 0;
670 p->digit_count = 0;
671 p->addr = 0;
672 state = SECOND0;
673 } else {
674 state = NUMBER_ERR;
675 length = i;
676 }
677 break;
678 case '/':
679 if (state == NUMBER3) {
680 p->begin = (p->addr << 8) | p->tmp;
681 p->tmp = 0;
682 p->digit_count = 0;
683 p->addr = 0;
684 state = IPV4_CIDR_NUM;
685 } else {
686 state = NUMBER_ERR;
687 length = i; /* break out of loop */
688 }
689 break;
690 case ':':
691 if (state == NUMBER0) {
692 /* Assume this is an IPv6 address instead of an IPv4 address */
693 _switch_to_ipv6(p, state);
694 state = IPV6_BEGIN;
695 i--;
696 break;
697 }
698 case ',':
699 case ' ':
700 case '\t':
701 case '\r':
702 case '\n':
703 if (state == NUMBER3) {
704 p->begin = (p->addr << 8) | p->tmp;
705 p->end = p->begin;
706 p->tmp = 0;
707 p->digit_count = 0;
708 p->addr = 0;
709 state = ADDR_START;
710 length = i; /* break out of loop */
711 if (c == '\n') {
712 p->line_number++;
713 p->char_number = 0;
714 }
715 *r_begin = p->begin;
716 *r_end = p->end;
717 result = Found_IPv4;
718 } else if (state == SECOND3) {
719 p->end = (p->addr << 8) | p->tmp;
720 p->tmp = 0;
721 p->digit_count = 0;
722 p->addr = 0;
723 state = ADDR_START;
724 length = i; /* break out of loop */
725 if (c == '\n') {
726 p->line_number++;
727 p->char_number = 0;
728 }
729 *r_begin = p->begin;
730 *r_end = p->end;
731 result = Found_IPv4;
732 } else {
733 state = NUMBER_ERR;
734 length = i;
735 }
736 break;
737 default:
738 state = ERROR;
739 length = i; /* break out of loop */
740 break;
741 }
742 break;
743
744 default:
745 case ERROR:
746 case NUMBER_ERR:
747 case SECOND_ERR:
748 state = ERROR;
749 length = i; /* break */
750 break;
751 }
752 }
753
754 /* The 'offset' parameter is optional. If NULL, then
755 * we don't return a value */
756 if (r_offset)
757 *r_offset = i;
758
759 p->state = state;
760 if (state == ERROR || state == NUMBER_ERR || state == SECOND_ERR)
761 result = Found_Error;
762 return result;
763 }
764
765
766 /***************************************************************************
767 * Test errors. We should get exactly which line-number and which character
768 * in the line caused the error
769 ***************************************************************************/
770 static int
rangefile_test_error(const char * buf,unsigned long long in_line_number,unsigned long long in_char_number,unsigned which_test)771 rangefile_test_error(const char *buf, unsigned long long in_line_number, unsigned long long in_char_number, unsigned which_test)
772 {
773 size_t length = strlen(buf);
774 size_t offset = 0;
775 struct massip_parser p[1];
776 unsigned out_begin = 0xa3a3a3a3;
777 unsigned out_end = 0xa3a3a3a3;
778 unsigned long long out_line_number;
779 unsigned long long out_char_number;
780 int x;
781
782 /* test the entire buffer */
783 _parser_init(p);
784 x = _parser_next(p, buf, &offset, length, &out_begin, &out_end);
785 if (x != Found_Error)
786 goto fail;
787 _parser_err(p, &out_line_number, &out_char_number);
788 if (in_line_number != out_line_number || in_char_number != out_char_number)
789 goto fail;
790
791 /* test one byte at a time */
792 _parser_destroy(p);
793 _parser_init(p);
794 offset = 0;
795 out_begin = 0xa3a3a3a3;
796 out_end = 0xa3a3a3a3;
797
798 x = 0;
799 while (offset < length) {
800 x = _parser_next(p, buf, &offset, offset+1, &out_begin, &out_end);
801 if (x == Found_Error)
802 break;
803 }
804 if (x != Found_Error)
805 goto fail;
806 _parser_err(p, &out_line_number, &out_char_number);
807
808 if (in_line_number != out_line_number || in_char_number != out_char_number)
809 goto fail;
810
811 _parser_destroy(p);
812 return 0;
813 fail:
814 _parser_destroy(p);
815 fprintf(stderr, "[-] rangefile test fail, line=%u\n", which_test);
816 return 1;
817 }
818
819 /***************************************************************************
820 ***************************************************************************/
821 int
massip_parse_file(struct MassIP * massip,const char * filename)822 massip_parse_file(struct MassIP *massip, const char *filename)
823 {
824 struct RangeList *targets_ipv4 = &massip->ipv4;
825 struct Range6List *targets_ipv6 = &massip->ipv6;
826 struct massip_parser p[1];
827 char buf[65536];
828 FILE *fp = NULL;
829 int err;
830 bool is_error = false;
831 unsigned addr_count = 0;
832 unsigned long long line_number, char_number;
833
834 /*
835 * Open the file containing IP addresses, which can potentially be
836 * many megabytes in size
837 */
838 err = fopen_s(&fp, filename, "rb");
839 if (err || fp == NULL) {
840 perror(filename);
841 exit(1);
842 }
843
844 /*
845 * Create a parser for reading in the IP addresses using a state
846 * machine parser
847 */
848 _parser_init(p);
849
850 /*
851 * Read in the data a block at a time, parsing according to the state
852 * machine.
853 */
854 while (!is_error) {
855 size_t count;
856 size_t offset;
857
858 count = fread(buf, 1, sizeof(buf), fp);
859 if (count <= 0)
860 break;
861
862 offset = 0;
863 while (offset < count) {
864 unsigned begin, end;
865
866 err = _parser_next(p, buf, &offset, count, &begin, &end);
867 switch (err) {
868 case Still_Working:
869 if (offset < count) {
870 /* We reached this somehow in the middle of the buffer, but
871 * this return is only possible at the end of the buffer */
872 fprintf(stderr, "[-] rangeparse_next(): unknown coding failure\n");
873 }
874 break;
875 case Found_Error:
876 default:
877 _parser_err(p, &line_number, &char_number);
878 fprintf(stderr, "[-] %s:%llu:%llu: invalid IP address on line #%llu\n", filename, line_number, char_number, line_number);
879 is_error = true;
880 count = offset;
881 break;
882 case Found_IPv4:
883 rangelist_add_range(targets_ipv4, begin, end);
884 addr_count++;
885 break;
886 case Found_IPv6:
887 {
888 ipv6address found_begin, found_end;
889 _parser_get_ipv6(p, &found_begin, &found_end);
890 range6list_add_range(targets_ipv6, found_begin, found_end);
891 addr_count++;
892 }
893 break;
894 }
895 }
896 }
897 fclose(fp);
898
899 /* In case the file doesn't end with a newline '\n', then artificially
900 * add one to the end. This is just a repeat of the code above */
901 if (!is_error) {
902 size_t offset = 0;
903 unsigned begin, end;
904 err = _parser_next(p, "\n", &offset, 1, &begin, &end);
905 switch (err) {
906 case Still_Working:
907 break;
908 case Found_Error:
909 default:
910 _parser_err(p, &line_number, &char_number);
911 fprintf(stderr, "[-] %s:%llu:%llu: invalid IP address on line #%llu\n", filename, line_number, char_number, line_number);
912 is_error = true;
913 break;
914 case Found_IPv4:
915 rangelist_add_range(targets_ipv4, begin, end);
916 addr_count++;
917 break;
918 case Found_IPv6:
919 {
920 ipv6address found_begin, found_end;
921 _parser_get_ipv6(p, &found_begin, &found_end);
922 range6list_add_range(targets_ipv6, found_begin, found_end);
923 addr_count++;
924 }
925 break;
926 }
927 }
928
929 LOG(1, "[+] %s: %u addresses read\n", filename, addr_count);
930
931 /* Target list must be sorted every time it's been changed,
932 * before it can be used */
933 rangelist_sort(targets_ipv4);
934
935 if (is_error)
936 return -1; /* fail */
937 else
938 return 0; /* success*/
939 }
940
941
942 ipv6address
massip_parse_ipv6(const char * line)943 massip_parse_ipv6(const char *line)
944 {
945 struct massip_parser p[1];
946 size_t count = strlen(line);
947 size_t offset = 0;
948 int err;
949 unsigned begin, end;
950 ipv6address result;
951 ipv6address range;
952
953 _parser_init(p);
954 err = _parser_next(p, line, &offset, count, &begin, &end);
955 again:
956 switch (err) {
957 case Still_Working:
958 if (offset < count) {
959 /* We reached this somehow in the middle of the buffer, but
960 * this return is only possible at the end of the buffer */
961 fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
962 goto fail;
963 } else {
964 err = _parser_next(p, "\n", 0, 1, &begin, &end);
965 if (err == Still_Working) {
966 fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
967 goto fail;
968 } else {
969 goto again;
970 }
971 }
972 break;
973 case Found_Error:
974 default:
975 goto fail;
976 case Found_IPv4:
977 goto fail;
978 case Found_IPv6:
979 _parser_get_ipv6(p, &result, &range);
980 if (!ipv6address_is_equal(result, range))
981 goto fail;
982 return result;
983 }
984 fail:
985 result.hi = ~0ULL;
986 result.lo = ~0ULL;
987 return result;
988 }
989
990 unsigned
massip_parse_ipv4(const char * line)991 massip_parse_ipv4(const char *line)
992 {
993 struct massip_parser p[1];
994 size_t count = strlen(line);
995 size_t offset = 0;
996 int err;
997 unsigned begin, end;
998
999
1000 _parser_init(p);
1001 err = _parser_next(p, line, &offset, count, &begin, &end);
1002 again:
1003 switch (err) {
1004 case Still_Working:
1005 if (offset < count) {
1006 /* We reached this somehow in the middle of the buffer, but
1007 * this return is only possible at the end of the buffer */
1008 fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
1009 goto fail;
1010 } else {
1011 err = _parser_next(p, "\n", 0, 1, &begin, &end);
1012 if (err == Still_Working) {
1013 fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
1014 goto fail;
1015 } else {
1016 goto again;
1017 }
1018 }
1019 break;
1020 case Found_Error:
1021 default:
1022 goto fail;
1023 case Found_IPv6:
1024 goto fail;
1025 case Found_IPv4:
1026 if (begin != end)
1027 goto fail;
1028 return begin;
1029 }
1030 fail:
1031 return 0xFFFFFFFF;
1032 }
1033
1034 enum RangeParseResult
massip_parse_range(const char * line,size_t * offset,size_t count,struct Range * ipv4,struct Range6 * ipv6)1035 massip_parse_range(const char *line, size_t *offset, size_t count, struct Range *ipv4, struct Range6 *ipv6)
1036 {
1037 struct massip_parser p[1];
1038 int err;
1039 unsigned begin, end;
1040 size_t tmp_offset = 0;
1041
1042 /* The 'count' (length of the string) is an optional parameter. If
1043 * zero, and also the offset is NULL, then set it to the string length */
1044 if (count == 0 && offset == NULL)
1045 count = strlen(line);
1046
1047 /* The offset is an optional parameter. If NULL, then we set
1048 * it to point to a value on the stack instead */
1049 if (offset == NULL)
1050 offset = &tmp_offset;
1051
1052 /* Creat e parser object */
1053 _parser_init(p);
1054
1055 /* Parse the next range from the input */
1056 err = _parser_next(p, line, offset, count, &begin, &end);
1057 again:
1058 switch (err) {
1059 case Still_Working:
1060 if (*offset < count) {
1061 /* We reached this somehow in the middle of the buffer, but
1062 * this return is only possible at the end of the buffer */
1063 fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
1064 return Bad_Address;
1065 } else {
1066 err = _parser_next(p, "\n", 0, 1, &begin, &end);
1067 if (err == Still_Working) {
1068 fprintf(stderr, "[-] _parser_next(): unknown coding failure\n");
1069 return Bad_Address;
1070 } else {
1071 goto again;
1072 }
1073 }
1074 break;
1075 case Found_Error:
1076 default:
1077 return Bad_Address;
1078 case Found_IPv4:
1079 ipv4->begin = begin;
1080 ipv4->end = end;
1081 return Ipv4_Address;
1082 case Found_IPv6:
1083 _parser_get_ipv6(p, &ipv6->begin, &ipv6->end);
1084 return Ipv6_Address;
1085 }
1086 }
1087
1088 /**
1089 * This tests parsing when addresses/ranges are specified on the command-line
1090 * or configuration files, rather than the other test-cases which test parsing
1091 * when the IP addresses are specified in a file. The thing we are looking for
1092 * here is specifically when users separate addresses with things like
1093 * commas and spaces.
1094 */
1095 static int
selftest_massip_parse_range(void)1096 selftest_massip_parse_range(void)
1097 {
1098 struct testcases {
1099 const char *line;
1100 union {
1101 struct Range ipv4;
1102 struct Range6 ipv6;
1103 } list[4];
1104 } cases[] = {
1105 {"0.0.1.0/24,0.0.3.0-0.0.4.0", {{{0x100,0x1ff}}, {{0x300,0x400}}}},
1106 {"0.0.1.0-0.0.1.255,0.0.3.0-0.0.4.0", {{{0x100,0x1ff}}, {{0x300,0x400}}}},
1107 {"0.0.1.0/24 0.0.3.0-0.0.4.0", {{{0x100,0x1ff}}, {{0x300,0x400}}}},
1108 {0}
1109 };
1110 size_t i;
1111
1112 for (i=0; cases[i].line; i++) {
1113 size_t length = strlen(cases[i].line);
1114 size_t offset = 0;
1115 size_t j = 0;
1116 struct Range6 range6;
1117 struct Range range4;
1118
1119 while (offset < length) {
1120 int x;
1121 x = massip_parse_range(cases[i].line, &offset, length, &range4, &range6);
1122 switch (x) {
1123 default:
1124 case Bad_Address:
1125 fprintf(stdout, "[-] selftest_massip_parse_range[%u] fail\n", (unsigned)i);
1126 return 1;
1127 case Ipv4_Address:
1128 if (cases[i].list[j].ipv4.begin != range4.begin
1129 || cases[i].list[j].ipv4.end != range4.end) {
1130 fprintf(stdout, "[-] %u.%u.%u.%u - %u.%u.%u.%u\n",
1131 (unsigned char)(range4.begin>>24),
1132 (unsigned char)(range4.begin>>16),
1133 (unsigned char)(range4.begin>> 8),
1134 (unsigned char)(range4.begin>> 0),
1135 (unsigned char)(range4.end>>24),
1136 (unsigned char)(range4.end>>16),
1137 (unsigned char)(range4.end>> 8),
1138 (unsigned char)(range4.end>> 0)
1139 );
1140 fprintf(stdout, "[-] selftest_massip_parse_range[%u] fail\n", (unsigned)i);
1141 return 1;
1142 }
1143 break;
1144 }
1145 j++;
1146 }
1147
1148 /* Make sure we have found all the expected cases */
1149 if (cases[i].list[j].ipv4.begin != 0) {
1150 fprintf(stdout, "[-] selftest_massip_parse_range[%u] fail\n", (unsigned)i);
1151 return 1;
1152 }
1153 }
1154 return 0;
1155 }
1156
1157
1158 /***************************************************************************
1159 ***************************************************************************/
1160 static int
rangefile6_test_buffer(struct massip_parser * parser,const char * buf,ipv6address expected_begin,ipv6address expected_end)1161 rangefile6_test_buffer(struct massip_parser *parser,
1162 const char *buf,
1163 ipv6address expected_begin,
1164 ipv6address expected_end)
1165 {
1166 size_t length = strlen(buf);
1167 size_t offset = 0;
1168 ipv6address found_begin = {1,2};
1169 ipv6address found_end = {1,2};
1170 unsigned tmp1, tmp2;
1171 int err;
1172
1173 /* test the entire buffer */
1174 err = _parser_next(parser, buf, &offset, length, &tmp1, &tmp2);
1175 if (err == Still_Working)
1176 err = _parser_next(parser, "\n", 0, 1, &tmp1, &tmp2);
1177 switch (err) {
1178 case Found_IPv6:
1179 /* Extract the resulting IPv6 address from the state structure */
1180 _parser_get_ipv6(parser, &found_begin, &found_end);
1181
1182 /* Test to see if the parsed address equals the expected address */
1183 if (!ipv6address_is_equal(found_begin, expected_begin)) {
1184 ipaddress_formatted_t fmt1 = ipv6address_fmt(found_begin);
1185 ipaddress_formatted_t fmt2 = ipv6address_fmt(expected_begin);
1186 fprintf(stderr, "[-] begin mismatch: found=[%s], expected=[%s]\n", fmt1.string, fmt2.string);
1187 goto fail;
1188 }
1189 if (!ipv6address_is_equal(found_end, expected_end)) {
1190 ipaddress_formatted_t fmt1 = ipv6address_fmt(found_end);
1191 ipaddress_formatted_t fmt2 = ipv6address_fmt(expected_end);
1192 fprintf(stderr, "[-] end mismatch: found=[%s], expected=[%s]\n", fmt1.string, fmt2.string);
1193 goto fail;
1194 }
1195 break;
1196 case Found_IPv4:
1197 if (expected_begin.hi != 0 || expected_end.hi != 0)
1198 goto fail;
1199 if (tmp1 != expected_begin.lo || tmp2 != expected_end.lo)
1200 goto fail;
1201 break;
1202 case Still_Working:
1203 /* Found a partial address, which is a normal result in the
1204 * real world at buffer boundaries, but which is an error
1205 * here */
1206 goto fail;
1207 case Found_Error:
1208 default:
1209 goto fail;
1210 }
1211
1212 return 0; /* success */
1213 fail:
1214 return 1; /* failure */
1215 }
1216
1217 /***************************************************************************
1218 * List of test cases. Each test case contains three parts:
1219 * - the string representation of an address, as read from a file, meaning
1220 * that it can contain additional things like comment strings
1221 * - the first address of a range, which in the case of IPv6 addresses
1222 * will be two 64-bit numbers, but an IPv4 address have a high-order
1223 * number set to zero and the low-order number set to the IPv4 address
1224 * - the second address of a range, which in the case of individual
1225 * addresses, will be equal to the first number
1226 ***************************************************************************/
1227 struct {
1228 const char *string;
1229 ipv6address begin;
1230 ipv6address end;
1231 } test_cases[] = {
1232 {"[1::1]/126", {0x0001000000000000ULL, 0ULL}, {0x0001000000000000ULL, 3ULL}},
1233 {"1::1/126", {0x0001000000000000ULL, 0ULL}, {0x0001000000000000ULL, 3ULL}},
1234 {"[1::1]-[2::3]", {0x0001000000000000ULL, 1ULL}, {0x0002000000000000ULL, 3ULL}},
1235 {"1::1-2::3", {0x0001000000000000ULL, 1ULL}, {0x0002000000000000ULL, 3ULL}},
1236 {"[1234:5678:9abc:def0:0fed:cba9:8765:4321]", {0x123456789abcdef0ULL, 0x0fedcba987654321ULL}, {0x123456789abcdef0ULL, 0x0fedcba987654321ULL}},
1237 {"22ab::1", {0x22ab000000000000ULL, 1ULL}, {0x22ab000000000000ULL, 1ULL}},
1238 {"240e:33c:2:c080:d08:d0e:b53:e74e", {0x240e033c0002c080ULL, 0x0d080d0e0b53e74eULL}, {0x240e033c0002c080ULL, 0x0d080d0e0b53e74eULL}},
1239 {"2a03:90c0:105::9", {0x2a0390c001050000ULL, 9ULL}, {0x2a0390c001050000ULL, 9ULL}},
1240 {"2a03:9060:0:400::2", {0x2a03906000000400ULL, 2ULL}, {0x2a03906000000400ULL, 2ULL}},
1241 {"2c0f:ff00:0:a:face:b00c:0:a7", {0x2c0fff000000000aULL, 0xfaceb00c000000a7ULL}, {0x2c0fff000000000aULL, 0xfaceb00c000000a7ULL}},
1242 {"2a01:5b40:0:4a01:0:e21d:789f:59b1", {0x2a015b4000004a01ULL, 0x0000e21d789f59b1ULL}, {0x2a015b4000004a01ULL, 0x0000e21d789f59b1ULL}},
1243 {"2001:1200:10::1", {0x2001120000100000ULL, 1ULL}, {0x2001120000100000ULL, 1ULL}},
1244 {"fec0:0:0:ffff::1", {0xfec000000000ffffULL, 1ULL}, {0xfec000000000ffffULL, 1ULL}},
1245 {"1234:5678:9abc:def0:0fed:cba9:8765:4321", {0x123456789abcdef0ULL, 0x0fedcba987654321ULL}, {0x123456789abcdef0ULL, 0x0fedcba987654321ULL}},
1246 {"[1111:2222:3333:4444:5555:6666:7777:8888]", {0x1111222233334444ULL, 0x5555666677778888ULL}, {0x1111222233334444ULL, 0x5555666677778888ULL}},
1247 {"1::1", {0x0001000000000000ULL, 1ULL}, {0x0001000000000000ULL, 1ULL}},
1248 {"1.2.3.4", {0, 0x01020304}, {0, 0x01020304}},
1249 {"#test\n 97.86.162.161" "\x96" "97.86.162.175\n", {0, 0x6156a2a1}, {0, 0x6156a2af}},
1250 {"1.2.3.4/24\n", {0, 0x01020300}, {0, 0x010203ff}},
1251 {" 1.2.3.4-1.2.3.5\n", {0, 0x01020304}, {0, 0x01020305}},
1252 {0,{0,0},{0,0}}
1253 };
1254
1255 /***************************************************************************
1256 * Called during "make test" to run a regression test over this module.
1257 ***************************************************************************/
1258 int
massip_parse_selftest(void)1259 massip_parse_selftest(void)
1260 {
1261 int x = 0;
1262 size_t i;
1263 struct massip_parser parser[1];
1264
1265
1266 /* Run through the test cases, stopping at the first failure */
1267 _parser_init(parser);
1268 for (i=0; test_cases[i].string; i++) {
1269 x += rangefile6_test_buffer(parser,
1270 test_cases[i].string,
1271 test_cases[i].begin,
1272 test_cases[i].end);
1273 if (x) {
1274 fprintf(stderr, "[-] failed: %u: %s\n", (unsigned)i, test_cases[i].string);
1275 break;
1276 }
1277 }
1278 _parser_destroy(parser);
1279
1280
1281 /* First, do the single line test */
1282 x += selftest_massip_parse_range();
1283 if (x)
1284 return x;
1285
1286
1287 x += rangefile_test_error("#bad ipv4\n 257.1.1.1\n", 2, 5, __LINE__);
1288 x += rangefile_test_error("#bad ipv4\n 1.257.1.1.1\n", 2, 6, __LINE__);
1289 x += rangefile_test_error("#bad ipv4\n 1.10.257.1.1.1\n", 2, 9, __LINE__);
1290 x += rangefile_test_error("#bad ipv4\n 1.10.255.256.1.1.1\n", 2, 13, __LINE__);
1291 x += rangefile_test_error("#bad ipv4\n 1.1.1.1.1\n", 2, 9, __LINE__);
1292
1293 if (x)
1294 LOG(0, "[-] rangefile_selftest: fail\n");
1295 return x;
1296 }
1297
1298