1/**
2 * Copyright (c) 2015, Timothy Stack
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * * Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 * * Neither the name of Timothy Stack nor the names of its contributors
15 * may be used to endorse or promote products derived from this software
16 * without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
19 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
22 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30#include "config.h"
31
32#include <arpa/inet.h>
33#include <netinet/in.h>
34#include <sys/socket.h>
35
36#include "data_scanner.hh"
37
38bool data_scanner::tokenize2(pcre_context &pc, data_token_t &token_out)
39{
40#   define YYCTYPE unsigned char
41#   define CAPTURE(tok) { \
42        if (YYCURSOR.val == EMPTY) { \
43            pi.pi_next_offset = pi.pi_length; \
44        } else { \
45            pi.pi_next_offset = YYCURSOR.val - (const unsigned char *) pi.get_string(); \
46        } \
47        cap[0].c_end = pi.pi_next_offset; \
48        cap[1].c_end = pi.pi_next_offset; \
49        token_out = tok; \
50    }
51#   define RET(tok) { \
52        CAPTURE(tok); \
53        return true; \
54    }
55    static const unsigned char *EMPTY = (const unsigned char *) "";
56    pcre_input &pi = this->ds_pcre_input;
57    struct _YYCURSOR {
58        YYCTYPE operator*() const {
59            if (this->val < this->lim) {
60                return *val;
61            }
62            return '\0';
63        }
64
65        operator const YYCTYPE *() const {
66            if (this->val < this->lim) {
67                return this->val;
68            }
69            return EMPTY;
70        }
71
72        const YYCTYPE *operator=(const YYCTYPE *rhs) {
73            this->val = rhs;
74            return rhs;
75        }
76
77        const YYCTYPE *operator+(int rhs) {
78            return this->val + rhs;
79        }
80
81        const _YYCURSOR *operator-=(int rhs) {
82            this->val -= rhs;
83            return this;
84        }
85
86        _YYCURSOR& operator++() {
87            this->val += 1;
88            return *this;
89        }
90
91        const YYCTYPE *val{nullptr};
92        const YYCTYPE *lim{nullptr};
93    } YYCURSOR;
94    YYCURSOR = (const unsigned char *) pi.get_string() + pi.pi_next_offset;
95    _YYCURSOR yyt1;
96    _YYCURSOR yyt2;
97    _YYCURSOR yyt3;
98    _YYCURSOR yyt4;
99    const YYCTYPE *YYLIMIT = (const unsigned char *) pi.get_string() + pi.pi_length;
100    const YYCTYPE *YYMARKER = YYCURSOR;
101    pcre_context::capture_t *cap = pc.all();
102
103    YYCURSOR.lim = YYLIMIT;
104
105    pc.set_count(2);
106    cap[0].c_begin = pi.pi_next_offset;
107    cap[0].c_end = pi.pi_next_offset;
108    cap[1].c_begin = pi.pi_next_offset;
109    cap[1].c_end = pi.pi_next_offset;
110
111    /*!re2c
112       re2c:yyfill:enable = 0;
113       re2c:flags:tags = 1;
114
115       SPACE = [ \t\r];
116       ALPHA = [a-zA-Z];
117       NUM = [0-9];
118       ALPHANUM = [a-zA-Z0-9_];
119       EOF = "\x00";
120       IPV4SEG  = ("25"[0-5]|("2"[0-4]|"1"{0,1}[0-9]){0,1}[0-9]);
121       IPV4ADDR = (IPV4SEG"."){3,3}IPV4SEG;
122       IPV6SEG  = [0-9a-fA-F]{1,4};
123       IPV6ADDR = (
124                  (IPV6SEG":"){7,7}IPV6SEG|
125                  (IPV6SEG":"){1,7}":"|
126                  (IPV6SEG":"){1,6}":"IPV6SEG|
127                  (IPV6SEG":"){1,5}(":"IPV6SEG){1,2}|
128                  (IPV6SEG":"){1,4}(":"IPV6SEG){1,3}|
129                  (IPV6SEG":"){1,3}(":"IPV6SEG){1,4}|
130                  (IPV6SEG":"){1,2}(":"IPV6SEG){1,5}|
131                  IPV6SEG":"((":"IPV6SEG){1,6})|
132                  ":"((":"IPV6SEG){1,7}|":")|
133                  [a-fA-F0-9]{4}":"(":"IPV6SEG){0,4}"%"[0-9a-zA-Z]{1,}|
134                  "::"('ffff'(":0"{1,4}){0,1}":"){0,1}IPV4ADDR|
135                  (IPV6SEG":"){1,4}":"IPV4ADDR
136                  );
137
138       EOF { return false; }
139
140       ("u"|"r")?'"'('\\'.|[^\x00\"\\]|'""')*'"' {
141           CAPTURE(DT_QUOTED_STRING);
142           switch (pi.get_string()[cap[1].c_begin]) {
143           case 'u':
144           case 'r':
145               cap[1].c_begin += 1;
146               break;
147           }
148           cap[1].c_begin += 1;
149           cap[1].c_end -= 1;
150           return true;
151       }
152       [a-qstv-zA-QSTV-Z]"'" {
153           CAPTURE(DT_WORD);
154       }
155       ("u"|"r")?"'"('\\'.|"''"|[^\x00\'\\])*"'"/[^sS] {
156           CAPTURE(DT_QUOTED_STRING);
157           switch (pi.get_string()[cap[1].c_begin]) {
158           case 'u':
159           case 'r':
160               cap[1].c_begin += 1;
161               break;
162           }
163           cap[1].c_begin += 1;
164           cap[1].c_end -= 1;
165           return true;
166       }
167       [a-zA-Z0-9]+"://"[^\x00\r\n\t '"\[\](){}]+[/a-zA-Z0-9\-=&?%] { RET(DT_URL); }
168       ("/"|"./"|"../")[a-zA-Z0-9_\.\-\~/!@#$%^&*()]* { RET(DT_PATH); }
169       (SPACE|NUM)NUM":"NUM{2}/[^:] { RET(DT_TIME); }
170       (SPACE|NUM)NUM?":"NUM{2}":"NUM{2}("."NUM{3,6})?/[^:] { RET(DT_TIME); }
171       [0-9a-fA-F][0-9a-fA-F](":"[0-9a-fA-F][0-9a-fA-F])+ {
172           if ((YYCURSOR - (const unsigned char *) pi.get_string()) == 17) {
173               RET(DT_MAC_ADDRESS);
174           } else {
175               RET(DT_HEX_DUMP);
176           }
177       }
178       (NUM{4}"/"NUM{1,2}"/"NUM{1,2}|NUM{4}"-"NUM{1,2}"-"NUM{1,2}|NUM{2}"/"ALPHA{3}"/"NUM{4})"T"? {
179           RET(DT_DATE);
180       }
181       IPV6ADDR/[^:a-zA-Z0-9] { RET(DT_IPV6_ADDRESS); }
182
183       "<""?"?[a-zA-Z0-9_:\-]+SPACE*([a-zA-Z0-9_:\-]+(SPACE*'='SPACE*('"'(('\\'.|[^\x00"\\])+)'"'|"'"(('\\'.|[^\x00'\\])+)"'"|[^\x00>]+)))*SPACE*("/"|"?")">" {
184           RET(DT_XML_EMPTY_TAG);
185       }
186
187       "<"[a-zA-Z0-9_:\-]+SPACE*([a-zA-Z0-9_:\-]+(SPACE*"="SPACE*('"'(('\\'.|[^\x00"\\])+)'"'|"'"(('\\'.|[^\x00'\\])+)"'"|[^\x00>]+)))*SPACE*">" {
188           RET(DT_XML_OPEN_TAG);
189       }
190
191       "</"[a-zA-Z0-9:\-]+SPACE*">" {
192           RET(DT_XML_CLOSE_TAG);
193       }
194
195       ":" { RET(DT_COLON); }
196       "=" { RET(DT_EQUALS); }
197       "," { RET(DT_COMMA); }
198       ";" { RET(DT_SEMI); }
199       "()" | "{}" | "[]" { RET(DT_EMPTY_CONTAINER); }
200       "{" { RET(DT_LCURLY); }
201       "}" { RET(DT_RCURLY); }
202       "[" { RET(DT_LSQUARE); }
203       "]" { RET(DT_RSQUARE); }
204       "(" { RET(DT_LPAREN); }
205       ")" { RET(DT_RPAREN); }
206       "<" { RET(DT_LANGLE); }
207       ">" { RET(DT_RANGLE); }
208
209       IPV4ADDR/[^0-9] {
210           RET(DT_IPV4_ADDRESS);
211       }
212
213       [0-9a-fA-F]{8}("-"[0-9a-fA-F]{4}){3}"-"[0-9a-fA-F]{12} { RET(DT_UUID); }
214
215       [0-9]"."[0-9]+'e'[\-\+][0-9]+ { RET(DT_NUMBER); }
216
217       [0-9]+("."[0-9]+[a-zA-Z0-9_]*){2,}("-"[a-zA-Z0-9_]+)?|[0-9]+("."[0-9]+[a-zA-Z0-9_]*)+"-"[a-zA-Z0-9_]+ {
218           RET(DT_VERSION_NUMBER);
219       }
220
221       "-"?"0"[0-7]+ { RET(DT_OCTAL_NUMBER); }
222       "-"?[0-9]+("."[0-9]+)?[ ]*"%" { RET(DT_PERCENTAGE); }
223       "-"?[0-9]+("."[0-9]+)?([eE][\-+][0-9]+)? { RET(DT_NUMBER); }
224       "-"?("0x"|[0-9])[0-9a-fA-F]+ { RET(DT_HEX_NUMBER); }
225
226       [a-zA-Z0-9\._%+-]+"@"[a-zA-Z0-9\.-]+"."[a-zA-Z]+ { RET(DT_EMAIL); }
227
228       "true"|"True"|"TRUE"|"false"|"False"|"FALSE"|"None"|"null"|"NULL"/([\r\n\t \(\)!\*:;'\"\?,]|[\.\!,\?]SPACE|EOF) { RET(DT_CONSTANT); }
229
230       ("re-")?[a-zA-Z][a-z']+/([\r\n\t \(\)!\*:;'\"\?,]|[\.\!,\?]SPACE|EOF) { RET(DT_WORD); }
231
232       [^\x00"; \t\r\n:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\\]+("::"[^\x00"; \r\n\t:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\\]+)* {
233           RET(DT_SYMBOL);
234       }
235
236       ("\r"?"\n"|"\\n") { RET(DT_LINE); }
237       SPACE+ { RET(DT_WHITE); }
238       "." { RET(DT_DOT); }
239       . { RET(DT_GARBAGE); }
240
241     */
242}
243