1/** 2 * Copyright (c) 2015, Timothy Stack 3 * 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * * Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 * * Neither the name of Timothy Stack nor the names of its contributors 15 * may be used to endorse or promote products derived from this software 16 * without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY 19 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY 22 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30#include "config.h" 31 32#include <arpa/inet.h> 33#include <netinet/in.h> 34#include <sys/socket.h> 35 36#include "data_scanner.hh" 37 38bool data_scanner::tokenize2(pcre_context &pc, data_token_t &token_out) 39{ 40# define YYCTYPE unsigned char 41# define CAPTURE(tok) { \ 42 if (YYCURSOR.val == EMPTY) { \ 43 pi.pi_next_offset = pi.pi_length; \ 44 } else { \ 45 pi.pi_next_offset = YYCURSOR.val - (const unsigned char *) pi.get_string(); \ 46 } \ 47 cap[0].c_end = pi.pi_next_offset; \ 48 cap[1].c_end = pi.pi_next_offset; \ 49 token_out = tok; \ 50 } 51# define RET(tok) { \ 52 CAPTURE(tok); \ 53 return true; \ 54 } 55 static const unsigned char *EMPTY = (const unsigned char *) ""; 56 pcre_input &pi = this->ds_pcre_input; 57 struct _YYCURSOR { 58 YYCTYPE operator*() const { 59 if (this->val < this->lim) { 60 return *val; 61 } 62 return '\0'; 63 } 64 65 operator const YYCTYPE *() const { 66 if (this->val < this->lim) { 67 return this->val; 68 } 69 return EMPTY; 70 } 71 72 const YYCTYPE *operator=(const YYCTYPE *rhs) { 73 this->val = rhs; 74 return rhs; 75 } 76 77 const YYCTYPE *operator+(int rhs) { 78 return this->val + rhs; 79 } 80 81 const _YYCURSOR *operator-=(int rhs) { 82 this->val -= rhs; 83 return this; 84 } 85 86 _YYCURSOR& operator++() { 87 this->val += 1; 88 return *this; 89 } 90 91 const YYCTYPE *val{nullptr}; 92 const YYCTYPE *lim{nullptr}; 93 } YYCURSOR; 94 YYCURSOR = (const unsigned char *) pi.get_string() + pi.pi_next_offset; 95 _YYCURSOR yyt1; 96 _YYCURSOR yyt2; 97 _YYCURSOR yyt3; 98 _YYCURSOR yyt4; 99 const YYCTYPE *YYLIMIT = (const unsigned char *) pi.get_string() + pi.pi_length; 100 const YYCTYPE *YYMARKER = YYCURSOR; 101 pcre_context::capture_t *cap = pc.all(); 102 103 YYCURSOR.lim = YYLIMIT; 104 105 pc.set_count(2); 106 cap[0].c_begin = pi.pi_next_offset; 107 cap[0].c_end = pi.pi_next_offset; 108 cap[1].c_begin = pi.pi_next_offset; 109 cap[1].c_end = pi.pi_next_offset; 110 111 /*!re2c 112 re2c:yyfill:enable = 0; 113 re2c:flags:tags = 1; 114 115 SPACE = [ \t\r]; 116 ALPHA = [a-zA-Z]; 117 NUM = [0-9]; 118 ALPHANUM = [a-zA-Z0-9_]; 119 EOF = "\x00"; 120 IPV4SEG = ("25"[0-5]|("2"[0-4]|"1"{0,1}[0-9]){0,1}[0-9]); 121 IPV4ADDR = (IPV4SEG"."){3,3}IPV4SEG; 122 IPV6SEG = [0-9a-fA-F]{1,4}; 123 IPV6ADDR = ( 124 (IPV6SEG":"){7,7}IPV6SEG| 125 (IPV6SEG":"){1,7}":"| 126 (IPV6SEG":"){1,6}":"IPV6SEG| 127 (IPV6SEG":"){1,5}(":"IPV6SEG){1,2}| 128 (IPV6SEG":"){1,4}(":"IPV6SEG){1,3}| 129 (IPV6SEG":"){1,3}(":"IPV6SEG){1,4}| 130 (IPV6SEG":"){1,2}(":"IPV6SEG){1,5}| 131 IPV6SEG":"((":"IPV6SEG){1,6})| 132 ":"((":"IPV6SEG){1,7}|":")| 133 [a-fA-F0-9]{4}":"(":"IPV6SEG){0,4}"%"[0-9a-zA-Z]{1,}| 134 "::"('ffff'(":0"{1,4}){0,1}":"){0,1}IPV4ADDR| 135 (IPV6SEG":"){1,4}":"IPV4ADDR 136 ); 137 138 EOF { return false; } 139 140 ("u"|"r")?'"'('\\'.|[^\x00\"\\]|'""')*'"' { 141 CAPTURE(DT_QUOTED_STRING); 142 switch (pi.get_string()[cap[1].c_begin]) { 143 case 'u': 144 case 'r': 145 cap[1].c_begin += 1; 146 break; 147 } 148 cap[1].c_begin += 1; 149 cap[1].c_end -= 1; 150 return true; 151 } 152 [a-qstv-zA-QSTV-Z]"'" { 153 CAPTURE(DT_WORD); 154 } 155 ("u"|"r")?"'"('\\'.|"''"|[^\x00\'\\])*"'"/[^sS] { 156 CAPTURE(DT_QUOTED_STRING); 157 switch (pi.get_string()[cap[1].c_begin]) { 158 case 'u': 159 case 'r': 160 cap[1].c_begin += 1; 161 break; 162 } 163 cap[1].c_begin += 1; 164 cap[1].c_end -= 1; 165 return true; 166 } 167 [a-zA-Z0-9]+"://"[^\x00\r\n\t '"\[\](){}]+[/a-zA-Z0-9\-=&?%] { RET(DT_URL); } 168 ("/"|"./"|"../")[a-zA-Z0-9_\.\-\~/!@#$%^&*()]* { RET(DT_PATH); } 169 (SPACE|NUM)NUM":"NUM{2}/[^:] { RET(DT_TIME); } 170 (SPACE|NUM)NUM?":"NUM{2}":"NUM{2}("."NUM{3,6})?/[^:] { RET(DT_TIME); } 171 [0-9a-fA-F][0-9a-fA-F](":"[0-9a-fA-F][0-9a-fA-F])+ { 172 if ((YYCURSOR - (const unsigned char *) pi.get_string()) == 17) { 173 RET(DT_MAC_ADDRESS); 174 } else { 175 RET(DT_HEX_DUMP); 176 } 177 } 178 (NUM{4}"/"NUM{1,2}"/"NUM{1,2}|NUM{4}"-"NUM{1,2}"-"NUM{1,2}|NUM{2}"/"ALPHA{3}"/"NUM{4})"T"? { 179 RET(DT_DATE); 180 } 181 IPV6ADDR/[^:a-zA-Z0-9] { RET(DT_IPV6_ADDRESS); } 182 183 "<""?"?[a-zA-Z0-9_:\-]+SPACE*([a-zA-Z0-9_:\-]+(SPACE*'='SPACE*('"'(('\\'.|[^\x00"\\])+)'"'|"'"(('\\'.|[^\x00'\\])+)"'"|[^\x00>]+)))*SPACE*("/"|"?")">" { 184 RET(DT_XML_EMPTY_TAG); 185 } 186 187 "<"[a-zA-Z0-9_:\-]+SPACE*([a-zA-Z0-9_:\-]+(SPACE*"="SPACE*('"'(('\\'.|[^\x00"\\])+)'"'|"'"(('\\'.|[^\x00'\\])+)"'"|[^\x00>]+)))*SPACE*">" { 188 RET(DT_XML_OPEN_TAG); 189 } 190 191 "</"[a-zA-Z0-9:\-]+SPACE*">" { 192 RET(DT_XML_CLOSE_TAG); 193 } 194 195 ":" { RET(DT_COLON); } 196 "=" { RET(DT_EQUALS); } 197 "," { RET(DT_COMMA); } 198 ";" { RET(DT_SEMI); } 199 "()" | "{}" | "[]" { RET(DT_EMPTY_CONTAINER); } 200 "{" { RET(DT_LCURLY); } 201 "}" { RET(DT_RCURLY); } 202 "[" { RET(DT_LSQUARE); } 203 "]" { RET(DT_RSQUARE); } 204 "(" { RET(DT_LPAREN); } 205 ")" { RET(DT_RPAREN); } 206 "<" { RET(DT_LANGLE); } 207 ">" { RET(DT_RANGLE); } 208 209 IPV4ADDR/[^0-9] { 210 RET(DT_IPV4_ADDRESS); 211 } 212 213 [0-9a-fA-F]{8}("-"[0-9a-fA-F]{4}){3}"-"[0-9a-fA-F]{12} { RET(DT_UUID); } 214 215 [0-9]"."[0-9]+'e'[\-\+][0-9]+ { RET(DT_NUMBER); } 216 217 [0-9]+("."[0-9]+[a-zA-Z0-9_]*){2,}("-"[a-zA-Z0-9_]+)?|[0-9]+("."[0-9]+[a-zA-Z0-9_]*)+"-"[a-zA-Z0-9_]+ { 218 RET(DT_VERSION_NUMBER); 219 } 220 221 "-"?"0"[0-7]+ { RET(DT_OCTAL_NUMBER); } 222 "-"?[0-9]+("."[0-9]+)?[ ]*"%" { RET(DT_PERCENTAGE); } 223 "-"?[0-9]+("."[0-9]+)?([eE][\-+][0-9]+)? { RET(DT_NUMBER); } 224 "-"?("0x"|[0-9])[0-9a-fA-F]+ { RET(DT_HEX_NUMBER); } 225 226 [a-zA-Z0-9\._%+-]+"@"[a-zA-Z0-9\.-]+"."[a-zA-Z]+ { RET(DT_EMAIL); } 227 228 "true"|"True"|"TRUE"|"false"|"False"|"FALSE"|"None"|"null"|"NULL"/([\r\n\t \(\)!\*:;'\"\?,]|[\.\!,\?]SPACE|EOF) { RET(DT_CONSTANT); } 229 230 ("re-")?[a-zA-Z][a-z']+/([\r\n\t \(\)!\*:;'\"\?,]|[\.\!,\?]SPACE|EOF) { RET(DT_WORD); } 231 232 [^\x00"; \t\r\n:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\\]+("::"[^\x00"; \r\n\t:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\\]+)* { 233 RET(DT_SYMBOL); 234 } 235 236 ("\r"?"\n"|"\\n") { RET(DT_LINE); } 237 SPACE+ { RET(DT_WHITE); } 238 "." { RET(DT_DOT); } 239 . { RET(DT_GARBAGE); } 240 241 */ 242} 243