1 /* 2 tre-internal.h - TRE internal definitions 3 4 This software is released under a BSD-style license. 5 See the file LICENSE for details and copyright. 6 7 */ 8 9 #ifndef TRE_INTERNAL_H 10 #define TRE_INTERNAL_H 1 11 12 #ifdef HAVE_WCHAR_H 13 #include <wchar.h> 14 #endif /* HAVE_WCHAR_H */ 15 16 #ifdef HAVE_WCTYPE_H 17 #include <wctype.h> 18 #endif /* !HAVE_WCTYPE_H */ 19 20 #include <ctype.h> 21 #include <stdint.h> 22 #include "tre.h" 23 24 #ifdef TRE_DEBUG 25 #include <stdio.h> 26 #define DPRINT(msg) do {printf msg; fflush(stdout);} while(/*CONSTCOND*/(void)0,0) 27 #else /* !TRE_DEBUG */ 28 #define DPRINT(msg) do { } while(/*CONSTCOND*/(void)0,0) 29 #endif /* !TRE_DEBUG */ 30 31 #define elementsof(x) ( sizeof(x) / sizeof(x[0]) ) 32 33 #ifdef HAVE_MBRTOWC 34 #define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps))) 35 #else /* !HAVE_MBRTOWC */ 36 #ifdef HAVE_MBTOWC 37 #define tre_mbrtowc(pwc, s, n, ps) (mbtowc((pwc), (s), (n))) 38 #endif /* HAVE_MBTOWC */ 39 #endif /* !HAVE_MBRTOWC */ 40 41 #ifdef TRE_MULTIBYTE 42 #ifdef HAVE_MBSTATE_T 43 #define TRE_MBSTATE 44 #endif /* TRE_MULTIBYTE */ 45 #endif /* HAVE_MBSTATE_T */ 46 47 /* Define the character types and functions. */ 48 #ifdef TRE_WCHAR 49 50 /* Wide characters. */ 51 typedef wint_t tre_cint_t; 52 /* Workaround problem seen on AIX, (2010 & 2015), e.g., 53 https://stat.ethz.ch/pipermail/r-devel/2015-October/071902.html 54 WCHAR_MAX = UINT32_MAX on AIX and that is "not possible to work" 55 Solaris-sparcv9 WCHAR_MAX = INT32_MAX 56 Linux amd64 WCHAR_MAX = INT32_MAX 57 */ 58 /* 59 [U]INT32_MAX need to be declared: this is a C99 header which we assume 60 */ 61 #include <stdint.h> 62 #if WCHAR_MAX == UINT32_MAX 63 # define TRE_CHAR_MAX INT32_MAX 64 #else 65 # define TRE_CHAR_MAX WCHAR_MAX 66 #endif 67 68 #ifdef TRE_MULTIBYTE 69 #define TRE_MB_CUR_MAX MB_CUR_MAX 70 #else /* !TRE_MULTIBYTE */ 71 #define TRE_MB_CUR_MAX 1 72 #endif /* !TRE_MULTIBYTE */ 73 74 #include "rlocale.h" 75 76 #define tre_isalnum iswalnum 77 #define tre_isalpha iswalpha 78 #define tre_isblank iswblank 79 #define tre_iscntrl iswcntrl 80 #define tre_isdigit iswdigit 81 #define tre_isgraph iswgraph 82 #define tre_islower iswlower 83 #define tre_isprint iswprint 84 #define tre_ispunct iswpunct 85 #define tre_isspace iswspace 86 #define tre_isupper iswupper 87 #define tre_isxdigit iswxdigit 88 89 #define tre_tolower towlower 90 #define tre_toupper towupper 91 #define tre_strlen wcslen 92 93 #else /* !TRE_WCHAR */ 94 95 /* 8 bit characters. */ 96 typedef short tre_cint_t; 97 #define TRE_CHAR_MAX 255 98 #define TRE_MB_CUR_MAX 1 99 100 #define tre_isalnum isalnum 101 #define tre_isalpha isalpha 102 #ifdef HAVE_ISASCII 103 #define tre_isascii isascii 104 #endif /* HAVE_ISASCII */ 105 #ifdef HAVE_ISBLANK 106 #define tre_isblank isblank 107 #endif /* HAVE_ISBLANK */ 108 #define tre_iscntrl iscntrl 109 #define tre_isdigit isdigit 110 #define tre_isgraph isgraph 111 #define tre_islower islower 112 #define tre_isprint isprint 113 #define tre_ispunct ispunct 114 #define tre_isspace isspace 115 #define tre_isupper isupper 116 #define tre_isxdigit isxdigit 117 118 #define tre_tolower(c) (tre_cint_t)(tolower(c)) 119 #define tre_toupper(c) (tre_cint_t)(toupper(c)) 120 #define tre_strlen(s) (strlen((const char*)s)) 121 122 #endif /* !TRE_WCHAR */ 123 124 /* _WIN32 opt-out is R addition - iswctype was missing "blank" 125 R requires iswctype and wctype */ 126 #if !defined(_WIN32) && defined(TRE_WCHAR) && defined(HAVE_ISWCTYPE) && defined(HAVE_WCTYPE) 127 #define TRE_USE_SYSTEM_WCTYPE 1 128 #endif 129 130 #ifdef TRE_USE_SYSTEM_WCTYPE 131 /* Use system provided iswctype() and wctype(). */ 132 typedef wctype_t tre_ctype_t; 133 #define tre_isctype(c, type) iswctype(c, type) 134 #define tre_ctype(s) wctype(s) 135 #else /* !TRE_USE_SYSTEM_WCTYPE */ 136 /* Define our own versions of iswctype() and wctype(). */ 137 typedef int (*tre_ctype_t)(tre_cint_t); 138 #define tre_isctype(c, type) ( (type)(c) ) 139 tre_ctype_t tre_ctype(const char *name); 140 #endif /* !TRE_USE_SYSTEM_WCTYPE */ 141 142 typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t; 143 144 /* Returns number of bytes to add to (char *)ptr to make it 145 properly aligned for the type. */ 146 /* R change: was (long) but that is shorter than pointer on Win64 */ 147 #define ALIGN(ptr, type) \ 148 ((((size_t)ptr) % sizeof(type)) \ 149 ? (sizeof(type) - (((size_t)ptr) % sizeof(type))) \ 150 : 0) 151 152 #undef MAX 153 #undef MIN 154 #define MAX(a, b) (((a) >= (b)) ? (a) : (b)) 155 #define MIN(a, b) (((a) <= (b)) ? (a) : (b)) 156 157 /* Define STRF to the correct printf formatter for strings. */ 158 #ifdef TRE_WCHAR 159 #define STRF "ls" 160 #else /* !TRE_WCHAR */ 161 #define STRF "s" 162 #endif /* !TRE_WCHAR */ 163 164 /* TNFA transition type. A TNFA state is an array of transitions, 165 the terminator is a transition with NULL `state'. */ 166 typedef struct tnfa_transition tre_tnfa_transition_t; 167 168 struct tnfa_transition { 169 /* Range of accepted characters. */ 170 tre_cint_t code_min; 171 tre_cint_t code_max; 172 /* Pointer to the destination state. */ 173 tre_tnfa_transition_t *state; 174 /* ID number of the destination state. */ 175 int state_id; 176 /* -1 terminated array of tags (or NULL). */ 177 int *tags; 178 /* Matching parameters settings (or NULL). */ 179 int *params; 180 /* Assertion bitmap. */ 181 int assertions; 182 /* Assertion parameters. */ 183 union { 184 /* Character class assertion. */ 185 tre_ctype_t class; 186 /* Back reference assertion. */ 187 int backref; 188 } u; 189 /* Negative character class assertions. */ 190 tre_ctype_t *neg_classes; 191 }; 192 193 194 /* Assertions. */ 195 #define ASSERT_AT_BOL 1 /* Beginning of line. */ 196 #define ASSERT_AT_EOL 2 /* End of line. */ 197 #define ASSERT_CHAR_CLASS 4 /* Character class in `class'. */ 198 #define ASSERT_CHAR_CLASS_NEG 8 /* Character classes in `neg_classes'. */ 199 #define ASSERT_AT_BOW 16 /* Beginning of word. */ 200 #define ASSERT_AT_EOW 32 /* End of word. */ 201 #define ASSERT_AT_WB 64 /* Word boundary. */ 202 #define ASSERT_AT_WB_NEG 128 /* Not a word boundary. */ 203 #define ASSERT_BACKREF 256 /* A back reference in `backref'. */ 204 #define ASSERT_LAST 256 205 206 /* define R_assert() which can replace assert() */ 207 208 /* fake definition (important: jsut const char* str is not enough!) */ 209 extern void Rf_error(const char *str, ...); 210 211 #ifdef NDEBUG 212 #define R_assert(e) ((void) 0) 213 #else 214 /* The line below requires an ANSI C preprocessor (stringify operator) */ 215 #define R_assert(e) ((e) ? (void) 0 : Rf_error("assertion '%s' failed in executing regexp: file '%s', line %d\n", #e, __FILE__, __LINE__)) 216 #endif /* NDEBUG */ 217 218 /* Tag directions. */ 219 typedef enum { 220 TRE_TAG_MINIMIZE = 0, 221 TRE_TAG_MAXIMIZE = 1 222 } tre_tag_direction_t; 223 224 /* Parameters that can be changed dynamically while matching. */ 225 typedef enum { 226 TRE_PARAM_COST_INS = 0, 227 TRE_PARAM_COST_DEL = 1, 228 TRE_PARAM_COST_SUBST = 2, 229 TRE_PARAM_COST_MAX = 3, 230 TRE_PARAM_MAX_INS = 4, 231 TRE_PARAM_MAX_DEL = 5, 232 TRE_PARAM_MAX_SUBST = 6, 233 TRE_PARAM_MAX_ERR = 7, 234 TRE_PARAM_DEPTH = 8, 235 TRE_PARAM_LAST = 9 236 } tre_param_t; 237 238 /* Unset matching parameter */ 239 #define TRE_PARAM_UNSET -1 240 241 /* Signifies the default matching parameter value. */ 242 #define TRE_PARAM_DEFAULT -2 243 244 /* Instructions to compute submatch register values from tag values 245 after a successful match. */ 246 struct tre_submatch_data { 247 /* Tag that gives the value for rm_so (submatch start offset). */ 248 int so_tag; 249 /* Tag that gives the value for rm_eo (submatch end offset). */ 250 int eo_tag; 251 /* List of submatches this submatch is contained in. */ 252 int *parents; 253 }; 254 255 typedef struct tre_submatch_data tre_submatch_data_t; 256 257 258 /* TNFA definition. */ 259 typedef struct tnfa tre_tnfa_t; 260 261 struct tnfa { 262 tre_tnfa_transition_t *transitions; 263 unsigned int num_transitions; 264 tre_tnfa_transition_t *initial; 265 tre_tnfa_transition_t *final; 266 tre_submatch_data_t *submatch_data; 267 char *firstpos_chars; 268 int first_char; 269 unsigned int num_submatches; 270 tre_tag_direction_t *tag_directions; 271 int *minimal_tags; 272 int num_tags; 273 int num_minimals; 274 int end_tag; 275 int num_states; 276 int cflags; 277 int have_backrefs; 278 int have_approx; 279 int params_depth; 280 }; 281 282 int 283 tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags); 284 285 void 286 tre_free(regex_t *preg); 287 288 void 289 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, 290 const tre_tnfa_t *tnfa, int *tags, int match_eo); 291 292 reg_errcode_t 293 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, 294 tre_str_type_t type, int *match_tags, int eflags, 295 int *match_end_ofs); 296 297 reg_errcode_t 298 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len, 299 tre_str_type_t type, int *match_tags, int eflags, 300 int *match_end_ofs); 301 302 reg_errcode_t 303 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, 304 int len, tre_str_type_t type, int *match_tags, 305 int eflags, int *match_end_ofs); 306 307 #ifdef TRE_APPROX 308 reg_errcode_t 309 tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len, 310 tre_str_type_t type, int *match_tags, 311 regamatch_t *match, regaparams_t params, 312 int eflags, int *match_end_ofs); 313 #endif /* TRE_APPROX */ 314 315 #endif /* TRE_INTERNAL_H */ 316 317 /* EOF */ 318