1 /* regex.h -*- mode:c; coding:utf-8; -*- 2 * 3 * Copyright (c) 2010-2021 Takashi Kato <ktakashi@ymail.com> 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 * 28 * $Id: $ 29 */ 30 #ifndef SAGITTARIUS_PRIVATE_REGEX_H_ 31 #define SAGITTARIUS_PRIVATE_REGEX_H_ 32 33 #include <sagittarius/private.h> 34 35 enum PatternFlags { 36 /* on Sagittarius Scheme, internal line seperator is lf. */ 37 /* SG_UNIX_LINE = (1L << 0), */ 38 SG_CASE_INSENSITIVE = (1L << 1), 39 SG_DOTALL = (1L << 2), 40 SG_UNICODE_CASE = (1L << 3), 41 SG_COMMENTS = (1L << 4), 42 SG_MULTILINE = (1L << 5), 43 SG_LITERAL = (1L << 6), 44 /* we do not support canonical equivalence */ 45 /* SG_CANON_EQ = (1L << 7), */ 46 }; 47 48 enum Anchor { 49 UNANCHORED, /* No anchoring */ 50 ANCHOR_START, /* Anchor at start only */ 51 ANCHOR_BOTH, /* Anchor at start and end */ 52 }; 53 54 typedef struct inst_rec_t inst_t; 55 56 typedef union { /* arguments for opcode */ 57 SgChar c; /* RX_CHAR: target character */ 58 unsigned long n; /* RX_SAVE: submatch start or end position */ 59 struct { /* RX_SPLIT */ 60 inst_t *x; /* primary position to jump */ 61 inst_t *y; /* secondary position to jump */ 62 } pos; 63 SgObject set; /* RX_SET: charset */ 64 unsigned int flags; /* RX_FLAGS */ 65 unsigned long index; /* RX_BREF: reference index */ 66 struct { /* RX_BRANCH or RX_BRANCHA */ 67 inst_t *x; /* yes-pattern */ 68 inst_t *y; /* no-pattern */ 69 long n; /* submatch */ 70 } cond; 71 } inst_arg_t; 72 73 struct inst_rec_t 74 { 75 unsigned char opcode; /* opcode: max 255 */ 76 unsigned char flags; 77 inst_arg_t arg; 78 }; 79 80 #define INST_OPCODE(i) ((i)->opcode) 81 #define INST_FLAG(i) ((i)->flags) 82 #define INST_FLAG_SET(i, v) ((i)->flags|=(v)) 83 84 typedef struct 85 { 86 inst_t *root; /* root match code */ 87 int rootLength; 88 } prog_t; 89 90 typedef struct SgPatternRec 91 { 92 SG_HEADER; 93 SgObject pattern; /* regex pattern: string or ast */ 94 SgObject ast; /* parsed ast */ 95 int flags; /* flags, details are above. 96 this flags are initial condition, if regex 97 has (?imx:...) in its body, matcher 98 overwrites the flags in runtime.*/ 99 int groupCount; /* captured group count */ 100 int extendedp; /* if the compiled code is used possesive match, 101 backreference, lookahead or lookbehind or 102 not*/ 103 SgObject groupNames; /* alist of captured register name and groups, 104 could be '() */ 105 prog_t *prog; /* compiled regex */ 106 } SgPattern; 107 108 SG_CLASS_DECL(Sg_PatternClass); 109 #define SG_CLASS_PATTERN (&Sg_PatternClass) 110 #define SG_PATTERN(obj) ((SgPattern *)obj) 111 #define SG_PATTERNP(obj) SG_XTYPEP(obj, SG_CLASS_PATTERN) 112 113 typedef struct text_match_ctx_rec_t text_match_ctx_t; 114 typedef struct binary_match_ctx_rec_t binary_match_ctx_t; 115 116 typedef struct SgMatcherRec 117 { 118 SG_HEADER; 119 SgPattern *pattern; 120 /* privates */ 121 long from; 122 long to; 123 long first; 124 long last; 125 long lastAppendPosition; 126 } SgMatcher; 127 128 typedef struct SgTextMatcherRec 129 { 130 SgMatcher common; 131 SgString *text; 132 text_match_ctx_t *match_ctx; 133 SgString *submatch[1]; 134 } SgTextMatcher; 135 136 typedef struct SgBinaryMatcherRec 137 { 138 SgMatcher common; 139 SgByteVector *text; 140 binary_match_ctx_t *match_ctx; 141 SgByteVector *submatch[1]; 142 } SgBinaryMatcher; 143 144 145 SG_CLASS_DECL(Sg_MatcherClass); 146 #define SG_CLASS_MATCHER (&Sg_MatcherClass) 147 #define SG_MATCHER(obj) ((SgMatcher *)obj) 148 #define SG_MATCHERP(obj) SG_ISA(obj, SG_CLASS_MATCHER) 149 150 SG_CLASS_DECL(Sg_TextMatcherClass); 151 #define SG_CLASS_TEXT_MATCHER (&Sg_TextMatcherClass) 152 #define SG_TEXT_MATCHER(obj) ((SgTextMatcher *)obj) 153 #define SG_TEXT_MATCHERP(obj) SG_XTYPEP(obj, SG_CLASS_TEXT_MATCHER) 154 155 SG_CLASS_DECL(Sg_BinaryMatcherClass); 156 #define SG_CLASS_BINARY_MATCHER (&Sg_BinaryMatcherClass) 157 #define SG_BINARY_MATCHER(obj) ((SgBinaryMatcher *)obj) 158 #define SG_BINARY_MATCHERP(obj) SG_XTYPEP(obj, SG_CLASS_BINARY_MATCHER) 159 160 /* common accessor */ 161 #define SG_MATCHER_PATTERN(r) SG_MATCHER(r)->pattern 162 #define SG_MATCHER_FROM(r) SG_MATCHER(r)->from 163 #define SG_MATCHER_TO(r) SG_MATCHER(r)->to 164 #define SG_MATCHER_FIRST(r) SG_MATCHER(r)->first 165 #define SG_MATCHER_LAST(r) SG_MATCHER(r)->last 166 #define SG_MATCHER_LAST_APPEND_POSITION(r) SG_MATCHER(r)->lastAppendPosition 167 168 SG_CDECL_BEGIN 169 SG_EXTERN SgObject Sg_CompileRegex(SgString *pattern, int flags, 170 int parseOnly); 171 SG_EXTERN SgObject Sg_CompileRegexAST(SgObject ast, int flags); 172 173 /* for debug */ 174 SG_EXTERN void Sg_DumpRegex(SgPattern *pattern, SgObject port); 175 176 /* misc */ 177 SG_EXTERN SgObject Sg_ParseCharSetString(SgString *s, int asciiP, 178 long start, long end); 179 SG_EXTERN SgObject Sg_CharSetToRegexString(SgObject cset, int invertP); 180 181 /* text matcher */ 182 SG_EXTERN SgMatcher* Sg_RegexTextMatcher(SgPattern *pattern, SgString *text, 183 long start, long end); 184 SG_EXTERN SgObject Sg_RegexTextAfter(SgTextMatcher *matcher); 185 SG_EXTERN SgObject Sg_RegexTextBefore(SgTextMatcher *matcher); 186 187 SG_EXTERN int Sg_RegexTextMatches(SgTextMatcher *m); 188 SG_EXTERN int Sg_RegexTextLookingAt(SgTextMatcher *m); 189 SG_EXTERN int Sg_RegexTextFind(SgTextMatcher *m, long start); 190 191 SG_EXTERN SgObject Sg_RegexTextGroup(SgTextMatcher *m, SgObject groupOrName); 192 SG_EXTERN int Sg_RegexTextGroupPosition(SgTextMatcher *m, 193 SgObject groupOrName, 194 int startP); 195 196 SG_EXTERN SgString* Sg_RegexTextReplaceAll(SgTextMatcher *m, 197 SgObject replacement); 198 SG_EXTERN SgString* Sg_RegexTextReplaceFirst(SgTextMatcher *m, 199 SgObject replacement); 200 SG_EXTERN SgString* Sg_RegexTextReplace(SgTextMatcher *m, SgObject replacement, 201 long count); 202 SG_EXTERN int Sg_RegexTextCaptureCount(SgTextMatcher *m); 203 204 /* binary matcher */ 205 SG_EXTERN SgMatcher* Sg_RegexBinaryMatcher(SgPattern *pattern, 206 SgByteVector *text, 207 long start, long end); 208 SG_EXTERN SgObject Sg_RegexBinaryAfter(SgBinaryMatcher *matcher); 209 SG_EXTERN SgObject Sg_RegexBinaryBefore(SgBinaryMatcher *matcher); 210 SG_EXTERN int Sg_RegexBinaryMatches(SgBinaryMatcher *m); 211 SG_EXTERN int Sg_RegexBinaryLookingAt(SgBinaryMatcher *m); 212 SG_EXTERN int Sg_RegexBinaryFind(SgBinaryMatcher *m, long start); 213 214 SG_EXTERN SgObject Sg_RegexBinaryGroup(SgBinaryMatcher *m, 215 SgObject groupOrName); 216 SG_EXTERN int Sg_RegexBinaryGroupPosition(SgBinaryMatcher *m, 217 SgObject groupOrName, 218 int startP); 219 220 SG_EXTERN SgByteVector* Sg_RegexBinaryReplaceAll(SgBinaryMatcher *m, 221 SgObject replacement); 222 SG_EXTERN SgByteVector* Sg_RegexBinaryReplaceFirst(SgBinaryMatcher *m, 223 SgObject replacement); 224 SG_EXTERN SgByteVector* Sg_RegexBinaryReplace(SgBinaryMatcher *m, 225 SgObject replacement, 226 long count); 227 SG_EXTERN int Sg_RegexBinaryCaptureCount(SgBinaryMatcher *m); 228 229 230 231 /* Old interfaces */ 232 SG_EXTERN SgMatcher* Sg_RegexMatcher(SgPattern *pattern, SgObject text, 233 long start, long end); 234 SG_EXTERN int Sg_RegexMatches(SgMatcher *m); 235 SG_EXTERN int Sg_RegexLookingAt(SgMatcher *m); 236 SG_EXTERN int Sg_RegexFind(SgMatcher *m, long start); 237 238 SG_EXTERN SgObject Sg_RegexGroup(SgMatcher *m, SgObject groupOrName); 239 SG_EXTERN int Sg_RegexGroupPosition(SgMatcher *m, 240 SgObject groupOrName, 241 int startP); 242 243 SG_EXTERN SgObject Sg_RegexReplaceAll(SgMatcher *m, 244 SgObject replacement); 245 SG_EXTERN SgObject Sg_RegexReplaceFirst(SgMatcher *m, 246 SgObject replacement); 247 SG_EXTERN SgObject Sg_RegexReplace(SgMatcher *m, SgObject replacement, 248 long count); 249 SG_EXTERN int Sg_RegexCaptureCount(SgMatcher *m); 250 SG_EXTERN SgObject Sg_RegexAfter(SgMatcher *matcher); 251 SG_EXTERN SgObject Sg_RegexBefore(SgMatcher *matcher); 252 253 SG_CDECL_END 254 255 #endif /* SAGITTARIUS_REGEX_H_ */ 256