1 /* regex.h                                         -*- mode:c; coding:utf-8; -*-
2  *
3  *   Copyright (c) 2010-2021  Takashi Kato <ktakashi@ymail.com>
4  *
5  *   Redistribution and use in source and binary forms, with or without
6  *   modification, are permitted provided that the following conditions
7  *   are met:
8  *
9  *   1. Redistributions of source code must retain the above copyright
10  *      notice, this list of conditions and the following disclaimer.
11  *
12  *   2. Redistributions in binary form must reproduce the above copyright
13  *      notice, this list of conditions and the following disclaimer in the
14  *      documentation and/or other materials provided with the distribution.
15  *
16  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  *  $Id: $
29  */
30 #ifndef SAGITTARIUS_PRIVATE_REGEX_H_
31 #define SAGITTARIUS_PRIVATE_REGEX_H_
32 
33 #include <sagittarius/private.h>
34 
35 enum PatternFlags {
36   /* on Sagittarius Scheme, internal line seperator is lf. */
37   /* SG_UNIX_LINE         = (1L << 0), */
38   SG_CASE_INSENSITIVE  = (1L << 1),
39   SG_DOTALL            = (1L << 2),
40   SG_UNICODE_CASE      = (1L << 3),
41   SG_COMMENTS          = (1L << 4),
42   SG_MULTILINE         = (1L << 5),
43   SG_LITERAL           = (1L << 6),
44   /* we do not support canonical equivalence */
45   /* SG_CANON_EQ          = (1L << 7), */
46 };
47 
48 enum Anchor {
49   UNANCHORED,         /* No anchoring */
50   ANCHOR_START,       /* Anchor at start only */
51   ANCHOR_BOTH,        /* Anchor at start and end */
52 };
53 
54 typedef struct inst_rec_t inst_t;
55 
56 typedef union {		 /* arguments for opcode */
57   SgChar c;		 /* RX_CHAR: target character */
58   unsigned long n;	 /* RX_SAVE: submatch start or end position */
59   struct {		 /* RX_SPLIT */
60     inst_t *x;		 /*   primary position to jump */
61     inst_t *y;		 /*   secondary position to jump */
62   } pos;
63   SgObject set;		 /* RX_SET: charset */
64   unsigned int flags;	 /* RX_FLAGS */
65   unsigned long index;	 /* RX_BREF: reference index */
66   struct {		 /* RX_BRANCH or RX_BRANCHA */
67     inst_t *x;			/* yes-pattern */
68     inst_t *y;			/* no-pattern */
69     long    n;			/* submatch */
70   } cond;
71 } inst_arg_t;
72 
73 struct inst_rec_t
74 {
75   unsigned char  opcode;	/* opcode: max 255 */
76   unsigned char  flags;
77   inst_arg_t     arg;
78 };
79 
80 #define INST_OPCODE(i)        ((i)->opcode)
81 #define INST_FLAG(i)          ((i)->flags)
82 #define INST_FLAG_SET(i, v)   ((i)->flags|=(v))
83 
84 typedef struct
85 {
86   inst_t *root;			/* root match code */
87   int     rootLength;
88 } prog_t;
89 
90 typedef struct SgPatternRec
91 {
92   SG_HEADER;
93   SgObject pattern;		/* regex pattern: string or ast */
94   SgObject ast;			/* parsed ast */
95   int      flags;		/* flags, details are above.
96 				   this flags are initial condition, if regex
97 				   has (?imx:...) in its body, matcher
98 				   overwrites the flags in runtime.*/
99   int      groupCount;		/* captured group count */
100   int      extendedp;		/* if the compiled code is used possesive match,
101 				   backreference, lookahead or lookbehind or
102 				   not*/
103   SgObject groupNames;		/* alist of captured register name and groups,
104 				   could be '() */
105   prog_t  *prog;		/* compiled regex */
106 } SgPattern;
107 
108 SG_CLASS_DECL(Sg_PatternClass);
109 #define SG_CLASS_PATTERN   (&Sg_PatternClass)
110 #define SG_PATTERN(obj)   ((SgPattern *)obj)
111 #define SG_PATTERNP(obj) SG_XTYPEP(obj, SG_CLASS_PATTERN)
112 
113 typedef struct text_match_ctx_rec_t text_match_ctx_t;
114 typedef struct binary_match_ctx_rec_t binary_match_ctx_t;
115 
116 typedef struct SgMatcherRec
117 {
118   SG_HEADER;
119   SgPattern *pattern;
120   /* privates */
121   long       from;
122   long       to;
123   long       first;
124   long       last;
125   long       lastAppendPosition;
126 } SgMatcher;
127 
128 typedef struct SgTextMatcherRec
129 {
130   SgMatcher  common;
131   SgString  *text;
132   text_match_ctx_t *match_ctx;
133   SgString   *submatch[1];
134 } SgTextMatcher;
135 
136 typedef struct SgBinaryMatcherRec
137 {
138   SgMatcher  common;
139   SgByteVector *text;
140   binary_match_ctx_t   *match_ctx;
141   SgByteVector  *submatch[1];
142 } SgBinaryMatcher;
143 
144 
145 SG_CLASS_DECL(Sg_MatcherClass);
146 #define SG_CLASS_MATCHER   (&Sg_MatcherClass)
147 #define SG_MATCHER(obj)   ((SgMatcher *)obj)
148 #define SG_MATCHERP(obj) SG_ISA(obj, SG_CLASS_MATCHER)
149 
150 SG_CLASS_DECL(Sg_TextMatcherClass);
151 #define SG_CLASS_TEXT_MATCHER   (&Sg_TextMatcherClass)
152 #define SG_TEXT_MATCHER(obj)   ((SgTextMatcher *)obj)
153 #define SG_TEXT_MATCHERP(obj) SG_XTYPEP(obj, SG_CLASS_TEXT_MATCHER)
154 
155 SG_CLASS_DECL(Sg_BinaryMatcherClass);
156 #define SG_CLASS_BINARY_MATCHER  (&Sg_BinaryMatcherClass)
157 #define SG_BINARY_MATCHER(obj)   ((SgBinaryMatcher *)obj)
158 #define SG_BINARY_MATCHERP(obj)  SG_XTYPEP(obj, SG_CLASS_BINARY_MATCHER)
159 
160 /* common accessor */
161 #define SG_MATCHER_PATTERN(r)  SG_MATCHER(r)->pattern
162 #define SG_MATCHER_FROM(r)     SG_MATCHER(r)->from
163 #define SG_MATCHER_TO(r)       SG_MATCHER(r)->to
164 #define SG_MATCHER_FIRST(r)    SG_MATCHER(r)->first
165 #define SG_MATCHER_LAST(r)     SG_MATCHER(r)->last
166 #define SG_MATCHER_LAST_APPEND_POSITION(r) SG_MATCHER(r)->lastAppendPosition
167 
168 SG_CDECL_BEGIN
169 SG_EXTERN SgObject   Sg_CompileRegex(SgString *pattern, int flags,
170 				     int parseOnly);
171 SG_EXTERN SgObject   Sg_CompileRegexAST(SgObject ast, int flags);
172 
173 /* for debug */
174 SG_EXTERN void       Sg_DumpRegex(SgPattern *pattern, SgObject port);
175 
176 /* misc */
177 SG_EXTERN SgObject   Sg_ParseCharSetString(SgString *s, int asciiP,
178 					   long start, long end);
179 SG_EXTERN SgObject   Sg_CharSetToRegexString(SgObject cset, int invertP);
180 
181 /* text matcher */
182 SG_EXTERN SgMatcher* Sg_RegexTextMatcher(SgPattern *pattern, SgString *text,
183 					 long start, long end);
184 SG_EXTERN SgObject   Sg_RegexTextAfter(SgTextMatcher *matcher);
185 SG_EXTERN SgObject   Sg_RegexTextBefore(SgTextMatcher *matcher);
186 
187 SG_EXTERN int        Sg_RegexTextMatches(SgTextMatcher *m);
188 SG_EXTERN int        Sg_RegexTextLookingAt(SgTextMatcher *m);
189 SG_EXTERN int        Sg_RegexTextFind(SgTextMatcher *m, long start);
190 
191 SG_EXTERN SgObject   Sg_RegexTextGroup(SgTextMatcher *m, SgObject groupOrName);
192 SG_EXTERN int        Sg_RegexTextGroupPosition(SgTextMatcher *m,
193 					       SgObject groupOrName,
194 					       int startP);
195 
196 SG_EXTERN SgString*  Sg_RegexTextReplaceAll(SgTextMatcher *m,
197 					    SgObject replacement);
198 SG_EXTERN SgString*  Sg_RegexTextReplaceFirst(SgTextMatcher *m,
199 					      SgObject replacement);
200 SG_EXTERN SgString*  Sg_RegexTextReplace(SgTextMatcher *m, SgObject replacement,
201 					 long count);
202 SG_EXTERN int        Sg_RegexTextCaptureCount(SgTextMatcher *m);
203 
204 /* binary matcher */
205 SG_EXTERN SgMatcher* Sg_RegexBinaryMatcher(SgPattern *pattern,
206 					   SgByteVector *text,
207 					   long start, long end);
208 SG_EXTERN SgObject   Sg_RegexBinaryAfter(SgBinaryMatcher *matcher);
209 SG_EXTERN SgObject   Sg_RegexBinaryBefore(SgBinaryMatcher *matcher);
210 SG_EXTERN int        Sg_RegexBinaryMatches(SgBinaryMatcher *m);
211 SG_EXTERN int        Sg_RegexBinaryLookingAt(SgBinaryMatcher *m);
212 SG_EXTERN int        Sg_RegexBinaryFind(SgBinaryMatcher *m, long start);
213 
214 SG_EXTERN SgObject   Sg_RegexBinaryGroup(SgBinaryMatcher *m,
215 					 SgObject groupOrName);
216 SG_EXTERN int        Sg_RegexBinaryGroupPosition(SgBinaryMatcher *m,
217 					       SgObject groupOrName,
218 					       int startP);
219 
220 SG_EXTERN SgByteVector* Sg_RegexBinaryReplaceAll(SgBinaryMatcher *m,
221 						 SgObject replacement);
222 SG_EXTERN SgByteVector* Sg_RegexBinaryReplaceFirst(SgBinaryMatcher *m,
223 						   SgObject replacement);
224 SG_EXTERN SgByteVector* Sg_RegexBinaryReplace(SgBinaryMatcher *m,
225 					      SgObject replacement,
226 					      long count);
227 SG_EXTERN int        Sg_RegexBinaryCaptureCount(SgBinaryMatcher *m);
228 
229 
230 
231 /* Old interfaces */
232 SG_EXTERN SgMatcher* Sg_RegexMatcher(SgPattern *pattern, SgObject text,
233 				     long start, long end);
234 SG_EXTERN int        Sg_RegexMatches(SgMatcher *m);
235 SG_EXTERN int        Sg_RegexLookingAt(SgMatcher *m);
236 SG_EXTERN int        Sg_RegexFind(SgMatcher *m, long start);
237 
238 SG_EXTERN SgObject   Sg_RegexGroup(SgMatcher *m, SgObject groupOrName);
239 SG_EXTERN int        Sg_RegexGroupPosition(SgMatcher *m,
240 					   SgObject groupOrName,
241 					   int startP);
242 
243 SG_EXTERN SgObject   Sg_RegexReplaceAll(SgMatcher *m,
244 				       SgObject replacement);
245 SG_EXTERN SgObject   Sg_RegexReplaceFirst(SgMatcher *m,
246 					  SgObject replacement);
247 SG_EXTERN SgObject   Sg_RegexReplace(SgMatcher *m, SgObject replacement,
248 				     long count);
249 SG_EXTERN int        Sg_RegexCaptureCount(SgMatcher *m);
250 SG_EXTERN SgObject   Sg_RegexAfter(SgMatcher *matcher);
251 SG_EXTERN SgObject   Sg_RegexBefore(SgMatcher *matcher);
252 
253 SG_CDECL_END
254 
255 #endif /* SAGITTARIUS_REGEX_H_ */
256