1 /*
2 * Regexp.cpp - Regexp.
3 *
4 * Copyright (c) 2008 Higepon(Taro Minowa) <higepon@users.sourceforge.jp>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
23 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 * $Id$
30 */
31
32 #include "Object.h"
33 #include "Object-inl.h"
34 #include "Pair.h"
35 #include "Pair-inl.h"
36 #include "Regexp.h"
37 #include "SString.h"
38 #include "EqHashTable.h"
39 #include "VM.h"
40
41 using namespace scheme;
42
43 extern VM* theVM;
44
45 #if WORDS_BIGENDIAN
46 #define ONIG_ENCODING ONIG_ENCODING_UTF32_BE
47 #else
48 #define ONIG_ENCODING ONIG_ENCODING_UTF32_LE
49 #endif
50
51
Regexp(const ucs4string & pattern,bool caseFold,bool isSingleLine)52 Regexp::Regexp(const ucs4string& pattern, bool caseFold, bool isSingleLine) :
53 pattern_(pattern),
54 isErrorOccured_(false),
55 errorMessage_(Object::Nil),
56 irritants_(Object::Nil)
57 {
58 const ucs4char* p = pattern_.data();
59 int r = onig_new(®exp_,
60 (const uint8_t*)p,
61 (const uint8_t*)(p + pattern_.size()),
62 (ONIG_OPTION_DEFAULT) | (caseFold ? ONIG_OPTION_IGNORECASE : 0) | (isSingleLine? ONIG_OPTION_SINGLELINE : 0),
63 ONIG_ENCODING,
64 ONIG_SYNTAX_RUBY,
65 &einfo_);
66 if (r != ONIG_NORMAL)
67 {
68 char errorMessageBuffer[ONIG_MAX_ERROR_MESSAGE_LEN];
69 onig_error_code_to_str((uint8_t*)errorMessageBuffer, r, &einfo_);
70 isErrorOccured_ = true;
71 errorMessage_ = errorMessageBuffer;
72 irritants_ = L1(Object::makeString(pattern.data()));
73 }
74 }
75
isErrorOccured() const76 bool Regexp::isErrorOccured() const
77 {
78 return isErrorOccured_;
79 }
80
errorMessage() const81 Object Regexp::errorMessage() const
82 {
83 return errorMessage_;
84 }
85
irritants() const86 Object Regexp::irritants() const
87 {
88 return irritants_;
89 }
90
match(const ucs4string & text)91 Object Regexp::match(const ucs4string& text)
92 {
93 OnigRegion* region= matchInternal(text);
94 if (NULL == region) {
95 return Object::False;
96 } else {
97 return Object::makeRegMatch(region, text);
98 }
99 }
100
matchInternal(const ucs4string & text)101 OnigRegion* Regexp::matchInternal(const ucs4string& text)
102 {
103 OnigRegion* region = onig_region_new();
104 const uint8_t* start = (const uint8_t*)text.data();
105 const uint8_t* end = start + text.size() * sizeof(ucs4char);
106 const uint8_t* range = end;
107 const int r = onig_search(regexp_, start, end, start, range, region, ONIG_OPTION_NONE);
108 if (r >= 0) {
109 return region;
110 } else if (r == ONIG_MISMATCH) {
111 return NULL;
112 } else {
113 char errorMessageBuffer[ONIG_MAX_ERROR_MESSAGE_LEN];
114 onig_error_code_to_str((uint8_t*)errorMessageBuffer, r);
115 isErrorOccured_ = true;
116 errorMessage_ = errorMessageBuffer;
117 irritants_ = L2(Object::makeString(text.data()),
118 Object::makeString(pattern_.data()));
119 return NULL;
120 }
121 }
122
replace(ucs4string & text,ucs4string & subst,bool & matched)123 ucs4string Regexp::replace(ucs4string& text, ucs4string& subst, bool& matched)
124 {
125 OnigRegion* const region = matchInternal(text);
126 if (NULL == region) {
127 matched = false;
128 return text;
129 }
130 const ucs4string beg = text.substr(0, region->beg[0] / sizeof(ucs4char));
131 const ucs4string end = text.substr(region->end[0] / sizeof(ucs4char), text.size() - region->end[0] / sizeof(ucs4char));
132 matched = true;
133 return (beg + subst + end).data();
134 }
135
replace(Object t,Object subst)136 Object Regexp::replace(Object t, Object subst)
137 {
138 bool matched;
139 const ucs4string ret = replace(t.toString()->data(), subst.toString()->data(), matched);
140 if (matched) {
141 return Object::makeString(ret);
142 } else {
143 return t;
144 }
145 }
146
replaceAll(Object t,Object subst)147 Object Regexp::replaceAll(Object t, Object subst)
148 {
149 ucs4string ret;
150 ucs4string targetString = t.toString()->data();
151 ucs4string substitute = subst.toString()->data();
152
153 for (;;) {
154 OnigRegion* const region = matchInternal(targetString);
155 if (NULL == region) {
156 ret += targetString;
157 break;
158 }
159
160 const ucs4string preString = targetString.substr(0, region->beg[0] / sizeof(ucs4char));
161 const ucs4string postString = targetString.substr(region->end[0] / sizeof(ucs4char), targetString.size() - region->end[0] / sizeof(ucs4char));
162 ret += preString + substitute;
163 targetString = postString;
164 }
165 return Object::makeString(ret);
166 }
167
RegMatch(OnigRegion * region,const ucs4string & text)168 RegMatch::RegMatch(OnigRegion* region, const ucs4string& text) : region_(region),
169 text_(text),
170 isErrorOccured_(false),
171 errorMessage_(Object::Nil),
172 irritants_(Object::Nil)
173 {
174 }
175
errorMessage() const176 Object RegMatch::errorMessage() const
177 {
178 return errorMessage_;
179 }
180
irritants() const181 Object RegMatch::irritants() const
182 {
183 return irritants_;
184 }
185
isErrorOccured() const186 bool RegMatch::isErrorOccured() const
187 {
188 return isErrorOccured_;
189 }
190
matchStart(int index)191 int RegMatch::matchStart(int index)
192 {
193 if (index < 0 || index >= region_->num_regs) {
194 isErrorOccured_ = true;
195 errorMessage_ = "submatch index out of range";
196 irritants_ = L1(Object::makeFixnum(index));
197 return -1;
198 }
199 return region_->beg[index] / sizeof(ucs4char);
200 }
201
matchEnd(int index)202 int RegMatch::matchEnd(int index)
203 {
204 if (index < 0 || index >= region_->num_regs) {
205 isErrorOccured_ = true;
206 errorMessage_ = "submatch index out of range";
207 irritants_ = L1(Object::makeFixnum(index));
208 return -1;
209 }
210 return region_->end[index] / sizeof(ucs4char);
211 }
212
matchAfter(int index)213 Object RegMatch::matchAfter(int index)
214 {
215 if (index < 0 || index >= region_->num_regs) {
216 isErrorOccured_ = true;
217 errorMessage_ = "submatch index out of range";
218 irritants_ = L1(Object::makeFixnum(index));
219 return Object::Undef;
220 }
221 return Object::makeString(text_.substr(region_->end[index] / sizeof(ucs4char),
222 text_.size() - region_->end[index] / sizeof(ucs4char)).data());
223 }
224
matchBefore(int index)225 Object RegMatch::matchBefore(int index)
226 {
227 if (index < 0 || index >= region_->num_regs)
228 {
229 isErrorOccured_ = true;
230 errorMessage_ = "submatch index out of range";
231 irritants_ = L1(Object::makeFixnum(index));
232 return Object::Undef;
233 }
234 return Object::makeString(text_.substr(0, region_->beg[index] / sizeof(ucs4char)).data());
235 }
236
matchSubString(int index)237 Object RegMatch::matchSubString(int index)
238 {
239 if (index< 0 || index >= region_->num_regs)
240 {
241 isErrorOccured_ = true;
242 errorMessage_ = "submatch index out of range";
243 irritants_ = L1(Object::makeFixnum(index));
244 return Object::Undef;
245 }
246
247 // really?
248 if (region_->beg[index] == region_->end[index]) {
249 return Object::False;
250 }
251 return Object::makeString(text_.substr(region_->beg[index] / sizeof(ucs4char),
252 region_->end[index] / sizeof(ucs4char) - region_->beg[index] / sizeof(ucs4char)).data());
253
254 }
255