1 /*
2  * Regexp.cpp - Regexp.
3  *
4  *   Copyright (c) 2008  Higepon(Taro Minowa)  <higepon@users.sourceforge.jp>
5  *
6  *   Redistribution and use in source and binary forms, with or without
7  *   modification, are permitted provided that the following conditions
8  *   are met:
9  *
10  *   1. Redistributions of source code must retain the above copyright
11  *      notice, this list of conditions and the following disclaimer.
12  *
13  *   2. Redistributions in binary form must reproduce the above copyright
14  *      notice, this list of conditions and the following disclaimer in the
15  *      documentation and/or other materials provided with the distribution.
16  *
17  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
23  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  *  $Id$
30  */
31 
32 #include "Object.h"
33 #include "Object-inl.h"
34 #include "Pair.h"
35 #include "Pair-inl.h"
36 #include "Regexp.h"
37 #include "SString.h"
38 #include "EqHashTable.h"
39 #include "VM.h"
40 
41 using namespace scheme;
42 
43 extern VM* theVM;
44 
45 #if WORDS_BIGENDIAN
46 #define ONIG_ENCODING ONIG_ENCODING_UTF32_BE
47 #else
48 #define ONIG_ENCODING ONIG_ENCODING_UTF32_LE
49 #endif
50 
51 
Regexp(const ucs4string & pattern,bool caseFold,bool isSingleLine)52 Regexp::Regexp(const ucs4string& pattern, bool caseFold, bool isSingleLine) :
53     pattern_(pattern),
54     isErrorOccured_(false),
55     errorMessage_(Object::Nil),
56     irritants_(Object::Nil)
57 {
58     const ucs4char* p = pattern_.data();
59     int r = onig_new(&regexp_,
60                      (const uint8_t*)p,
61                      (const uint8_t*)(p + pattern_.size()),
62                      (ONIG_OPTION_DEFAULT) | (caseFold ? ONIG_OPTION_IGNORECASE : 0) | (isSingleLine? ONIG_OPTION_SINGLELINE : 0),
63                      ONIG_ENCODING,
64                      ONIG_SYNTAX_RUBY,
65                      &einfo_);
66     if (r != ONIG_NORMAL)
67     {
68         char errorMessageBuffer[ONIG_MAX_ERROR_MESSAGE_LEN];
69         onig_error_code_to_str((uint8_t*)errorMessageBuffer, r, &einfo_);
70         isErrorOccured_ = true;
71         errorMessage_ = errorMessageBuffer;
72         irritants_ = L1(Object::makeString(pattern.data()));
73     }
74 }
75 
isErrorOccured() const76 bool Regexp::isErrorOccured() const
77 {
78     return isErrorOccured_;
79 }
80 
errorMessage() const81 Object Regexp::errorMessage() const
82 {
83     return errorMessage_;
84 }
85 
irritants() const86 Object Regexp::irritants() const
87 {
88     return irritants_;
89 }
90 
match(const ucs4string & text)91 Object Regexp::match(const ucs4string& text)
92 {
93     OnigRegion* region= matchInternal(text);
94     if (NULL == region) {
95         return Object::False;
96     } else {
97         return Object::makeRegMatch(region, text);
98     }
99 }
100 
matchInternal(const ucs4string & text)101 OnigRegion* Regexp::matchInternal(const ucs4string& text)
102 {
103     OnigRegion* region = onig_region_new();
104     const uint8_t* start = (const uint8_t*)text.data();
105     const uint8_t* end   = start + text.size() * sizeof(ucs4char);
106     const uint8_t* range = end;
107     const int r = onig_search(regexp_, start, end, start, range, region, ONIG_OPTION_NONE);
108     if (r >= 0) {
109         return region;
110     } else if (r == ONIG_MISMATCH) {
111         return NULL;
112     } else {
113         char errorMessageBuffer[ONIG_MAX_ERROR_MESSAGE_LEN];
114         onig_error_code_to_str((uint8_t*)errorMessageBuffer, r);
115         isErrorOccured_ = true;
116         errorMessage_ = errorMessageBuffer;
117         irritants_ = L2(Object::makeString(text.data()),
118                         Object::makeString(pattern_.data()));
119         return NULL;
120     }
121 }
122 
replace(ucs4string & text,ucs4string & subst,bool & matched)123 ucs4string Regexp::replace(ucs4string& text, ucs4string& subst, bool& matched)
124 {
125     OnigRegion* const region = matchInternal(text);
126     if (NULL == region) {
127         matched = false;
128         return text;
129     }
130     const ucs4string beg = text.substr(0, region->beg[0] / sizeof(ucs4char));
131     const ucs4string end = text.substr(region->end[0] / sizeof(ucs4char), text.size() - region->end[0] / sizeof(ucs4char));
132     matched = true;
133     return (beg + subst + end).data();
134 }
135 
replace(Object t,Object subst)136 Object Regexp::replace(Object t, Object subst)
137 {
138     bool matched;
139     const ucs4string ret = replace(t.toString()->data(), subst.toString()->data(), matched);
140     if (matched) {
141         return Object::makeString(ret);
142     } else {
143         return t;
144     }
145 }
146 
replaceAll(Object t,Object subst)147 Object Regexp::replaceAll(Object t, Object subst)
148 {
149     ucs4string ret;
150     ucs4string targetString = t.toString()->data();
151     ucs4string substitute = subst.toString()->data();
152 
153     for (;;) {
154         OnigRegion* const region = matchInternal(targetString);
155         if (NULL == region) {
156             ret += targetString;
157             break;
158         }
159 
160         const ucs4string preString  = targetString.substr(0, region->beg[0] / sizeof(ucs4char));
161         const ucs4string postString = targetString.substr(region->end[0] / sizeof(ucs4char), targetString.size() - region->end[0] / sizeof(ucs4char));
162         ret += preString + substitute;
163         targetString = postString;
164     }
165     return Object::makeString(ret);
166 }
167 
RegMatch(OnigRegion * region,const ucs4string & text)168 RegMatch::RegMatch(OnigRegion* region, const ucs4string& text) : region_(region),
169                                                                  text_(text),
170                                                                  isErrorOccured_(false),
171                                                                  errorMessage_(Object::Nil),
172                                                                  irritants_(Object::Nil)
173 {
174 }
175 
errorMessage() const176 Object RegMatch::errorMessage() const
177 {
178     return errorMessage_;
179 }
180 
irritants() const181 Object RegMatch::irritants() const
182 {
183     return irritants_;
184 }
185 
isErrorOccured() const186 bool RegMatch::isErrorOccured() const
187 {
188     return isErrorOccured_;
189 }
190 
matchStart(int index)191 int RegMatch::matchStart(int index)
192 {
193     if (index < 0 || index >= region_->num_regs) {
194         isErrorOccured_ = true;
195         errorMessage_ = "submatch index out of range";
196         irritants_ = L1(Object::makeFixnum(index));
197         return -1;
198     }
199     return region_->beg[index] / sizeof(ucs4char);
200 }
201 
matchEnd(int index)202 int RegMatch::matchEnd(int index)
203 {
204     if (index < 0 || index >= region_->num_regs) {
205         isErrorOccured_ = true;
206         errorMessage_ = "submatch index out of range";
207         irritants_ = L1(Object::makeFixnum(index));
208         return -1;
209     }
210     return region_->end[index] / sizeof(ucs4char);
211 }
212 
matchAfter(int index)213 Object RegMatch::matchAfter(int index)
214 {
215     if (index < 0 || index >= region_->num_regs) {
216         isErrorOccured_ = true;
217         errorMessage_ = "submatch index out of range";
218         irritants_ = L1(Object::makeFixnum(index));
219         return Object::Undef;
220     }
221     return Object::makeString(text_.substr(region_->end[index] / sizeof(ucs4char),
222                                           text_.size() - region_->end[index] / sizeof(ucs4char)).data());
223 }
224 
matchBefore(int index)225 Object RegMatch::matchBefore(int index)
226 {
227     if (index < 0 || index >= region_->num_regs)
228     {
229         isErrorOccured_ = true;
230         errorMessage_ = "submatch index out of range";
231         irritants_ = L1(Object::makeFixnum(index));
232         return Object::Undef;
233     }
234     return Object::makeString(text_.substr(0, region_->beg[index] / sizeof(ucs4char)).data());
235 }
236 
matchSubString(int index)237 Object RegMatch::matchSubString(int index)
238 {
239     if (index< 0 || index >= region_->num_regs)
240     {
241         isErrorOccured_ = true;
242         errorMessage_ = "submatch index out of range";
243         irritants_ = L1(Object::makeFixnum(index));
244         return Object::Undef;
245     }
246 
247     // really?
248     if (region_->beg[index] == region_->end[index]) {
249         return Object::False;
250     }
251     return Object::makeString(text_.substr(region_->beg[index] / sizeof(ucs4char),
252                                            region_->end[index] / sizeof(ucs4char) - region_->beg[index] / sizeof(ucs4char)).data());
253 
254 }
255