1 /*
2 * Copyright 2005-2007 Gerald Schmidt.
3 *
4 * This file is part of Xml Copy Editor.
5 *
6 * Xml Copy Editor is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * Xml Copy Editor is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Xml Copy Editor; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <iostream>
22 #include <string>
23 #include <ctype.h>
24 #include <stdexcept>
25 #include <cstring>
26 #include "wrapregex.h"
27 #include "contexthandler.h"
28
29 using namespace std;
30
WrapRegex(const string & pattern,bool matchCase,const string & replaceParameter,const int arrayLengthParameter)31 WrapRegex::WrapRegex (
32 const string& pattern,
33 bool matchCase,
34 const string& replaceParameter,
35 const int arrayLengthParameter ) :
36 replace ( replaceParameter ),
37 arrayLength ( arrayLengthParameter ),
38 returnValue ( 0 )
39 {
40 if ( pattern.empty() || pattern == ".*" )
41 {
42 disabled = true;
43 matchArray = NULL;
44 patternStructure = NULL;
45 patternExtraStructure = NULL;
46 return;
47 }
48 disabled = false;
49
50 matchArray = new int[arrayLength];
51
52 // compile
53 int optionsFlag = ( matchCase ) ? PCRE_UTF8 : PCRE_CASELESS | PCRE_UTF8;
54 const char *errorPointer;
55 int errorOffset;
56
57 if ( ( patternStructure = pcre_compile (
58 pattern.c_str(),
59 optionsFlag,
60 &errorPointer,
61 &errorOffset,
62 NULL ) ) == NULL )
63 {
64 throw runtime_error ( errorPointer );
65 }
66
67 patternExtraStructure = pcre_study ( patternStructure, 0, &errorPointer );
68 }
69
~WrapRegex()70 WrapRegex::~WrapRegex()
71 {
72 if ( disabled )
73 return;
74
75 pcre_free ( patternStructure );
76 pcre_free ( patternExtraStructure );
77 delete[] matchArray;
78 }
79
matchPatternGlobal(string & buffer,vector<ContextMatch> & matchVector,unsigned elementCount,int context)80 int WrapRegex::matchPatternGlobal (
81 string &buffer,
82 vector<ContextMatch> &matchVector,
83 unsigned elementCount,
84 int context )
85 {
86 if ( disabled )
87 return 0;
88
89 return matchPatternGlobal_ (
90 buffer.c_str(),
91 buffer.size(),
92 matchVector,
93 elementCount,
94 context );
95 }
96
replaceGlobal(const string & buffer,int * matchCount)97 string WrapRegex::replaceGlobal (
98 const string& buffer,
99 int *matchCount )
100 {
101 *matchCount = 0;
102
103 if ( disabled )
104 return buffer;
105
106 const char *s = buffer.c_str();
107
108 string output, match;
109
110 output.reserve ( buffer.size() );
111 while ( ( returnValue = pcre_exec (
112 patternStructure,
113 patternExtraStructure,
114 s,
115 strlen ( s ),
116 0,
117 0,
118 matchArray,
119 arrayLength ) ) >= 0 )
120 {
121 ++ ( *matchCount );
122
123 output.append ( s, matchArray[0] );
124
125 match.clear();
126 match.append ( s + matchArray[0], matchArray[1] - matchArray[0] );
127 output.append ( getInterpolatedString_ ( s, ( char * ) replace.c_str() ) );
128 s += matchArray[1];
129 }
130 output.append ( s );
131 return output;
132 }
133
matchPatternGlobal_(const char * buffer,size_t buflen,vector<ContextMatch> & matchVector,unsigned elementCount,int context)134 int WrapRegex::matchPatternGlobal_ (
135 const char *buffer,
136 size_t buflen,
137 vector<ContextMatch> &matchVector,
138 unsigned elementCount,
139 int context )
140 {
141 if ( disabled )
142 return 0;
143
144 const char *s, *origin;
145 int matchcount;
146 size_t offset;
147 ContextMatch match;
148
149 s = origin = buffer;
150 matchcount = 0;
151 offset = 0;
152
153 while ( ( returnValue = pcre_exec (
154 patternStructure,
155 patternExtraStructure,
156 s,
157 buflen,
158 offset,
159 0,
160 matchArray,
161 arrayLength ) ) >= 0 )
162 {
163 ++matchcount;
164
165 if ( context )
166 {
167 match = ContextHandler::getContext (
168 s + matchArray[0],
169 matchArray[1] - matchArray[0],
170 origin,
171 context );
172 }
173 else
174 {
175 match.prelog = match.postlog = "";
176 match.match.assign ( s + matchArray[0], matchArray[1] - matchArray[0] );
177 }
178
179 // record element and offset information
180 match.elementCount = elementCount;
181 match.offset = matchArray[0];
182
183 if ( replace != "" )
184 match.replace = getInterpolatedString_ ( s, ( char * ) replace.c_str() );
185
186 matchVector.push_back ( match );
187
188 if ( ( offset = matchArray[1] ) >= buflen )
189 break;
190 }
191 return matchcount;
192 }
193
getInterpolatedString_(const char * buffer,const char * source)194 string WrapRegex::getInterpolatedString_ ( const char *buffer, const char *source )
195 {
196 if ( disabled )
197 return "";
198
199 const char *s = source;
200
201 string interpol_string;
202
203 int escapeState = false;
204 for ( ; *s; ++s )
205 {
206 if ( *s == '\\' )
207 {
208 escapeState = ( escapeState ) ? false : true;
209 if ( escapeState )
210 {
211 if ( isdigit ( * ( s + 1 ) ) )
212 {
213 const char *number, *it;
214 number = s + 1;
215 for ( it = number; *it && isdigit ( * ( it + 1 ) ); ++it )
216 ;
217 size_t len = it - s;
218 char *tmp = new char[len + 1];
219 memcpy ( tmp, number, sizeof ( char ) * len );
220 * ( tmp + len ) = '\0';
221 int i = atoi ( tmp );
222 delete[] tmp;
223
224 interpol_string += getSubpattern_ ( buffer, i );
225
226 s += len;
227 escapeState = false;
228 }
229 else if ( * ( s + 1 ) == 't' )
230 {
231 interpol_string += '\t';
232 ++s;
233 escapeState = false;
234 }
235 else if ( * ( s + 1 ) == 'n' )
236 {
237 interpol_string += '\n';
238 ++s;
239 escapeState = false;
240 }
241 else
242 interpol_string += *s;
243 }
244 else
245 interpol_string += *s;
246 }
247 else
248 interpol_string += *s;
249 }
250 return interpol_string;
251 }
252
getSubpattern_(const char * s,unsigned subpattern)253 string WrapRegex::getSubpattern_ ( const char *s, unsigned subpattern )
254 {
255 if ( disabled )
256 return "";
257
258 const char *sub;
259 int ret = pcre_get_substring ( s, matchArray, returnValue, subpattern, &sub );
260 if ( ret == PCRE_ERROR_NOSUBSTRING || ret == PCRE_ERROR_NOMEMORY )
261 return "";
262 string subString ( sub );
263 pcre_free_substring ( sub );
264 return subString;
265 }
266