1 /*
2  * Copyright 2005-2007 Gerald Schmidt.
3  *
4  * This file is part of Xml Copy Editor.
5  *
6  * Xml Copy Editor is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * Xml Copy Editor is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with Xml Copy Editor; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19  */
20 
21 #include <iostream>
22 #include <string>
23 #include <ctype.h>
24 #include <stdexcept>
25 #include <cstring>
26 #include "wrapregex.h"
27 #include "contexthandler.h"
28 
29 using namespace std;
30 
WrapRegex(const string & pattern,bool matchCase,const string & replaceParameter,const int arrayLengthParameter)31 WrapRegex::WrapRegex (
32     const string& pattern,
33     bool matchCase,
34     const string& replaceParameter,
35     const int arrayLengthParameter ) :
36 		replace ( replaceParameter ),
37 		arrayLength ( arrayLengthParameter ),
38 		returnValue ( 0 )
39 {
40 	if ( pattern.empty() || pattern == ".*" )
41 	{
42 		disabled = true;
43 		matchArray = NULL;
44 		patternStructure = NULL;
45 		patternExtraStructure = NULL;
46 		return;
47 	}
48 	disabled = false;
49 
50 	matchArray = new int[arrayLength];
51 
52 	// compile
53 	int optionsFlag = ( matchCase ) ? PCRE_UTF8 : PCRE_CASELESS | PCRE_UTF8;
54 	const char *errorPointer;
55 	int errorOffset;
56 
57 	if ( ( patternStructure = pcre_compile (
58 	                              pattern.c_str(),
59 	                              optionsFlag,
60 	                              &errorPointer,
61 	                              &errorOffset,
62 	                              NULL ) ) == NULL )
63 	{
64 		throw runtime_error ( errorPointer );
65 	}
66 
67 	patternExtraStructure = pcre_study ( patternStructure, 0, &errorPointer );
68 }
69 
~WrapRegex()70 WrapRegex::~WrapRegex()
71 {
72 	if ( disabled )
73 		return;
74 
75 	pcre_free ( patternStructure );
76 	pcre_free ( patternExtraStructure );
77 	delete[] matchArray;
78 }
79 
matchPatternGlobal(string & buffer,vector<ContextMatch> & matchVector,unsigned elementCount,int context)80 int WrapRegex::matchPatternGlobal (
81     string &buffer,
82     vector<ContextMatch> &matchVector,
83     unsigned elementCount,
84     int context )
85 {
86 	if ( disabled )
87 		return 0;
88 
89 	return matchPatternGlobal_ (
90 	           buffer.c_str(),
91 	           buffer.size(),
92 	           matchVector,
93 	           elementCount,
94 	           context );
95 }
96 
replaceGlobal(const string & buffer,int * matchCount)97 string WrapRegex::replaceGlobal (
98     const string& buffer,
99     int *matchCount )
100 {
101 	*matchCount = 0;
102 
103 	if ( disabled )
104 		return buffer;
105 
106 	const char *s = buffer.c_str();
107 
108 	string output, match;
109 
110 	output.reserve ( buffer.size() );
111 	while ( ( returnValue = pcre_exec (
112 	                            patternStructure,
113 	                            patternExtraStructure,
114 	                            s,
115 	                            strlen ( s ),
116 	                            0,
117 	                            0,
118 	                            matchArray,
119 	                            arrayLength ) ) >= 0 )
120 	{
121 		++ ( *matchCount );
122 
123 		output.append ( s, matchArray[0] );
124 
125 		match.clear();
126 		match.append ( s + matchArray[0], matchArray[1] - matchArray[0] );
127 		output.append ( getInterpolatedString_ ( s, ( char * ) replace.c_str() ) );
128 		s += matchArray[1];
129 	}
130 	output.append ( s );
131 	return output;
132 }
133 
matchPatternGlobal_(const char * buffer,size_t buflen,vector<ContextMatch> & matchVector,unsigned elementCount,int context)134 int WrapRegex::matchPatternGlobal_ (
135     const char *buffer,
136     size_t buflen,
137     vector<ContextMatch> &matchVector,
138     unsigned elementCount,
139     int context )
140 {
141 	if ( disabled )
142 		return 0;
143 
144 	const char *s, *origin;
145 	int matchcount;
146 	size_t offset;
147 	ContextMatch match;
148 
149 	s = origin = buffer;
150 	matchcount = 0;
151 	offset = 0;
152 
153 	while ( ( returnValue = pcre_exec (
154 	                            patternStructure,
155 	                            patternExtraStructure,
156 	                            s,
157 	                            buflen,
158 	                            offset,
159 	                            0,
160 	                            matchArray,
161 	                            arrayLength ) ) >= 0 )
162 	{
163 		++matchcount;
164 
165 		if ( context )
166 		{
167 			match = ContextHandler::getContext (
168 			            s + matchArray[0],
169 			            matchArray[1] - matchArray[0],
170 			            origin,
171 			            context );
172 		}
173 		else
174 		{
175 			match.prelog = match.postlog = "";
176 			match.match.assign ( s + matchArray[0], matchArray[1] - matchArray[0] );
177 		}
178 
179 		// record element and offset information
180 		match.elementCount = elementCount;
181 		match.offset = matchArray[0];
182 
183 		if ( replace != "" )
184 			match.replace = getInterpolatedString_ ( s, ( char * ) replace.c_str() );
185 
186 		matchVector.push_back ( match );
187 
188 		if ( ( offset = matchArray[1] ) >= buflen )
189 			break;
190 	}
191 	return matchcount;
192 }
193 
getInterpolatedString_(const char * buffer,const char * source)194 string WrapRegex::getInterpolatedString_ ( const char *buffer, const char *source )
195 {
196 	if ( disabled )
197 		return "";
198 
199 	const char *s = source;
200 
201 	string interpol_string;
202 
203 	int escapeState = false;
204 	for ( ; *s; ++s )
205 	{
206 		if ( *s == '\\' )
207 		{
208 			escapeState = ( escapeState ) ? false : true;
209 			if ( escapeState )
210 			{
211 				if ( isdigit ( * ( s + 1 ) ) )
212 				{
213 					const char *number, *it;
214 					number = s + 1;
215 					for ( it = number; *it && isdigit ( * ( it + 1 ) ); ++it )
216 						;
217 					size_t len = it - s;
218 					char *tmp = new char[len + 1];
219 					memcpy ( tmp, number, sizeof ( char ) * len );
220 					* ( tmp + len ) = '\0';
221 					int i = atoi ( tmp );
222 					delete[] tmp;
223 
224 					interpol_string += getSubpattern_ ( buffer, i );
225 
226 					s += len;
227 					escapeState = false;
228 				}
229 				else if ( * ( s + 1 ) == 't' )
230 				{
231 					interpol_string += '\t';
232 					++s;
233 					escapeState = false;
234 				}
235 				else if ( * ( s + 1 ) == 'n' )
236 				{
237 					interpol_string += '\n';
238 					++s;
239 					escapeState = false;
240 				}
241 				else
242 					interpol_string += *s;
243 			}
244 			else
245 				interpol_string += *s;
246 		}
247 		else
248 			interpol_string += *s;
249 	}
250 	return interpol_string;
251 }
252 
getSubpattern_(const char * s,unsigned subpattern)253 string WrapRegex::getSubpattern_ ( const char *s, unsigned subpattern )
254 {
255 	if ( disabled )
256 		return "";
257 
258 	const char *sub;
259 	int ret = pcre_get_substring ( s, matchArray, returnValue, subpattern, &sub );
260 	if ( ret == PCRE_ERROR_NOSUBSTRING || ret == PCRE_ERROR_NOMEMORY )
261 		return "";
262 	string subString ( sub );
263 	pcre_free_substring ( sub );
264 	return subString;
265 }
266