1 /*
2 * Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 3 of the License, or
7 * any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
17 * MA 02110-1301, USA.
18 */
19
20 #include <tools/regexTools.h>
21 #include <tools/lock.h>
22
23 #include <unicode/regex.h>
24 #include <unicode/ucnv.h>
25
26 #include <memory>
27 #include <map>
28 #include <stdexcept>
29 #include <pthread.h>
30
31 std::map<std::string, std::shared_ptr<icu::RegexPattern>> regexCache;
32 static pthread_mutex_t regexLock = PTHREAD_MUTEX_INITIALIZER;
33
buildMatcher(const std::string & regex,icu::UnicodeString & content)34 std::unique_ptr<icu::RegexMatcher> buildMatcher(const std::string& regex, icu::UnicodeString& content)
35 {
36 std::shared_ptr<icu::RegexPattern> pattern;
37 /* Regex is in cache */
38 try {
39 pattern = regexCache.at(regex);
40 } catch (std::out_of_range&) {
41 // Redo the search with a lock to avoid race condition.
42 kiwix::Lock l(®exLock);
43 try {
44 pattern = regexCache.at(regex);
45 } catch (std::out_of_range&) {
46 UErrorCode status = U_ZERO_ERROR;
47 UParseError pe;
48 icu::UnicodeString uregex(regex.c_str());
49 pattern.reset(icu::RegexPattern::compile(uregex, UREGEX_CASE_INSENSITIVE, pe, status));
50 regexCache[regex] = pattern;
51 }
52 }
53 UErrorCode status = U_ZERO_ERROR;
54 return std::unique_ptr<icu::RegexMatcher>(pattern->matcher(content, status));
55 }
56
matchRegex(const std::string & content,const std::string & regex)57 bool matchRegex(const std::string& content, const std::string& regex)
58 {
59 ucnv_setDefaultName("UTF-8");
60 icu::UnicodeString ucontent(content.c_str());
61 auto matcher = buildMatcher(regex, ucontent);
62 return matcher->find();
63 }
64
replaceRegex(const std::string & content,const std::string & replacement,const std::string & regex)65 std::string replaceRegex(const std::string& content,
66 const std::string& replacement,
67 const std::string& regex)
68 {
69 ucnv_setDefaultName("UTF-8");
70 icu::UnicodeString ureplacement(replacement.c_str());
71 icu::UnicodeString ucontent(content.c_str());
72 auto matcher = buildMatcher(regex, ucontent);
73 UErrorCode status = U_ZERO_ERROR;
74 auto uresult = matcher->replaceAll(ureplacement, status);
75 std::string tmp;
76 uresult.toUTF8String(tmp);
77 return tmp;
78 }
79
appendToFirstOccurence(const std::string & content,const std::string & regex,const std::string & replacement)80 std::string appendToFirstOccurence(const std::string& content,
81 const std::string& regex,
82 const std::string& replacement)
83 {
84 ucnv_setDefaultName("UTF-8");
85 icu::UnicodeString ucontent(content.c_str());
86 icu::UnicodeString ureplacement(replacement.c_str());
87 auto matcher = buildMatcher(regex, ucontent);
88 if (matcher->find()) {
89 UErrorCode status = U_ZERO_ERROR;
90 ucontent.insert(matcher->end(status), ureplacement);
91 std::string tmp;
92 ucontent.toUTF8String(tmp);
93 return tmp;
94 }
95
96 return content;
97 }
98