1 /*
2  * Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU  General Public License as published by
6  * the Free Software Foundation; either version 3 of the License, or
7  * any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
17  * MA 02110-1301, USA.
18  */
19 
20 #include <tools/regexTools.h>
21 #include <tools/lock.h>
22 
23 #include <unicode/regex.h>
24 #include <unicode/ucnv.h>
25 
26 #include <memory>
27 #include <map>
28 #include <stdexcept>
29 #include <pthread.h>
30 
31 std::map<std::string, std::shared_ptr<icu::RegexPattern>> regexCache;
32 static pthread_mutex_t regexLock = PTHREAD_MUTEX_INITIALIZER;
33 
buildMatcher(const std::string & regex,icu::UnicodeString & content)34 std::unique_ptr<icu::RegexMatcher> buildMatcher(const std::string& regex, icu::UnicodeString& content)
35 {
36   std::shared_ptr<icu::RegexPattern> pattern;
37   /* Regex is in cache */
38   try {
39     pattern = regexCache.at(regex);
40   } catch (std::out_of_range&) {
41     // Redo the search with a lock to avoid race condition.
42     kiwix::Lock l(&regexLock);
43     try {
44       pattern = regexCache.at(regex);
45     } catch (std::out_of_range&) {
46       UErrorCode status = U_ZERO_ERROR;
47       UParseError pe;
48       icu::UnicodeString uregex(regex.c_str());
49       pattern.reset(icu::RegexPattern::compile(uregex, UREGEX_CASE_INSENSITIVE, pe, status));
50       regexCache[regex] = pattern;
51     }
52   }
53   UErrorCode status = U_ZERO_ERROR;
54   return std::unique_ptr<icu::RegexMatcher>(pattern->matcher(content, status));
55 }
56 
matchRegex(const std::string & content,const std::string & regex)57 bool matchRegex(const std::string& content, const std::string& regex)
58 {
59   ucnv_setDefaultName("UTF-8");
60   icu::UnicodeString ucontent(content.c_str());
61   auto matcher = buildMatcher(regex, ucontent);
62   return matcher->find();
63 }
64 
replaceRegex(const std::string & content,const std::string & replacement,const std::string & regex)65 std::string replaceRegex(const std::string& content,
66                          const std::string& replacement,
67                          const std::string& regex)
68 {
69   ucnv_setDefaultName("UTF-8");
70   icu::UnicodeString ureplacement(replacement.c_str());
71   icu::UnicodeString ucontent(content.c_str());
72   auto matcher = buildMatcher(regex, ucontent);
73   UErrorCode status = U_ZERO_ERROR;
74   auto uresult = matcher->replaceAll(ureplacement, status);
75   std::string tmp;
76   uresult.toUTF8String(tmp);
77   return tmp;
78 }
79 
appendToFirstOccurence(const std::string & content,const std::string & regex,const std::string & replacement)80 std::string appendToFirstOccurence(const std::string& content,
81                                    const std::string& regex,
82                                    const std::string& replacement)
83 {
84   ucnv_setDefaultName("UTF-8");
85   icu::UnicodeString ucontent(content.c_str());
86   icu::UnicodeString ureplacement(replacement.c_str());
87   auto matcher = buildMatcher(regex, ucontent);
88   if (matcher->find()) {
89     UErrorCode status = U_ZERO_ERROR;
90     ucontent.insert(matcher->end(status), ureplacement);
91     std::string tmp;
92     ucontent.toUTF8String(tmp);
93     return tmp;
94   }
95 
96   return content;
97 }
98