1 /*
2 * Copyright © 2012 Mikkel Kamstrup Erlandsen <mikkel.kamstrup@gmail.com>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version 2
7 * of the License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 *
17 */
18
19 #include "stringutils.h"
20 #include <string>
21 #include <algorithm>
22
23 #ifdef HAVE_DEE_ICU
24 #include <dee-icu.h>
25 #endif
26
27 using namespace std;
28
29 namespace ZeitgeistFTS {
30
31 namespace StringUtils {
32
33 /**
34 * Make sure s has equal or less than 'nbytes' bytes making sure the returned
35 * string is still valid UTF-8.
36 *
37 * NOTE: It is assumed the input string is valid UTF-8. Untrusted text
38 * should be validated with g_utf8_validate().
39 *
40 * This function useful for working with Xapian terms because Xapian has
41 * a max term length of 245 (which is not very well documented, but see
42 * http://xapian.org/docs/omega/termprefixes.html).
43 */
Truncate(string const & s,unsigned int nbytes)44 string Truncate (string const& s, unsigned int nbytes)
45 {
46 const gchar *str = s.c_str();
47 const gchar *iter = str;
48
49 nbytes = MIN(nbytes, s.length());
50
51 while (iter - str < nbytes)
52 {
53 const gchar *tmp = g_utf8_next_char (iter);
54 if (tmp - str > nbytes) break;
55 iter = tmp;
56 }
57
58
59 return s.substr(0, iter - str);
60 }
61
62 /**
63 * Converts a URI into an index- and query friendly string. The problem
64 * is that Xapian doesn't handle CAPITAL letters or most non-alphanumeric
65 * symbols in a boolean term when it does prefix matching. The mangled
66 * URIs returned from this function are suitable for boolean prefix searches.
67 *
68 * IMPORTANT: This is a 1-way function! You can not convert back.
69 */
MangleUri(string const & orig)70 string MangleUri (string const& orig)
71 {
72 // the input is supposed to be a uri, so no utf8 characters
73 gchar *casefolded = g_ascii_strdown (orig.c_str (), orig.size ());
74
75 string s(casefolded);
76 g_free (casefolded);
77 size_t pos = 0;
78 while ((pos = s.find_first_of (": /-.%", pos)) != string::npos)
79 {
80 s.replace (pos, 1, 1, '_');
81 pos++;
82 }
83
84 return s;
85 }
86
87 /**
88 * This method expects a valid uri and tries to split it into authority,
89 * path and query.
90 *
91 * Note that any and all parts may be left untouched.
92 */
SplitUri(string const & uri,string & authority,string & path,string & query)93 void SplitUri (string const& uri, string &authority,
94 string &path, string &query)
95 {
96 size_t colon_pos = uri.find (':');
97 if (colon_pos == string::npos) return; // not an uri?
98 bool has_double_slash = uri.length () > colon_pos + 2 &&
99 uri.compare (colon_pos + 1, 2, "//") == 0;
100
101 size_t start_pos = has_double_slash ? colon_pos + 3 : colon_pos + 1;
102
103 size_t first_slash = uri.find ('/', start_pos);
104 size_t question_mark_pos = uri.find ('?', first_slash == string::npos ?
105 start_pos : first_slash + 1);
106
107 authority = uri.substr (start_pos);
108 if (first_slash != string::npos)
109 {
110 authority.resize (first_slash - start_pos);
111 }
112 else if (question_mark_pos != string::npos)
113 {
114 authority.resize (question_mark_pos - start_pos);
115 }
116
117 if (first_slash == string::npos)
118 {
119 first_slash = start_pos + authority.length ();
120 }
121
122 if (question_mark_pos != string::npos)
123 {
124 path = uri.substr (first_slash, question_mark_pos - first_slash);
125 query = uri.substr (question_mark_pos + 1);
126 }
127 else
128 {
129 path = uri.substr (first_slash);
130 }
131 }
132
RemoveUnderscores(string const & input)133 string RemoveUnderscores (string const &input)
134 {
135 string result (input);
136 std::replace (result.begin (), result.end (), '_', ' ');
137
138 return result;
139 }
140
is_digit(char c)141 static bool is_digit (char c) { return c >= '0' && c <= '9'; }
142
CountDigits(string const & input)143 size_t CountDigits (string const &input)
144 {
145 return std::count_if (input.begin (), input.end (), is_digit);
146 }
147
148 static GRegex *camelcase_matcher = NULL;
149
150 static gboolean
matcher_cb(const GMatchInfo * match_info,GString * result,gpointer user_data)151 matcher_cb (const GMatchInfo *match_info, GString *result, gpointer user_data)
152 {
153 gint start_pos;
154 g_match_info_fetch_pos (match_info, 0, &start_pos, NULL);
155 if (start_pos != 0) g_string_append_c (result, ' ');
156 gchar *word = g_match_info_fetch (match_info, 0);
157 g_string_append (result, word);
158 g_free (word);
159
160 return FALSE;
161 }
162
UnCamelcase(string const & input)163 string UnCamelcase (string const &input)
164 {
165 if (camelcase_matcher == NULL)
166 {
167 camelcase_matcher = g_regex_new ("(?<=^|[[:lower:]])[[:upper:]]+[^[:upper:]]+", G_REGEX_OPTIMIZE, (GRegexMatchFlags) 0, NULL);
168 if (camelcase_matcher == NULL) g_critical ("Unable to create matcher!");
169 }
170
171 gchar *result = g_regex_replace_eval (camelcase_matcher, input.c_str (),
172 input.length (), 0,
173 (GRegexMatchFlags) 0,
174 matcher_cb, NULL, NULL);
175
176 string ret (result);
177 g_free (result);
178 return ret;
179 }
180
181 #ifdef HAVE_DEE_ICU
182 static DeeICUTermFilter *icu_filter = NULL;
183
184 /**
185 * Use ascii folding filter on the input text and return folded version
186 * of the original string.
187 *
188 * Note that if the folded version is exactly the same as the original
189 * empty string will be returned.
190 */
AsciiFold(string const & input)191 string AsciiFold (string const& input)
192 {
193 if (icu_filter == NULL)
194 {
195 icu_filter = dee_icu_term_filter_new_ascii_folder ();
196 if (icu_filter == NULL) return "";
197 }
198
199 // FIXME: check first if the input contains any non-ascii chars?
200
201 gchar *folded = dee_icu_term_filter_apply (icu_filter, input.c_str ());
202 string result (folded);
203 g_free (folded);
204
205 return result == input ? "" : result;
206 }
207 #else
AsciiFold(string const & input)208 string AsciiFold (string const& input)
209 {
210 return "";
211 }
212 #endif
213
214 } /* namespace StringUtils */
215
216 } /* namespace ZeitgeistFTS */
217