1 /*
2  * Copyright © 2012 Mikkel Kamstrup Erlandsen <mikkel.kamstrup@gmail.com>
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  *
17  */
18 
19 #include "stringutils.h"
20 #include <string>
21 #include <algorithm>
22 
23 #ifdef HAVE_DEE_ICU
24 #include <dee-icu.h>
25 #endif
26 
27 using namespace std;
28 
29 namespace ZeitgeistFTS {
30 
31 namespace StringUtils {
32 
33 /**
34  * Make sure s has equal or less than 'nbytes' bytes making sure the returned
35  * string is still valid UTF-8.
36  *
37  * NOTE: It is assumed the input string is valid UTF-8. Untrusted text
38  * should be validated with g_utf8_validate().
39  *
40  * This function useful for working with Xapian terms because Xapian has
41  * a max term length of 245 (which is not very well documented, but see
42  * http://xapian.org/docs/omega/termprefixes.html).
43  */
Truncate(string const & s,unsigned int nbytes)44 string Truncate (string const& s, unsigned int nbytes)
45 {
46   const gchar *str = s.c_str();
47   const gchar *iter = str;
48 
49   nbytes = MIN(nbytes, s.length());
50 
51   while (iter - str < nbytes)
52   {
53     const gchar *tmp = g_utf8_next_char (iter);
54     if (tmp - str > nbytes) break;
55     iter = tmp;
56   }
57 
58 
59   return s.substr(0, iter - str);
60 }
61 
62 /**
63  * Converts a URI into an index- and query friendly string. The problem
64  * is that Xapian doesn't handle CAPITAL letters or most non-alphanumeric
65  * symbols in a boolean term when it does prefix matching. The mangled
66  * URIs returned from this function are suitable for boolean prefix searches.
67  *
68  * IMPORTANT: This is a 1-way function! You can not convert back.
69  */
MangleUri(string const & orig)70 string MangleUri (string const& orig)
71 {
72   // the input is supposed to be a uri, so no utf8 characters
73   gchar *casefolded = g_ascii_strdown (orig.c_str (), orig.size ());
74 
75   string s(casefolded);
76   g_free (casefolded);
77   size_t pos = 0;
78   while ((pos = s.find_first_of (": /-.%", pos)) != string::npos)
79   {
80     s.replace (pos, 1, 1, '_');
81     pos++;
82   }
83 
84   return s;
85 }
86 
87 /**
88  * This method expects a valid uri and tries to split it into authority,
89  * path and query.
90  *
91  * Note that any and all parts may be left untouched.
92  */
SplitUri(string const & uri,string & authority,string & path,string & query)93 void SplitUri (string const& uri, string &authority,
94                string &path, string &query)
95 {
96   size_t colon_pos = uri.find (':');
97   if (colon_pos == string::npos) return; // not an uri?
98   bool has_double_slash = uri.length () > colon_pos + 2 &&
99     uri.compare (colon_pos + 1, 2, "//") == 0;
100 
101   size_t start_pos = has_double_slash ? colon_pos + 3 : colon_pos + 1;
102 
103   size_t first_slash = uri.find ('/', start_pos);
104   size_t question_mark_pos = uri.find ('?', first_slash == string::npos ?
105       start_pos : first_slash + 1);
106 
107   authority = uri.substr (start_pos);
108   if (first_slash != string::npos)
109   {
110     authority.resize (first_slash - start_pos);
111   }
112   else if (question_mark_pos != string::npos)
113   {
114     authority.resize (question_mark_pos - start_pos);
115   }
116 
117   if (first_slash == string::npos)
118   {
119     first_slash = start_pos + authority.length ();
120   }
121 
122   if (question_mark_pos != string::npos)
123   {
124     path = uri.substr (first_slash, question_mark_pos - first_slash);
125     query = uri.substr (question_mark_pos + 1);
126   }
127   else
128   {
129     path = uri.substr (first_slash);
130   }
131 }
132 
RemoveUnderscores(string const & input)133 string RemoveUnderscores (string const &input)
134 {
135   string result (input);
136   std::replace (result.begin (), result.end (), '_', ' ');
137 
138   return result;
139 }
140 
is_digit(char c)141 static bool is_digit (char c) { return c >= '0' && c <= '9'; }
142 
CountDigits(string const & input)143 size_t CountDigits (string const &input)
144 {
145   return std::count_if (input.begin (), input.end (), is_digit);
146 }
147 
148 static GRegex *camelcase_matcher = NULL;
149 
150 static gboolean
matcher_cb(const GMatchInfo * match_info,GString * result,gpointer user_data)151 matcher_cb (const GMatchInfo *match_info, GString *result, gpointer user_data)
152 {
153   gint start_pos;
154   g_match_info_fetch_pos (match_info, 0, &start_pos, NULL);
155   if (start_pos != 0) g_string_append_c (result, ' ');
156   gchar *word = g_match_info_fetch (match_info, 0);
157   g_string_append (result, word);
158   g_free (word);
159 
160   return FALSE;
161 }
162 
UnCamelcase(string const & input)163 string UnCamelcase (string const &input)
164 {
165   if (camelcase_matcher == NULL)
166   {
167     camelcase_matcher = g_regex_new ("(?<=^|[[:lower:]])[[:upper:]]+[^[:upper:]]+", G_REGEX_OPTIMIZE, (GRegexMatchFlags) 0, NULL);
168     if (camelcase_matcher == NULL) g_critical ("Unable to create matcher!");
169   }
170 
171   gchar *result = g_regex_replace_eval (camelcase_matcher, input.c_str (),
172                                         input.length (), 0,
173                                         (GRegexMatchFlags) 0,
174                                         matcher_cb, NULL, NULL);
175 
176   string ret (result);
177   g_free (result);
178   return ret;
179 }
180 
181 #ifdef HAVE_DEE_ICU
182 static DeeICUTermFilter *icu_filter = NULL;
183 
184 /**
185  * Use ascii folding filter on the input text and return folded version
186  * of the original string.
187  *
188  * Note that if the folded version is exactly the same as the original
189  * empty string will be returned.
190  */
AsciiFold(string const & input)191 string AsciiFold (string const& input)
192 {
193   if (icu_filter == NULL)
194   {
195     icu_filter = dee_icu_term_filter_new_ascii_folder ();
196     if (icu_filter == NULL) return "";
197   }
198 
199   // FIXME: check first if the input contains any non-ascii chars?
200 
201   gchar *folded = dee_icu_term_filter_apply (icu_filter, input.c_str ());
202   string result (folded);
203   g_free (folded);
204 
205   return result == input ? "" : result;
206 }
207 #else
AsciiFold(string const & input)208 string AsciiFold (string const& input)
209 {
210   return "";
211 }
212 #endif
213 
214 } /* namespace StringUtils */
215 
216 } /* namespace ZeitgeistFTS */
217