1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * Copyright (C) 2002-2017 Németh László
5 *
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
10 *
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
15 *
16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17 *
18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23 *
24 * Alternatively, the contents of this file may be used under the terms of
25 * either the GNU General Public License Version 2 or later (the "GPL"), or
26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
35 *
36 * ***** END LICENSE BLOCK ***** */
37 /*
38 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39 * And Contributors. All rights reserved.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 *
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 *
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 *
52 * 3. All modifications to the source code must be clearly marked as
53 * such. Binary redistributions based on modified source code
54 * must be clearly marked as modified versions in the documentation
55 * and/or other materials provided with the distribution.
56 *
57 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
61 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 */
70
71 #ifndef CSUTIL_HXX_
72 #define CSUTIL_HXX_
73
74 #include "hunvisapi.h"
75
76 // First some base level utility routines
77
78 #include <fstream>
79 #include <string>
80 #include <vector>
81 #include <string.h>
82 #include "w_char.hxx"
83 #include "htypes.hxx"
84
85 #ifdef MOZILLA_CLIENT
86 #include "nscore.h" // for mozalloc headers
87 #endif
88
89 // casing
90 #define NOCAP 0
91 #define INITCAP 1
92 #define ALLCAP 2
93 #define HUHCAP 3
94 #define HUHINITCAP 4
95
96 // default encoding and keystring
97 #define SPELL_ENCODING "ISO8859-1"
98 #define SPELL_KEYSTRING "qwertyuiop|asdfghjkl|zxcvbnm"
99
100 // default morphological fields
101 #define MORPH_STEM "st:"
102 #define MORPH_ALLOMORPH "al:"
103 #define MORPH_POS "po:"
104 #define MORPH_DERI_PFX "dp:"
105 #define MORPH_INFL_PFX "ip:"
106 #define MORPH_TERM_PFX "tp:"
107 #define MORPH_DERI_SFX "ds:"
108 #define MORPH_INFL_SFX "is:"
109 #define MORPH_TERM_SFX "ts:"
110 #define MORPH_SURF_PFX "sp:"
111 #define MORPH_FREQ "fr:"
112 #define MORPH_PHON "ph:"
113 #define MORPH_HYPH "hy:"
114 #define MORPH_PART "pa:"
115 #define MORPH_FLAG "fl:"
116 #define MORPH_HENTRY "_H:"
117 #define MORPH_TAG_LEN strlen(MORPH_STEM)
118
119 #define MSEP_FLD ' '
120 #define MSEP_REC '\n'
121 #define MSEP_ALT '\v'
122
123 // default flags
124 #define DEFAULTFLAGS 65510
125 #define FORBIDDENWORD 65510
126 #define ONLYUPCASEFLAG 65511
127
128 // fix long pathname problem of WIN32 by using w_char std::fstream::open override
129 LIBHUNSPELL_DLL_EXPORTED void myopen(std::ifstream& stream, const char* path,
130 std::ios_base::openmode mode);
131
132 // convert UTF-16 characters to UTF-8
133 LIBHUNSPELL_DLL_EXPORTED std::string& u16_u8(std::string& dest,
134 const std::vector<w_char>& src);
135
136 // convert UTF-8 characters to UTF-16
137 LIBHUNSPELL_DLL_EXPORTED int u8_u16(std::vector<w_char>& dest,
138 const std::string& src);
139
140 // remove end of line char(s)
141 LIBHUNSPELL_DLL_EXPORTED void mychomp(std::string& s);
142
143 // duplicate string
144 LIBHUNSPELL_DLL_EXPORTED char* mystrdup(const char* s);
145
146 // parse into tokens with char delimiter
147 LIBHUNSPELL_DLL_EXPORTED std::string::const_iterator mystrsep(const std::string &str,
148 std::string::const_iterator& start);
149
150 // replace pat by rep in word and return word
151 LIBHUNSPELL_DLL_EXPORTED std::string& mystrrep(std::string& str,
152 const std::string& search,
153 const std::string& replace);
154
155 // append s to ends of every lines in text
156 LIBHUNSPELL_DLL_EXPORTED std::string& strlinecat(std::string& str,
157 const std::string& apd);
158
159 // tokenize into lines with new line
160 LIBHUNSPELL_DLL_EXPORTED std::vector<std::string> line_tok(const std::string& text,
161 char breakchar);
162
163 // tokenize into lines with new line and uniq in place
164 LIBHUNSPELL_DLL_EXPORTED void line_uniq(std::string& text, char breakchar);
165
166 LIBHUNSPELL_DLL_EXPORTED void line_uniq_app(std::string& text, char breakchar);
167
168 // reverse word
169 LIBHUNSPELL_DLL_EXPORTED size_t reverseword(std::string& word);
170
171 // reverse word
172 LIBHUNSPELL_DLL_EXPORTED size_t reverseword_utf(std::string&);
173
174 // remove duplicates
175 LIBHUNSPELL_DLL_EXPORTED void uniqlist(std::vector<std::string>& list);
176
177 // character encoding information
178 struct cs_info {
179 unsigned char ccase;
180 unsigned char clower;
181 unsigned char cupper;
182 };
183
184 LIBHUNSPELL_DLL_EXPORTED void initialize_utf_tbl();
185 LIBHUNSPELL_DLL_EXPORTED void free_utf_tbl();
186 LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetoupper(unsigned short c,
187 int langnum);
188 LIBHUNSPELL_DLL_EXPORTED w_char upper_utf(w_char u, int langnum);
189 LIBHUNSPELL_DLL_EXPORTED w_char lower_utf(w_char u, int langnum);
190 LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetolower(unsigned short c,
191 int langnum);
192 LIBHUNSPELL_DLL_EXPORTED int unicodeisalpha(unsigned short c);
193
194 LIBHUNSPELL_DLL_EXPORTED struct cs_info* get_current_cs(const std::string& es);
195
196 // get language identifiers of language codes
197 LIBHUNSPELL_DLL_EXPORTED int get_lang_num(const std::string& lang);
198
199 // get characters of the given 8bit encoding with lower- and uppercase forms
200 LIBHUNSPELL_DLL_EXPORTED std::string get_casechars(const char* enc);
201
202 // convert std::string to all caps
203 LIBHUNSPELL_DLL_EXPORTED std::string& mkallcap(std::string& s,
204 const struct cs_info* csconv);
205
206 // convert null terminated string to all little
207 LIBHUNSPELL_DLL_EXPORTED std::string& mkallsmall(std::string& s,
208 const struct cs_info* csconv);
209
210 // convert first letter of string to little
211 LIBHUNSPELL_DLL_EXPORTED std::string& mkinitsmall(std::string& s,
212 const struct cs_info* csconv);
213
214 // convert first letter of string to capital
215 LIBHUNSPELL_DLL_EXPORTED std::string& mkinitcap(std::string& s,
216 const struct cs_info* csconv);
217
218 // convert first letter of UTF-8 string to capital
219 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
220 mkinitcap_utf(std::vector<w_char>& u, int langnum);
221
222 // convert UTF-8 string to little
223 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
224 mkallsmall_utf(std::vector<w_char>& u, int langnum);
225
226 // convert first letter of UTF-8 string to little
227 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
228 mkinitsmall_utf(std::vector<w_char>& u, int langnum);
229
230 // convert UTF-8 string to capital
231 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
232 mkallcap_utf(std::vector<w_char>& u, int langnum);
233
234 // get type of capitalization
235 LIBHUNSPELL_DLL_EXPORTED int get_captype(const std::string& q, cs_info*);
236
237 // get type of capitalization (UTF-8)
238 LIBHUNSPELL_DLL_EXPORTED int get_captype_utf8(const std::vector<w_char>& q, int langnum);
239
240 // strip all ignored characters in the string
241 LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars_utf(
242 std::string& word,
243 const std::vector<w_char>& ignored_chars);
244
245 // strip all ignored characters in the string
246 LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars(
247 std::string& word,
248 const std::string& ignored_chars);
249
250 LIBHUNSPELL_DLL_EXPORTED bool parse_string(const std::string& line,
251 std::string& out,
252 int ln);
253
254 LIBHUNSPELL_DLL_EXPORTED bool parse_array(const std::string& line,
255 std::string& out,
256 std::vector<w_char>& out_utf16,
257 int utf8,
258 int ln);
259
260 LIBHUNSPELL_DLL_EXPORTED int fieldlen(const char* r);
261
262 LIBHUNSPELL_DLL_EXPORTED bool copy_field(std::string& dest,
263 const std::string& morph,
264 const std::string& var);
265
266 // conversion function for protected memory
267 LIBHUNSPELL_DLL_EXPORTED void store_pointer(char* dest, char* source);
268
269 // conversion function for protected memory
270 LIBHUNSPELL_DLL_EXPORTED char* get_stored_pointer(const char* s);
271
272
273 // to avoid unnecessary string copies and Unicode conversions
274 // we simply check the ignored_chars characters in the word
275 // (in the case of UTF-8 encoded strings, "false" means
276 // "likely false", if ignored_chars characters are not ASCII)
has_no_ignored_chars(const std::string & word,const std::string & ignored_chars)277 inline bool has_no_ignored_chars(const std::string& word,
278 const std::string& ignored_chars) {
279 for (std::string::const_iterator it = ignored_chars.begin(), end = ignored_chars.end(); it != end; ++it)
280 if (word.find(*it) != std::string::npos)
281 return false;
282 return true;
283 }
284
285 // hash entry macros
HENTRY_DATA(struct hentry * h)286 inline char* HENTRY_DATA(struct hentry* h) {
287 char* ret;
288 if (!(h->var & H_OPT))
289 ret = NULL;
290 else if (h->var & H_OPT_ALIASM)
291 ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
292 else
293 ret = HENTRY_WORD(h) + h->blen + 1;
294 return ret;
295 }
296
HENTRY_DATA(const struct hentry * h)297 inline const char* HENTRY_DATA(
298 const struct hentry* h) {
299 const char* ret;
300 if (!(h->var & H_OPT))
301 ret = NULL;
302 else if (h->var & H_OPT_ALIASM)
303 ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
304 else
305 ret = HENTRY_WORD(h) + h->blen + 1;
306 return ret;
307 }
308
309 // NULL-free version for warning-free OOo build
HENTRY_DATA2(const struct hentry * h)310 inline const char* HENTRY_DATA2(
311 const struct hentry* h) {
312 const char* ret;
313 if (!(h->var & H_OPT))
314 ret = "";
315 else if (h->var & H_OPT_ALIASM)
316 ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
317 else
318 ret = HENTRY_WORD(h) + h->blen + 1;
319 return ret;
320 }
321
HENTRY_FIND(struct hentry * h,const char * p)322 inline char* HENTRY_FIND(struct hentry* h,
323 const char* p) {
324 return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL);
325 }
326
327 #endif
328