1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * Copyright (C) 2002-2017 Németh László
5 *
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
10 *
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
15 *
16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17 *
18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23 *
24 * Alternatively, the contents of this file may be used under the terms of
25 * either the GNU General Public License Version 2 or later (the "GPL"), or
26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
35 *
36 * ***** END LICENSE BLOCK ***** */
37 /*
38 * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39 * And Contributors. All rights reserved.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 *
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 *
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 *
52 * 3. All modifications to the source code must be clearly marked as
53 * such. Binary redistributions based on modified source code
54 * must be clearly marked as modified versions in the documentation
55 * and/or other materials provided with the distribution.
56 *
57 * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
61 * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 */
70
71 #ifndef CSUTIL_HXX_
72 #define CSUTIL_HXX_
73
74 #include "hunvisapi.h"
75
76 // First some base level utility routines
77
78 #include <fstream>
79 #include <string>
80 #include <vector>
81 #include <string.h>
82 #include "w_char.hxx"
83 #include "htypes.hxx"
84
85 // casing
86 #define NOCAP 0
87 #define INITCAP 1
88 #define ALLCAP 2
89 #define HUHCAP 3
90 #define HUHINITCAP 4
91
92 // default encoding and keystring
93 #define SPELL_ENCODING "ISO8859-1"
94 #define SPELL_KEYSTRING "qwertyuiop|asdfghjkl|zxcvbnm"
95
96 // default morphological fields
97 #define MORPH_STEM "st:"
98 #define MORPH_ALLOMORPH "al:"
99 #define MORPH_POS "po:"
100 #define MORPH_DERI_PFX "dp:"
101 #define MORPH_INFL_PFX "ip:"
102 #define MORPH_TERM_PFX "tp:"
103 #define MORPH_DERI_SFX "ds:"
104 #define MORPH_INFL_SFX "is:"
105 #define MORPH_TERM_SFX "ts:"
106 #define MORPH_SURF_PFX "sp:"
107 #define MORPH_FREQ "fr:"
108 #define MORPH_PHON "ph:"
109 #define MORPH_HYPH "hy:"
110 #define MORPH_PART "pa:"
111 #define MORPH_FLAG "fl:"
112 #define MORPH_HENTRY "_H:"
113 #define MORPH_TAG_LEN strlen(MORPH_STEM)
114
115 #define MSEP_FLD ' '
116 #define MSEP_REC '\n'
117 #define MSEP_ALT '\v'
118
119 // default flags
120 #define DEFAULTFLAGS 65510
121 #define FORBIDDENWORD 65510
122 #define ONLYUPCASEFLAG 65511
123
124 // fix long pathname problem of WIN32 by using w_char std::fstream::open override
125 LIBHUNSPELL_DLL_EXPORTED void myopen(std::ifstream& stream, const char* path,
126 std::ios_base::openmode mode);
127
128 // convert UTF-16 characters to UTF-8
129 LIBHUNSPELL_DLL_EXPORTED std::string& u16_u8(std::string& dest,
130 const std::vector<w_char>& src);
131
132 // convert UTF-8 characters to UTF-16
133 LIBHUNSPELL_DLL_EXPORTED int u8_u16(std::vector<w_char>& dest,
134 const std::string& src);
135
136 // remove end of line char(s)
137 LIBHUNSPELL_DLL_EXPORTED void mychomp(std::string& s);
138
139 // duplicate string
140 LIBHUNSPELL_DLL_EXPORTED char* mystrdup(const char* s);
141
142 // parse into tokens with char delimiter
143 LIBHUNSPELL_DLL_EXPORTED std::string::const_iterator mystrsep(const std::string &str,
144 std::string::const_iterator& start);
145
146 // replace pat by rep in word and return word
147 LIBHUNSPELL_DLL_EXPORTED std::string& mystrrep(std::string& str,
148 const std::string& search,
149 const std::string& replace);
150
151 // append s to ends of every lines in text
152 LIBHUNSPELL_DLL_EXPORTED std::string& strlinecat(std::string& str,
153 const std::string& apd);
154
155 // tokenize into lines with new line
156 LIBHUNSPELL_DLL_EXPORTED std::vector<std::string> line_tok(const std::string& text,
157 char breakchar);
158
159 // tokenize into lines with new line and uniq in place
160 LIBHUNSPELL_DLL_EXPORTED void line_uniq(std::string& text, char breakchar);
161
162 LIBHUNSPELL_DLL_EXPORTED void line_uniq_app(std::string& text, char breakchar);
163
164 // reverse word
165 LIBHUNSPELL_DLL_EXPORTED size_t reverseword(std::string& word);
166
167 // reverse word
168 LIBHUNSPELL_DLL_EXPORTED size_t reverseword_utf(std::string&);
169
170 // remove duplicates
171 LIBHUNSPELL_DLL_EXPORTED void uniqlist(std::vector<std::string>& list);
172
173 // character encoding information
174 struct cs_info {
175 unsigned char ccase;
176 unsigned char clower;
177 unsigned char cupper;
178 };
179
180 LIBHUNSPELL_DLL_EXPORTED void initialize_utf_tbl();
181 LIBHUNSPELL_DLL_EXPORTED void free_utf_tbl();
182 LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetoupper(unsigned short c,
183 int langnum);
184 LIBHUNSPELL_DLL_EXPORTED w_char upper_utf(w_char u, int langnum);
185 LIBHUNSPELL_DLL_EXPORTED w_char lower_utf(w_char u, int langnum);
186 LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetolower(unsigned short c,
187 int langnum);
188 LIBHUNSPELL_DLL_EXPORTED int unicodeisalpha(unsigned short c);
189
190 LIBHUNSPELL_DLL_EXPORTED struct cs_info* get_current_cs(const std::string& es);
191
192 // get language identifiers of language codes
193 LIBHUNSPELL_DLL_EXPORTED int get_lang_num(const std::string& lang);
194
195 // get characters of the given 8bit encoding with lower- and uppercase forms
196 LIBHUNSPELL_DLL_EXPORTED std::string get_casechars(const char* enc);
197
198 // convert std::string to all caps
199 LIBHUNSPELL_DLL_EXPORTED std::string& mkallcap(std::string& s,
200 const struct cs_info* csconv);
201
202 // convert null terminated string to all little
203 LIBHUNSPELL_DLL_EXPORTED std::string& mkallsmall(std::string& s,
204 const struct cs_info* csconv);
205
206 // convert first letter of string to little
207 LIBHUNSPELL_DLL_EXPORTED std::string& mkinitsmall(std::string& s,
208 const struct cs_info* csconv);
209
210 // convert first letter of string to capital
211 LIBHUNSPELL_DLL_EXPORTED std::string& mkinitcap(std::string& s,
212 const struct cs_info* csconv);
213
214 // convert first letter of UTF-8 string to capital
215 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
216 mkinitcap_utf(std::vector<w_char>& u, int langnum);
217
218 // convert UTF-8 string to little
219 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
220 mkallsmall_utf(std::vector<w_char>& u, int langnum);
221
222 // convert first letter of UTF-8 string to little
223 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
224 mkinitsmall_utf(std::vector<w_char>& u, int langnum);
225
226 // convert UTF-8 string to capital
227 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
228 mkallcap_utf(std::vector<w_char>& u, int langnum);
229
230 // get type of capitalization
231 LIBHUNSPELL_DLL_EXPORTED int get_captype(const std::string& q, cs_info*);
232
233 // get type of capitalization (UTF-8)
234 LIBHUNSPELL_DLL_EXPORTED int get_captype_utf8(const std::vector<w_char>& q, int langnum);
235
236 // strip all ignored characters in the string
237 LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars_utf(
238 std::string& word,
239 const std::vector<w_char>& ignored_chars);
240
241 // strip all ignored characters in the string
242 LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars(
243 std::string& word,
244 const std::string& ignored_chars);
245
246 LIBHUNSPELL_DLL_EXPORTED bool parse_string(const std::string& line,
247 std::string& out,
248 int ln);
249
250 LIBHUNSPELL_DLL_EXPORTED bool parse_array(const std::string& line,
251 std::string& out,
252 std::vector<w_char>& out_utf16,
253 int utf8,
254 int ln);
255
256 LIBHUNSPELL_DLL_EXPORTED int fieldlen(const char* r);
257
258 LIBHUNSPELL_DLL_EXPORTED bool copy_field(std::string& dest,
259 const std::string& morph,
260 const std::string& var);
261
262 // conversion function for protected memory
263 LIBHUNSPELL_DLL_EXPORTED void store_pointer(char* dest, char* source);
264
265 // conversion function for protected memory
266 LIBHUNSPELL_DLL_EXPORTED char* get_stored_pointer(const char* s);
267
268
269 // to avoid unnecessary string copies and Unicode conversions
270 // we simply check the ignored_chars characters in the word
271 // (in the case of UTF-8 encoded strings, "false" means
272 // "likely false", if ignored_chars characters are not ASCII)
has_no_ignored_chars(const std::string & word,const std::string & ignored_chars)273 inline bool has_no_ignored_chars(const std::string& word,
274 const std::string& ignored_chars) {
275 for (std::string::const_iterator it = ignored_chars.begin(), end = ignored_chars.end(); it != end; ++it)
276 if (word.find(*it) != std::string::npos)
277 return false;
278 return true;
279 }
280
281 // hash entry macros
HENTRY_DATA(struct hentry * h)282 inline char* HENTRY_DATA(struct hentry* h) {
283 char* ret;
284 if (!(h->var & H_OPT))
285 ret = NULL;
286 else if (h->var & H_OPT_ALIASM)
287 ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
288 else
289 ret = HENTRY_WORD(h) + h->blen + 1;
290 return ret;
291 }
292
HENTRY_DATA(const struct hentry * h)293 inline const char* HENTRY_DATA(
294 const struct hentry* h) {
295 const char* ret;
296 if (!(h->var & H_OPT))
297 ret = NULL;
298 else if (h->var & H_OPT_ALIASM)
299 ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
300 else
301 ret = HENTRY_WORD(h) + h->blen + 1;
302 return ret;
303 }
304
305 // NULL-free version for warning-free OOo build
HENTRY_DATA2(const struct hentry * h)306 inline const char* HENTRY_DATA2(
307 const struct hentry* h) {
308 const char* ret;
309 if (!(h->var & H_OPT))
310 ret = "";
311 else if (h->var & H_OPT_ALIASM)
312 ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
313 else
314 ret = HENTRY_WORD(h) + h->blen + 1;
315 return ret;
316 }
317
HENTRY_FIND(struct hentry * h,const char * p)318 inline char* HENTRY_FIND(struct hentry* h,
319 const char* p) {
320 return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL);
321 }
322
323 #endif
324