1 /* ***** BEGIN LICENSE BLOCK *****
2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3  *
4  * Copyright (C) 2002-2017 Németh László
5  *
6  * The contents of this file are subject to the Mozilla Public License Version
7  * 1.1 (the "License"); you may not use this file except in compliance with
8  * the License. You may obtain a copy of the License at
9  * http://www.mozilla.org/MPL/
10  *
11  * Software distributed under the License is distributed on an "AS IS" basis,
12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13  * for the specific language governing rights and limitations under the
14  * License.
15  *
16  * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17  *
18  * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19  * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20  * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21  * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22  * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23  *
24  * Alternatively, the contents of this file may be used under the terms of
25  * either the GNU General Public License Version 2 or later (the "GPL"), or
26  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27  * in which case the provisions of the GPL or the LGPL are applicable instead
28  * of those above. If you wish to allow use of your version of this file only
29  * under the terms of either the GPL or the LGPL, and not to allow others to
30  * use your version of this file under the terms of the MPL, indicate your
31  * decision by deleting the provisions above and replace them with the notice
32  * and other provisions required by the GPL or the LGPL. If you do not delete
33  * the provisions above, a recipient may use your version of this file under
34  * the terms of any one of the MPL, the GPL or the LGPL.
35  *
36  * ***** END LICENSE BLOCK ***** */
37 /*
38  * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39  * And Contributors.  All rights reserved.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  *
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  *
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  *
52  * 3. All modifications to the source code must be clearly marked as
53  *    such.  Binary redistributions based on modified source code
54  *    must be clearly marked as modified versions in the documentation
55  *    and/or other materials provided with the distribution.
56  *
57  * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
61  * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68  * SUCH DAMAGE.
69  */
70 
71 #ifndef CSUTIL_HXX_
72 #define CSUTIL_HXX_
73 
74 #include "hunvisapi.h"
75 
76 // First some base level utility routines
77 
78 #include <fstream>
79 #include <string>
80 #include <vector>
81 #include <string.h>
82 #include "w_char.hxx"
83 #include "htypes.hxx"
84 
85 // casing
86 #define NOCAP 0
87 #define INITCAP 1
88 #define ALLCAP 2
89 #define HUHCAP 3
90 #define HUHINITCAP 4
91 
92 // default encoding and keystring
93 #define SPELL_ENCODING "ISO8859-1"
94 #define SPELL_KEYSTRING "qwertyuiop|asdfghjkl|zxcvbnm"
95 
96 // default morphological fields
97 #define MORPH_STEM "st:"
98 #define MORPH_ALLOMORPH "al:"
99 #define MORPH_POS "po:"
100 #define MORPH_DERI_PFX "dp:"
101 #define MORPH_INFL_PFX "ip:"
102 #define MORPH_TERM_PFX "tp:"
103 #define MORPH_DERI_SFX "ds:"
104 #define MORPH_INFL_SFX "is:"
105 #define MORPH_TERM_SFX "ts:"
106 #define MORPH_SURF_PFX "sp:"
107 #define MORPH_FREQ "fr:"
108 #define MORPH_PHON "ph:"
109 #define MORPH_HYPH "hy:"
110 #define MORPH_PART "pa:"
111 #define MORPH_FLAG "fl:"
112 #define MORPH_HENTRY "_H:"
113 #define MORPH_TAG_LEN strlen(MORPH_STEM)
114 
115 #define MSEP_FLD ' '
116 #define MSEP_REC '\n'
117 #define MSEP_ALT '\v'
118 
119 // default flags
120 #define DEFAULTFLAGS 65510
121 #define FORBIDDENWORD 65510
122 #define ONLYUPCASEFLAG 65511
123 
124 // fix long pathname problem of WIN32 by using w_char std::fstream::open override
125 LIBHUNSPELL_DLL_EXPORTED void myopen(std::ifstream& stream, const char* path,
126                                      std::ios_base::openmode mode);
127 
128 // convert UTF-16 characters to UTF-8
129 LIBHUNSPELL_DLL_EXPORTED std::string& u16_u8(std::string& dest,
130                                              const std::vector<w_char>& src);
131 
132 // convert UTF-8 characters to UTF-16
133 LIBHUNSPELL_DLL_EXPORTED int u8_u16(std::vector<w_char>& dest,
134                                     const std::string& src);
135 
136 // remove end of line char(s)
137 LIBHUNSPELL_DLL_EXPORTED void mychomp(std::string& s);
138 
139 // duplicate string
140 LIBHUNSPELL_DLL_EXPORTED char* mystrdup(const char* s);
141 
142 // parse into tokens with char delimiter
143 LIBHUNSPELL_DLL_EXPORTED std::string::const_iterator mystrsep(const std::string &str,
144                                                               std::string::const_iterator& start);
145 
146 // replace pat by rep in word and return word
147 LIBHUNSPELL_DLL_EXPORTED std::string& mystrrep(std::string& str,
148                                                const std::string& search,
149                                                const std::string& replace);
150 
151 // append s to ends of every lines in text
152 LIBHUNSPELL_DLL_EXPORTED std::string& strlinecat(std::string& str,
153                                                  const std::string& apd);
154 
155 // tokenize into lines with new line
156 LIBHUNSPELL_DLL_EXPORTED std::vector<std::string> line_tok(const std::string& text,
157                                                            char breakchar);
158 
159 // tokenize into lines with new line and uniq in place
160 LIBHUNSPELL_DLL_EXPORTED void line_uniq(std::string& text, char breakchar);
161 
162 LIBHUNSPELL_DLL_EXPORTED void line_uniq_app(std::string& text, char breakchar);
163 
164 // reverse word
165 LIBHUNSPELL_DLL_EXPORTED size_t reverseword(std::string& word);
166 
167 // reverse word
168 LIBHUNSPELL_DLL_EXPORTED size_t reverseword_utf(std::string&);
169 
170 // remove duplicates
171 LIBHUNSPELL_DLL_EXPORTED void uniqlist(std::vector<std::string>& list);
172 
173 // character encoding information
174 struct cs_info {
175   unsigned char ccase;
176   unsigned char clower;
177   unsigned char cupper;
178 };
179 
180 LIBHUNSPELL_DLL_EXPORTED void initialize_utf_tbl();
181 LIBHUNSPELL_DLL_EXPORTED void free_utf_tbl();
182 LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetoupper(unsigned short c,
183                                                        int langnum);
184 LIBHUNSPELL_DLL_EXPORTED w_char upper_utf(w_char u, int langnum);
185 LIBHUNSPELL_DLL_EXPORTED w_char lower_utf(w_char u, int langnum);
186 LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetolower(unsigned short c,
187                                                        int langnum);
188 LIBHUNSPELL_DLL_EXPORTED int unicodeisalpha(unsigned short c);
189 
190 LIBHUNSPELL_DLL_EXPORTED struct cs_info* get_current_cs(const std::string& es);
191 
192 // get language identifiers of language codes
193 LIBHUNSPELL_DLL_EXPORTED int get_lang_num(const std::string& lang);
194 
195 // get characters of the given 8bit encoding with lower- and uppercase forms
196 LIBHUNSPELL_DLL_EXPORTED std::string get_casechars(const char* enc);
197 
198 // convert std::string to all caps
199 LIBHUNSPELL_DLL_EXPORTED std::string& mkallcap(std::string& s,
200                                                const struct cs_info* csconv);
201 
202 // convert null terminated string to all little
203 LIBHUNSPELL_DLL_EXPORTED std::string& mkallsmall(std::string& s,
204                                                  const struct cs_info* csconv);
205 
206 // convert first letter of string to little
207 LIBHUNSPELL_DLL_EXPORTED std::string& mkinitsmall(std::string& s,
208                                                  const struct cs_info* csconv);
209 
210 // convert first letter of string to capital
211 LIBHUNSPELL_DLL_EXPORTED std::string& mkinitcap(std::string& s,
212                                                 const struct cs_info* csconv);
213 
214 // convert first letter of UTF-8 string to capital
215 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
216 mkinitcap_utf(std::vector<w_char>& u, int langnum);
217 
218 // convert UTF-8 string to little
219 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
220 mkallsmall_utf(std::vector<w_char>& u, int langnum);
221 
222 // convert first letter of UTF-8 string to little
223 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
224 mkinitsmall_utf(std::vector<w_char>& u, int langnum);
225 
226 // convert UTF-8 string to capital
227 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
228 mkallcap_utf(std::vector<w_char>& u, int langnum);
229 
230 // get type of capitalization
231 LIBHUNSPELL_DLL_EXPORTED int get_captype(const std::string& q, cs_info*);
232 
233 // get type of capitalization (UTF-8)
234 LIBHUNSPELL_DLL_EXPORTED int get_captype_utf8(const std::vector<w_char>& q, int langnum);
235 
236 // strip all ignored characters in the string
237 LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars_utf(
238     std::string& word,
239     const std::vector<w_char>& ignored_chars);
240 
241 // strip all ignored characters in the string
242 LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars(
243     std::string& word,
244     const std::string& ignored_chars);
245 
246 LIBHUNSPELL_DLL_EXPORTED bool parse_string(const std::string& line,
247                                            std::string& out,
248                                            int ln);
249 
250 LIBHUNSPELL_DLL_EXPORTED bool parse_array(const std::string& line,
251                                           std::string& out,
252                                           std::vector<w_char>& out_utf16,
253                                           int utf8,
254                                           int ln);
255 
256 LIBHUNSPELL_DLL_EXPORTED int fieldlen(const char* r);
257 
258 LIBHUNSPELL_DLL_EXPORTED bool copy_field(std::string& dest,
259                                          const std::string& morph,
260                                          const std::string& var);
261 
262 // conversion function for protected memory
263 LIBHUNSPELL_DLL_EXPORTED void store_pointer(char* dest, char* source);
264 
265 // conversion function for protected memory
266 LIBHUNSPELL_DLL_EXPORTED char* get_stored_pointer(const char* s);
267 
268 
269 // to avoid unnecessary string copies and Unicode conversions
270 // we simply check the ignored_chars characters in the word
271 // (in the case of UTF-8 encoded strings, "false" means
272 // "likely false", if ignored_chars characters are not ASCII)
has_no_ignored_chars(const std::string & word,const std::string & ignored_chars)273 inline bool has_no_ignored_chars(const std::string& word,
274                             const std::string& ignored_chars) {
275   for (std::string::const_iterator it = ignored_chars.begin(), end = ignored_chars.end(); it != end; ++it)
276     if (word.find(*it) != std::string::npos)
277       return false;
278   return true;
279 }
280 
281 // hash entry macros
HENTRY_DATA(struct hentry * h)282 inline char* HENTRY_DATA(struct hentry* h) {
283   char* ret;
284   if (!(h->var & H_OPT))
285     ret = NULL;
286   else if (h->var & H_OPT_ALIASM)
287     ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
288   else
289     ret = HENTRY_WORD(h) + h->blen + 1;
290   return ret;
291 }
292 
HENTRY_DATA(const struct hentry * h)293 inline const char* HENTRY_DATA(
294     const struct hentry* h) {
295   const char* ret;
296   if (!(h->var & H_OPT))
297     ret = NULL;
298   else if (h->var & H_OPT_ALIASM)
299     ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
300   else
301     ret = HENTRY_WORD(h) + h->blen + 1;
302   return ret;
303 }
304 
305 // NULL-free version for warning-free OOo build
HENTRY_DATA2(const struct hentry * h)306 inline const char* HENTRY_DATA2(
307     const struct hentry* h) {
308   const char* ret;
309   if (!(h->var & H_OPT))
310     ret = "";
311   else if (h->var & H_OPT_ALIASM)
312     ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
313   else
314     ret = HENTRY_WORD(h) + h->blen + 1;
315   return ret;
316 }
317 
HENTRY_FIND(struct hentry * h,const char * p)318 inline char* HENTRY_FIND(struct hentry* h,
319                                                   const char* p) {
320   return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL);
321 }
322 
323 #endif
324