1 /* ***** BEGIN LICENSE BLOCK *****
2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3  *
4  * Copyright (C) 2002-2017 Németh László
5  *
6  * The contents of this file are subject to the Mozilla Public License Version
7  * 1.1 (the "License"); you may not use this file except in compliance with
8  * the License. You may obtain a copy of the License at
9  * http://www.mozilla.org/MPL/
10  *
11  * Software distributed under the License is distributed on an "AS IS" basis,
12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13  * for the specific language governing rights and limitations under the
14  * License.
15  *
16  * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17  *
18  * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19  * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20  * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21  * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22  * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23  *
24  * Alternatively, the contents of this file may be used under the terms of
25  * either the GNU General Public License Version 2 or later (the "GPL"), or
26  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27  * in which case the provisions of the GPL or the LGPL are applicable instead
28  * of those above. If you wish to allow use of your version of this file only
29  * under the terms of either the GPL or the LGPL, and not to allow others to
30  * use your version of this file under the terms of the MPL, indicate your
31  * decision by deleting the provisions above and replace them with the notice
32  * and other provisions required by the GPL or the LGPL. If you do not delete
33  * the provisions above, a recipient may use your version of this file under
34  * the terms of any one of the MPL, the GPL or the LGPL.
35  *
36  * ***** END LICENSE BLOCK ***** */
37 /*
38  * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
39  * And Contributors.  All rights reserved.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  *
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  *
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  *
52  * 3. All modifications to the source code must be clearly marked as
53  *    such.  Binary redistributions based on modified source code
54  *    must be clearly marked as modified versions in the documentation
55  *    and/or other materials provided with the distribution.
56  *
57  * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
58  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
59  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
60  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
61  * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
62  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
63  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68  * SUCH DAMAGE.
69  */
70 
71 #ifndef CSUTIL_HXX_
72 #define CSUTIL_HXX_
73 
74 #include "hunvisapi.h"
75 
76 // First some base level utility routines
77 
78 #include <fstream>
79 #include <string>
80 #include <vector>
81 #include <string.h>
82 #include "w_char.hxx"
83 #include "htypes.hxx"
84 
85 #ifdef MOZILLA_CLIENT
86 #include "nscore.h"  // for mozalloc headers
87 #endif
88 
89 // casing
90 #define NOCAP 0
91 #define INITCAP 1
92 #define ALLCAP 2
93 #define HUHCAP 3
94 #define HUHINITCAP 4
95 
96 // default encoding and keystring
97 #define SPELL_ENCODING "ISO8859-1"
98 #define SPELL_KEYSTRING "qwertyuiop|asdfghjkl|zxcvbnm"
99 
100 // default morphological fields
101 #define MORPH_STEM "st:"
102 #define MORPH_ALLOMORPH "al:"
103 #define MORPH_POS "po:"
104 #define MORPH_DERI_PFX "dp:"
105 #define MORPH_INFL_PFX "ip:"
106 #define MORPH_TERM_PFX "tp:"
107 #define MORPH_DERI_SFX "ds:"
108 #define MORPH_INFL_SFX "is:"
109 #define MORPH_TERM_SFX "ts:"
110 #define MORPH_SURF_PFX "sp:"
111 #define MORPH_FREQ "fr:"
112 #define MORPH_PHON "ph:"
113 #define MORPH_HYPH "hy:"
114 #define MORPH_PART "pa:"
115 #define MORPH_FLAG "fl:"
116 #define MORPH_HENTRY "_H:"
117 #define MORPH_TAG_LEN strlen(MORPH_STEM)
118 
119 #define MSEP_FLD ' '
120 #define MSEP_REC '\n'
121 #define MSEP_ALT '\v'
122 
123 // default flags
124 #define DEFAULTFLAGS 65510
125 #define FORBIDDENWORD 65510
126 #define ONLYUPCASEFLAG 65511
127 
128 // fix long pathname problem of WIN32 by using w_char std::fstream::open override
129 LIBHUNSPELL_DLL_EXPORTED void myopen(std::ifstream& stream, const char* path,
130                                      std::ios_base::openmode mode);
131 
132 // convert UTF-16 characters to UTF-8
133 LIBHUNSPELL_DLL_EXPORTED std::string& u16_u8(std::string& dest,
134                                              const std::vector<w_char>& src);
135 
136 // convert UTF-8 characters to UTF-16
137 LIBHUNSPELL_DLL_EXPORTED int u8_u16(std::vector<w_char>& dest,
138                                     const std::string& src);
139 
140 // remove end of line char(s)
141 LIBHUNSPELL_DLL_EXPORTED void mychomp(std::string& s);
142 
143 // duplicate string
144 LIBHUNSPELL_DLL_EXPORTED char* mystrdup(const char* s);
145 
146 // parse into tokens with char delimiter
147 LIBHUNSPELL_DLL_EXPORTED std::string::const_iterator mystrsep(const std::string &str,
148                                                               std::string::const_iterator& start);
149 
150 // replace pat by rep in word and return word
151 LIBHUNSPELL_DLL_EXPORTED std::string& mystrrep(std::string& str,
152                                                const std::string& search,
153                                                const std::string& replace);
154 
155 // append s to ends of every lines in text
156 LIBHUNSPELL_DLL_EXPORTED std::string& strlinecat(std::string& str,
157                                                  const std::string& apd);
158 
159 // tokenize into lines with new line
160 LIBHUNSPELL_DLL_EXPORTED std::vector<std::string> line_tok(const std::string& text,
161                                                            char breakchar);
162 
163 // tokenize into lines with new line and uniq in place
164 LIBHUNSPELL_DLL_EXPORTED void line_uniq(std::string& text, char breakchar);
165 
166 LIBHUNSPELL_DLL_EXPORTED void line_uniq_app(std::string& text, char breakchar);
167 
168 // reverse word
169 LIBHUNSPELL_DLL_EXPORTED size_t reverseword(std::string& word);
170 
171 // reverse word
172 LIBHUNSPELL_DLL_EXPORTED size_t reverseword_utf(std::string&);
173 
174 // remove duplicates
175 LIBHUNSPELL_DLL_EXPORTED void uniqlist(std::vector<std::string>& list);
176 
177 // character encoding information
178 struct cs_info {
179   unsigned char ccase;
180   unsigned char clower;
181   unsigned char cupper;
182 };
183 
184 LIBHUNSPELL_DLL_EXPORTED void initialize_utf_tbl();
185 LIBHUNSPELL_DLL_EXPORTED void free_utf_tbl();
186 LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetoupper(unsigned short c,
187                                                        int langnum);
188 LIBHUNSPELL_DLL_EXPORTED w_char upper_utf(w_char u, int langnum);
189 LIBHUNSPELL_DLL_EXPORTED w_char lower_utf(w_char u, int langnum);
190 LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetolower(unsigned short c,
191                                                        int langnum);
192 LIBHUNSPELL_DLL_EXPORTED int unicodeisalpha(unsigned short c);
193 
194 LIBHUNSPELL_DLL_EXPORTED struct cs_info* get_current_cs(const std::string& es);
195 
196 // get language identifiers of language codes
197 LIBHUNSPELL_DLL_EXPORTED int get_lang_num(const std::string& lang);
198 
199 // get characters of the given 8bit encoding with lower- and uppercase forms
200 LIBHUNSPELL_DLL_EXPORTED std::string get_casechars(const char* enc);
201 
202 // convert std::string to all caps
203 LIBHUNSPELL_DLL_EXPORTED std::string& mkallcap(std::string& s,
204                                                const struct cs_info* csconv);
205 
206 // convert null terminated string to all little
207 LIBHUNSPELL_DLL_EXPORTED std::string& mkallsmall(std::string& s,
208                                                  const struct cs_info* csconv);
209 
210 // convert first letter of string to little
211 LIBHUNSPELL_DLL_EXPORTED std::string& mkinitsmall(std::string& s,
212                                                  const struct cs_info* csconv);
213 
214 // convert first letter of string to capital
215 LIBHUNSPELL_DLL_EXPORTED std::string& mkinitcap(std::string& s,
216                                                 const struct cs_info* csconv);
217 
218 // convert first letter of UTF-8 string to capital
219 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
220 mkinitcap_utf(std::vector<w_char>& u, int langnum);
221 
222 // convert UTF-8 string to little
223 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
224 mkallsmall_utf(std::vector<w_char>& u, int langnum);
225 
226 // convert first letter of UTF-8 string to little
227 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
228 mkinitsmall_utf(std::vector<w_char>& u, int langnum);
229 
230 // convert UTF-8 string to capital
231 LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
232 mkallcap_utf(std::vector<w_char>& u, int langnum);
233 
234 // get type of capitalization
235 LIBHUNSPELL_DLL_EXPORTED int get_captype(const std::string& q, cs_info*);
236 
237 // get type of capitalization (UTF-8)
238 LIBHUNSPELL_DLL_EXPORTED int get_captype_utf8(const std::vector<w_char>& q, int langnum);
239 
240 // strip all ignored characters in the string
241 LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars_utf(
242     std::string& word,
243     const std::vector<w_char>& ignored_chars);
244 
245 // strip all ignored characters in the string
246 LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars(
247     std::string& word,
248     const std::string& ignored_chars);
249 
250 LIBHUNSPELL_DLL_EXPORTED bool parse_string(const std::string& line,
251                                            std::string& out,
252                                            int ln);
253 
254 LIBHUNSPELL_DLL_EXPORTED bool parse_array(const std::string& line,
255                                           std::string& out,
256                                           std::vector<w_char>& out_utf16,
257                                           int utf8,
258                                           int ln);
259 
260 LIBHUNSPELL_DLL_EXPORTED int fieldlen(const char* r);
261 
262 LIBHUNSPELL_DLL_EXPORTED bool copy_field(std::string& dest,
263                                          const std::string& morph,
264                                          const std::string& var);
265 
266 // conversion function for protected memory
267 LIBHUNSPELL_DLL_EXPORTED void store_pointer(char* dest, char* source);
268 
269 // conversion function for protected memory
270 LIBHUNSPELL_DLL_EXPORTED char* get_stored_pointer(const char* s);
271 
272 
273 // to avoid unnecessary string copies and Unicode conversions
274 // we simply check the ignored_chars characters in the word
275 // (in the case of UTF-8 encoded strings, "false" means
276 // "likely false", if ignored_chars characters are not ASCII)
has_no_ignored_chars(const std::string & word,const std::string & ignored_chars)277 inline bool has_no_ignored_chars(const std::string& word,
278                             const std::string& ignored_chars) {
279   for (std::string::const_iterator it = ignored_chars.begin(), end = ignored_chars.end(); it != end; ++it)
280     if (word.find(*it) != std::string::npos)
281       return false;
282   return true;
283 }
284 
285 // hash entry macros
HENTRY_DATA(struct hentry * h)286 inline char* HENTRY_DATA(struct hentry* h) {
287   char* ret;
288   if (!(h->var & H_OPT))
289     ret = NULL;
290   else if (h->var & H_OPT_ALIASM)
291     ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
292   else
293     ret = HENTRY_WORD(h) + h->blen + 1;
294   return ret;
295 }
296 
HENTRY_DATA(const struct hentry * h)297 inline const char* HENTRY_DATA(
298     const struct hentry* h) {
299   const char* ret;
300   if (!(h->var & H_OPT))
301     ret = NULL;
302   else if (h->var & H_OPT_ALIASM)
303     ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
304   else
305     ret = HENTRY_WORD(h) + h->blen + 1;
306   return ret;
307 }
308 
309 // NULL-free version for warning-free OOo build
HENTRY_DATA2(const struct hentry * h)310 inline const char* HENTRY_DATA2(
311     const struct hentry* h) {
312   const char* ret;
313   if (!(h->var & H_OPT))
314     ret = "";
315   else if (h->var & H_OPT_ALIASM)
316     ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
317   else
318     ret = HENTRY_WORD(h) + h->blen + 1;
319   return ret;
320 }
321 
HENTRY_FIND(struct hentry * h,const char * p)322 inline char* HENTRY_FIND(struct hentry* h,
323                                                   const char* p) {
324   return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL);
325 }
326 
327 #endif
328