1 /* AbiSource Application Framework
2 * Copyright (C) 1998,1999 AbiSource, Inc.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version 2
7 * of the License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA.
18 */
19
20 #include <stdlib.h>
21 #include <string.h>
22
23 #include "ut_assert.h"
24 #include "ut_debugmsg.h"
25 #include "ut_string.h"
26 #include "ut_growbuf.h"
27 #include "ut_hash.h"
28 #include "ut_vector.h"
29 #include "ut_string_class.h"
30
31 #include "xap_Dictionary.h"
32
33 /*****************************************************************/
34 /*****************************************************************/
35
36 /*
37 Dictionary is an alphabetic list of words, one per line.
38 Import/export as a plain text file formatted in UTF-8.
39 We allow either LF or CR or CRLF line termination.
40
41 (This code shamelessly cribbed from the impexp for UTF-8.)
42 */
43
44 /*****************************************************************/
45 /*****************************************************************/
46
XAP_Dictionary(const char * szFilename)47 XAP_Dictionary::XAP_Dictionary(const char * szFilename)
48 : m_hashWords(29)
49 {
50 UT_ASSERT(szFilename && *szFilename);
51 m_szFilename = g_strdup(szFilename);
52
53 m_fp = 0;
54 m_bDirty = false;
55 }
56
~XAP_Dictionary()57 XAP_Dictionary::~XAP_Dictionary()
58 {
59 if (m_fp)
60 _closeFile();
61
62 FREEP(m_szFilename);
63
64 //UT_HASH_PURGEDATA(UT_UCSChar *, (&m_hashWords), g_free);
65 m_hashWords.freeData();
66 #if 0
67 UT_StringPtrMap::UT_Cursor _hc1(&m_hashWords);
68 for (UT_UCSChar * _hval1 = const_cast<UT_UCSChar *>(reinterpret_cast<const UT_UCSChar *>(_hc1.first())); _hc1.is_valid(); _hval1 = const_cast<UT_UCSChar *>(reinterpret_cast<const UT_UCSChar *>(_hc1.next())) )
69 {
70 if (_hval1)
71 g_free (_hval1);
72 }
73 #endif
74 }
75
getShortName(void) const76 const char * XAP_Dictionary::getShortName(void) const
77 {
78 // TODO: get just the filename (no path), for use in UI
79 return NULL;
80 }
81
82 /*****************************************************************/
83 /*****************************************************************/
84
_openFile(const char * mode)85 bool XAP_Dictionary::_openFile(const char * mode)
86 {
87 UT_ASSERT(!m_fp);
88
89 // TODO add code to make a backup of the original file, if it exists.
90
91 m_fp = fopen(m_szFilename,mode);
92 return (m_fp != 0);
93 }
94
_writeBytes(const UT_Byte * pBytes,UT_uint32 length)95 UT_uint32 XAP_Dictionary::_writeBytes(const UT_Byte * pBytes, UT_uint32 length)
96 {
97 UT_ASSERT(m_fp);
98 UT_ASSERT(pBytes);
99 UT_ASSERT(length);
100
101 return fwrite(pBytes,sizeof(UT_Byte),length,m_fp);
102 }
103
_writeBytes(const UT_Byte * sz)104 bool XAP_Dictionary::_writeBytes(const UT_Byte * sz)
105 {
106 UT_ASSERT(m_fp);
107 UT_ASSERT(sz);
108 int length = strlen(reinterpret_cast<const char *>(sz));
109 UT_ASSERT(length);
110
111 return (_writeBytes(sz,length)==static_cast<UT_uint32>(length));
112 }
113
_closeFile(void)114 bool XAP_Dictionary::_closeFile(void)
115 {
116 if (m_fp)
117 fclose(m_fp);
118 m_fp = 0;
119 return true;
120 }
121
_abortFile(void)122 void XAP_Dictionary::_abortFile(void)
123 {
124 // abort the write.
125 // TODO close the file and do any restore and/or cleanup.
126
127 _closeFile();
128 return;
129 }
130
131 //////////////////////////////////////////////////////////////////
132 //////////////////////////////////////////////////////////////////
133
load(void)134 bool XAP_Dictionary::load(void)
135 {
136 UT_ASSERT(m_hashWords.size() == 0);
137
138 if (!_openFile("r"))
139 return false;
140
141 if (!_parseUTF8())
142 _abortFile();
143 else
144 _closeFile();
145
146 m_bDirty = false;
147 //
148 // Hardwire in some words that should be in the English Language :-)
149 //
150 addWord("AbiWord");
151 addWord("AbiSource");
152 return true;
153 }
154
155 #define X_ReturnIfFail(exp) do { bool b = (exp); if (!b) return (false); } while (0)
156
_parseUTF8(void)157 bool XAP_Dictionary::_parseUTF8(void)
158 {
159 UT_GrowBuf gbBlock(1024);
160 bool bEatLF = false;
161 gchar buf[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
162 gint len;
163
164 while (fread(buf, 1, sizeof(gchar), m_fp) > 0)
165 {
166 switch (buf[0])
167 {
168 case '\r':
169 case '\n':
170 if ((buf[0] == '\n') && bEatLF)
171 {
172 bEatLF = false;
173 break;
174 }
175
176 if (buf[0] == '\r')
177 {
178 bEatLF = true;
179 }
180
181 // we interprete either CRLF, CR, or LF as a word delimiter.
182
183 if (gbBlock.getLength() > 0)
184 {
185 X_ReturnIfFail(addWord(reinterpret_cast<UT_UCS4Char*>(gbBlock.getPointer(0)), gbBlock.getLength()));
186 gbBlock.truncate(0);
187 }
188 break;
189
190 default:
191 bEatLF = false;
192
193 len = g_utf8_next_char(buf) - buf;
194 if (len > 1) {
195 fread (buf + 1, len - 1, sizeof (gchar), m_fp);
196 }
197 UT_UCSChar uc = g_utf8_get_char(buf);
198 X_ReturnIfFail(gbBlock.ins(gbBlock.getLength(),reinterpret_cast<UT_GrowBufElement*>(&uc),1));
199 break;
200 }
201 }
202
203 if (gbBlock.getLength() > 0)
204 {
205 X_ReturnIfFail(addWord(reinterpret_cast<UT_UCS4Char*>(gbBlock.getPointer(0)), gbBlock.getLength()));
206 }
207
208 return true;
209 }
210
211 //////////////////////////////////////////////////////////////////
212 //////////////////////////////////////////////////////////////////
213
save(void)214 bool XAP_Dictionary::save(void)
215 {
216 if (!m_bDirty)
217 return true;
218
219 if (!_openFile("w"))
220 return false;
221
222 UT_GenericVector<UT_UCSChar *> * pVec = m_hashWords.enumerate();
223 UT_ASSERT(pVec);
224
225 UT_uint32 size = pVec->size();
226
227 for (UT_uint32 i = 0; i < size; i++)
228 {
229 UT_UCSChar * pWord = pVec->getNthItem(i);
230 _outputUTF8(pWord, UT_UCS4_strlen(pWord));
231 _writeBytes(reinterpret_cast<const UT_Byte *>("\n"));
232 }
233
234 _closeFile();
235
236 delete pVec;
237 m_bDirty = false;
238
239 return true;
240 }
241
_outputUTF8(const UT_UCSChar * data,UT_uint32 length)242 void XAP_Dictionary::_outputUTF8(const UT_UCSChar * data, UT_uint32 length)
243 {
244 UT_String buf;
245 const UT_UCSChar * pData;
246
247 for (pData = data; (pData<data+length); /**/)
248 {
249 if (*pData > 0x007f)
250 {
251 gchar outbuf[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
252 g_unichar_to_utf8(*pData++, outbuf);
253 buf += outbuf;
254 }
255 else
256 {
257 buf += (char)*pData++;
258 }
259 }
260
261 _writeBytes(reinterpret_cast<const UT_Byte *>(buf.c_str()),buf.size());
262 }
263
264 //////////////////////////////////////////////////////////////////
265 //////////////////////////////////////////////////////////////////
266
addWord(const UT_UCSChar * pWord,UT_uint32 len)267 bool XAP_Dictionary::addWord(const UT_UCSChar * pWord, UT_uint32 len)
268 {
269 char * key = static_cast<char *>(UT_calloc(len+1, sizeof(char)));
270 UT_UCSChar * copy = static_cast<UT_UCSChar *>(UT_calloc(len+1, sizeof(UT_UCSChar)));
271 if (!key || !copy)
272 {
273 UT_DEBUGMSG(("mem failure adding word to dictionary\n"));
274 FREEP(key);
275 FREEP(copy);
276 return false;
277 }
278 UT_uint32 i = 0;
279 for (i = 0; i < len; i++)
280 {
281 UT_UCSChar currentChar;
282 currentChar = pWord[i];
283 // map smart quote apostrophe to ASCII right single quote
284 if (currentChar == UCS_RQUOTE) currentChar = '\'';
285 key[i] = static_cast<char>(static_cast<unsigned char>(pWord[i]));
286 copy[i] = currentChar;
287 xxx_UT_DEBUGMSG(("addWord: key[%d] = %c %d \n",i,key[i],key[i]));
288 if(key[i] == 0)
289 {
290 break;
291 }
292 }
293 key[i] = 0;
294 //
295 // Get exactly the same length.
296 //
297 char * key2 = g_strdup(key);
298 copy[i] = 0;
299 #if 0
300 //
301 // Useful debugging code
302 //
303 char * ucs_dup = static_cast<char *>(UT_calloc(2*len+1, sizeof(char)));
304 UT_UCS4_strcpy_to_char( ucs_dup, copy);
305 UT_DEBUGMSG(("Inserting word %s with key %s into hash \n",ucs_dup,key));
306 FREEP(ucs_dup);
307
308 #endif
309 if(!m_hashWords.insert(key2,copy))
310 FREEP(copy);
311
312 FREEP(key);
313 FREEP(key2);
314
315 // TODO: is this right?
316 m_bDirty = true;
317 return true;
318 }
319
addWord(const char * word)320 bool XAP_Dictionary::addWord(const char * word)
321 {
322 UT_sint32 len = strlen(word);
323 if(len <=0)
324 {
325 return false;
326 }
327 UT_UCSChar * ucs_dup = static_cast<UT_UCSChar *>(UT_calloc(len+1, sizeof(UT_UCSChar)));
328 UT_UCS4_strcpy_char(ucs_dup, word);
329 addWord(ucs_dup,len);
330 FREEP(ucs_dup);
331 return true;
332 }
333
334
335 /*!
336 * Returns true if the word given is found in the users custom dictionary.
337 \param const UT_UCSChar * pWord the word to look for suggestion for
338 \param UT_uint32 len the length of the word
339 \returns UT_Vector * pVecSuggestions this a vector of suggestions.
340 The returner is responsible for deleting these words.
341 */
suggestWord(UT_GenericVector<UT_UCSChar * > * pVecSuggestions,const UT_UCSChar * pWord,UT_uint32 len)342 void XAP_Dictionary::suggestWord(UT_GenericVector<UT_UCSChar *> * pVecSuggestions, const UT_UCSChar * pWord, UT_uint32 len)
343 {
344 //
345 // Get the words in the local dictionary
346 //
347 UT_GenericVector<UT_UCSChar *> * pVec = m_hashWords.enumerate();
348 UT_ASSERT(pVec);
349 UT_uint32 i=0;
350 UT_uint32 count = pVec->getItemCount();
351 //
352 // Turn our word into a NULL teminated string
353 //
354 UT_UCSChar * pszWord = static_cast<UT_UCSChar*>(UT_calloc(len+1, sizeof(UT_UCSChar)));
355 for(i=0; i< len; i++)
356 {
357 pszWord[i] = pWord[i];
358 }
359 pszWord[len] = 0;
360 //
361 // Loop over all the words in our custom doctionary and add them to the
362 //the suggestions if they're possibilities.
363 //
364 for(i=0; i< count; i++)
365 {
366 UT_UCSChar * pszDict = pVec->getNthItem(i);
367 UT_UCSChar * pszReturn = NULL;
368 float lenDict = static_cast<float>(UT_UCS4_strlen(pszDict));
369 UT_uint32 wordInDict = countCommonChars(pszDict,pszWord);
370 UT_uint32 dictInWord = countCommonChars(pszWord,pszDict);
371 float flen = static_cast<float>(len);
372 float frac1 = (static_cast<float>(wordInDict)) / flen;
373 float frac2 = (static_cast<float>(dictInWord)) / lenDict;
374
375 if((frac1 > 0.8) && (frac2 > 0.8))
376 {
377 UT_UCS4_cloneString(&pszReturn, pszDict);
378 pVecSuggestions->addItem(pszReturn);
379 }
380 }
381 FREEP(pszWord);
382 DELETEP(pVec);
383 }
384
385 /*!
386 * This method counts the number of common characters in pszNeedle found in
387 * pszHaystack. Every time character in pszNeedle is found in pszHaystack the
388 * score is incremented by 1.
389 */
countCommonChars(UT_UCSChar * pszHaystack,UT_UCSChar * pszNeedle)390 UT_uint32 XAP_Dictionary::countCommonChars( UT_UCSChar *pszHaystack,UT_UCSChar * pszNeedle)
391 {
392 UT_uint32 lenNeedle = UT_UCS4_strlen(pszNeedle);
393 UT_UCSChar oneChar[2];
394 oneChar[1] = 0;
395 UT_uint32 i=0;
396 UT_uint32 score =0;
397 for(i=0; i< lenNeedle; i++)
398 {
399 oneChar[0] = pszNeedle[i];
400 if(UT_UCS4_strstr(pszHaystack,oneChar) != 0)
401 {
402 score++;
403 }
404 }
405 return score;
406 }
407
isWord(const UT_UCSChar * pWord,UT_uint32 len) const408 bool XAP_Dictionary::isWord(const UT_UCSChar * pWord, UT_uint32 len) const
409 {
410 char * key = static_cast<char*>(UT_calloc(len+1, sizeof(char)));
411 if (!key)
412 {
413 UT_DEBUGMSG(("mem failure looking up word in dictionary\n"));
414 FREEP(key);
415 return false;
416 }
417 UT_uint32 i =0;
418 for (i = 0; i < len; i++)
419 {
420 key[i] = static_cast<char>(static_cast<unsigned char>( pWord[i] ));
421 xxx_UT_DEBUGMSG(("isword key[%d] = %c %d \n",i,key[i],key[i]));
422 if(key[i] == 0)
423 break;
424 }
425 key[i] = 0;
426 char * key2 = g_strdup(key);
427 bool contains = m_hashWords.contains (key2, NULL);
428 FREEP(key);
429 FREEP(key2);
430 return contains;
431 }
432
433