1 /* AbiSource Application Framework
2  * Copyright (C) 1998,1999 AbiSource, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17  * 02110-1301 USA.
18  */
19 
20 #include <stdlib.h>
21 #include <string.h>
22 
23 #include "ut_assert.h"
24 #include "ut_debugmsg.h"
25 #include "ut_string.h"
26 #include "ut_growbuf.h"
27 #include "ut_hash.h"
28 #include "ut_vector.h"
29 #include "ut_string_class.h"
30 
31 #include "xap_Dictionary.h"
32 
33 /*****************************************************************/
34 /*****************************************************************/
35 
36 /*
37 	Dictionary is an alphabetic list of words, one per line.
38 	Import/export as a plain text file formatted in UTF-8.
39 	We allow either LF or CR or CRLF line termination.
40 
41 	(This code shamelessly cribbed from the impexp for UTF-8.)
42 */
43 
44 /*****************************************************************/
45 /*****************************************************************/
46 
XAP_Dictionary(const char * szFilename)47 XAP_Dictionary::XAP_Dictionary(const char * szFilename)
48 	: m_hashWords(29)
49 {
50 	UT_ASSERT(szFilename && *szFilename);
51 	m_szFilename = g_strdup(szFilename);
52 
53 	m_fp = 0;
54 	m_bDirty = false;
55 }
56 
~XAP_Dictionary()57 XAP_Dictionary::~XAP_Dictionary()
58 {
59 	if (m_fp)
60 		_closeFile();
61 
62 	FREEP(m_szFilename);
63 
64   	//UT_HASH_PURGEDATA(UT_UCSChar *, (&m_hashWords), g_free);
65 	m_hashWords.freeData();
66 #if 0
67 	UT_StringPtrMap::UT_Cursor _hc1(&m_hashWords);
68 	for (UT_UCSChar * _hval1 = const_cast<UT_UCSChar *>(reinterpret_cast<const UT_UCSChar *>(_hc1.first())); _hc1.is_valid(); _hval1 = const_cast<UT_UCSChar *>(reinterpret_cast<const UT_UCSChar *>(_hc1.next())) )
69 	{
70 		if (_hval1)
71 			g_free (_hval1);
72 	}
73 #endif
74 }
75 
getShortName(void) const76 const char * XAP_Dictionary::getShortName(void) const
77 {
78 	// TODO: get just the filename (no path), for use in UI
79 	return NULL;
80 }
81 
82 /*****************************************************************/
83 /*****************************************************************/
84 
_openFile(const char * mode)85 bool XAP_Dictionary::_openFile(const char * mode)
86 {
87 	UT_ASSERT(!m_fp);
88 
89 	// TODO add code to make a backup of the original file, if it exists.
90 
91 	m_fp = fopen(m_szFilename,mode);
92 	return (m_fp != 0);
93 }
94 
_writeBytes(const UT_Byte * pBytes,UT_uint32 length)95 UT_uint32 XAP_Dictionary::_writeBytes(const UT_Byte * pBytes, UT_uint32 length)
96 {
97 	UT_ASSERT(m_fp);
98 	UT_ASSERT(pBytes);
99 	UT_ASSERT(length);
100 
101 	return fwrite(pBytes,sizeof(UT_Byte),length,m_fp);
102 }
103 
_writeBytes(const UT_Byte * sz)104 bool XAP_Dictionary::_writeBytes(const UT_Byte * sz)
105 {
106 	UT_ASSERT(m_fp);
107 	UT_ASSERT(sz);
108 	int length = strlen(reinterpret_cast<const char *>(sz));
109 	UT_ASSERT(length);
110 
111 	return (_writeBytes(sz,length)==static_cast<UT_uint32>(length));
112 }
113 
_closeFile(void)114 bool XAP_Dictionary::_closeFile(void)
115 {
116 	if (m_fp)
117 		fclose(m_fp);
118 	m_fp = 0;
119 	return true;
120 }
121 
_abortFile(void)122 void XAP_Dictionary::_abortFile(void)
123 {
124 	// abort the write.
125 	// TODO close the file and do any restore and/or cleanup.
126 
127 	_closeFile();
128 	return;
129 }
130 
131 //////////////////////////////////////////////////////////////////
132 //////////////////////////////////////////////////////////////////
133 
load(void)134 bool XAP_Dictionary::load(void)
135 {
136 	UT_ASSERT(m_hashWords.size() == 0);
137 
138 	if (!_openFile("r"))
139 		return false;
140 
141 	if (!_parseUTF8())
142 		_abortFile();
143 	else
144 		_closeFile();
145 
146 	m_bDirty = false;
147 //
148 // Hardwire in some words that should be in the English Language :-)
149 //
150 	addWord("AbiWord");
151 	addWord("AbiSource");
152 	return true;
153 }
154 
155 #define X_ReturnIfFail(exp)		do { bool b = (exp); if (!b) return (false); } while (0)
156 
_parseUTF8(void)157 bool XAP_Dictionary::_parseUTF8(void)
158 {
159 	UT_GrowBuf gbBlock(1024);
160 	bool bEatLF = false;
161 	gchar buf[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
162 	gint len;
163 
164 	while (fread(buf, 1, sizeof(gchar), m_fp) > 0)
165 	{
166 		switch (buf[0])
167 		{
168 		case '\r':
169 		case '\n':
170 			if ((buf[0] == '\n') && bEatLF)
171 			{
172 				bEatLF = false;
173 				break;
174 			}
175 
176 			if (buf[0] == '\r')
177 			{
178 				bEatLF = true;
179 			}
180 
181 			// we interprete either CRLF, CR, or LF as a word delimiter.
182 
183 			if (gbBlock.getLength() > 0)
184 			{
185 				X_ReturnIfFail(addWord(reinterpret_cast<UT_UCS4Char*>(gbBlock.getPointer(0)), gbBlock.getLength()));
186 				gbBlock.truncate(0);
187 			}
188 			break;
189 
190 		default:
191 			bEatLF = false;
192 
193 			len = g_utf8_next_char(buf) - buf;
194 			if (len > 1) {
195 				fread (buf + 1, len - 1, sizeof (gchar), m_fp);
196 			}
197 			UT_UCSChar uc = g_utf8_get_char(buf);
198 			X_ReturnIfFail(gbBlock.ins(gbBlock.getLength(),reinterpret_cast<UT_GrowBufElement*>(&uc),1));
199 			break;
200 		}
201 	}
202 
203 	if (gbBlock.getLength() > 0)
204 	{
205 		X_ReturnIfFail(addWord(reinterpret_cast<UT_UCS4Char*>(gbBlock.getPointer(0)), gbBlock.getLength()));
206 	}
207 
208 	return true;
209 }
210 
211 //////////////////////////////////////////////////////////////////
212 //////////////////////////////////////////////////////////////////
213 
save(void)214 bool XAP_Dictionary::save(void)
215 {
216 	if (!m_bDirty)
217 		return true;
218 
219 	if (!_openFile("w"))
220 		return false;
221 
222 	UT_GenericVector<UT_UCSChar *> * pVec = m_hashWords.enumerate();
223 	UT_ASSERT(pVec);
224 
225 	UT_uint32 size = pVec->size();
226 
227 	for (UT_uint32 i = 0; i < size; i++)
228 	{
229 		UT_UCSChar * pWord = pVec->getNthItem(i);
230 		_outputUTF8(pWord, UT_UCS4_strlen(pWord));
231 		_writeBytes(reinterpret_cast<const UT_Byte *>("\n"));
232 	}
233 
234 	_closeFile();
235 
236 	delete pVec;
237 	m_bDirty = false;
238 
239 	return true;
240 }
241 
_outputUTF8(const UT_UCSChar * data,UT_uint32 length)242 void XAP_Dictionary::_outputUTF8(const UT_UCSChar * data, UT_uint32 length)
243 {
244 	UT_String buf;
245 	const UT_UCSChar * pData;
246 
247 	for (pData = data; (pData<data+length); /**/)
248 	{
249 		if (*pData > 0x007f)
250 		{
251 			gchar outbuf[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
252 			g_unichar_to_utf8(*pData++, outbuf);
253 			buf += outbuf;
254 		}
255 		else
256 		{
257 			buf += (char)*pData++;
258 		}
259 	}
260 
261 	_writeBytes(reinterpret_cast<const UT_Byte *>(buf.c_str()),buf.size());
262 }
263 
264 //////////////////////////////////////////////////////////////////
265 //////////////////////////////////////////////////////////////////
266 
addWord(const UT_UCSChar * pWord,UT_uint32 len)267 bool XAP_Dictionary::addWord(const UT_UCSChar * pWord, UT_uint32 len)
268 {
269 	char * key = static_cast<char *>(UT_calloc(len+1, sizeof(char)));
270 	UT_UCSChar * copy = static_cast<UT_UCSChar *>(UT_calloc(len+1, sizeof(UT_UCSChar)));
271     if (!key || !copy)
272 	{
273 		UT_DEBUGMSG(("mem failure adding word to dictionary\n"));
274 		FREEP(key);
275 		FREEP(copy);
276 		return false;
277 	}
278 	UT_uint32 i = 0;
279 	for (i = 0; i < len; i++)
280 	{
281 		UT_UCSChar currentChar;
282 		currentChar = pWord[i];
283 		// map smart quote apostrophe to ASCII right single quote
284 		if (currentChar == UCS_RQUOTE) currentChar = '\'';
285 		key[i] = static_cast<char>(static_cast<unsigned char>(pWord[i]));
286 		copy[i] = currentChar;
287 		xxx_UT_DEBUGMSG(("addWord: key[%d] = %c %d \n",i,key[i],key[i]));
288 		if(key[i] == 0)
289 		{
290 			break;
291 		}
292 	}
293 	key[i] = 0;
294 //
295 // Get exactly the same length.
296 //
297 	char * key2 = g_strdup(key);
298 	copy[i] = 0;
299 #if 0
300 //
301 // Useful debugging code
302 //
303 	char * ucs_dup = static_cast<char *>(UT_calloc(2*len+1, sizeof(char)));
304 	UT_UCS4_strcpy_to_char( ucs_dup, copy);
305 	UT_DEBUGMSG(("Inserting word %s with key %s into hash \n",ucs_dup,key));
306 	FREEP(ucs_dup);
307 
308 #endif
309 	if(!m_hashWords.insert(key2,copy))
310 		FREEP(copy);
311 
312 	FREEP(key);
313 	FREEP(key2);
314 
315 	// TODO: is this right?
316 	m_bDirty = true;
317 	return true;
318 }
319 
addWord(const char * word)320 bool XAP_Dictionary::addWord(const char * word)
321 {
322 	UT_sint32 len = strlen(word);
323 	if(len <=0)
324 	{
325 		return false;
326 	}
327 	UT_UCSChar * ucs_dup = static_cast<UT_UCSChar *>(UT_calloc(len+1, sizeof(UT_UCSChar)));
328 	UT_UCS4_strcpy_char(ucs_dup, word);
329 	addWord(ucs_dup,len);
330 	FREEP(ucs_dup);
331 	return true;
332 }
333 
334 
335 /*!
336  * Returns true if the word given is found in the users custom dictionary.
337 \param const UT_UCSChar * pWord the word to look for suggestion for
338 \param UT_uint32 len the length of the word
339 \returns UT_Vector * pVecSuggestions this a vector of suggestions.
340 The returner is responsible for deleting these words.
341 */
suggestWord(UT_GenericVector<UT_UCSChar * > * pVecSuggestions,const UT_UCSChar * pWord,UT_uint32 len)342 void XAP_Dictionary::suggestWord(UT_GenericVector<UT_UCSChar *> * pVecSuggestions, const UT_UCSChar * pWord, UT_uint32 len)
343 {
344   //
345   // Get the words in the local dictionary
346   //
347   UT_GenericVector<UT_UCSChar *> * pVec = m_hashWords.enumerate();
348   UT_ASSERT(pVec);
349   UT_uint32 i=0;
350   UT_uint32 count = pVec->getItemCount();
351   //
352   // Turn our word into a NULL teminated string
353   //
354   UT_UCSChar * pszWord = static_cast<UT_UCSChar*>(UT_calloc(len+1, sizeof(UT_UCSChar)));
355   for(i=0; i< len; i++)
356   {
357     pszWord[i] = pWord[i];
358   }
359   pszWord[len] = 0;
360   //
361   // Loop over all the words in our custom doctionary and add them to the
362   //the suggestions if they're possibilities.
363   //
364   for(i=0; i< count; i++)
365   {
366     UT_UCSChar * pszDict = pVec->getNthItem(i);
367     UT_UCSChar * pszReturn = NULL;
368     float lenDict = static_cast<float>(UT_UCS4_strlen(pszDict));
369     UT_uint32 wordInDict = countCommonChars(pszDict,pszWord);
370     UT_uint32 dictInWord = countCommonChars(pszWord,pszDict);
371     float flen = static_cast<float>(len);
372     float frac1 = (static_cast<float>(wordInDict)) / flen;
373     float frac2 = (static_cast<float>(dictInWord)) / lenDict;
374 
375     if((frac1 > 0.8) && (frac2 > 0.8))
376     {
377 	  UT_UCS4_cloneString(&pszReturn, pszDict);
378 	  pVecSuggestions->addItem(pszReturn);
379     }
380   }
381   FREEP(pszWord);
382   DELETEP(pVec);
383 }
384 
385 /*!
386  * This method counts the number of common characters in pszNeedle found in
387  * pszHaystack. Every time character in pszNeedle is found in pszHaystack the
388  * score is incremented by 1.
389  */
countCommonChars(UT_UCSChar * pszHaystack,UT_UCSChar * pszNeedle)390 UT_uint32 XAP_Dictionary::countCommonChars( UT_UCSChar *pszHaystack,UT_UCSChar * pszNeedle)
391 {
392     UT_uint32 lenNeedle =  UT_UCS4_strlen(pszNeedle);
393     UT_UCSChar oneChar[2];
394     oneChar[1] = 0;
395     UT_uint32 i=0;
396     UT_uint32 score =0;
397     for(i=0; i< lenNeedle; i++)
398     {
399       oneChar[0] = pszNeedle[i];
400       if(UT_UCS4_strstr(pszHaystack,oneChar) != 0)
401       {
402 	  score++;
403       }
404     }
405     return score;
406 }
407 
isWord(const UT_UCSChar * pWord,UT_uint32 len) const408 bool XAP_Dictionary::isWord(const UT_UCSChar * pWord, UT_uint32 len) const
409 {
410 	char * key = static_cast<char*>(UT_calloc(len+1, sizeof(char)));
411 	if (!key)
412 	{
413 		UT_DEBUGMSG(("mem failure looking up word in dictionary\n"));
414 		FREEP(key);
415 		return false;
416 	}
417 	UT_uint32 i =0;
418 	for (i = 0; i < len; i++)
419 	{
420 		key[i] = static_cast<char>(static_cast<unsigned char>( pWord[i] ));
421 		xxx_UT_DEBUGMSG(("isword key[%d] = %c %d \n",i,key[i],key[i]));
422 		if(key[i] == 0)
423 			break;
424 	}
425 	key[i] = 0;
426 	char * key2 = g_strdup(key);
427 	bool contains = m_hashWords.contains (key2, NULL);
428 	FREEP(key);
429 	FREEP(key2);
430 	return contains;
431 }
432 
433