1 /** \file hyphman.cpp
2 \brief AlReader hyphenation manager
3
4 (c) Alan, adapted TeX hyphenation dictionaries code: http://alreader.kms.ru/
5 (c) Mark Lipsman -- hyphenation algorithm, modified my Mike & SeNS
6
7 Adapted for CREngine by Vadim Lopatin
8
9 This source code is distributed under the terms of
10 GNU General Public License.
11
12 See LICENSE file for details.
13
14 */
15
16 // set to 1 for debug dump
17 #if 0
18 #define DUMP_HYPHENATION_WORDS 1
19 #define DUMP_PATTERNS 1
20 #else
21 #define DUMP_HYPHENATION_WORDS 0
22 #define DUMP_PATTERNS 0
23 #endif
24
25 #include "../include/crsetup.h"
26
27 #include <stdlib.h>
28 #include <string.h>
29 #include "../include/lvxml.h"
30
31 #if !defined(__SYMBIAN32__)
32 #include <stdio.h>
33 #endif
34
35 #include "../include/lvtypes.h"
36 #include "../include/lvstream.h"
37 #include "../include/hyphman.h"
38 #include "../include/lvfnt.h"
39 #include "../include/lvstring.h"
40 #include "../include/lvstring32collection.h"
41 #include "../include/crlog.h"
42 #include "../include/textlang.h"
43
44
45 #ifdef ANDROID
46
47 #define _32(x) lString32(x)
48
49 #else
50
51 #include "../include/cri18n.h"
52
53 #endif
54
55 int HyphMan::_LeftHyphenMin = HYPH_DEFAULT_HYPHEN_MIN;
56 int HyphMan::_RightHyphenMin = HYPH_DEFAULT_HYPHEN_MIN;
57 int HyphMan::_TrustSoftHyphens = HYPH_DEFAULT_TRUST_SOFT_HYPHENS;
58 LVHashTable<lString32, HyphMethod*> HyphMan::_loaded_hyph_methods(16);
59 HyphDataLoader* HyphMan::_dataLoader = NULL;
60
61
62 // Obsolete: now fetched from TextLangMan main lang TextLangCfg
63 // HyphDictionary * HyphMan::_selectedDictionary = NULL;
64
65 HyphDictionaryList * HyphMan::_dictList = NULL;
66
67 // MAX_PATTERN_SIZE is actually the max size of a word (pattern stripped
68 // from all the numbers that give the quality of a split after previous char)
69 // (35 is needed for German.pattern)
70 #define MAX_PATTERN_SIZE 35
71 #define PATTERN_HASH_SIZE 16384
72 class TexPattern;
73 class TexHyph : public HyphMethod
74 {
75 TexPattern * table[PATTERN_HASH_SIZE];
76 lUInt32 _hash;
77 lUInt32 _pattern_count;
78 public:
79 int largest_overflowed_word;
80 bool match( const lChar32 * str, char * mask );
81 virtual bool hyphenate( const lChar32 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize );
82 void addPattern( TexPattern * pattern );
83 TexHyph( lString32 id=HYPH_DICT_ID_DICTIONARY, int leftHyphenMin=HYPHMETHOD_DEFAULT_HYPHEN_MIN, int rightHyphenMin=HYPHMETHOD_DEFAULT_HYPHEN_MIN );
84 virtual ~TexHyph();
85 bool load( LVStreamRef stream );
86 bool load( lString32 fileName );
getHash()87 virtual lUInt32 getHash() { return _hash; }
getCount()88 virtual lUInt32 getCount() { return _pattern_count; }
89 virtual lUInt32 getSize();
90 };
91
92 class AlgoHyph : public HyphMethod
93 {
94 public:
AlgoHyph()95 AlgoHyph(): HyphMethod(HYPH_DICT_ID_ALGORITHM) {};
96 virtual bool hyphenate( const lChar32 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize );
97 virtual ~AlgoHyph();
98 };
99
100 class SoftHyphensHyph : public HyphMethod
101 {
102 public:
SoftHyphensHyph()103 SoftHyphensHyph(): HyphMethod(HYPH_DICT_ID_SOFTHYPHENS) {};
104 virtual bool hyphenate( const lChar32 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize );
105 virtual ~SoftHyphensHyph();
106 };
107
108 class NoHyph : public HyphMethod
109 {
110 public:
NoHyph()111 NoHyph(): HyphMethod(HYPH_DICT_ID_NONE) {};
hyphenate(const lChar32 * str,int len,lUInt16 * widths,lUInt8 * flags,lUInt16 hyphCharWidth,lUInt16 maxWidth,size_t flagSize)112 virtual bool hyphenate( const lChar32 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize )
113 {
114 CR_UNUSED6(str, len, widths, flags, hyphCharWidth, maxWidth);
115 return false;
116 }
~NoHyph()117 virtual ~NoHyph() { }
118 };
119
120 static NoHyph NO_HYPH;
121 static AlgoHyph ALGO_HYPH;
122 static SoftHyphensHyph SOFTHYPHENS_HYPH;
123
124 // Obsolete: provided by TextLangMan main lang
125 // HyphMethod * HyphMan::_method = &NO_HYPH;
126
127 #pragma pack(push, 1)
128 typedef struct {
129 lUInt16 wl;
130 lUInt16 wu;
131 char al;
132 char au;
133
134 unsigned char mask0[2];
135 lUInt16 aux[256];
136
137 lUInt16 len;
138 } thyph;
139
140 typedef struct {
141 lUInt16 start;
142 lUInt16 len;
143 } hyph_index_item_t;
144 #pragma pack(pop)
145
146 class HyphDataLoaderFromFile: public HyphDataLoader
147 {
148 public:
HyphDataLoaderFromFile()149 HyphDataLoaderFromFile() : HyphDataLoader() {}
~HyphDataLoaderFromFile()150 virtual ~HyphDataLoaderFromFile() {}
loadData(lString32 id)151 virtual LVStreamRef loadData(lString32 id) {
152 HyphDictionaryList* dictList = HyphMan::getDictList();
153 HyphDictionary * p = dictList->find(id);
154 if ( !p )
155 return LVStreamRef();
156 if ( p->getType() == HDT_NONE ||
157 p->getType() == HDT_ALGORITHM ||
158 p->getType() == HDT_SOFTHYPHENS ||
159 ( p->getType() != HDT_DICT_ALAN && p->getType() != HDT_DICT_TEX) )
160 return LVStreamRef();
161 lString32 filename = p->getFilename();
162 return LVOpenFileStream( filename.c_str(), LVOM_READ );
163 }
164 };
165
166
167
uninit()168 void HyphMan::uninit()
169 {
170 // Avoid existing frontend code to have to call it:
171 TextLangMan::uninit();
172 // Clean up _loaded_hyph_methods
173 LVHashTable<lString32, HyphMethod*>::iterator it = _loaded_hyph_methods.forwardIterator();
174 LVHashTable<lString32, HyphMethod*>::pair* pair;
175 while ((pair = it.next())) {
176 delete pair->value;
177 }
178 _loaded_hyph_methods.clear();
179 if ( _dictList )
180 delete _dictList;
181 _dictList = NULL;
182 if ( _dataLoader )
183 delete _dataLoader;
184 _dataLoader = NULL;
185 /* Obsolete:
186 _selectedDictionary = NULL;
187 if ( HyphMan::_method != &ALGO_HYPH && HyphMan::_method != &NO_HYPH && HyphMan::_method != &SOFTHYPHENS_HYPH )
188 delete HyphMan::_method;
189 _method = &NO_HYPH;
190 */
191 }
192
initDictionaries(lString32 dir,bool clear)193 bool HyphMan::initDictionaries(lString32 dir, bool clear)
194 {
195 if (clear && _dictList)
196 delete _dictList;
197 if (clear || !_dictList)
198 _dictList = new HyphDictionaryList();
199 if (NULL == _dataLoader)
200 _dataLoader = new HyphDataLoaderFromFile;
201 if (_dictList->open(dir, clear)) {
202 if ( !_dictList->activate( lString32(DEF_HYPHENATION_DICT) ) )
203 _dictList->activate( lString32(HYPH_DICT_ID_ALGORITHM) );
204 return true;
205 } else {
206 _dictList->activate( lString32(HYPH_DICT_ID_ALGORITHM) );
207 return false;
208 }
209 }
210
211 // for android
addDictionaryItem(HyphDictionary * dict)212 bool HyphMan::addDictionaryItem(HyphDictionary* dict)
213 {
214 if (_dictList->find(dict->getId()))
215 return false;
216 _dictList->add(dict);
217 return true;
218 }
219
setDataLoader(HyphDataLoader * loader)220 void HyphMan::setDataLoader(HyphDataLoader* loader) {
221 if (_dataLoader)
222 delete _dataLoader;
223 _dataLoader = loader;
224 }
225
setLeftHyphenMin(int left_hyphen_min)226 bool HyphMan::setLeftHyphenMin( int left_hyphen_min ) {
227 if (left_hyphen_min >= HYPH_MIN_HYPHEN_MIN && left_hyphen_min <= HYPH_MAX_HYPHEN_MIN) {
228 HyphMan::_LeftHyphenMin = left_hyphen_min;
229 return true;
230 }
231 return false;
232 }
233
setRightHyphenMin(int right_hyphen_min)234 bool HyphMan::setRightHyphenMin( int right_hyphen_min ) {
235 if (right_hyphen_min >= HYPH_MIN_HYPHEN_MIN && right_hyphen_min <= HYPH_MAX_HYPHEN_MIN) {
236 HyphMan::_RightHyphenMin = right_hyphen_min;
237 return true;
238 }
239 return false;
240 }
241
setTrustSoftHyphens(int trust_soft_hyphens)242 bool HyphMan::setTrustSoftHyphens( int trust_soft_hyphens ) {
243 HyphMan::_TrustSoftHyphens = trust_soft_hyphens;
244 return true;
245 }
246
isEnabled()247 bool HyphMan::isEnabled() {
248 return TextLangMan::getHyphenationEnabled();
249 /* Obsolete:
250 return _selectedDictionary != NULL && _selectedDictionary->getId() != HYPH_DICT_ID_NONE;
251 */
252 }
253
hyphenate(const lChar32 * str,int len,lUInt16 * widths,lUInt8 * flags,lUInt16 hyphCharWidth,lUInt16 maxWidth,size_t flagSize)254 bool HyphMan::hyphenate( const lChar32 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize )
255 {
256 return TextLangMan::getMainLangHyphMethod()->hyphenate( str, len, widths, flags, hyphCharWidth, maxWidth, flagSize );
257 /* Obsolete:
258 return _method->hyphenate( str, len, widths, flags, hyphCharWidth, maxWidth, flagSize );
259 */
260 }
261
getSelectedDictionary()262 HyphDictionary * HyphMan::getSelectedDictionary() {
263 lString32 id = TextLangMan::getTextLangCfg()->getHyphMethod()->getId();
264 HyphDictionary * dict = _dictList->find( id );
265 return dict;
266 }
267
getHyphMethodForDictionary(lString32 id,int leftHyphenMin,int rightHyphenMin)268 HyphMethod * HyphMan::getHyphMethodForDictionary( lString32 id, int leftHyphenMin, int rightHyphenMin ) {
269 if ( id.empty() || NULL == _dataLoader)
270 return &NO_HYPH;
271 HyphDictionary * p = _dictList->find(id);
272 if ( !p || p->getType() == HDT_NONE )
273 return &NO_HYPH;
274 if ( p->getType() == HDT_ALGORITHM )
275 return &ALGO_HYPH;
276 if ( p->getType() == HDT_SOFTHYPHENS )
277 return &SOFTHYPHENS_HYPH;
278 if ( p->getType() != HDT_DICT_ALAN && p->getType() != HDT_DICT_TEX )
279 return &NO_HYPH;
280 HyphMethod * method;
281 if ( _loaded_hyph_methods.get(id, method) ) {
282 // printf("getHyphMethodForDictionary reusing cached %s\n", UnicodeToUtf8(p->getFilename()).c_str());
283 return method;
284 }
285 LVStreamRef stream = _dataLoader->loadData(id);
286 if ( stream.isNull() ) {
287 CRLog::error("Cannot open hyphenation dictionary %s", UnicodeToUtf8(id).c_str() );
288 return &NO_HYPH;
289 }
290 TexHyph * newmethod = new TexHyph(id, leftHyphenMin, rightHyphenMin);
291 if ( !newmethod->load( stream ) ) {
292 CRLog::error("Cannot open hyphenation dictionary %s", UnicodeToUtf8(id).c_str() );
293 delete newmethod;
294 return &NO_HYPH;
295 }
296 // printf("CRE: loaded hyphenation dict %s\n", UnicodeToUtf8(id).c_str());
297 if ( newmethod->largest_overflowed_word )
298 CRLog::warn("%s: some hyphenation patterns were too long and have been ignored: increase MAX_PATTERN_SIZE from %d to %d\n", UnicodeToUtf8(id).c_str(), MAX_PATTERN_SIZE, newmethod->largest_overflowed_word);
299 _loaded_hyph_methods.set(id, newmethod);
300 return newmethod;
301 }
302
activate()303 bool HyphDictionary::activate()
304 {
305 TextLangMan::setMainLangFromHyphDict( getId() );
306 return true;
307 /* Obsolete:
308 if (HyphMan::_selectedDictionary == this)
309 return true; // already active
310 if ( getType() == HDT_ALGORITHM ) {
311 CRLog::info("Turn on algorythmic hyphenation" );
312 if ( HyphMan::_method != &ALGO_HYPH ) {
313 if ( HyphMan::_method != &SOFTHYPHENS_HYPH && HyphMan::_method != &NO_HYPH )
314 delete HyphMan::_method;
315 HyphMan::_method = &ALGO_HYPH;
316 }
317 } else if ( getType() == HDT_SOFTHYPHENS ) {
318 CRLog::info("Turn on soft-hyphens hyphenation" );
319 if ( HyphMan::_method != &SOFTHYPHENS_HYPH ) {
320 if ( HyphMan::_method != &ALGO_HYPH && HyphMan::_method != &NO_HYPH )
321 delete HyphMan::_method;
322 HyphMan::_method = &SOFTHYPHENS_HYPH;
323 }
324 } else if ( getType() == HDT_NONE ) {
325 CRLog::info("Disabling hyphenation" );
326 if ( HyphMan::_method != &NO_HYPH ) {
327 if ( HyphMan::_method != &ALGO_HYPH && HyphMan::_method != &SOFTHYPHENS_HYPH )
328 delete HyphMan::_method;
329 HyphMan::_method = &NO_HYPH;
330 }
331 } else if ( getType() == HDT_DICT_ALAN || getType() == HDT_DICT_TEX ) {
332 if ( HyphMan::_method != &NO_HYPH && HyphMan::_method != &ALGO_HYPH && HyphMan::_method != &SOFTHYPHENS_HYPH ) {
333 delete HyphMan::_method;
334 HyphMan::_method = &NO_HYPH;
335 }
336 CRLog::info("Selecting hyphenation dictionary %s", UnicodeToUtf8(_filename).c_str() );
337 LVStreamRef stream = LVOpenFileStream( getFilename().c_str(), LVOM_READ );
338 if ( stream.isNull() ) {
339 CRLog::error("Cannot open hyphenation dictionary %s", UnicodeToUtf8(_filename).c_str() );
340 return false;
341 }
342 TexHyph * method = new TexHyph();
343 if ( !method->load( stream ) ) {
344 CRLog::error("Cannot open hyphenation dictionary %s", UnicodeToUtf8(_filename).c_str() );
345 delete method;
346 return false;
347 }
348 if (method->largest_overflowed_word)
349 printf("CRE WARNING: %s: some hyphenation patterns were too long and have been ignored: increase MAX_PATTERN_SIZE from %d to %d\n", UnicodeToUtf8(_filename).c_str(), MAX_PATTERN_SIZE, method->largest_overflowed_word);
350 HyphMan::_method = method;
351 }
352 HyphMan::_selectedDictionary = this;
353 return true;
354 */
355 }
356
activate(lString32 id)357 bool HyphDictionaryList::activate( lString32 id )
358 {
359 CRLog::trace("HyphDictionaryList::activate(%s)", LCSTR(id));
360 HyphDictionary * p = find(id);
361 if ( p )
362 return p->activate();
363 else
364 return false;
365 }
366
addDefault()367 void HyphDictionaryList::addDefault()
368 {
369 if ( !find( lString32( HYPH_DICT_ID_NONE ) ) ) {
370 _list.add( new HyphDictionary( HDT_NONE, _32("[No Hyphenation]"), lString32(HYPH_DICT_ID_NONE), lString32(HYPH_DICT_ID_NONE) ) );
371 }
372 if ( !find( lString32( HYPH_DICT_ID_ALGORITHM ) ) ) {
373 _list.add( new HyphDictionary( HDT_ALGORITHM, _32("[Algorithmic Hyphenation]"), lString32(HYPH_DICT_ID_ALGORITHM), lString32(HYPH_DICT_ID_ALGORITHM) ) );
374 }
375 if ( !find( lString32( HYPH_DICT_ID_SOFTHYPHENS ) ) ) {
376 _list.add( new HyphDictionary( HDT_SOFTHYPHENS, _32("[Soft-hyphens Hyphenation]"), lString32(HYPH_DICT_ID_SOFTHYPHENS), lString32(HYPH_DICT_ID_SOFTHYPHENS) ) );
377 }
378
379 }
380
find(const lString32 & id)381 HyphDictionary * HyphDictionaryList::find( const lString32& id )
382 {
383 for ( int i=0; i<_list.length(); i++ ) {
384 if ( _list[i]->getId() == id )
385 return _list[i];
386 }
387 return NULL;
388 }
389
HyphDictionary_comparator(const HyphDictionary ** item1,const HyphDictionary ** item2)390 static int HyphDictionary_comparator(const HyphDictionary ** item1, const HyphDictionary ** item2)
391 {
392 if ( ( (*item1)->getType() == HDT_DICT_ALAN || (*item1)->getType() == HDT_DICT_TEX) &&
393 ( (*item2)->getType() == HDT_DICT_ALAN || (*item2)->getType() == HDT_DICT_TEX) )
394 return (*item1)->getTitle().compare((*item2)->getTitle());
395 return (int)((*item1)->getType() - (*item2)->getType());
396 }
397
open(lString32 hyphDirectory,bool clear)398 bool HyphDictionaryList::open(lString32 hyphDirectory, bool clear)
399 {
400 CRLog::info("HyphDictionaryList::open(%s)", LCSTR(hyphDirectory) );
401 if (clear) {
402 _list.clear();
403 addDefault();
404 }
405 if ( hyphDirectory.empty() )
406 return true;
407 //LVAppendPathDelimiter( hyphDirectory );
408 LVContainerRef container;
409 LVStreamRef stream;
410 if ( (hyphDirectory.endsWith("/") || hyphDirectory.endsWith("\\")) && LVDirectoryExists(hyphDirectory) ) {
411 container = LVOpenDirectory( hyphDirectory.c_str(), U"*.*" );
412 } else if ( LVFileExists(hyphDirectory) ) {
413 stream = LVOpenFileStream( hyphDirectory.c_str(), LVOM_READ );
414 if ( !stream.isNull() )
415 container = LVOpenArchieve( stream );
416 }
417
418 if ( !container.isNull() ) {
419 int len = container->GetObjectCount();
420 int count = 0;
421 CRLog::info("%d items found in hyph directory", len);
422 for ( int i=0; i<len; i++ ) {
423 const LVContainerItemInfo * item = container->GetObjectInfo( i );
424 lString32 name = item->GetName();
425 lString32 suffix;
426 lString32 suffix2add;
427 HyphDictType t = HDT_NONE;
428 if ( name.endsWith("_hyphen_(Alan).pdb") ) {
429 suffix = "_hyphen_(Alan).pdb";
430 suffix2add = " (Alan)";
431 t = HDT_DICT_ALAN;
432 } else if ( name.endsWith(".pattern") ) {
433 suffix = ".pattern";
434 t = HDT_DICT_TEX;
435 } else
436 continue;
437
438
439
440 lString32 filename = hyphDirectory + name;
441 lString32 id = name;
442 lString32 title = name;
443 if ( title.endsWith( suffix ) )
444 title.erase( title.length() - suffix.length(), suffix.length() );
445 if (!suffix2add.empty())
446 title.append(suffix2add);
447 _list.add( new HyphDictionary( t, title, id, filename ) );
448 count++;
449 }
450 _list.sort(HyphDictionary_comparator);
451 CRLog::info("%d dictionaries added to list", _list.length());
452 return true;
453 } else {
454 CRLog::info("no hyphenation dictionary items found in hyph directory %s", LCSTR(hyphDirectory));
455 }
456 return false;
457 }
458
HyphMan()459 HyphMan::HyphMan()
460 {
461 }
462
~HyphMan()463 HyphMan::~HyphMan()
464 {
465 }
466
467 // Used by SoftHyphensHyph::hyphenate(), but also possibly (when
468 // TrustSoftHyphens is true) as a first step by TexHyph::hyphenate()
469 // and AlgoHyph::hyphenate(): if soft hyphens are found in the
470 // provided word, trust and use them; don't do the regular patterns
471 // and algorithm matching.
softhyphens_hyphenate(const lChar32 * str,int len,lUInt16 * widths,lUInt8 * flags,lUInt16 hyphCharWidth,lUInt16 maxWidth,size_t flagSize)472 static bool softhyphens_hyphenate( const lChar32 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize )
473 {
474 bool soft_hyphens_found = false;
475 for ( int i = 0; i<len; i++ ) {
476 if ( widths[i] + hyphCharWidth > maxWidth )
477 break;
478 if ( str[i] == UNICODE_SOFT_HYPHEN_CODE ) {
479 if ( flagSize == 2 ) {
480 lUInt16* flags16 = (lUInt16*) flags;
481 flags16[i] |= LCHAR_ALLOW_HYPH_WRAP_AFTER;
482 }
483 else {
484 flags[i] |= LCHAR_ALLOW_HYPH_WRAP_AFTER;
485 }
486 soft_hyphens_found = true;
487 }
488 }
489 return soft_hyphens_found;
490 }
491
hyphenate(const lChar32 * str,int len,lUInt16 * widths,lUInt8 * flags,lUInt16 hyphCharWidth,lUInt16 maxWidth,size_t flagSize)492 bool SoftHyphensHyph::hyphenate( const lChar32 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize )
493 {
494 return softhyphens_hyphenate(str, len, widths, flags, hyphCharWidth, maxWidth, flagSize);
495 }
496
~SoftHyphensHyph()497 SoftHyphensHyph::~SoftHyphensHyph()
498 {
499 }
500
501 struct tPDBHdr
502 {
503 char filename[36];
504 lUInt32 dw1;
505 lUInt32 dw2;
506 lUInt32 dw4[4];
507 char type[8];
508 lUInt32 dw44;
509 lUInt32 dw48;
510 lUInt16 numrec;
511 };
512
isCorrectHyphFile(LVStream * stream)513 static int isCorrectHyphFile(LVStream * stream)
514 {
515 if (!stream)
516 return false;
517 lvsize_t dw;
518 int w = 0;
519 tPDBHdr HDR;
520 stream->SetPos(0);
521 stream->Read( &HDR, 78, &dw);
522 stream->SetPos(0);
523 lvByteOrderConv cnv;
524 w=cnv.msf(HDR.numrec);
525 if (dw!=78 || w>0xff)
526 w = 0;
527
528 if (strncmp((const char*)&HDR.type, "HypHAlR4", 8) != 0)
529 w = 0;
530
531 return w;
532 }
533
534 class TexPattern {
535 public:
536 lChar32 word[MAX_PATTERN_SIZE+1];
537 char attr[MAX_PATTERN_SIZE+2];
538 int overflowed; // 0, or size of complete word if larger than MAX_PATTERN_SIZE
539 TexPattern * next;
540
cmp(TexPattern * v)541 int cmp( TexPattern * v )
542 {
543 return lStr_cmp( word, v->word );
544 }
545
hash(const lChar32 * s)546 static int hash( const lChar32 * s )
547 {
548 return ((lUInt32)(((s[0] *31 + s[1])*31 + s[2]) * 31 + s[3])) % PATTERN_HASH_SIZE;
549 }
550
hash3(const lChar32 * s)551 static int hash3( const lChar32 * s )
552 {
553 return ((lUInt32)(((s[0] *31 + s[1])*31 + s[2]) * 31 + 0)) % PATTERN_HASH_SIZE;
554 }
555
hash2(const lChar32 * s)556 static int hash2( const lChar32 * s )
557 {
558 return ((lUInt32)(((s[0] *31 + s[1])*31 + 0) * 31 + 0)) % PATTERN_HASH_SIZE;
559 }
560
hash1(const lChar32 * s)561 static int hash1( const lChar32 * s )
562 {
563 return ((lUInt32)(((s[0] *31 + 0)*31 + 0) * 31 + 0)) % PATTERN_HASH_SIZE;
564 }
565
hash()566 int hash()
567 {
568 return ((lUInt32)(((word[0] *31 + word[1])*31 + word[2]) * 31 + word[3])) % PATTERN_HASH_SIZE;
569 }
570
match(const lChar32 * s,char * mask)571 bool match( const lChar32 * s, char * mask )
572 {
573 TexPattern * p = this;
574 bool found = false;
575 while ( p ) {
576 bool res = true;
577 for ( int i=2; p->word[i]; i++ )
578 if ( p->word[i]!=s[i] ) {
579 res = false;
580 break;
581 }
582 if ( res ) {
583 if ( p->word[0]==s[0] && (p->word[1]==0 || p->word[1]==s[1]) ) {
584 #if DUMP_PATTERNS==1
585 CRLog::debug("Pattern matched: %s %s on %s %s", LCSTR(lString32(p->word)), p->attr, LCSTR(lString32(s)), mask);
586 #endif
587 p->apply(mask);
588 found = true;
589 }
590 }
591 p = p->next;
592 }
593 return found;
594 }
595
apply(char * mask)596 void apply( char * mask )
597 {
598 ;
599 for ( char * p = attr; *p && *mask; p++, mask++ ) {
600 if ( *mask < *p )
601 *mask = *p;
602 }
603 }
604
TexPattern(const lString32 & s)605 TexPattern( const lString32 &s ) : next( NULL )
606 {
607 overflowed = 0;
608 memset( word, 0, sizeof(word) );
609 memset( attr, '0', sizeof(attr) );
610 attr[sizeof(attr)-1] = 0;
611 int n = 0;
612 for ( int i=0; i<(int)s.length(); i++ ) {
613 lChar32 ch = s[i];
614 if (n > MAX_PATTERN_SIZE) {
615 if ( ch<'0' || ch>'9' ) {
616 overflowed = n++;
617 }
618 continue;
619 }
620 if ( ch>='0' && ch<='9' ) {
621 attr[n] = (char)ch;
622 // if (n>0)
623 // attr[n-1] = (char)ch;
624 } else {
625 if (n == MAX_PATTERN_SIZE) { // we previously reached max word size
626 // Let the last 0 (string termination) in
627 // word[MAX_PATTERN_SIZE] and mark it as overflowed
628 overflowed = n++;
629 }
630 else {
631 word[n++] = ch;
632 }
633 }
634 }
635 // if n==MAX_PATTERN_SIZE (or >), attr[MAX_PATTERN_SIZE] is either the
636 // memset '0', or a 0-9 we got on next iteration, and
637 // attr[MAX_PATTERN_SIZE+1] is the 0 set by attr[sizeof(attr)-1] = 0
638 if (n < MAX_PATTERN_SIZE)
639 attr[n+1] = 0;
640
641 if (overflowed)
642 overflowed = overflowed + 1; // convert counter to number of things counted
643 }
644
TexPattern(const unsigned char * s,int sz,const lChar32 * charMap)645 TexPattern( const unsigned char * s, int sz, const lChar32 * charMap )
646 {
647 overflowed = 0;
648 if ( sz > MAX_PATTERN_SIZE ) {
649 overflowed = sz;
650 sz = MAX_PATTERN_SIZE;
651 }
652 memset( word, 0, sizeof(word) );
653 memset( attr, 0, sizeof(attr) );
654 for ( int i=0; i<sz; i++ )
655 word[i] = charMap[ s[i] ];
656 memcpy( attr, s+sz, sz+1 );
657 }
658 };
659
660 class HyphPatternReader : public LVXMLParserCallback
661 {
662 protected:
663 bool insidePatternTag;
664 lString32Collection & data;
665 public:
HyphPatternReader(lString32Collection & result)666 HyphPatternReader(lString32Collection & result) : insidePatternTag(false), data(result)
667 {
668 result.clear();
669 }
670 /// called on parsing end
OnStop()671 virtual void OnStop() { }
672 /// called on opening tag end
OnTagBody()673 virtual void OnTagBody() {}
674 /// called on opening tag
OnTagOpen(const lChar32 * nsname,const lChar32 * tagname)675 virtual ldomNode * OnTagOpen( const lChar32 * nsname, const lChar32 * tagname)
676 {
677 CR_UNUSED(nsname);
678 if (!lStr_cmp(tagname, "pattern")) {
679 insidePatternTag = true;
680 }
681 return NULL;
682 }
683 /// called on closing
OnTagClose(const lChar32 * nsname,const lChar32 * tagname,bool self_closing_tag=false)684 virtual void OnTagClose( const lChar32 * nsname, const lChar32 * tagname, bool self_closing_tag=false )
685 {
686 CR_UNUSED2(nsname, tagname);
687 insidePatternTag = false;
688 }
689 /// called on element attribute
OnAttribute(const lChar32 * nsname,const lChar32 * attrname,const lChar32 * attrvalue)690 virtual void OnAttribute( const lChar32 * nsname, const lChar32 * attrname, const lChar32 * attrvalue )
691 {
692 CR_UNUSED3(nsname, attrname, attrvalue);
693 }
694 /// called on text
OnText(const lChar32 * text,int len,lUInt32 flags)695 virtual void OnText( const lChar32 * text, int len, lUInt32 flags )
696 {
697 CR_UNUSED(flags);
698 if ( insidePatternTag )
699 data.add( lString32(text, len) );
700 }
701 /// add named BLOB data to document
OnBlob(lString32 name,const lUInt8 * data,int size)702 virtual bool OnBlob(lString32 name, const lUInt8 * data, int size) {
703 CR_UNUSED3(name, data, size);
704 return false;
705 }
706
707 };
708
TexHyph(lString32 id,int leftHyphenMin,int rightHyphenMin)709 TexHyph::TexHyph(lString32 id, int leftHyphenMin, int rightHyphenMin) : HyphMethod(id, leftHyphenMin, rightHyphenMin)
710 {
711 memset( table, 0, sizeof(table) );
712 _hash = 123456;
713 _pattern_count = 0;
714 largest_overflowed_word = 0;
715 }
716
~TexHyph()717 TexHyph::~TexHyph()
718 {
719 for ( int i=0; i<PATTERN_HASH_SIZE; i++ ) {
720 TexPattern * p = table[i];
721 while (p) {
722 TexPattern * tmp = p;
723 p = p->next;
724 delete tmp;
725 }
726 }
727 }
728
addPattern(TexPattern * pattern)729 void TexHyph::addPattern( TexPattern * pattern )
730 {
731 int h = pattern->hash();
732 TexPattern * * p = &table[h];
733 while ( *p && pattern->cmp(*p)<0 )
734 p = &((*p)->next);
735 pattern->next = *p;
736 *p = pattern;
737 _pattern_count++;
738 }
739
getSize()740 lUInt32 TexHyph::getSize() {
741 return _pattern_count * sizeof(TexPattern);
742 }
743
load(LVStreamRef stream)744 bool TexHyph::load( LVStreamRef stream )
745 {
746 int w = isCorrectHyphFile(stream.get());
747 int patternCount = 0;
748 if (w) {
749 _hash = stream->getcrc32();
750 int i;
751 lvsize_t dw;
752
753 lvByteOrderConv cnv;
754
755 int hyph_count = w;
756 thyph hyph;
757
758 lvpos_t p = 78 + (hyph_count * 8 + 2);
759 stream->SetPos(p);
760 if ( stream->SetPos(p)!=p )
761 return false;
762 lChar32 charMap[256] = { 0 };
763 unsigned char buf[0x10000];
764 // make char map table
765 for (i=0; i<hyph_count; i++)
766 {
767 if ( stream->Read( &hyph, 522, &dw )!=LVERR_OK || dw!=522 )
768 return false;
769 cnv.msf( &hyph.len ); //rword(_main_hyph[i].len);
770 lvpos_t newPos;
771 if ( stream->Seek( hyph.len, LVSEEK_CUR, &newPos )!=LVERR_OK )
772 return false;
773
774 cnv.msf( hyph.wl );
775 cnv.msf( hyph.wu );
776 charMap[ (unsigned char)hyph.al ] = hyph.wl;
777 charMap[ (unsigned char)hyph.au ] = hyph.wu;
778 // lChar32 ch = hyph.wl;
779 // CRLog::debug("wl=%s mask=%c%c", LCSTR(lString32(&ch, 1)), hyph.mask0[0], hyph.mask0[1]);
780 if (hyph.mask0[0]!='0'||hyph.mask0[1]!='0') {
781 unsigned char pat[4];
782 pat[0] = hyph.al;
783 pat[1] = hyph.mask0[0];
784 pat[2] = hyph.mask0[1];
785 pat[3] = 0;
786 TexPattern * pattern = new TexPattern(pat, 1, charMap);
787 #if DUMP_PATTERNS==1
788 CRLog::debug("Pattern: '%s' - %s", LCSTR(lString32(pattern->word)), pattern->attr );
789 #endif
790 if (pattern->overflowed) {
791 // don't use truncated words
792 CRLog::warn("Pattern overflowed (%d > %d) and ignored: '%s'", pattern->overflowed, MAX_PATTERN_SIZE, LCSTR(lString32(pattern->word)));
793 if (pattern->overflowed > largest_overflowed_word)
794 largest_overflowed_word = pattern->overflowed;
795 delete pattern;
796 }
797 else {
798 addPattern( pattern );
799 patternCount++;
800 }
801 }
802 }
803
804 if ( stream->SetPos(p)!=p )
805 return false;
806
807 for (i=0; i<hyph_count; i++)
808 {
809 stream->Read( &hyph, 522, &dw );
810 if (dw!=522)
811 return false;
812 cnv.msf( &hyph.len );
813
814 stream->Read(buf, hyph.len, &dw);
815 if (dw!=hyph.len)
816 return false;
817
818 unsigned char * p = buf;
819 unsigned char * end_p = p + hyph.len;
820 while ( p < end_p ) {
821 lUInt8 sz = *p++;
822 if ( p + sz > end_p )
823 break;
824 TexPattern * pattern = new TexPattern( p, sz, charMap );
825 #if DUMP_PATTERNS==1
826 CRLog::debug("Pattern: '%s' - %s", LCSTR(lString32(pattern->word)), pattern->attr);
827 #endif
828 if (pattern->overflowed) {
829 // don't use truncated words
830 CRLog::warn("Pattern overflowed (%d > %d) and ignored: '%s'", pattern->overflowed, MAX_PATTERN_SIZE, LCSTR(lString32(pattern->word)));
831 if (pattern->overflowed > largest_overflowed_word)
832 largest_overflowed_word = pattern->overflowed;
833 delete pattern;
834 }
835 else {
836 addPattern( pattern );
837 patternCount++;
838 }
839 p += sz + sz + 1;
840 }
841 }
842
843 return patternCount>0;
844 } else {
845 // tex xml format as for FBReader
846 lString32Collection data;
847 HyphPatternReader reader( data );
848 LVXMLParser parser( stream, &reader );
849 if ( !parser.CheckFormat() )
850 return false;
851 if ( !parser.Parse() )
852 return false;
853 if ( !data.length() )
854 return false;
855 for ( int i=0; i<(int)data.length(); i++ ) {
856 data[i].lowercase();
857 TexPattern * pattern = new TexPattern( data[i] );
858 #if DUMP_PATTERNS==1
859 CRLog::debug("Pattern: (%s) '%s' - %s", LCSTR(data[i]), LCSTR(lString32(pattern->word)), pattern->attr);
860 #endif
861 if (pattern->overflowed) {
862 // don't use truncated words
863 CRLog::warn("Pattern overflowed (%d > %d) and ignored: (%s) '%s'", pattern->overflowed, MAX_PATTERN_SIZE, LCSTR(data[i]), LCSTR(lString32(pattern->word)));
864 if (pattern->overflowed > largest_overflowed_word)
865 largest_overflowed_word = pattern->overflowed;
866 delete pattern;
867 }
868 else {
869 addPattern( pattern );
870 patternCount++;
871 }
872 }
873 return patternCount>0;
874 }
875 }
876
load(lString32 fileName)877 bool TexHyph::load( lString32 fileName )
878 {
879 LVStreamRef stream = LVOpenFileStream( fileName.c_str(), LVOM_READ );
880 if ( stream.isNull() )
881 return false;
882 return load( stream );
883 }
884
885
match(const lChar32 * str,char * mask)886 bool TexHyph::match( const lChar32 * str, char * mask )
887 {
888 bool found = false;
889 TexPattern * res = table[ TexPattern::hash( str ) ];
890 if ( res ) {
891 found = res->match( str, mask ) || found;
892 }
893 res = table[ TexPattern::hash3( str ) ];
894 if ( res ) {
895 found = res->match( str, mask ) || found;
896 }
897 res = table[ TexPattern::hash2( str ) ];
898 if ( res ) {
899 found = res->match( str, mask ) || found;
900 }
901 res = table[ TexPattern::hash1( str ) ];
902 if ( res ) {
903 found = res->match( str, mask ) || found;
904 }
905 return found;
906 }
907
908 //TODO: do we need it?
909 ///// returns false if there is rule disabling hyphenation at specified point
910 //static bool checkHyphenRules( const lChar32 * str, int len, int pos )
911 //{
912 // if ( pos<1 || pos>len-3 )
913 // return false;
914 // lUInt16 props[2] = { 0, 0 };
915 // lStr_getCharProps( str+pos+1, 1, props);
916 // if ( props[0]&CH_PROP_ALPHA_SIGN )
917 // return false;
918 // if ( pos==len-3 ) {
919 // lStr_getCharProps( str+len-2, 2, props);
920 // return (props[0]&CH_PROP_VOWEL) || (props[1]&CH_PROP_VOWEL);
921 // }
922 // if ( pos==1 ) {
923 // lStr_getCharProps( str, 2, props);
924 // return (props[0]&CH_PROP_VOWEL) || (props[1]&CH_PROP_VOWEL);
925 // }
926 // return true;
927 //}
928
hyphenate(const lChar32 * str,int len,lUInt16 * widths,lUInt8 * flags,lUInt16 hyphCharWidth,lUInt16 maxWidth,size_t flagSize)929 bool TexHyph::hyphenate( const lChar32 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize )
930 {
931 if ( HyphMan::_TrustSoftHyphens ) {
932 if ( softhyphens_hyphenate(str, len, widths, flags, hyphCharWidth, maxWidth, flagSize) )
933 return true;
934 }
935 if ( len<=3 )
936 return false;
937 if ( len>=WORD_LENGTH )
938 len = WORD_LENGTH - 2;
939 lChar32 word[WORD_LENGTH+4] = { 0 };
940 char mask[WORD_LENGTH+4] = { 0 };
941
942 // Make word from str, with soft-hyphens stripped out.
943 // Prepend and append a space so patterns can match word boundaries.
944 int wlen;
945 word[0] = ' ';
946 int w = 1;
947 for ( int i=0; i<len; i++ ) {
948 if ( str[i] != UNICODE_SOFT_HYPHEN_CODE ) {
949 word[w++] = str[i];
950 }
951 }
952 wlen = w-1;
953 word[w++] = ' ';
954 if ( wlen<=3 )
955 return false;
956 lStr_lowercase(word+1, wlen);
957 // printf("word:%s => #%s# (%d => %d)\n", LCSTR(lString32(str, len)), LCSTR(lString32(word)), len, wlen);
958
959 #if DUMP_HYPHENATION_WORDS==1
960 CRLog::trace("word to hyphenate: '%s'", LCSTR(lString32(word)));
961 #endif
962
963 // Find matches from dict patterns, at any position in word.
964 // Places where hyphenation is allowed are put into 'mask'.
965 memset( mask, '0', wlen+3 ); // 0x30!
966 bool found = false;
967 for ( int i=0; i<=wlen; i++ ) {
968 found = match( word + i, mask + i ) || found;
969 }
970 if ( !found )
971 return false;
972
973 #if DUMP_HYPHENATION_WORDS==1
974 lString32 buf;
975 lString32 buf2;
976 bool boundFound = false;
977 for ( int i=0; i<wlen; i++ ) {
978 buf << word[i+1];
979 buf2 << word[i+1];
980 buf2 << (lChar32)mask[i+2];
981 // This maxWidth check may be wrong here (in the dump only) because
982 // of a +1 shift and possible more shifts due to soft-hyphens.
983 int nw = widths[i]+hyphCharWidth;
984 if ( (mask[i+2]&1) ) {
985 buf << (lChar32)'-';
986 buf2 << (lChar32)'-';
987 }
988 if ( nw>maxWidth && !boundFound ) {
989 buf << (lChar32)'|';
990 buf2 << (lChar32)'|';
991 boundFound = true;
992 // buf << (lChar32)'-';
993 // buf2 << (lChar32)'-';
994 }
995 }
996 CRLog::trace("Hyphenate: %s %s", LCSTR(buf), LCSTR(buf2) );
997 #endif
998
999 // Use HyphMan global left/right hyphen min, unless set to 0 (the default)
1000 // which means we should use the HyphMethod specific values.
1001 int left_hyphen_min = HyphMan::_LeftHyphenMin ? HyphMan::_LeftHyphenMin : _left_hyphen_min;
1002 int right_hyphen_min = HyphMan::_RightHyphenMin ? HyphMan::_RightHyphenMin : _right_hyphen_min;
1003
1004 // Moves allowed hyphenation positions from 'mask' to the provided 'flags',
1005 // taking soft-hyphen shifts into account
1006 int soft_hyphens_skipped = 0;
1007 bool res = false;
1008 for ( int p=0 ; p<=len-2; p++ ) {
1009 // printf(" char %c\n", str[p]);
1010 if ( str[p] == UNICODE_SOFT_HYPHEN_CODE ) {
1011 soft_hyphens_skipped++;
1012 continue;
1013 }
1014 if (p-soft_hyphens_skipped < left_hyphen_min - 1)
1015 continue;
1016 if (p > len - right_hyphen_min - 1)
1017 continue;
1018 // hyphenate
1019 //00010030100
1020 int nw = widths[p]+hyphCharWidth;
1021 // printf(" word %c\n", word[p+1-soft_hyphens_skipped]);
1022 // p+2 because: +1 because word has a space prepended, and +1 because
1023 // mask[] holds the flag for char n on slot n+1
1024 if ( (mask[p+2-soft_hyphens_skipped]&1) && nw <= maxWidth ) {
1025 if ( flagSize == 2 ) {
1026 lUInt16* flags16 = (lUInt16*) flags;
1027 flags16[p] |= LCHAR_ALLOW_HYPH_WRAP_AFTER;
1028 }
1029 else {
1030 flags[p] |= LCHAR_ALLOW_HYPH_WRAP_AFTER;
1031 }
1032 // printf(" allowed after %c\n", str[p]);
1033 res = true;
1034 }
1035 }
1036 return res;
1037 }
1038
hyphenate(const lChar32 * str,int len,lUInt16 * widths,lUInt8 * flags,lUInt16 hyphCharWidth,lUInt16 maxWidth,size_t flagSize)1039 bool AlgoHyph::hyphenate( const lChar32 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize )
1040 {
1041 if ( HyphMan::_TrustSoftHyphens ) {
1042 if ( softhyphens_hyphenate(str, len, widths, flags, hyphCharWidth, maxWidth, flagSize) )
1043 return true;
1044 }
1045
1046 // Use HyphMan global left/right hyphen min, unless set to 0 (the default)
1047 // which means we should use the HyphMethod specific values.
1048 int left_hyphen_min = HyphMan::_LeftHyphenMin ? HyphMan::_LeftHyphenMin : _left_hyphen_min;
1049 int right_hyphen_min = HyphMan::_RightHyphenMin ? HyphMan::_RightHyphenMin : _right_hyphen_min;
1050
1051 lUInt16 chprops[WORD_LENGTH];
1052 if ( len > WORD_LENGTH-2 )
1053 len = WORD_LENGTH - 2;
1054 lStr_getCharProps( str, len, chprops );
1055 int start, end, i, j;
1056 #define MIN_WORD_LEN_TO_HYPHEN 2
1057 for ( start = 0; start<len; ) {
1058 // find start of word
1059 while (start<len && !(chprops[start] & CH_PROP_ALPHA) )
1060 ++start;
1061 // find end of word
1062 for ( end=start+1; end<len && (chprops[start] & CH_PROP_ALPHA); ++end )
1063 ;
1064 // now look over word, placing hyphens
1065 if ( end-start > MIN_WORD_LEN_TO_HYPHEN ) { // word must be long enough
1066 for (i=start;i<end-MIN_WORD_LEN_TO_HYPHEN;++i) {
1067 if (i-start < left_hyphen_min - 1)
1068 continue;
1069 if (end-i < right_hyphen_min + 1)
1070 continue;
1071 if ( widths[i] > maxWidth )
1072 break;
1073 if ( chprops[i] & CH_PROP_VOWEL ) {
1074 for ( j=i+1; j<end; ++j ) {
1075 if ( chprops[j] & CH_PROP_VOWEL ) {
1076 int next = i+1;
1077 while ( (chprops[next] & CH_PROP_HYPHEN) && next<end-MIN_WORD_LEN_TO_HYPHEN) {
1078 // printf("next++\n");
1079 next++;
1080 }
1081 int next2 = next+1;
1082 while ( (chprops[next2] & CH_PROP_HYPHEN) && next2<end-MIN_WORD_LEN_TO_HYPHEN) {
1083 // printf("next2++\n");
1084 next2++;
1085 }
1086 if ( (chprops[next] & CH_PROP_CONSONANT) && (chprops[next2] & CH_PROP_CONSONANT) )
1087 i = next;
1088 else if ( (chprops[next] & CH_PROP_CONSONANT) && ( chprops[next2] & CH_PROP_ALPHA_SIGN ) )
1089 i = next2;
1090 if ( i-start>=1 && end-i>2 ) {
1091 // insert hyphenation mark
1092 lUInt16 nw = widths[i] + hyphCharWidth;
1093 if ( nw<maxWidth )
1094 {
1095 bool disabled = false;
1096 const char * dblSequences[] = {
1097 "sh", "th", "ph", "ch", NULL
1098 };
1099 next = i+1;
1100 while ( (chprops[next] & CH_PROP_HYPHEN) && next<end-MIN_WORD_LEN_TO_HYPHEN) {
1101 // printf("next3++\n");
1102 next++;
1103 }
1104 for (int k=0; dblSequences[k]; k++)
1105 if (str[i]==dblSequences[k][0] && str[next]==dblSequences[k][1]) {
1106 disabled = true;
1107 break;
1108 }
1109 if (!disabled) {
1110 if ( flagSize == 2 ) {
1111 lUInt16* flags16 = (lUInt16*) flags;
1112 flags16[i] |= LCHAR_ALLOW_HYPH_WRAP_AFTER;
1113 }
1114 else {
1115 flags[i] |= LCHAR_ALLOW_HYPH_WRAP_AFTER;
1116 }
1117 }
1118 //widths[i] = nw; // don't add hyph width
1119 }
1120 }
1121 break;
1122 }
1123 }
1124 }
1125 }
1126 }
1127 start=end;
1128 }
1129 return true;
1130 }
1131
~AlgoHyph()1132 AlgoHyph::~AlgoHyph()
1133 {
1134 }
1135
1136
1137
1138