1 
2 // Copied from coolreader-3.2.49 (crengine/src/hyphman.cpp)
3 
4 #include "my_texhyph.h"
5 #include "my_hyphpatternreader.h"
6 
7 #include <string.h>
8 #include <stdlib.h>
9 
10 #include "lvfnt.h"
11 #include "lvstring.h"
12 #include "lvstring32collection.h"
13 #include "crlog.h"
14 
15 //#define DUMP_PATTERNS 1
16 
17 struct tPDBHdr
18 {
19     char filename[36];
20     lUInt32 dw1;
21     lUInt32 dw2;
22     lUInt32 dw4[4];
23     char type[8];
24     lUInt32 dw44;
25     lUInt32 dw48;
26     lUInt16 numrec;
27 };
28 
29 #pragma pack(push, 1)
30 typedef struct {
31     lUInt16         wl;
32     lUInt16         wu;
33     char            al;
34     char            au;
35 
36     unsigned char   mask0[2];
37     lUInt16         aux[256];
38 
39     lUInt16         len;
40 } thyph;
41 
42 typedef struct {
43     lUInt16 start;
44     lUInt16 len;
45 } hyph_index_item_t;
46 #pragma pack(pop)
47 
48 
isCorrectHyphFile(LVStream * stream)49 static int isCorrectHyphFile(LVStream * stream)
50 {
51     if (!stream)
52         return false;
53     lvsize_t   dw;
54     int    w = 0;
55     tPDBHdr    HDR;
56     stream->SetPos(0);
57     stream->Read( &HDR, 78, &dw);
58     stream->SetPos(0);
59     lvByteOrderConv cnv;
60     w=cnv.msf(HDR.numrec);
61     if (dw!=78 || w>0xff)
62         w = 0;
63 
64     if (strncmp((const char*)&HDR.type, "HypHAlR4", 8) != 0)
65         w = 0;
66 
67     return w;
68 }
69 
70 
71 
MyTexHyph(lString32 id,int leftHyphenMin,int rightHyphenMin)72 MyTexHyph::MyTexHyph( lString32 id, int leftHyphenMin, int rightHyphenMin )
73  : HyphMethod(id, leftHyphenMin, rightHyphenMin)
74 {
75     memset( table, 0, sizeof(table) );
76     _hash = 123456;
77     _pattern_count = 0;
78     largest_overflowed_word = 0;
79 }
80 
~MyTexHyph()81 MyTexHyph::~MyTexHyph()
82 {
83     for ( int i=0; i<PATTERN_HASH_SIZE; i++ ) {
84         MyTexPattern * p = table[i];
85         while (p) {
86             MyTexPattern * tmp = p;
87             p = p->next;
88             delete tmp;
89         }
90     }
91 }
92 
match(const lChar32 * str,char * mask)93 bool MyTexHyph::match( const lChar32 * str, char * mask )
94 {
95     bool found = false;
96     MyTexPattern * res = table[ MyTexPattern::hash( str ) ];
97     if ( res ) {
98         found = res->match( str, mask ) || found;
99     }
100     res = table[ MyTexPattern::hash3( str ) ];
101     if ( res ) {
102         found = res->match( str, mask ) || found;
103     }
104     res = table[ MyTexPattern::hash2( str ) ];
105     if ( res ) {
106         found = res->match( str, mask ) || found;
107     }
108     res = table[ MyTexPattern::hash1( str ) ];
109     if ( res ) {
110         found = res->match( str, mask ) || found;
111     }
112     return found;
113 }
114 
hyphenate(const lChar32 * str,int len,lUInt16 * widths,lUInt8 * flags,lUInt16 hyphCharWidth,lUInt16 maxWidth,size_t flagSize)115 bool MyTexHyph::hyphenate( const lChar32 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize )
116 {
117     // stub
118     return false;
119 }
120 
addPattern(MyTexPattern * pattern)121 void MyTexHyph::addPattern( MyTexPattern * pattern )
122 {
123     int h = pattern->hash();
124     MyTexPattern * * p = &table[h];
125     while ( *p && pattern->cmp(*p)<0 )
126         p = &((*p)->next);
127     pattern->next = *p;
128     *p = pattern;
129     _pattern_count++;
130 }
131 
load(LVStreamRef stream)132 bool MyTexHyph::load( LVStreamRef stream )
133 {
134     int w = isCorrectHyphFile(stream.get());
135     int patternCount = 0;
136     if (w) {
137         _hash = stream->getcrc32();
138         int        i;
139         lvsize_t   dw;
140 
141         lvByteOrderConv cnv;
142 
143         int hyph_count = w;
144         thyph hyph;
145 
146         lvpos_t p = 78 + (hyph_count * 8 + 2);
147         stream->SetPos(p);
148         if ( stream->SetPos(p)!=p )
149             return false;
150         lChar32 charMap[256] = { 0 };
151         unsigned char buf[0x10000];
152         // make char map table
153         for (i=0; i<hyph_count; i++)
154         {
155             if ( stream->Read( &hyph, 522, &dw )!=LVERR_OK || dw!=522 )
156                 return false;
157             cnv.msf( &hyph.len ); //rword(_main_hyph[i].len);
158             lvpos_t newPos;
159             if ( stream->Seek( hyph.len, LVSEEK_CUR, &newPos )!=LVERR_OK )
160                 return false;
161 
162             cnv.msf( hyph.wl );
163             cnv.msf( hyph.wu );
164             charMap[ (unsigned char)hyph.al ] = hyph.wl;
165             charMap[ (unsigned char)hyph.au ] = hyph.wu;
166 //            lChar32 ch = hyph.wl;
167 //            CRLog::debug("wl=%s mask=%c%c", LCSTR(lString32(&ch, 1)), hyph.mask0[0], hyph.mask0[1]);
168             if (hyph.mask0[0]!='0'||hyph.mask0[1]!='0') {
169                 unsigned char pat[4];
170                 pat[0] = hyph.al;
171                 pat[1] = hyph.mask0[0];
172                 pat[2] = hyph.mask0[1];
173                 pat[3] = 0;
174                 MyTexPattern * pattern = new MyTexPattern(pat, 1, charMap);
175 #if DUMP_PATTERNS==1
176                 CRLog::debug("Pattern: '%s' - %s", LCSTR(lString32(pattern->word)), pattern->attr );
177 #endif
178                 if (pattern->overflowed) {
179                     // don't use truncated words
180                     CRLog::warn("Pattern overflowed (%d > %d) and ignored: '%s'", pattern->overflowed, MAX_PATTERN_SIZE, LCSTR(lString32(pattern->word)));
181                     if (pattern->overflowed > largest_overflowed_word)
182                         largest_overflowed_word = pattern->overflowed;
183                     delete pattern;
184                 }
185                 else {
186                     addPattern( pattern );
187                     patternCount++;
188                 }
189             }
190         }
191 
192         if ( stream->SetPos(p)!=p )
193             return false;
194 
195         for (i=0; i<hyph_count; i++)
196         {
197             stream->Read( &hyph, 522, &dw );
198             if (dw!=522)
199                 return false;
200             cnv.msf( &hyph.len );
201 
202             stream->Read(buf, hyph.len, &dw);
203             if (dw!=hyph.len)
204                 return false;
205 
206             unsigned char * p = buf;
207             unsigned char * end_p = p + hyph.len;
208             while ( p < end_p ) {
209                 lUInt8 sz = *p++;
210                 if ( p + sz > end_p )
211                     break;
212                 MyTexPattern * pattern = new MyTexPattern( p, sz, charMap );
213 #if DUMP_PATTERNS==1
214                 CRLog::debug("Pattern: '%s' - %s", LCSTR(lString32(pattern->word)), pattern->attr);
215 #endif
216                 if (pattern->overflowed) {
217                     // don't use truncated words
218                     CRLog::warn("Pattern overflowed (%d > %d) and ignored: '%s'", pattern->overflowed, MAX_PATTERN_SIZE, LCSTR(lString32(pattern->word)));
219                     if (pattern->overflowed > largest_overflowed_word)
220                         largest_overflowed_word = pattern->overflowed;
221                     delete pattern;
222                 }
223                 else {
224                     addPattern( pattern );
225                     patternCount++;
226                 }
227                 p += sz + sz + 1;
228             }
229         }
230 
231 #if DUMP_PATTERNS==1
232         CRLog::debug("Patterns count = %d", patternCount);
233 #endif
234         return patternCount>0;
235     } else {
236         // tex xml format as for FBReader
237         lString32Collection data;
238         MyHyphPatternReader reader( data );
239         LVXMLParser parser( stream, &reader );
240         if ( !parser.CheckFormat() )
241             return false;
242         if ( !parser.Parse() )
243             return false;
244         if ( !data.length() )
245             return false;
246         for ( int i=0; i<(int)data.length(); i++ ) {
247             data[i].lowercase();
248             MyTexPattern * pattern = new MyTexPattern( data[i] );
249 #if DUMP_PATTERNS==1
250             CRLog::debug("Pattern: (%s) '%s' - %s", LCSTR(data[i]), LCSTR(lString32(pattern->word)), pattern->attr);
251 #endif
252             if (pattern->overflowed) {
253                 // don't use truncated words
254                 CRLog::warn("Pattern overflowed (%d > %d) and ignored: (%s) '%s'", pattern->overflowed, MAX_PATTERN_SIZE, LCSTR(data[i]), LCSTR(lString32(pattern->word)));
255                 if (pattern->overflowed > largest_overflowed_word)
256                     largest_overflowed_word = pattern->overflowed;
257                 delete pattern;
258             }
259             else {
260                 addPattern( pattern );
261                 patternCount++;
262             }
263         }
264         return patternCount>0;
265     }
266 }
267 
load(lString32 fileName)268 bool MyTexHyph::load( lString32 fileName )
269 {
270     LVStreamRef stream = LVOpenFileStream( fileName.c_str(), LVOM_READ );
271     if ( stream.isNull() )
272         return false;
273     return load( stream );
274 }
275 
getSize()276 lUInt32 MyTexHyph::getSize()
277 {
278     return _pattern_count * sizeof(MyTexPattern);
279 }
280 
281 
dump(LVStreamRef stream,const lString8 & title)282 bool MyTexHyph::dump(LVStreamRef stream, const lString8& title)
283 {
284     if (stream.isNull()) {
285         CRLog::error("Output stream is null!");
286         return false;
287     }
288     CRLog::info("Dictionary contains %d patterns.", _pattern_count);
289 
290     lString32Collection strPatterns;
291     MyTexPattern* pattern;
292     for (lUInt32 i = 0; i < PATTERN_HASH_SIZE; i++) {
293         pattern = table[i];
294         while (pattern != 0) {
295             //CRLog::debug("Pattern: '%s' - %s", LCSTR(lString32(pattern->word)), pattern->attr);
296             lString32 str;
297             int k = 0;
298             for (int j = 0; pattern->attr[j] && j < MAX_PATTERN_SIZE + 2 && k < MAX_PATTERN_SIZE + 1; j++) {
299                 if (pattern->attr[j] > '0' && pattern->attr[j] <= '9') {
300                     str << pattern->attr[j];
301                     if (pattern->word[k])
302                         str << pattern->word[k++];
303                 } else {
304                     if (pattern->word[k])
305                         str << pattern->word[k++];
306                 }
307             }
308             strPatterns.add(str);
309             pattern = pattern->next;
310         }
311     }
312     if (strPatterns.length() != _pattern_count) {
313         CRLog::error("Not all patterns processed!");
314         return false;
315     }
316     // sort
317     strPatterns.sort();
318     // Write to stream
319     lString8 decStr;
320     *stream << "<?xml version=\"1.0\" encoding=\"utf8\"?>\n";
321     *stream << "<!-- dump of hyphenation dictionary \"";
322     *stream << title << "\"\n";
323     *stream << "  pattern's count ";
324     decStr.appendDecimal(_pattern_count);
325     *stream << decStr;
326     *stream << " -->\n";
327     *stream << "<HyphenationDescription>\n";
328     for (int i = 0; i < strPatterns.length(); i++) {
329         *stream << "<pattern>";
330         *stream << UnicodeToUtf8(strPatterns[i]);
331         *stream << "</pattern>\n";
332     }
333     *stream << "</HyphenationDescription>\n";
334     return true;
335 }
336