1
2 // Copied from coolreader-3.2.49 (crengine/src/hyphman.cpp)
3
4 #include "my_texhyph.h"
5 #include "my_hyphpatternreader.h"
6
7 #include <string.h>
8 #include <stdlib.h>
9
10 #include "lvfnt.h"
11 #include "lvstring.h"
12 #include "lvstring32collection.h"
13 #include "crlog.h"
14
15 //#define DUMP_PATTERNS 1
16
17 struct tPDBHdr
18 {
19 char filename[36];
20 lUInt32 dw1;
21 lUInt32 dw2;
22 lUInt32 dw4[4];
23 char type[8];
24 lUInt32 dw44;
25 lUInt32 dw48;
26 lUInt16 numrec;
27 };
28
29 #pragma pack(push, 1)
30 typedef struct {
31 lUInt16 wl;
32 lUInt16 wu;
33 char al;
34 char au;
35
36 unsigned char mask0[2];
37 lUInt16 aux[256];
38
39 lUInt16 len;
40 } thyph;
41
42 typedef struct {
43 lUInt16 start;
44 lUInt16 len;
45 } hyph_index_item_t;
46 #pragma pack(pop)
47
48
isCorrectHyphFile(LVStream * stream)49 static int isCorrectHyphFile(LVStream * stream)
50 {
51 if (!stream)
52 return false;
53 lvsize_t dw;
54 int w = 0;
55 tPDBHdr HDR;
56 stream->SetPos(0);
57 stream->Read( &HDR, 78, &dw);
58 stream->SetPos(0);
59 lvByteOrderConv cnv;
60 w=cnv.msf(HDR.numrec);
61 if (dw!=78 || w>0xff)
62 w = 0;
63
64 if (strncmp((const char*)&HDR.type, "HypHAlR4", 8) != 0)
65 w = 0;
66
67 return w;
68 }
69
70
71
MyTexHyph(lString32 id,int leftHyphenMin,int rightHyphenMin)72 MyTexHyph::MyTexHyph( lString32 id, int leftHyphenMin, int rightHyphenMin )
73 : HyphMethod(id, leftHyphenMin, rightHyphenMin)
74 {
75 memset( table, 0, sizeof(table) );
76 _hash = 123456;
77 _pattern_count = 0;
78 largest_overflowed_word = 0;
79 }
80
~MyTexHyph()81 MyTexHyph::~MyTexHyph()
82 {
83 for ( int i=0; i<PATTERN_HASH_SIZE; i++ ) {
84 MyTexPattern * p = table[i];
85 while (p) {
86 MyTexPattern * tmp = p;
87 p = p->next;
88 delete tmp;
89 }
90 }
91 }
92
match(const lChar32 * str,char * mask)93 bool MyTexHyph::match( const lChar32 * str, char * mask )
94 {
95 bool found = false;
96 MyTexPattern * res = table[ MyTexPattern::hash( str ) ];
97 if ( res ) {
98 found = res->match( str, mask ) || found;
99 }
100 res = table[ MyTexPattern::hash3( str ) ];
101 if ( res ) {
102 found = res->match( str, mask ) || found;
103 }
104 res = table[ MyTexPattern::hash2( str ) ];
105 if ( res ) {
106 found = res->match( str, mask ) || found;
107 }
108 res = table[ MyTexPattern::hash1( str ) ];
109 if ( res ) {
110 found = res->match( str, mask ) || found;
111 }
112 return found;
113 }
114
hyphenate(const lChar32 * str,int len,lUInt16 * widths,lUInt8 * flags,lUInt16 hyphCharWidth,lUInt16 maxWidth,size_t flagSize)115 bool MyTexHyph::hyphenate( const lChar32 * str, int len, lUInt16 * widths, lUInt8 * flags, lUInt16 hyphCharWidth, lUInt16 maxWidth, size_t flagSize )
116 {
117 // stub
118 return false;
119 }
120
addPattern(MyTexPattern * pattern)121 void MyTexHyph::addPattern( MyTexPattern * pattern )
122 {
123 int h = pattern->hash();
124 MyTexPattern * * p = &table[h];
125 while ( *p && pattern->cmp(*p)<0 )
126 p = &((*p)->next);
127 pattern->next = *p;
128 *p = pattern;
129 _pattern_count++;
130 }
131
load(LVStreamRef stream)132 bool MyTexHyph::load( LVStreamRef stream )
133 {
134 int w = isCorrectHyphFile(stream.get());
135 int patternCount = 0;
136 if (w) {
137 _hash = stream->getcrc32();
138 int i;
139 lvsize_t dw;
140
141 lvByteOrderConv cnv;
142
143 int hyph_count = w;
144 thyph hyph;
145
146 lvpos_t p = 78 + (hyph_count * 8 + 2);
147 stream->SetPos(p);
148 if ( stream->SetPos(p)!=p )
149 return false;
150 lChar32 charMap[256] = { 0 };
151 unsigned char buf[0x10000];
152 // make char map table
153 for (i=0; i<hyph_count; i++)
154 {
155 if ( stream->Read( &hyph, 522, &dw )!=LVERR_OK || dw!=522 )
156 return false;
157 cnv.msf( &hyph.len ); //rword(_main_hyph[i].len);
158 lvpos_t newPos;
159 if ( stream->Seek( hyph.len, LVSEEK_CUR, &newPos )!=LVERR_OK )
160 return false;
161
162 cnv.msf( hyph.wl );
163 cnv.msf( hyph.wu );
164 charMap[ (unsigned char)hyph.al ] = hyph.wl;
165 charMap[ (unsigned char)hyph.au ] = hyph.wu;
166 // lChar32 ch = hyph.wl;
167 // CRLog::debug("wl=%s mask=%c%c", LCSTR(lString32(&ch, 1)), hyph.mask0[0], hyph.mask0[1]);
168 if (hyph.mask0[0]!='0'||hyph.mask0[1]!='0') {
169 unsigned char pat[4];
170 pat[0] = hyph.al;
171 pat[1] = hyph.mask0[0];
172 pat[2] = hyph.mask0[1];
173 pat[3] = 0;
174 MyTexPattern * pattern = new MyTexPattern(pat, 1, charMap);
175 #if DUMP_PATTERNS==1
176 CRLog::debug("Pattern: '%s' - %s", LCSTR(lString32(pattern->word)), pattern->attr );
177 #endif
178 if (pattern->overflowed) {
179 // don't use truncated words
180 CRLog::warn("Pattern overflowed (%d > %d) and ignored: '%s'", pattern->overflowed, MAX_PATTERN_SIZE, LCSTR(lString32(pattern->word)));
181 if (pattern->overflowed > largest_overflowed_word)
182 largest_overflowed_word = pattern->overflowed;
183 delete pattern;
184 }
185 else {
186 addPattern( pattern );
187 patternCount++;
188 }
189 }
190 }
191
192 if ( stream->SetPos(p)!=p )
193 return false;
194
195 for (i=0; i<hyph_count; i++)
196 {
197 stream->Read( &hyph, 522, &dw );
198 if (dw!=522)
199 return false;
200 cnv.msf( &hyph.len );
201
202 stream->Read(buf, hyph.len, &dw);
203 if (dw!=hyph.len)
204 return false;
205
206 unsigned char * p = buf;
207 unsigned char * end_p = p + hyph.len;
208 while ( p < end_p ) {
209 lUInt8 sz = *p++;
210 if ( p + sz > end_p )
211 break;
212 MyTexPattern * pattern = new MyTexPattern( p, sz, charMap );
213 #if DUMP_PATTERNS==1
214 CRLog::debug("Pattern: '%s' - %s", LCSTR(lString32(pattern->word)), pattern->attr);
215 #endif
216 if (pattern->overflowed) {
217 // don't use truncated words
218 CRLog::warn("Pattern overflowed (%d > %d) and ignored: '%s'", pattern->overflowed, MAX_PATTERN_SIZE, LCSTR(lString32(pattern->word)));
219 if (pattern->overflowed > largest_overflowed_word)
220 largest_overflowed_word = pattern->overflowed;
221 delete pattern;
222 }
223 else {
224 addPattern( pattern );
225 patternCount++;
226 }
227 p += sz + sz + 1;
228 }
229 }
230
231 #if DUMP_PATTERNS==1
232 CRLog::debug("Patterns count = %d", patternCount);
233 #endif
234 return patternCount>0;
235 } else {
236 // tex xml format as for FBReader
237 lString32Collection data;
238 MyHyphPatternReader reader( data );
239 LVXMLParser parser( stream, &reader );
240 if ( !parser.CheckFormat() )
241 return false;
242 if ( !parser.Parse() )
243 return false;
244 if ( !data.length() )
245 return false;
246 for ( int i=0; i<(int)data.length(); i++ ) {
247 data[i].lowercase();
248 MyTexPattern * pattern = new MyTexPattern( data[i] );
249 #if DUMP_PATTERNS==1
250 CRLog::debug("Pattern: (%s) '%s' - %s", LCSTR(data[i]), LCSTR(lString32(pattern->word)), pattern->attr);
251 #endif
252 if (pattern->overflowed) {
253 // don't use truncated words
254 CRLog::warn("Pattern overflowed (%d > %d) and ignored: (%s) '%s'", pattern->overflowed, MAX_PATTERN_SIZE, LCSTR(data[i]), LCSTR(lString32(pattern->word)));
255 if (pattern->overflowed > largest_overflowed_word)
256 largest_overflowed_word = pattern->overflowed;
257 delete pattern;
258 }
259 else {
260 addPattern( pattern );
261 patternCount++;
262 }
263 }
264 return patternCount>0;
265 }
266 }
267
load(lString32 fileName)268 bool MyTexHyph::load( lString32 fileName )
269 {
270 LVStreamRef stream = LVOpenFileStream( fileName.c_str(), LVOM_READ );
271 if ( stream.isNull() )
272 return false;
273 return load( stream );
274 }
275
getSize()276 lUInt32 MyTexHyph::getSize()
277 {
278 return _pattern_count * sizeof(MyTexPattern);
279 }
280
281
dump(LVStreamRef stream,const lString8 & title)282 bool MyTexHyph::dump(LVStreamRef stream, const lString8& title)
283 {
284 if (stream.isNull()) {
285 CRLog::error("Output stream is null!");
286 return false;
287 }
288 CRLog::info("Dictionary contains %d patterns.", _pattern_count);
289
290 lString32Collection strPatterns;
291 MyTexPattern* pattern;
292 for (lUInt32 i = 0; i < PATTERN_HASH_SIZE; i++) {
293 pattern = table[i];
294 while (pattern != 0) {
295 //CRLog::debug("Pattern: '%s' - %s", LCSTR(lString32(pattern->word)), pattern->attr);
296 lString32 str;
297 int k = 0;
298 for (int j = 0; pattern->attr[j] && j < MAX_PATTERN_SIZE + 2 && k < MAX_PATTERN_SIZE + 1; j++) {
299 if (pattern->attr[j] > '0' && pattern->attr[j] <= '9') {
300 str << pattern->attr[j];
301 if (pattern->word[k])
302 str << pattern->word[k++];
303 } else {
304 if (pattern->word[k])
305 str << pattern->word[k++];
306 }
307 }
308 strPatterns.add(str);
309 pattern = pattern->next;
310 }
311 }
312 if (strPatterns.length() != _pattern_count) {
313 CRLog::error("Not all patterns processed!");
314 return false;
315 }
316 // sort
317 strPatterns.sort();
318 // Write to stream
319 lString8 decStr;
320 *stream << "<?xml version=\"1.0\" encoding=\"utf8\"?>\n";
321 *stream << "<!-- dump of hyphenation dictionary \"";
322 *stream << title << "\"\n";
323 *stream << " pattern's count ";
324 decStr.appendDecimal(_pattern_count);
325 *stream << decStr;
326 *stream << " -->\n";
327 *stream << "<HyphenationDescription>\n";
328 for (int i = 0; i < strPatterns.length(); i++) {
329 *stream << "<pattern>";
330 *stream << UnicodeToUtf8(strPatterns[i]);
331 *stream << "</pattern>\n";
332 }
333 *stream << "</HyphenationDescription>\n";
334 return true;
335 }
336