1 /*
2 * Kchmviewer - a CHM and EPUB file viewer with broad language support
3 * Copyright (C) 2004-2014 George Yunaev, gyunaev@ulduzsoft.com
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include <QApplication>
20 #include <QTextCodec>
21
22 #include "ebook.h"
23 #include "ebook_search.h"
24 #include "helper_search_index.h"
25
26 static const int DICT_VERSION = 4;
27
28 namespace QtAs {
29
30 // Those characters are splitters (i.e. split the word), but added themselves into dictionary too.
31 // This makes the dictionary MUCH larger, but ensure that for the piece of "window->print" both
32 // search for "print" and "->print" will find it.
33 static const char SPLIT_CHARACTERS[] = "!()*&^%#@[]{}':;,.?/|/?<>\\-+=~`";
34
35 // Those characters are parts of word - for example, '_' is here, and search for _debug will find only _debug.
36 static const char WORD_CHARACTERS[] = "$_";
37
38
39 struct Term
40 {
TermQtAs::Term41 Term() : frequency(-1) {}
TermQtAs::Term42 Term( const QString &t, int f, QVector<Document> l ) : term( t ), frequency( f ), documents( l ) {}
43 QString term;
44 int frequency;
45 QVector<Document>documents;
operator <QtAs::Term46 bool operator<( const Term &i2 ) const { return frequency < i2.frequency; }
47 };
48
49
operator >>(QDataStream & s,Document & l)50 QDataStream &operator>>( QDataStream &s, Document &l )
51 {
52 s >> l.docNumber;
53 s >> l.frequency;
54 return s;
55 }
56
operator <<(QDataStream & s,const Document & l)57 QDataStream &operator<<( QDataStream &s, const Document &l )
58 {
59 s << (short)l.docNumber;
60 s << (short)l.frequency;
61 return s;
62 }
63
Index()64 Index::Index()
65 : QObject( 0 )
66 {
67 lastWindowClosed = false;
68 connect( qApp, SIGNAL( lastWindowClosed() ), this, SLOT( setLastWinClosed() ) );
69 }
70
setLastWinClosed()71 void Index::setLastWinClosed()
72 {
73 lastWindowClosed = true;
74 }
75
76
makeIndex(const QList<QUrl> & docs,EBook * chmFile)77 bool Index::makeIndex(const QList< QUrl >& docs, EBook *chmFile )
78 {
79 if ( docs.isEmpty() )
80 return false;
81
82 docList = docs;
83
84 if ( chmFile->hasFeature( EBook::FEATURE_ENCODING ) )
85 entityDecoder.changeEncoding( QTextCodec::codecForName( chmFile->currentEncoding().toUtf8() ) );
86
87 QList< QUrl >::ConstIterator it = docList.begin();
88 int steps = docList.count() / 100;
89
90 if ( !steps )
91 steps++;
92
93 int prog = 0;
94
95 for ( int i = 0; it != docList.end(); ++it, ++i )
96 {
97 if ( lastWindowClosed )
98 return false;
99
100 QUrl filename = *it;
101 QStringList terms;
102
103 if ( parseDocumentToStringlist( chmFile, filename, terms ) )
104 {
105 for ( QStringList::ConstIterator tit = terms.begin(); tit != terms.end(); ++tit )
106 insertInDict( *tit, i );
107 }
108
109 if ( i%steps == 0 )
110 {
111 prog++;
112 prog = qMin( prog, 99 );
113 emit indexingProgress( prog, tr("Processing document %1") .arg( (*it).path() ) );
114 }
115 }
116
117 emit indexingProgress( 100, tr("Processing completed") );
118 return true;
119 }
120
121
insertInDict(const QString & str,int docNum)122 void Index::insertInDict( const QString &str, int docNum )
123 {
124 Entry *e = 0;
125 if ( dict.count() )
126 e = dict[ str ];
127
128 if ( e )
129 {
130 if ( e->documents.last().docNumber != docNum )
131 e->documents.append( Document(docNum, 1 ) );
132 else
133 e->documents.last().frequency++;
134 }
135 else
136 {
137 dict.insert( str, new Entry( docNum ) );
138 }
139 }
140
141
parseDocumentToStringlist(EBook * chmFile,const QUrl & filename,QStringList & tokenlist)142 bool Index::parseDocumentToStringlist(EBook *chmFile, const QUrl& filename, QStringList& tokenlist )
143 {
144 QString parsedbuf, parseentity, text;
145
146 if ( !chmFile->getFileContentAsString( text, filename )
147 || text.isEmpty() )
148 {
149 qWarning( "Search index generator: could not retrieve the document content for %s", qPrintable( filename.toString() ) );
150 return false;
151 }
152
153 m_charssplit = SPLIT_CHARACTERS;
154 m_charsword = WORD_CHARACTERS;
155
156 tokenlist.clear();
157
158 // State machine states
159 enum state_t
160 {
161 STATE_OUTSIDE_TAGS, // outside HTML tags; parse text
162 STATE_IN_HTML_TAG, // inside HTML tags; wait for end tag
163 STATE_IN_QUOTES, // inside HTML tags and inside quotes; wait for end quote (in var QuoteChar)
164 STATE_IN_HTML_ENTITY // inside HTML entity; parse the entity
165 };
166
167 state_t state = STATE_OUTSIDE_TAGS;
168 QChar QuoteChar; // used in STATE_IN_QUOTES
169
170 for ( int j = 0; j < text.length(); j++ )
171 {
172 QChar ch = text[j];
173
174 if ( (j % 20000) == 0 )
175 qApp->processEvents( QEventLoop::ExcludeUserInputEvents );
176
177 if ( state == STATE_IN_HTML_TAG )
178 {
179 // We are inside HTML tag.
180 // Ignore everything until we see '>' (end of HTML tag) or quote char (quote start)
181 if ( ch == '"' || ch == '\'' )
182 {
183 state = STATE_IN_QUOTES;
184 QuoteChar = ch;
185 }
186 else if ( ch == '>' )
187 state = STATE_OUTSIDE_TAGS;
188
189 continue;
190 }
191 else if ( state == STATE_IN_QUOTES )
192 {
193 // We are inside quoted text inside HTML tag.
194 // Ignore everything until we see the quote character again
195 if ( ch == QuoteChar )
196 state = STATE_IN_HTML_TAG;
197
198 continue;
199 }
200 else if ( state == STATE_IN_HTML_ENTITY )
201 {
202 // We are inside encoded HTML entity (like ).
203 // Collect to parsedbuf everything until we see ;
204 if ( ch.isLetterOrNumber() )
205 {
206 // get next character of this entity
207 parseentity.append( ch );
208 continue;
209 }
210
211 // The entity ended
212 state = STATE_OUTSIDE_TAGS;
213
214 // Some shitty HTML does not terminate entities correctly. Screw it.
215 if ( ch != ';' && ch != '<' )
216 {
217 if ( parseentity.isEmpty() )
218 {
219 // straight '&' symbol. Add and continue.
220 parsedbuf += "&";
221 }
222 else
223 qWarning( "Index::parseDocument: incorrectly terminated HTML entity '&%s%c', ignoring", qPrintable( parseentity ), ch.toLatin1() );
224
225 j--; // parse this character again, but in different state
226 continue;
227 }
228
229 // Don't we have a space?
230 if ( parseentity.toLower() != "nbsp" )
231 {
232 QString entity = entityDecoder.decode( parseentity );
233
234 if ( entity.isNull() )
235 {
236 // decodeEntity() already printed error message
237 //qWarning( "Index::parseDocument: failed to decode entity &%s;", parsedbuf.ascii() );
238 continue;
239 }
240
241 parsedbuf += entity;
242 continue;
243 }
244 else
245 ch = ' '; // We got a space, so treat it like it, and not add it to parsebuf
246 }
247
248 //
249 // Now process STATE_OUTSIDE_TAGS
250 //
251
252 // Check for start of HTML tag, and switch to STATE_IN_HTML_TAG if it is
253 if ( ch == '<' )
254 {
255 state = STATE_IN_HTML_TAG;
256 goto tokenize_buf;
257 }
258
259 // Check for start of HTML entity
260 if ( ch == '&' )
261 {
262 state = STATE_IN_HTML_ENTITY;
263 parseentity = QString::null;
264 continue;
265 }
266
267 // Replace quote by ' - quotes are used in search window to set the phrase
268 if ( ch == '"' )
269 ch = '\'';
270
271 // Ok, we have a valid character outside HTML tags, and probably some in buffer already.
272 // If it is char or letter, add it and continue
273 if ( ch.isLetterOrNumber() || m_charsword.indexOf( ch ) != -1 )
274 {
275 parsedbuf.append( ch );
276 continue;
277 }
278
279 // If it is a split char, add the word to the dictionary, and then add the char itself.
280 if ( m_charssplit.indexOf( ch ) != -1 )
281 {
282 if ( !parsedbuf.isEmpty() )
283 tokenlist.push_back( parsedbuf.toLower() );
284
285 tokenlist.push_back( ch.toLower() );
286 parsedbuf = QString::null;
287 continue;
288 }
289
290 tokenize_buf:
291 // Just add the word; it is most likely a space or terminated by tokenizer.
292 if ( !parsedbuf.isEmpty() )
293 {
294 tokenlist.push_back( parsedbuf.toLower() );
295 parsedbuf = QString::null;
296 }
297 }
298
299 // Add the last word if still here - for broken htmls.
300 if ( !parsedbuf.isEmpty() )
301 tokenlist.push_back( parsedbuf.toLower() );
302
303 return true;
304 }
305
306
writeDict(QDataStream & stream)307 void Index::writeDict( QDataStream& stream )
308 {
309 stream << DICT_VERSION;
310 stream << m_charssplit;
311 stream << m_charsword;
312
313 // Document list
314 stream << docList;
315
316 // Dictionary
317 for( QHash<QString, Entry *>::ConstIterator it = dict.begin(); it != dict.end(); ++it )
318 {
319 stream << it.key();
320 stream << (int) it.value()->documents.count();
321 stream << it.value()->documents;
322 }
323 }
324
325
readDict(QDataStream & stream)326 bool Index::readDict( QDataStream& stream )
327 {
328 dict.clear();
329 docList.clear();
330
331 QString key;
332 int version, numOfDocs;
333
334 stream >> version;
335
336 if ( version < 2 )
337 return false;
338
339 stream >> m_charssplit;
340 stream >> m_charsword;
341
342 // Read the document list
343 stream >> docList;
344
345 while ( !stream.atEnd() )
346 {
347 stream >> key;
348 stream >> numOfDocs;
349
350 QVector<Document> docs( numOfDocs );
351
352 stream >> docs;
353 dict.insert( key, new Entry( docs ) );
354 }
355
356 return dict.size() > 0;
357 }
358
359
query(const QStringList & terms,const QStringList & termSeq,const QStringList & seqWords,EBook * chmFile)360 QList< QUrl > Index::query(const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords, EBook *chmFile )
361 {
362 QList<Term> termList;
363
364 QStringList::ConstIterator it = terms.begin();
365 for ( it = terms.begin(); it != terms.end(); ++it )
366 {
367 Entry *e = 0;
368
369 if ( dict[ *it ] )
370 {
371 e = dict[ *it ];
372 termList.append( Term( *it, e->documents.count(), e->documents ) );
373 }
374 else
375 {
376 return QList< QUrl >();
377 }
378 }
379
380 if ( !termList.count() )
381 return QList< QUrl >();
382
383 qSort( termList );
384
385 QVector<Document> minDocs = termList.takeFirst().documents;
386 for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) {
387 Term *t = &(*it);
388 QVector<Document> docs = t->documents;
389 for(QVector<Document>::Iterator minDoc_it = minDocs.begin(); minDoc_it != minDocs.end(); ) {
390 bool found = false;
391 for (QVector<Document>::ConstIterator doc_it = docs.constBegin(); doc_it != docs.constEnd(); ++doc_it ) {
392 if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) {
393 (*minDoc_it).frequency += (*doc_it).frequency;
394 found = true;
395 break;
396 }
397 }
398 if ( !found )
399 minDoc_it = minDocs.erase( minDoc_it );
400 else
401 ++minDoc_it;
402 }
403 }
404
405 QList< QUrl > results;
406 qSort( minDocs );
407 if ( termSeq.isEmpty() ) {
408 for(QVector<Document>::Iterator it = minDocs.begin(); it != minDocs.end(); ++it)
409 results << docList.at((int)(*it).docNumber);
410 return results;
411 }
412
413 QUrl fileName;
414 for(QVector<Document>::Iterator it = minDocs.begin(); it != minDocs.end(); ++it) {
415 fileName = docList[ (int)(*it).docNumber ];
416 if ( searchForPhrases( termSeq, seqWords, fileName, chmFile ) )
417 results << fileName;
418 }
419
420 return results;
421 }
422
423
searchForPhrases(const QStringList & phrases,const QStringList & words,const QUrl & filename,EBook * chmFile)424 bool Index::searchForPhrases( const QStringList &phrases, const QStringList &words, const QUrl &filename, EBook * chmFile )
425 {
426 QStringList parsed_document;
427
428 if ( !parseDocumentToStringlist( chmFile, filename, parsed_document ) )
429 return false;
430
431 miniDict.clear();
432
433 // Initialize the dictionary with the words in phrase(s)
434 for ( QStringList::ConstIterator cIt = words.begin(); cIt != words.end(); ++cIt )
435 miniDict.insert( *cIt, new PosEntry( 0 ) );
436
437 // Fill the dictionary with the words from the document
438 unsigned int word_offset = 3;
439 for ( QStringList::ConstIterator it = parsed_document.begin(); it != parsed_document.end(); it++, word_offset++ )
440 {
441 PosEntry * entry = miniDict[ *it ];
442
443 if ( entry )
444 entry->positions.append( word_offset );
445 }
446
447 // Dump it
448 /*
449 QDictIterator<PosEntry> it( miniDict );
450 for( ; it.current(); ++it )
451 {
452 QString text( it.currentKey() );
453 QValueList<uint> pos = miniDict[text]->positions;
454 for ( unsigned int i = 1; i < pos.size(); i++ )
455 text += " " + QString::number( pos[i] );
456
457 qDebug( "%s", text.ascii());
458 }
459 */
460
461 QList<uint> first_word_positions;
462
463 for ( QStringList::ConstIterator phrase_it = phrases.begin(); phrase_it != phrases.end(); phrase_it++ )
464 {
465 QStringList phrasewords = phrase_it->split( ' ' );
466 first_word_positions = miniDict[ phrasewords[0] ]->positions;
467
468 for ( int j = 1; j < phrasewords.count(); ++j )
469 {
470 QList<uint> next_word_it = miniDict[ phrasewords[j] ]->positions;
471 QList<uint>::iterator dict_it = first_word_positions.begin();
472
473 while ( dict_it != first_word_positions.end() )
474 {
475 if ( next_word_it.indexOf( *dict_it + 1 ) != -1 )
476 {
477 (*dict_it)++;
478 ++dict_it;
479 }
480 else
481 dict_it = first_word_positions.erase( dict_it );
482 }
483 }
484 }
485
486 if ( first_word_positions.count() )
487 return true;
488
489 return false;
490 }
491
492
493 };
494