1 /*
2  *  Kchmviewer - a CHM and EPUB file viewer with broad language support
3  *  Copyright (C) 2004-2014 George Yunaev, gyunaev@ulduzsoft.com
4  *
5  *  This program is free software: you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation, either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include <QApplication>
20 #include <QTextCodec>
21 
22 #include "ebook.h"
23 #include "ebook_search.h"
24 #include "helper_search_index.h"
25 
26 static const int DICT_VERSION = 4;
27 
28 namespace QtAs {
29 
30 // Those characters are splitters (i.e. split the word), but added themselves into dictionary too.
31 // This makes the dictionary MUCH larger, but ensure that for the piece of "window->print" both
32 // search for "print" and "->print" will find it.
33 static const char SPLIT_CHARACTERS[] = "!()*&^%#@[]{}':;,.?/|/?<>\\-+=~`";
34 
35 // Those characters are parts of word - for example, '_' is here, and search for _debug will find only _debug.
36 static const char WORD_CHARACTERS[] = "$_";
37 
38 
39 struct Term
40 {
TermQtAs::Term41 	Term() : frequency(-1) {}
TermQtAs::Term42 	Term( const QString &t, int f, QVector<Document> l ) : term( t ), frequency( f ), documents( l ) {}
43 	QString term;
44 	int frequency;
45 	QVector<Document>documents;
operator <QtAs::Term46 	bool operator<( const Term &i2 ) const { return frequency < i2.frequency; }
47 };
48 
49 
operator >>(QDataStream & s,Document & l)50 QDataStream &operator>>( QDataStream &s, Document &l )
51 {
52 	s >> l.docNumber;
53 	s >> l.frequency;
54 	return s;
55 }
56 
operator <<(QDataStream & s,const Document & l)57 QDataStream &operator<<( QDataStream &s, const Document &l )
58 {
59 	s << (short)l.docNumber;
60 	s << (short)l.frequency;
61 	return s;
62 }
63 
Index()64 Index::Index()
65 	: QObject( 0 )
66 {
67 	lastWindowClosed = false;
68 	connect( qApp, SIGNAL( lastWindowClosed() ), this, SLOT( setLastWinClosed() ) );
69 }
70 
setLastWinClosed()71 void Index::setLastWinClosed()
72 {
73 	lastWindowClosed = true;
74 }
75 
76 
makeIndex(const QList<QUrl> & docs,EBook * chmFile)77 bool Index::makeIndex(const QList< QUrl >& docs, EBook *chmFile )
78 {
79 	if ( docs.isEmpty() )
80 		return false;
81 
82 	docList = docs;
83 
84 	if ( chmFile->hasFeature( EBook::FEATURE_ENCODING ) )
85 		entityDecoder.changeEncoding( QTextCodec::codecForName( chmFile->currentEncoding().toUtf8() ) );
86 
87 	QList< QUrl >::ConstIterator it = docList.begin();
88 	int steps = docList.count() / 100;
89 
90 	if ( !steps )
91 		steps++;
92 
93 	int prog = 0;
94 
95 	for ( int i = 0; it != docList.end(); ++it, ++i )
96 	{
97 		if ( lastWindowClosed )
98 			return false;
99 
100 		QUrl filename = *it;
101 		QStringList terms;
102 
103 		if ( parseDocumentToStringlist( chmFile, filename, terms ) )
104 		{
105 			for ( QStringList::ConstIterator tit = terms.begin(); tit != terms.end(); ++tit )
106 				insertInDict( *tit, i );
107 		}
108 
109 		if ( i%steps == 0 )
110 		{
111 			prog++;
112 			prog = qMin( prog, 99 );
113 			emit indexingProgress( prog, tr("Processing document %1") .arg( (*it).path() ) );
114 		}
115 	}
116 
117 	emit indexingProgress( 100, tr("Processing completed") );
118 	return true;
119 }
120 
121 
insertInDict(const QString & str,int docNum)122 void Index::insertInDict( const QString &str, int docNum )
123 {
124 	Entry *e = 0;
125 	if ( dict.count() )
126 		e = dict[ str ];
127 
128 	if ( e )
129 	{
130 		if ( e->documents.last().docNumber != docNum )
131 			e->documents.append( Document(docNum, 1 ) );
132 		else
133 			e->documents.last().frequency++;
134 	}
135 	else
136 	{
137 		dict.insert( str, new Entry( docNum ) );
138 	}
139 }
140 
141 
parseDocumentToStringlist(EBook * chmFile,const QUrl & filename,QStringList & tokenlist)142 bool Index::parseDocumentToStringlist(EBook *chmFile, const QUrl& filename, QStringList& tokenlist )
143 {
144 	QString parsedbuf, parseentity, text;
145 
146 	if ( !chmFile->getFileContentAsString( text, filename )
147 	|| text.isEmpty() )
148 	{
149 		qWarning( "Search index generator: could not retrieve the document content for %s", qPrintable( filename.toString() ) );
150 		return false;
151 	}
152 
153 	m_charssplit = SPLIT_CHARACTERS;
154 	m_charsword = WORD_CHARACTERS;
155 
156 	tokenlist.clear();
157 
158 	// State machine states
159 	enum state_t
160 	{
161 		STATE_OUTSIDE_TAGS,		// outside HTML tags; parse text
162 		STATE_IN_HTML_TAG,		// inside HTML tags; wait for end tag
163 		STATE_IN_QUOTES,		// inside HTML tags and inside quotes; wait for end quote (in var QuoteChar)
164 		STATE_IN_HTML_ENTITY	// inside HTML entity; parse the entity
165 	};
166 
167 	state_t state = STATE_OUTSIDE_TAGS;
168 	QChar QuoteChar; // used in STATE_IN_QUOTES
169 
170 	for ( int j = 0; j < text.length(); j++ )
171 	{
172 		QChar ch = text[j];
173 
174 		if ( (j % 20000) == 0 )
175 			qApp->processEvents( QEventLoop::ExcludeUserInputEvents );
176 
177 		if ( state == STATE_IN_HTML_TAG )
178 		{
179 			// We are inside HTML tag.
180 			// Ignore everything until we see '>' (end of HTML tag) or quote char (quote start)
181 			if ( ch == '"' || ch == '\'' )
182 			{
183 				state = STATE_IN_QUOTES;
184 				QuoteChar = ch;
185 			}
186 			else if ( ch == '>' )
187 				state = STATE_OUTSIDE_TAGS;
188 
189 			continue;
190 		}
191 		else if ( state == STATE_IN_QUOTES )
192 		{
193 			// We are inside quoted text inside HTML tag.
194 			// Ignore everything until we see the quote character again
195 			if ( ch == QuoteChar )
196 				state = STATE_IN_HTML_TAG;
197 
198 			continue;
199 		}
200 		else if ( state == STATE_IN_HTML_ENTITY )
201 		{
202 			// We are inside encoded HTML entity (like &nbsp;).
203 			// Collect to parsedbuf everything until we see ;
204 			if ( ch.isLetterOrNumber() )
205 			{
206 				// get next character of this entity
207 				parseentity.append( ch );
208 				continue;
209 			}
210 
211 			// The entity ended
212 			state = STATE_OUTSIDE_TAGS;
213 
214 			// Some shitty HTML does not terminate entities correctly. Screw it.
215 			if ( ch != ';' && ch != '<' )
216 			{
217 				if ( parseentity.isEmpty() )
218 				{
219 					// straight '&' symbol. Add and continue.
220 					parsedbuf += "&";
221 				}
222 				else
223 					qWarning( "Index::parseDocument: incorrectly terminated HTML entity '&%s%c', ignoring", qPrintable( parseentity ), ch.toLatin1() );
224 
225 				j--; // parse this character again, but in different state
226 				continue;
227 			}
228 
229 			// Don't we have a space?
230 			if ( parseentity.toLower() != "nbsp" )
231 			{
232 				QString entity = entityDecoder.decode( parseentity );
233 
234 				if ( entity.isNull() )
235 				{
236 					// decodeEntity() already printed error message
237 					//qWarning( "Index::parseDocument: failed to decode entity &%s;", parsedbuf.ascii() );
238 					continue;
239 				}
240 
241 				parsedbuf += entity;
242 				continue;
243 			}
244 			else
245 				ch = ' '; // We got a space, so treat it like it, and not add it to parsebuf
246 		}
247 
248 		//
249 		// Now process STATE_OUTSIDE_TAGS
250 		//
251 
252 		// Check for start of HTML tag, and switch to STATE_IN_HTML_TAG if it is
253 		if ( ch == '<' )
254 		{
255 			state = STATE_IN_HTML_TAG;
256 			goto tokenize_buf;
257 		}
258 
259 		// Check for start of HTML entity
260 		if ( ch == '&' )
261 		{
262 			state = STATE_IN_HTML_ENTITY;
263 			parseentity = QString::null;
264 			continue;
265 		}
266 
267 		// Replace quote by ' - quotes are used in search window to set the phrase
268 		if ( ch == '"' )
269 			ch = '\'';
270 
271 		// Ok, we have a valid character outside HTML tags, and probably some in buffer already.
272 		// If it is char or letter, add it and continue
273 		if ( ch.isLetterOrNumber() || m_charsword.indexOf( ch ) != -1 )
274 		{
275 			parsedbuf.append( ch );
276 			continue;
277 		}
278 
279 		// If it is a split char, add the word to the dictionary, and then add the char itself.
280 		if ( m_charssplit.indexOf( ch ) != -1 )
281 		{
282 			if ( !parsedbuf.isEmpty() )
283 				tokenlist.push_back( parsedbuf.toLower() );
284 
285 			tokenlist.push_back( ch.toLower() );
286 			parsedbuf = QString::null;
287 			continue;
288 		}
289 
290 tokenize_buf:
291 		// Just add the word; it is most likely a space or terminated by tokenizer.
292 		if ( !parsedbuf.isEmpty() )
293 		{
294 			tokenlist.push_back( parsedbuf.toLower() );
295 			parsedbuf = QString::null;
296 		}
297 	}
298 
299 	// Add the last word if still here - for broken htmls.
300 	if ( !parsedbuf.isEmpty() )
301 		tokenlist.push_back( parsedbuf.toLower() );
302 
303 	return true;
304 }
305 
306 
writeDict(QDataStream & stream)307 void Index::writeDict( QDataStream& stream )
308 {
309 	stream << DICT_VERSION;
310 	stream << m_charssplit;
311 	stream << m_charsword;
312 
313 	// Document list
314 	stream << docList;
315 
316 	// Dictionary
317 	for( QHash<QString, Entry *>::ConstIterator it = dict.begin(); it != dict.end(); ++it )
318 	{
319 		stream << it.key();
320 		stream << (int) it.value()->documents.count();
321 		stream << it.value()->documents;
322 	}
323 }
324 
325 
readDict(QDataStream & stream)326 bool Index::readDict( QDataStream& stream )
327 {
328 	dict.clear();
329 	docList.clear();
330 
331 	QString key;
332 	int version, numOfDocs;
333 
334 	stream >> version;
335 
336 	if ( version < 2 )
337 		return false;
338 
339 	stream >> m_charssplit;
340 	stream >> m_charsword;
341 
342 	// Read the document list
343 	stream >> docList;
344 
345 	while ( !stream.atEnd() )
346 	{
347 		stream >> key;
348 		stream >> numOfDocs;
349 
350 		QVector<Document> docs( numOfDocs );
351 
352 		stream >> docs;
353 		dict.insert( key, new Entry( docs ) );
354 	}
355 
356 	return dict.size() > 0;
357 }
358 
359 
query(const QStringList & terms,const QStringList & termSeq,const QStringList & seqWords,EBook * chmFile)360 QList< QUrl > Index::query(const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords, EBook *chmFile )
361 {
362 	QList<Term> termList;
363 
364 	QStringList::ConstIterator it = terms.begin();
365 	for ( it = terms.begin(); it != terms.end(); ++it )
366 	{
367 		Entry *e = 0;
368 
369 		if ( dict[ *it ] )
370 		{
371 			e = dict[ *it ];
372 			termList.append( Term( *it, e->documents.count(), e->documents ) );
373 		}
374 		else
375 		{
376 			return QList< QUrl >();
377 		}
378 	}
379 
380 	if ( !termList.count() )
381 		return QList< QUrl >();
382 
383 	qSort( termList );
384 
385 	QVector<Document> minDocs = termList.takeFirst().documents;
386 	for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) {
387 		Term *t = &(*it);
388 		QVector<Document> docs = t->documents;
389 		for(QVector<Document>::Iterator minDoc_it = minDocs.begin(); minDoc_it != minDocs.end(); ) {
390 			bool found = false;
391 			for (QVector<Document>::ConstIterator doc_it = docs.constBegin(); doc_it != docs.constEnd(); ++doc_it ) {
392 				if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) {
393 					(*minDoc_it).frequency += (*doc_it).frequency;
394 					found = true;
395 					break;
396 				}
397 			}
398 			if ( !found )
399 				minDoc_it = minDocs.erase( minDoc_it );
400 			else
401 				++minDoc_it;
402 		}
403 	}
404 
405 	QList< QUrl > results;
406 	qSort( minDocs );
407 	if ( termSeq.isEmpty() ) {
408 		for(QVector<Document>::Iterator it = minDocs.begin(); it != minDocs.end(); ++it)
409 			results << docList.at((int)(*it).docNumber);
410 		return results;
411 	}
412 
413 	QUrl fileName;
414 	for(QVector<Document>::Iterator it = minDocs.begin(); it != minDocs.end(); ++it) {
415 		fileName =  docList[ (int)(*it).docNumber ];
416 		if ( searchForPhrases( termSeq, seqWords, fileName, chmFile ) )
417 			results << fileName;
418 	}
419 
420 	return results;
421 }
422 
423 
searchForPhrases(const QStringList & phrases,const QStringList & words,const QUrl & filename,EBook * chmFile)424 bool Index::searchForPhrases( const QStringList &phrases, const QStringList &words, const QUrl &filename, EBook * chmFile )
425 {
426 	QStringList parsed_document;
427 
428 	if ( !parseDocumentToStringlist( chmFile, filename, parsed_document ) )
429 		return false;
430 
431 	miniDict.clear();
432 
433 	// Initialize the dictionary with the words in phrase(s)
434 	for ( QStringList::ConstIterator cIt = words.begin(); cIt != words.end(); ++cIt )
435 		miniDict.insert( *cIt, new PosEntry( 0 ) );
436 
437 	// Fill the dictionary with the words from the document
438 	unsigned int word_offset = 3;
439 	for ( QStringList::ConstIterator it = parsed_document.begin(); it != parsed_document.end(); it++, word_offset++ )
440 	{
441 		PosEntry * entry = miniDict[ *it ];
442 
443 		if ( entry )
444 			entry->positions.append( word_offset );
445 	}
446 
447 	// Dump it
448 /*
449 	QDictIterator<PosEntry> it( miniDict );
450 	for( ; it.current(); ++it )
451 	{
452 		QString text( it.currentKey() );
453 		QValueList<uint> pos = miniDict[text]->positions;
454 		for ( unsigned int i = 1; i < pos.size(); i++ )
455 			text += " " + QString::number( pos[i] );
456 
457 		qDebug( "%s", text.ascii());
458 	}
459 */
460 
461 	QList<uint> first_word_positions;
462 
463 	for ( QStringList::ConstIterator phrase_it = phrases.begin(); phrase_it != phrases.end(); phrase_it++ )
464 	{
465 		QStringList phrasewords = phrase_it->split( ' ' );
466 		first_word_positions = miniDict[ phrasewords[0] ]->positions;
467 
468 		for ( int j = 1; j < phrasewords.count(); ++j )
469 		{
470 			QList<uint> next_word_it = miniDict[ phrasewords[j] ]->positions;
471 			QList<uint>::iterator dict_it = first_word_positions.begin();
472 
473 			while ( dict_it != first_word_positions.end() )
474 			{
475 				if ( next_word_it.indexOf( *dict_it + 1 ) != -1 )
476 				{
477 					(*dict_it)++;
478 					++dict_it;
479 				}
480 				else
481 					dict_it = first_word_positions.erase( dict_it );
482 			}
483 		}
484 	}
485 
486 	if ( first_word_positions.count() )
487 		return true;
488 
489 	return false;
490 }
491 
492 
493 };
494