1 /****************************************************************************
2 **
3 ** Copyright (C) 2008 Nokia Corporation and/or its subsidiary(-ies).
4 ** Contact: Qt Software Information (qt-info@nokia.com)
5 **
6 ** This file is part of the Qt Assistant of the Qt Toolkit.
7 **
8 ** Commercial Usage
9 ** Licensees holding valid Qt Commercial licenses may use this file in
10 ** accordance with the Qt Commercial License Agreement provided with the
11 ** Software or, alternatively, in accordance with the terms contained in
12 ** a written agreement between you and Nokia.
13 **
14 **
15 ** GNU General Public License Usage
16 ** Alternatively, this file may be used under the terms of the GNU
17 ** General Public License versions 2.0 or 3.0 as published by the Free
18 ** Software Foundation and appearing in the file LICENSE.GPL included in
19 ** the packaging of this file.  Please review the following information
20 ** to ensure GNU General Public Licensing requirements will be met:
21 ** http://www.fsf.org/licensing/licenses/info/GPLv2.html and
22 ** http://www.gnu.org/copyleft/gpl.html.  In addition, as a special
23 ** exception, Nokia gives you certain additional rights. These rights
24 ** are described in the Nokia Qt GPL Exception version 1.3, included in
25 ** the file GPL_EXCEPTION.txt in this package.
26 **
27 ** Qt for Windows(R) Licensees
28 ** As a special exception, Nokia, as the sole copyright holder for Qt
29 ** Designer, grants users of the Qt/Eclipse Integration plug-in the
30 ** right for the Qt/Eclipse Integration to link to functionality
31 ** provided by Qt Designer and its related libraries.
32 **
33 ** If you are unsure which license is appropriate for your use, please
34 ** contact the sales department at qt-sales@nokia.com.
35 **
36 ****************************************************************************/
37 
38 #include "HelpIndex.h"
39 
40 #include <QFile>
41 #include <QDir>
42 #include <QStringList>
43 #include <QApplication>
44 #include <QByteArray>
45 #include <QTextStream>
46 #include <QUrl>
47 #include <QTextCodec>
48 #include <cctype>
49 #include <QTextDocument>
50 #include <QTimer>
51 
52 #include <algorithm>
53 #include <utility>
54 
55 QT_BEGIN_NAMESPACE
56 
57 struct Term
58 {
59 	Term() = default;
TermTerm60 	Term(QString t, int f, QVector<Document> l)
61 	    : term(std::move(t))
62 	    , frequency(f)
63 	    , documents(std::move(l))
64 	{
65 	}
66 	QString term;
67 	int frequency = -1;
68 	QVector<Document> documents;
operator <Term69 	bool operator<(const Term & i2) const { return frequency < i2.frequency; }
70 };
71 
operator >>(QDataStream & s,Document & l)72 QDataStream & operator>>(QDataStream & s, Document & l)
73 {
74 	s >> l.docNumber;
75 	s >> l.frequency;
76 	return s;
77 }
78 
operator <<(QDataStream & s,const Document & l)79 QDataStream & operator<<(QDataStream & s, const Document & l)
80 {
81 	s << (qint16)l.docNumber;
82 	s << (qint16)l.frequency;
83 	return s;
84 }
85 
HelpIndex(QString dp,const QString &)86 HelpIndex::HelpIndex(QString dp, const QString & /* hp */)
87     : QObject(nullptr)
88     , docPath(std::move(dp))
89 {
90 	alreadyHaveDocList = false;
91 
92 	connect(qApp, SIGNAL(lastWindowClosed()), this, SLOT(setLastWinClosed()));
93 
94 	m_pTimer = new QTimer(this);
95 	m_pTimer->setSingleShot(true);
96 	m_pTimer->setInterval(0);
97 	connect(m_pTimer, SIGNAL(timeout()), this, SLOT(filterNext()));
98 }
99 
HelpIndex(QStringList dl,const QString &)100 HelpIndex::HelpIndex(QStringList dl, const QString & /* hp */)
101     : QObject(nullptr)
102     , docList{ std::move(dl) }
103 {
104 	alreadyHaveDocList = true;
105 
106 	connect(qApp, SIGNAL(lastWindowClosed()), this, SLOT(setLastWinClosed()));
107 }
108 
setLastWinClosed()109 void HelpIndex::setLastWinClosed()
110 {
111 	lastWindowClosed = true;
112 }
113 
setDictionaryFile(const QString & f)114 void HelpIndex::setDictionaryFile(const QString & f)
115 {
116 	dictFile = f;
117 }
118 
setDocListFile(const QString & f)119 void HelpIndex::setDocListFile(const QString & f)
120 {
121 	docListFile = f;
122 }
123 
setDocList(const QStringList & lst)124 void HelpIndex::setDocList(const QStringList & lst)
125 {
126 	docList = lst;
127 }
128 
makeIndex()129 void HelpIndex::makeIndex()
130 {
131 	if(!alreadyHaveDocList)
132 		setupDocumentList();
133 
134 	lastWindowClosed = false;
135 	emit indexingStart(docList.count());
136 	dict.clear();
137 	m_iCurItem = 0;
138 	m_pTimer->start(); //singleshot
139 }
140 
filterNext()141 void HelpIndex::filterNext()
142 {
143 	if(m_iCurItem < docList.count() && !lastWindowClosed)
144 	{
145 		QUrl url(docList.at(m_iCurItem));
146 		parseDocument(url.toLocalFile(), m_iCurItem);
147 		emit indexingProgress(m_iCurItem);
148 		m_iCurItem++;
149 		m_pTimer->start(); //singleshot
150 	}
151 	else
152 	{
153 		emit indexingEnd();
154 	}
155 }
156 
setupDocumentList()157 void HelpIndex::setupDocumentList()
158 {
159 	docList.clear();
160 	titleList.clear();
161 	QDir d(docPath);
162 	QStringList filters{QLatin1String("*.html")};
163 	QStringList lst = d.entryList(filters);
164 	for(auto&& item : lst)
165 	{
166 		QString filename = QLatin1String("file:///") + docPath + QLatin1String("/") + item;
167 		docList.append(filename);
168 		titleList.append(getDocumentTitle(filename));
169 	}
170 }
171 
insertInDict(const QString & str,int docNum)172 void HelpIndex::insertInDict(const QString & str, int docNum)
173 {
174 	if(str == QLatin1String("amp") || str == QLatin1String("nbsp"))
175 		return;
176 	Entry * e = nullptr;
177 	if(dict.count())
178 		e = dict[str];
179 
180 	if(e)
181 	{
182 		if(e->documents.last().docNumber != docNum)
183 			e->documents.append(Document(docNum, 1));
184 		else
185 			e->documents.last().frequency++;
186 	}
187 	else
188 	{
189 		dict.insert(str, new Entry(docNum));
190 	}
191 }
192 
getCharsetForDocument(QFile * file)193 QString HelpIndex::getCharsetForDocument(QFile * file)
194 {
195 	QTextStream s(file);
196 	QString contents = s.readAll();
197 
198 	QString encoding;
199 	int start = contents.indexOf(QLatin1String("<meta"), 0, Qt::CaseInsensitive);
200 	if(start > 0)
201 	{
202 		int end = contents.indexOf(QLatin1String(">"), start);
203 		QString meta = contents.mid(start + 5, end - start);
204 		meta = meta.toLower();
205 		QRegExp r(QLatin1String("charset=([^\"\\s]+)"));
206 		if(r.indexIn(meta) != -1)
207 			encoding = r.cap(1);
208 	}
209 
210 	file->seek(0);
211 	if(encoding.isEmpty())
212 		return QLatin1String("utf-8");
213 	return encoding;
214 }
215 
parseDocument(const QString & filename,int docNum)216 void HelpIndex::parseDocument(const QString & filename, int docNum)
217 {
218 	QFile file(filename);
219 	if(!file.open(QFile::ReadOnly))
220 	{
221 		qWarning("Can't open file %s", qPrintable(filename));
222 		return;
223 	}
224 
225 	QTextStream s(&file);
226 	QString en = getCharsetForDocument(&file);
227 	s.setCodec(QTextCodec::codecForName(en.toLatin1().constData()));
228 
229 	QString text = s.readAll();
230 	if(text.isNull())
231 		return;
232 
233 	bool valid = true;
234 	const QChar * buf = text.unicode();
235 	QChar str[64];
236 	QChar c = buf[0];
237 	int j = 0;
238 	int i = 0;
239 	while(j < text.length())
240 	{
241 		if(c == QLatin1Char('<') || c == QLatin1Char('&'))
242 		{
243 			valid = false;
244 			if(i > 1)
245 				insertInDict(QString(str, i), docNum);
246 			i = 0;
247 			c = buf[++j];
248 			continue;
249 		}
250 		if((c == QLatin1Char('>') || c == QLatin1Char(';')) && !valid)
251 		{
252 			valid = true;
253 			c = buf[++j];
254 			continue;
255 		}
256 		if(!valid)
257 		{
258 			c = buf[++j];
259 			continue;
260 		}
261 		if((c.isLetterOrNumber() || c == QLatin1Char('_')) && i < 63)
262 		{
263 			str[i] = c.toLower();
264 			++i;
265 		}
266 		else
267 		{
268 			if(i > 1)
269 				insertInDict(QString(str, i), docNum);
270 			i = 0;
271 		}
272 		c = buf[++j];
273 	}
274 	if(i > 1)
275 		insertInDict(QString(str, i), docNum);
276 	file.close();
277 }
278 
writeDict()279 void HelpIndex::writeDict()
280 {
281 	QFile f(dictFile);
282 	qDebug("Write dict to %s", f.fileName().toUtf8().data());
283 	if(!f.open(QFile::WriteOnly))
284 		return;
285 	QDataStream s(&f);
286 	for(auto it = dict.cbegin(); it != dict.cend(); ++it)
287 	{
288 		s << it.key();
289 		s << it.value()->documents.count();
290 		s << it.value()->documents;
291 	}
292 	f.close();
293 	writeDocumentList();
294 }
295 
writeDocumentList()296 void HelpIndex::writeDocumentList()
297 {
298 	QFile f(docListFile);
299 	if(!f.open(QFile::WriteOnly))
300 		return;
301 	QDataStream s(&f);
302 	s << docList;
303 
304 	QFile f1(docListFile + ".titles");
305 	if(!f1.open(QFile::WriteOnly))
306 		return;
307 	QDataStream s1(&f1);
308 	s1 << titleList;
309 }
310 
readDict()311 void HelpIndex::readDict()
312 {
313 	QFile f(dictFile);
314 	if(!f.open(QFile::ReadOnly))
315 		return;
316 
317 	dict.clear();
318 	QDataStream s(&f);
319 	QString key;
320 	int numOfDocs;
321 	QVector<Document> docs;
322 	while(!s.atEnd())
323 	{
324 		s >> key;
325 		s >> numOfDocs;
326 		docs.resize(numOfDocs);
327 		s >> docs;
328 		dict.insert(key, new Entry(docs));
329 	}
330 	f.close();
331 	readDocumentList();
332 }
333 
readDocumentList()334 void HelpIndex::readDocumentList()
335 {
336 	QFile f(docListFile);
337 	if(!f.open(QFile::ReadOnly))
338 		return;
339 	QDataStream s(&f);
340 	s >> docList;
341 	QFile f1(docListFile + ".titles");
342 	if(!f1.open(QFile::ReadOnly))
343 		return;
344 	QDataStream s1(&f1);
345 	s1 >> titleList;
346 }
347 
query(const QStringList & terms,const QStringList & termSeq,const QStringList & seqWords)348 QStringList HelpIndex::query(const QStringList & terms, const QStringList & termSeq, const QStringList & seqWords)
349 {
350 	QList<Term> termList;
351 	for(const auto & term : terms)
352 	{
353 		if(term.contains(QLatin1Char('*')))
354 		{
355 			QVector<Document> wcts = setupDummyTerm(getWildcardTerms(term));
356 			termList.append(Term(QLatin1String("dummy"), wcts.count(), wcts));
357 		}
358 		else if(dict[term])
359 		{
360 			auto e = dict[term];
361 			termList.append(Term(term, e->documents.count(), e->documents));
362 		}
363 		else
364 		{
365 			return QStringList();
366 		}
367 	}
368 	if(!termList.count())
369 		return QStringList();
370 	std::sort(termList.begin(), termList.end());
371 
372 	QVector<Document> minDocs = termList.takeFirst().documents;
373 	for(const auto & term : termList)
374 	{
375 		QVector<Document> docs = term.documents;
376 		for(auto minDoc_it = minDocs.begin(); minDoc_it != minDocs.end();)
377 		{
378 			bool found = false;
379 			for(auto&& doc : docs)
380 			{
381 				if(minDoc_it->docNumber == doc.docNumber)
382 				{
383 					minDoc_it->frequency += doc.frequency;
384 					found = true;
385 					break;
386 				}
387 			}
388 			if(!found)
389 				minDoc_it = minDocs.erase(minDoc_it);
390 			else
391 				++minDoc_it;
392 		}
393 	}
394 
395 	QStringList results;
396 	std::sort(minDocs.begin(), minDocs.end());
397 	if(termSeq.isEmpty())
398 	{
399 		for(auto & minDoc : minDocs)
400 			results << docList.at((int)minDoc.docNumber);
401 		return results;
402 	}
403 
404 	for(const auto & minDoc : minDocs)
405 	{
406 		auto fileName = docList[(int)minDoc.docNumber];
407 		if(searchForPattern(termSeq, seqWords, fileName))
408 			results << fileName;
409 	}
410 	return results;
411 }
412 
getDocumentTitle(const QString & fullFileName)413 QString HelpIndex::getDocumentTitle(const QString & fullFileName)
414 {
415 	QUrl url(fullFileName);
416 	QString fileName = url.toLocalFile();
417 
418 	if(documentTitleCache.contains(fileName))
419 		return documentTitleCache.value(fileName);
420 
421 	QFile file(fileName);
422 	if(!file.open(QFile::ReadOnly))
423 	{
424 		qWarning("Can't open file %s", qPrintable(fileName));
425 		return fileName;
426 	}
427 	QTextStream s(&file);
428 	QString text = s.readAll();
429 
430 	int start = text.indexOf(QLatin1String("<title>"), 0, Qt::CaseInsensitive) + 7;
431 	int end = text.indexOf(QLatin1String("</title>"), start, Qt::CaseInsensitive);
432 
433 	QString title = tr("Untitled");
434 	if(end > start)
435 	{
436 		title = text.mid(start, end - start);
437 		if(Qt::mightBeRichText(title))
438 		{
439 			QTextDocument doc;
440 			doc.setHtml(title);
441 			title = doc.toPlainText();
442 		}
443 	}
444 	documentTitleCache.insert(fileName, title);
445 	return title;
446 }
447 
getWildcardTerms(const QString & term)448 QStringList HelpIndex::getWildcardTerms(const QString & term)
449 {
450 	QStringList lst;
451 	QStringList terms = split(term);
452 
453 	for(auto it = dict.begin(); it != dict.end(); ++it)
454 	{
455 		int index = 0;
456 		bool found = false;
457 		QString text(it.key());
458 		for(auto iter = terms.cbegin(); iter != terms.cend(); ++iter)
459 		{
460 			if(*iter == QLatin1String("*"))
461 			{
462 				found = true;
463 				continue;
464 			}
465 			if(iter == terms.cbegin() && (*iter)[0] != text[0])
466 			{
467 				found = false;
468 				break;
469 			}
470 			index = text.indexOf(*iter, index);
471 			if(*iter == terms.last() && index != text.length() - 1)
472 			{
473 				index = text.lastIndexOf(*iter);
474 				if(index != text.length() - iter->length())
475 				{
476 					found = false;
477 					break;
478 				}
479 			}
480 			if(index != -1)
481 			{
482 				found = true;
483 				index += iter->length();
484 				continue;
485 			}
486 			else
487 			{
488 				found = false;
489 				break;
490 			}
491 		}
492 		if(found)
493 			lst << text;
494 	}
495 
496 	return lst;
497 }
498 
split(const QString & str)499 QStringList HelpIndex::split(const QString & str)
500 {
501 	QStringList lst;
502 	int j = 0;
503 	int i = str.indexOf(QLatin1Char('*'), j);
504 
505 	if(str.startsWith(QLatin1String("*")))
506 		lst << QLatin1String("*");
507 
508 	while(i != -1)
509 	{
510 		if(i > j && i <= (int)str.length())
511 		{
512 			lst << str.mid(j, i - j);
513 			lst << QLatin1String("*");
514 		}
515 		j = i + 1;
516 		i = str.indexOf(QLatin1Char('*'), j);
517 	}
518 
519 	int l = str.length() - 1;
520 	if(str.mid(j, l - j + 1).length() > 0)
521 		lst << str.mid(j, l - j + 1);
522 
523 	return lst;
524 }
525 
setupDummyTerm(const QStringList & terms)526 QVector<Document> HelpIndex::setupDummyTerm(const QStringList & terms)
527 {
528 	QList<Term> termList;
529 	for(const auto & term : terms)
530 	{
531 		if(dict[term])
532 		{
533 			auto e = dict[term];
534 			termList.append(Term(term, e->documents.count(), e->documents));
535 		}
536 	}
537 	if(!termList.count())
538 		return QVector<Document>();
539 	std::sort(termList.begin(), termList.end());
540 
541 	auto maxList = termList.takeLast().documents;
542 	for(const auto & term : termList)
543 		for(const auto & doc : term.documents)
544 			if(maxList.indexOf(doc) == -1)
545 				maxList.append(doc);
546 
547 	return maxList;
548 }
549 
buildMiniDict(const QString & str)550 void HelpIndex::buildMiniDict(const QString & str)
551 {
552 	if(miniDict[str])
553 		miniDict[str]->positions.append(wordNum);
554 	++wordNum;
555 }
556 
searchForPattern(const QStringList & patterns,const QStringList & words,const QString & fileName)557 bool HelpIndex::searchForPattern(const QStringList & patterns, const QStringList & words, const QString & fileName)
558 {
559 	QUrl url(fileName);
560 	QString fName = url.toLocalFile();
561 	QFile file(fName);
562 	if(!file.open(QFile::ReadOnly))
563 	{
564 		qWarning("Can't open file %s", qPrintable(fName));
565 		return false;
566 	}
567 
568 	wordNum = 3;
569 	miniDict.clear();
570 	for(auto&& word : words)
571 		miniDict.insert(word, new PosEntry(0));
572 
573 	QTextStream s(&file);
574 	QString text = s.readAll();
575 	bool valid = true;
576 	const QChar * buf = text.unicode();
577 	QChar str[64];
578 	QChar c = buf[0];
579 	int j = 0;
580 	int i = 0;
581 	while(j < text.length())
582 	{
583 		if(c == QLatin1Char('<') || c == QLatin1Char('&'))
584 		{
585 			valid = false;
586 			if(i > 1)
587 				buildMiniDict(QString(str, i));
588 			i = 0;
589 			c = buf[++j];
590 			continue;
591 		}
592 		if((c == QLatin1Char('>') || c == QLatin1Char(';')) && !valid)
593 		{
594 			valid = true;
595 			c = buf[++j];
596 			continue;
597 		}
598 		if(!valid)
599 		{
600 			c = buf[++j];
601 			continue;
602 		}
603 		if((c.isLetterOrNumber() || c == QLatin1Char('_')) && i < 63)
604 		{
605 			str[i] = c.toLower();
606 			++i;
607 		}
608 		else
609 		{
610 			if(i > 1)
611 				buildMiniDict(QString(str, i));
612 			i = 0;
613 		}
614 		c = buf[++j];
615 	}
616 	if(i > 1)
617 		buildMiniDict(QString(str, i));
618 	file.close();
619 
620 	QStringList wordLst;
621 	QList<uint> a;
622 	for(auto&& pattern : patterns)
623 	{
624 		wordLst = pattern.split(QLatin1Char(' '));
625 		a = miniDict[wordLst[0]]->positions;
626 		for(int j = 1; j < (int)wordLst.count(); ++j)
627 		{
628 			auto b = miniDict[wordLst[j]]->positions;
629 			auto aIt = a.begin();
630 			while(aIt != a.end())
631 			{
632 				if(b.contains(*aIt + 1))
633 				{
634 					(*aIt)++;
635 					++aIt;
636 				}
637 				else
638 				{
639 					aIt = a.erase(aIt);
640 				}
641 			}
642 		}
643 	}
644 	if(a.count())
645 		return true;
646 	return false;
647 }
648 
649 QT_END_NAMESPACE
650