1 /****************************************************************************
2 **
3 ** Copyright (C) 2015 The Qt Company Ltd.
4 ** Contact: http://www.qt.io/licensing/
5 **
6 ** This file is part of the Qt Assistant of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see http://www.qt.io/terms-conditions. For further
15 ** information use the contact form at http://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 2.1 or version 3 as published by the Free
20 ** Software Foundation and appearing in the file LICENSE.LGPLv21 and
21 ** LICENSE.LGPLv3 included in the packaging of this file. Please review the
22 ** following information to ensure the GNU Lesser General Public License
23 ** requirements will be met: https://www.gnu.org/licenses/lgpl.html and
24 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
25 **
26 ** As a special exception, The Qt Company gives you certain additional
27 ** rights. These rights are described in The Qt Company LGPL Exception
28 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
29 **
30 ** GNU General Public License Usage
31 ** Alternatively, this file may be used under the terms of the GNU
32 ** General Public License version 3.0 as published by the Free Software
33 ** Foundation and appearing in the file LICENSE.GPL included in the
34 ** packaging of this file.  Please review the following information to
35 ** ensure the GNU General Public License version 3.0 requirements will be
36 ** met: http://www.gnu.org/copyleft/gpl.html.
37 **
38 ** $QT_END_LICENSE$
39 **
40 ****************************************************************************/
41 
42 #include "qhelpsearchindexwriter_default_p.h"
43 #include "qhelp_global.h"
44 #include "qhelpenginecore.h"
45 
46 #include <QtCore/QDir>
47 #include <QtCore/QSet>
48 #include <QtCore/QUrl>
49 #include <QtCore/QFile>
50 #include <QtCore/QRegExp>
51 #include <QtCore/QVariant>
52 #include <QtCore/QFileInfo>
53 #include <QtCore/QTextCodec>
54 #include <QtCore/QTextStream>
55 
56 QT_BEGIN_NAMESPACE
57 
58 namespace fulltextsearch {
59 namespace std {
60 
Writer(const QString & path)61 Writer::Writer(const QString &path)
62     : indexPath(path)
63     , indexFile(QString())
64     , documentFile(QString())
65 {
66     // nothing todo
67 }
68 
~Writer()69 Writer::~Writer()
70 {
71     reset();
72 }
73 
reset()74 void Writer::reset()
75 {
76     for(QHash<QString, Entry*>::ConstIterator it =
77         index.begin(); it != index.end(); ++it) {
78             delete it.value();
79     }
80 
81     index.clear();
82     documentList.clear();
83 }
84 
writeIndex() const85 bool Writer::writeIndex() const
86 {
87     bool status;
88     QFile idxFile(indexFile);
89     if (!(status = idxFile.open(QFile::WriteOnly)))
90         return status;
91 
92     QDataStream indexStream(&idxFile);
93     for(QHash<QString, Entry*>::ConstIterator it =
94         index.begin(); it != index.end(); ++it) {
95         indexStream << it.key();
96         indexStream << it.value()->documents.count();
97         indexStream << it.value()->documents;
98     }
99     idxFile.close();
100 
101     QFile docFile(documentFile);
102     if (!(status = docFile.open(QFile::WriteOnly)))
103         return status;
104 
105     QDataStream docStream(&docFile);
106     foreach(const QStringList &list, documentList) {
107         docStream << list.at(0);
108         docStream << list.at(1);
109     }
110     docFile.close();
111 
112     return status;
113 }
114 
removeIndex() const115 void Writer::removeIndex() const
116 {
117     QFile idxFile(indexFile);
118     if (idxFile.exists())
119         idxFile.remove();
120 
121     QFile docFile(documentFile);
122     if (docFile.exists())
123         docFile.remove();
124 }
125 
setIndexFile(const QString & namespaceName,const QString & attributes)126 void Writer::setIndexFile(const QString &namespaceName, const QString &attributes)
127 {
128     QString extension = namespaceName + QLatin1String("@") + attributes;
129     indexFile = indexPath + QLatin1String("/indexdb40.") + extension;
130     documentFile = indexPath + QLatin1String("/indexdoc40.") + extension;
131 }
132 
insertInIndex(const QString & string,int docNum)133 void Writer::insertInIndex(const QString &string, int docNum)
134 {
135     if (string == QLatin1String("amp") || string == QLatin1String("nbsp"))
136         return;
137 
138     Entry *entry = 0;
139     if (index.count())
140         entry = index[string];
141 
142     if (entry) {
143         if (entry->documents.last().docNumber != docNum)
144             entry->documents.append(Document(docNum, 1));
145         else
146             entry->documents.last().frequency++;
147     } else {
148         index.insert(string, new Entry(docNum));
149     }
150 }
151 
insertInDocumentList(const QString & title,const QString & url)152 void Writer::insertInDocumentList(const QString &title, const QString &url)
153 {
154     documentList.append(QStringList(title) << url);
155 }
156 
157 
QHelpSearchIndexWriter()158 QHelpSearchIndexWriter::QHelpSearchIndexWriter()
159     : QThread()
160     , m_cancel(false)
161 {
162     // nothing todo
163 }
164 
~QHelpSearchIndexWriter()165 QHelpSearchIndexWriter::~QHelpSearchIndexWriter()
166 {
167     mutex.lock();
168     this->m_cancel = true;
169     waitCondition.wakeOne();
170     mutex.unlock();
171 
172     wait();
173 }
174 
cancelIndexing()175 void QHelpSearchIndexWriter::cancelIndexing()
176 {
177     mutex.lock();
178     this->m_cancel = true;
179     mutex.unlock();
180 }
181 
updateIndex(const QString & collectionFile,const QString & indexFilesFolder,bool reindex)182 void QHelpSearchIndexWriter::updateIndex(const QString &collectionFile,
183                                          const QString &indexFilesFolder,
184                                          bool reindex)
185 {
186     wait();
187     QMutexLocker lock(&mutex);
188 
189     this->m_cancel = false;
190     this->m_reindex = reindex;
191     this->m_collectionFile = collectionFile;
192     this->m_indexFilesFolder = indexFilesFolder;
193 
194     start(QThread::LowestPriority);
195 }
196 
run()197 void QHelpSearchIndexWriter::run()
198 {
199     mutex.lock();
200 
201     if (m_cancel) {
202         mutex.unlock();
203         return;
204     }
205 
206     const bool reindex(this->m_reindex);
207     const QLatin1String key("DefaultSearchNamespaces");
208     const QString collectionFile(this->m_collectionFile);
209     const QString indexPath = m_indexFilesFolder;
210 
211     mutex.unlock();
212 
213     QHelpEngineCore engine(collectionFile, 0);
214     if (!engine.setupData())
215         return;
216 
217     if (reindex)
218         engine.setCustomValue(key, QLatin1String(""));
219 
220     const QStringList registeredDocs = engine.registeredDocumentations();
221     const QStringList indexedNamespaces = engine.customValue(key).toString().
222         split(QLatin1String("|"), QString::SkipEmptyParts);
223 
224     emit indexingStarted();
225 
226     QStringList namespaces;
227     Writer writer(indexPath);
228     foreach(const QString &namespaceName, registeredDocs) {
229         mutex.lock();
230         if (m_cancel) {
231             mutex.unlock();
232             return;
233         }
234         mutex.unlock();
235 
236         // if indexed, continue
237         namespaces.append(namespaceName);
238         if (indexedNamespaces.contains(namespaceName))
239             continue;
240 
241         const QList<QStringList> attributeSets =
242             engine.filterAttributeSets(namespaceName);
243 
244         foreach (const QStringList &attributes, attributeSets) {
245             // cleanup maybe old or unfinished files
246             writer.setIndexFile(namespaceName, attributes.join(QLatin1String("@")));
247             writer.removeIndex();
248 
249             QSet<QString> documentsSet;
250             const QList<QUrl> docFiles = engine.files(namespaceName, attributes);
251             foreach(QUrl url, docFiles) {
252                 if (m_cancel)
253                     return;
254 
255                 // get rid of duplicated files
256                 if (url.hasFragment())
257                     url.setFragment(QString());
258 
259                 QString s = url.toString();
260                 if (s.endsWith(QLatin1String(".html"))
261                     || s.endsWith(QLatin1String(".htm"))
262                     || s.endsWith(QLatin1String(".txt")))
263                     documentsSet.insert(s);
264             }
265 
266             int docNum = 0;
267             const QStringList documentsList(documentsSet.toList());
268             foreach(const QString &url, documentsList) {
269                 if (m_cancel)
270                     return;
271 
272                 QByteArray data(engine.fileData(url));
273                 if (data.isEmpty())
274                     continue;
275 
276                 QTextStream s(data);
277                 QString en = QHelpGlobal::codecFromData(data);
278                 s.setCodec(QTextCodec::codecForName(en.toLatin1().constData()));
279 
280                 QString text = s.readAll();
281                 if (text.isNull())
282                     continue;
283 
284                 QString title = QHelpGlobal::documentTitle(text);
285 
286                 int j = 0;
287                 int i = 0;
288                 bool valid = true;
289                 const QChar *buf = text.unicode();
290                 QChar str[64];
291                 QChar c = buf[0];
292 
293                 while ( j < text.length() ) {
294                     if (m_cancel)
295                         return;
296 
297                     if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) {
298                         valid = false;
299                         if ( i > 1 )
300                             writer.insertInIndex(QString(str,i), docNum);
301                         i = 0;
302                         c = buf[++j];
303                         continue;
304                     }
305                     if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) {
306                         valid = true;
307                         c = buf[++j];
308                         continue;
309                     }
310                     if ( !valid ) {
311                         c = buf[++j];
312                         continue;
313                     }
314                     if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) {
315                         str[i] = c.toLower();
316                         ++i;
317                     } else {
318                         if ( i > 1 )
319                             writer.insertInIndex(QString(str,i), docNum);
320                         i = 0;
321                     }
322                     c = buf[++j];
323                 }
324                 if ( i > 1 )
325                     writer.insertInIndex(QString(str,i), docNum);
326 
327                 docNum++;
328                 writer.insertInDocumentList(title, url);
329             }
330 
331             if (writer.writeIndex()) {
332                 engine.setCustomValue(key, addNamespace(
333                     engine.customValue(key).toString(), namespaceName));
334             }
335 
336             writer.reset();
337         }
338     }
339 
340     QStringListIterator qsli(indexedNamespaces);
341     while (qsli.hasNext()) {
342         const QString namespaceName = qsli.next();
343         if (namespaces.contains(namespaceName))
344             continue;
345 
346         const QList<QStringList> attributeSets =
347             engine.filterAttributeSets(namespaceName);
348 
349         foreach (const QStringList &attributes, attributeSets) {
350             writer.setIndexFile(namespaceName, attributes.join(QLatin1String("@")));
351             writer.removeIndex();
352         }
353 
354         engine.setCustomValue(key, removeNamespace(
355             engine.customValue(key).toString(), namespaceName));
356     }
357 
358     emit indexingFinished();
359 }
360 
addNamespace(const QString namespaces,const QString & namespaceName)361 QString QHelpSearchIndexWriter::addNamespace(const QString namespaces,
362                                              const QString &namespaceName)
363 {
364     QString value = namespaces;
365     if (!value.contains(namespaceName))
366         value.append(namespaceName).append(QLatin1String("|"));
367 
368     return value;
369 }
370 
removeNamespace(const QString namespaces,const QString & namespaceName)371 QString QHelpSearchIndexWriter::removeNamespace(const QString namespaces,
372                                                 const QString &namespaceName)
373 {
374     QString value = namespaces;
375     if (value.contains(namespaceName))
376         value.remove(namespaceName + QLatin1String("|"));
377 
378     return value;
379 }
380 
381 }   // namespace std
382 }   // namespace fulltextsearch
383 
384 QT_END_NAMESPACE
385