1 /****************************************************************************
2 **
3 ** Copyright (C) 2016 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of Qt Creator.
7 **
8 ** Commercial License Usage
9 ** Licensees holding valid commercial Qt licenses may use this file in
10 ** accordance with the commercial license agreement provided with the
11 ** Software or, alternatively, in accordance with the terms contained in
12 ** a written agreement between you and The Qt Company. For licensing terms
13 ** and conditions see https://www.qt.io/terms-conditions. For further
14 ** information use the contact form at https://www.qt.io/contact-us.
15 **
16 ** GNU General Public License Usage
17 ** Alternatively, this file may be used under the terms of the GNU
18 ** General Public License version 3 as published by the Free Software
19 ** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
20 ** included in the packaging of this file. Please review the following
21 ** information to ensure the GNU General Public License requirements will
22 ** be met: https://www.gnu.org/licenses/gpl-3.0.html.
23 **
24 ****************************************************************************/
25 
26 #include "htmldocextractor.h"
27 
28 #include <QStringList>
29 #include <QRegularExpression>
30 
31 namespace Utils {
32 
33 HtmlDocExtractor::HtmlDocExtractor() = default;
34 
setMode(Mode mode)35 void HtmlDocExtractor::setMode(Mode mode)
36 { m_mode = mode; }
37 
applyFormatting(const bool format)38 void HtmlDocExtractor::applyFormatting(const bool format)
39 { m_formatContents = format; }
40 
getClassOrNamespaceBrief(const QString & html,const QString & mark) const41 QString HtmlDocExtractor::getClassOrNamespaceBrief(const QString &html, const QString &mark) const
42 {
43     QString contents = getContentsByMarks(html, mark + QLatin1String("-brief"), mark);
44     if (!contents.isEmpty() && m_formatContents)
45         contents.remove(QLatin1String("<a href=\"#details\">More...</a>"));
46     processOutput(&contents);
47 
48     return contents;
49 }
50 
getClassOrNamespaceDescription(const QString & html,const QString & mark) const51 QString HtmlDocExtractor::getClassOrNamespaceDescription(const QString &html,
52                                                          const QString &mark) const
53 {
54     if (m_mode == FirstParagraph)
55         return getClassOrNamespaceBrief(html, mark);
56 
57     QString contents = getContentsByMarks(html, mark + QLatin1String("-description"), mark);
58     if (!contents.isEmpty() && m_formatContents)
59         contents.remove(QLatin1String("Detailed Description"));
60     processOutput(&contents);
61 
62     return contents;
63 }
64 
getEnumDescription(const QString & html,const QString & mark) const65 QString HtmlDocExtractor::getEnumDescription(const QString &html, const QString &mark) const
66 {
67     return getClassOrNamespaceMemberDescription(html, mark, mark);
68 }
69 
getTypedefDescription(const QString & html,const QString & mark) const70 QString HtmlDocExtractor::getTypedefDescription(const QString &html, const QString &mark) const
71 {
72     return getClassOrNamespaceMemberDescription(html, mark, mark);
73 }
74 
getMacroDescription(const QString & html,const QString & mark) const75 QString HtmlDocExtractor::getMacroDescription(const QString &html,
76                                               const QString &mark) const
77 {
78     return getClassOrNamespaceMemberDescription(html, mark, mark);
79 }
80 
getFunctionDescription(const QString & html,const QString & mark,const bool mainOverload) const81 QString HtmlDocExtractor::getFunctionDescription(const QString &html,
82                                                  const QString &mark,
83                                                  const bool mainOverload) const
84 {
85     QString cleanMark = mark;
86     QString startMark = mark;
87     const int parenthesis = mark.indexOf(QLatin1Char('('));
88     if (parenthesis != -1) {
89         startMark = mark.left(parenthesis);
90         cleanMark = startMark;
91         if (mainOverload) {
92             startMark.append(QLatin1String("[overload1]"));
93         } else {
94             QString complement = mark.right(mark.length() - parenthesis);
95             complement.remove(QRegularExpression("[\\(\\), ]"));
96             startMark.append(complement);
97         }
98     }
99 
100     QString contents = getClassOrNamespaceMemberDescription(html, startMark, cleanMark);
101     if (contents.isEmpty()) {
102         // Maybe this is a property function, which is documented differently. Besides
103         // setX/isX/hasX there are other (not so usual) names for them. A few examples of those:
104         //   - toPlainText / Prop. plainText from QPlainTextEdit.
105         //   - resize / Prop. size from QWidget.
106         //   - move / Prop. pos from QWidget (nothing similar in the names in this case).
107         // So I try to find the link to this property in the list of properties, extract its
108         // anchor and then follow by the name found.
109         const QString &pattern =
110             QString("<a href=\"[a-z\\.]+?#([A-Za-z]+?)-prop\">%1</a>").arg(cleanMark);
111         const QRegularExpressionMatch match = QRegularExpression(pattern).match(html);
112         if (match.hasMatch()) {
113             const QString &prop = match.captured(1);
114             contents = getClassOrNamespaceMemberDescription(html,
115                                                             prop + QLatin1String("-prop"),
116                                                             prop);
117         }
118     }
119 
120     return contents;
121 }
122 
getQmlComponentDescription(const QString & html,const QString & mark) const123 QString HtmlDocExtractor::getQmlComponentDescription(const QString &html, const QString &mark) const
124 {
125     return getClassOrNamespaceDescription(html, mark);
126 }
127 
getQmlPropertyDescription(const QString & html,const QString & mark) const128 QString HtmlDocExtractor::getQmlPropertyDescription(const QString &html, const QString &mark) const
129 {
130     QString startMark = QString::fromLatin1("<a name=\"%1-prop\">").arg(mark);
131     int index = html.indexOf(startMark);
132     if (index == -1) {
133         startMark = QString::fromLatin1("<a name=\"%1-signal\">").arg(mark);
134         index = html.indexOf(startMark);
135     }
136     if (index == -1)
137         return QString();
138 
139     QString contents = html.mid(index + startMark.size());
140     index = contents.indexOf(QLatin1String("<div class=\"qmldoc\"><p>"));
141     if (index == -1)
142         return QString();
143     contents = contents.mid(index);
144     processOutput(&contents);
145 
146     return contents;
147 }
148 
getQMakeVariableOrFunctionDescription(const QString & html,const QString & mark) const149 QString HtmlDocExtractor::getQMakeVariableOrFunctionDescription(const QString &html,
150                                                                 const QString &mark) const
151 {
152     const QString startMark = QString::fromLatin1("<a name=\"%1\"></a>").arg(mark);
153     int index = html.indexOf(startMark);
154     if (index == -1)
155         return QString();
156 
157     QString contents = html.mid(index + startMark.size());
158     index = contents.indexOf(QLatin1String("<!-- @@@qmake"));
159     if (index == -1)
160         return QString();
161     contents = contents.left(index);
162     processOutput(&contents);
163 
164     return contents;
165 }
166 
getQMakeFunctionId(const QString & html,const QString & mark) const167 QString HtmlDocExtractor::getQMakeFunctionId(const QString &html,
168                                              const QString &mark) const
169 {
170     const QString startMark = QString::fromLatin1("<a name=\"%1-").arg(mark);
171     const int startIndex = html.indexOf(startMark);
172     if (startIndex == -1)
173         return QString();
174 
175     const int startKeyIndex = html.indexOf(mark, startIndex);
176 
177     const QString endMark = QLatin1String("\"></a>");
178     const int endKeyIndex = html.indexOf(endMark, startKeyIndex);
179     if (endKeyIndex == -1)
180         return QString();
181 
182     return html.mid(startKeyIndex, endKeyIndex - startKeyIndex);
183 }
184 
getClassOrNamespaceMemberDescription(const QString & html,const QString & startMark,const QString & endMark) const185 QString HtmlDocExtractor::getClassOrNamespaceMemberDescription(const QString &html,
186                                                                const QString &startMark,
187                                                                const QString &endMark) const
188 {
189     QString contents = getContentsByMarks(html, startMark, endMark);
190     processOutput(&contents);
191 
192     return contents;
193 }
194 
getContentsByMarks(const QString & html,QString startMark,QString endMark) const195 QString HtmlDocExtractor::getContentsByMarks(const QString &html,
196                                              QString startMark,
197                                              QString endMark) const
198 {
199     startMark.prepend(QLatin1String("$$$"));
200     endMark.prepend(QLatin1String("<!-- @@@"));
201 
202     QString contents;
203     int start = html.indexOf(startMark);
204     if (start != -1) {
205         start = html.indexOf(QLatin1String("-->"), start);
206         if (start != -1) {
207             int end = html.indexOf(endMark, start);
208             if (end != -1) {
209                 start += 3;
210                 contents = html.mid(start, end - start);
211             }
212         }
213     }
214     return contents;
215 }
216 
processOutput(QString * html) const217 void HtmlDocExtractor::processOutput(QString *html) const
218 {
219     if (html->isEmpty())
220         return;
221 
222     if (m_mode == FirstParagraph) {
223         // Try to get the entire first paragraph, but if one is not found or if its opening
224         // tag is not in the very beginning (using an empirical value as the limit) the html
225         // is cleared to avoid too much content. In case the first paragraph looks like:
226         // <p><i>This is only used on the Maemo platform.</i></p>
227         // or: <p><tt>This is used on Windows only.</tt></p>
228         // or: <p>[Conditional]</p>
229         // include also the next paragraph.
230         int index = html->indexOf(QLatin1String("<p>"));
231         if (index != -1 && index < 400) {
232             if (html->indexOf(QLatin1String("<p><i>")) == index ||
233                     html->indexOf(QLatin1String("<p><tt>")) == index ||
234                     html->indexOf(QLatin1String("<p>[Conditional]</p>")) == index)
235                 index = html->indexOf(QLatin1String("<p>"), index + 6); // skip the first paragraph
236 
237             index = html->indexOf(QLatin1String("</p>"), index + 3);
238             if (index != -1) {
239                 // Most paragraphs end with a period, but there are cases without punctuation
240                 // and cases like this: <p>This is a description. Example:</p>
241                 const int period = html->lastIndexOf(QLatin1Char('.'), index);
242                 if (period != -1) {
243                     html->truncate(period + 1);
244                     html->append(QLatin1String("</p>"));
245                 } else {
246                     html->truncate(index + 4);
247                 }
248             } else {
249                 html->clear();
250             }
251         } else {
252             html->clear();
253         }
254     }
255 
256     if (!html->isEmpty() && m_formatContents) {
257         stripBold(html);
258         replaceNonStyledHeadingsForBold(html);
259         replaceTablesForSimpleLines(html);
260         replaceListsForSimpleLines(html);
261         stripLinks(html);
262         stripHorizontalLines(html);
263         stripDivs(html);
264         stripTagsStyles(html);
265         stripHeadings(html);
266         stripImagens(html);
267         stripEmptyParagraphs(html);
268     }
269 }
270 
stripAllHtml(QString * html)271 void HtmlDocExtractor::stripAllHtml(QString *html)
272 {
273     html->remove(QRegularExpression("<.*?>"));
274 }
275 
stripHeadings(QString * html)276 void HtmlDocExtractor::stripHeadings(QString *html)
277 {
278     html->remove(QRegularExpression("<h\\d{1}.*?>|</h\\d{1}>"));
279 }
280 
stripLinks(QString * html)281 void HtmlDocExtractor::stripLinks(QString *html)
282 {
283     html->remove(QRegularExpression("<a\\s.*?>|</a>"));
284 }
285 
stripHorizontalLines(QString * html)286 void HtmlDocExtractor::stripHorizontalLines(QString *html)
287 {
288     html->remove(QRegularExpression("<hr\\s+/>"));
289 }
290 
stripDivs(QString * html)291 void HtmlDocExtractor::stripDivs(QString *html)
292 {
293     html->remove(QRegularExpression("<div\\s.*?>|</div>|<div\\s.*?/\\s*>"));
294 }
295 
stripTagsStyles(QString * html)296 void HtmlDocExtractor::stripTagsStyles(QString *html)
297 {
298     html->replace(QRegularExpression("<(.*?\\s+)class=\".*?\">"), "<\\1>");
299 }
300 
stripTeletypes(QString * html)301 void HtmlDocExtractor::stripTeletypes(QString *html)
302 {
303     html->remove(QLatin1String("<tt>"));
304     html->remove(QLatin1String("</tt>"));
305 }
306 
stripImagens(QString * html)307 void HtmlDocExtractor::stripImagens(QString *html)
308 {
309     html->remove(QRegularExpression("<img.*?>"));
310 }
311 
stripBold(QString * html)312 void HtmlDocExtractor::stripBold(QString *html)
313 {
314     html->remove(QLatin1String("<b>"));
315     html->remove(QLatin1String("</b>"));
316 }
317 
stripEmptyParagraphs(QString * html)318 void HtmlDocExtractor::stripEmptyParagraphs(QString *html)
319 {
320     html->remove(QLatin1String("<p></p>"));
321 }
322 
replaceNonStyledHeadingsForBold(QString * html)323 void HtmlDocExtractor::replaceNonStyledHeadingsForBold(QString *html)
324 {
325     const QRegularExpression hStart("<h\\d{1}>");
326     const QRegularExpression hEnd("</h\\d{1}>");
327     html->replace(hStart, QLatin1String("<p><b>"));
328     html->replace(hEnd, QLatin1String("</b></p>"));
329 }
330 
replaceTablesForSimpleLines(QString * html)331 void HtmlDocExtractor::replaceTablesForSimpleLines(QString *html)
332 {
333     html->replace(QRegularExpression("(?:<p>)?<table.*?>"), QLatin1String("<p>"));
334     html->replace(QLatin1String("</table>"), QLatin1String("</p>"));
335     html->remove(QRegularExpression("<thead.*?>"));
336     html->remove(QLatin1String("</thead>"));
337     html->remove(QRegularExpression("<tfoot.*?>"));
338     html->remove(QLatin1String("</tfoot>"));
339     html->remove(QRegularExpression("<tr.*?><th.*?>.*?</th></tr>"));
340     html->replace(QLatin1String("</td><td"), QLatin1String("</td>&nbsp;<td"));
341     html->remove(QRegularExpression("<td.*?><p>"));
342     html->remove(QRegularExpression("<td.*?>"));
343     html->remove(QRegularExpression("(?:</p>)?</td>"));
344     html->replace(QRegularExpression("<tr.*?>"), QLatin1String("&nbsp;&nbsp;&nbsp;&nbsp;"));
345     html->replace(QLatin1String("</tr>"), QLatin1String("<br />"));
346 }
347 
replaceListsForSimpleLines(QString * html)348 void HtmlDocExtractor::replaceListsForSimpleLines(QString *html)
349 {
350     html->remove(QRegularExpression("<(?:ul|ol).*?>"));
351     html->remove(QRegularExpression("</(?:ul|ol)>"));
352     html->replace(QLatin1String("<li>"), QLatin1String("&nbsp;&nbsp;&nbsp;&nbsp;"));
353     html->replace(QLatin1String("</li>"), QLatin1String("<br />"));
354 }
355 
356 } // namespace Utils
357