1 /****************************************************************************
2 **
3 ** Copyright (C) 2016 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of Qt Creator.
7 **
8 ** Commercial License Usage
9 ** Licensees holding valid commercial Qt licenses may use this file in
10 ** accordance with the commercial license agreement provided with the
11 ** Software or, alternatively, in accordance with the terms contained in
12 ** a written agreement between you and The Qt Company. For licensing terms
13 ** and conditions see https://www.qt.io/terms-conditions. For further
14 ** information use the contact form at https://www.qt.io/contact-us.
15 **
16 ** GNU General Public License Usage
17 ** Alternatively, this file may be used under the terms of the GNU
18 ** General Public License version 3 as published by the Free Software
19 ** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
20 ** included in the packaging of this file. Please review the following
21 ** information to ensure the GNU General Public License requirements will
22 ** be met: https://www.gnu.org/licenses/gpl-3.0.html.
23 **
24 ****************************************************************************/
25
26 #include "htmldocextractor.h"
27
28 #include <QStringList>
29 #include <QRegularExpression>
30
31 namespace Utils {
32
33 HtmlDocExtractor::HtmlDocExtractor() = default;
34
setMode(Mode mode)35 void HtmlDocExtractor::setMode(Mode mode)
36 { m_mode = mode; }
37
applyFormatting(const bool format)38 void HtmlDocExtractor::applyFormatting(const bool format)
39 { m_formatContents = format; }
40
getClassOrNamespaceBrief(const QString & html,const QString & mark) const41 QString HtmlDocExtractor::getClassOrNamespaceBrief(const QString &html, const QString &mark) const
42 {
43 QString contents = getContentsByMarks(html, mark + QLatin1String("-brief"), mark);
44 if (!contents.isEmpty() && m_formatContents)
45 contents.remove(QLatin1String("<a href=\"#details\">More...</a>"));
46 processOutput(&contents);
47
48 return contents;
49 }
50
getClassOrNamespaceDescription(const QString & html,const QString & mark) const51 QString HtmlDocExtractor::getClassOrNamespaceDescription(const QString &html,
52 const QString &mark) const
53 {
54 if (m_mode == FirstParagraph)
55 return getClassOrNamespaceBrief(html, mark);
56
57 QString contents = getContentsByMarks(html, mark + QLatin1String("-description"), mark);
58 if (!contents.isEmpty() && m_formatContents)
59 contents.remove(QLatin1String("Detailed Description"));
60 processOutput(&contents);
61
62 return contents;
63 }
64
getEnumDescription(const QString & html,const QString & mark) const65 QString HtmlDocExtractor::getEnumDescription(const QString &html, const QString &mark) const
66 {
67 return getClassOrNamespaceMemberDescription(html, mark, mark);
68 }
69
getTypedefDescription(const QString & html,const QString & mark) const70 QString HtmlDocExtractor::getTypedefDescription(const QString &html, const QString &mark) const
71 {
72 return getClassOrNamespaceMemberDescription(html, mark, mark);
73 }
74
getMacroDescription(const QString & html,const QString & mark) const75 QString HtmlDocExtractor::getMacroDescription(const QString &html,
76 const QString &mark) const
77 {
78 return getClassOrNamespaceMemberDescription(html, mark, mark);
79 }
80
getFunctionDescription(const QString & html,const QString & mark,const bool mainOverload) const81 QString HtmlDocExtractor::getFunctionDescription(const QString &html,
82 const QString &mark,
83 const bool mainOverload) const
84 {
85 QString cleanMark = mark;
86 QString startMark = mark;
87 const int parenthesis = mark.indexOf(QLatin1Char('('));
88 if (parenthesis != -1) {
89 startMark = mark.left(parenthesis);
90 cleanMark = startMark;
91 if (mainOverload) {
92 startMark.append(QLatin1String("[overload1]"));
93 } else {
94 QString complement = mark.right(mark.length() - parenthesis);
95 complement.remove(QRegularExpression("[\\(\\), ]"));
96 startMark.append(complement);
97 }
98 }
99
100 QString contents = getClassOrNamespaceMemberDescription(html, startMark, cleanMark);
101 if (contents.isEmpty()) {
102 // Maybe this is a property function, which is documented differently. Besides
103 // setX/isX/hasX there are other (not so usual) names for them. A few examples of those:
104 // - toPlainText / Prop. plainText from QPlainTextEdit.
105 // - resize / Prop. size from QWidget.
106 // - move / Prop. pos from QWidget (nothing similar in the names in this case).
107 // So I try to find the link to this property in the list of properties, extract its
108 // anchor and then follow by the name found.
109 const QString &pattern =
110 QString("<a href=\"[a-z\\.]+?#([A-Za-z]+?)-prop\">%1</a>").arg(cleanMark);
111 const QRegularExpressionMatch match = QRegularExpression(pattern).match(html);
112 if (match.hasMatch()) {
113 const QString &prop = match.captured(1);
114 contents = getClassOrNamespaceMemberDescription(html,
115 prop + QLatin1String("-prop"),
116 prop);
117 }
118 }
119
120 return contents;
121 }
122
getQmlComponentDescription(const QString & html,const QString & mark) const123 QString HtmlDocExtractor::getQmlComponentDescription(const QString &html, const QString &mark) const
124 {
125 return getClassOrNamespaceDescription(html, mark);
126 }
127
getQmlPropertyDescription(const QString & html,const QString & mark) const128 QString HtmlDocExtractor::getQmlPropertyDescription(const QString &html, const QString &mark) const
129 {
130 QString startMark = QString::fromLatin1("<a name=\"%1-prop\">").arg(mark);
131 int index = html.indexOf(startMark);
132 if (index == -1) {
133 startMark = QString::fromLatin1("<a name=\"%1-signal\">").arg(mark);
134 index = html.indexOf(startMark);
135 }
136 if (index == -1)
137 return QString();
138
139 QString contents = html.mid(index + startMark.size());
140 index = contents.indexOf(QLatin1String("<div class=\"qmldoc\"><p>"));
141 if (index == -1)
142 return QString();
143 contents = contents.mid(index);
144 processOutput(&contents);
145
146 return contents;
147 }
148
getQMakeVariableOrFunctionDescription(const QString & html,const QString & mark) const149 QString HtmlDocExtractor::getQMakeVariableOrFunctionDescription(const QString &html,
150 const QString &mark) const
151 {
152 const QString startMark = QString::fromLatin1("<a name=\"%1\"></a>").arg(mark);
153 int index = html.indexOf(startMark);
154 if (index == -1)
155 return QString();
156
157 QString contents = html.mid(index + startMark.size());
158 index = contents.indexOf(QLatin1String("<!-- @@@qmake"));
159 if (index == -1)
160 return QString();
161 contents = contents.left(index);
162 processOutput(&contents);
163
164 return contents;
165 }
166
getQMakeFunctionId(const QString & html,const QString & mark) const167 QString HtmlDocExtractor::getQMakeFunctionId(const QString &html,
168 const QString &mark) const
169 {
170 const QString startMark = QString::fromLatin1("<a name=\"%1-").arg(mark);
171 const int startIndex = html.indexOf(startMark);
172 if (startIndex == -1)
173 return QString();
174
175 const int startKeyIndex = html.indexOf(mark, startIndex);
176
177 const QString endMark = QLatin1String("\"></a>");
178 const int endKeyIndex = html.indexOf(endMark, startKeyIndex);
179 if (endKeyIndex == -1)
180 return QString();
181
182 return html.mid(startKeyIndex, endKeyIndex - startKeyIndex);
183 }
184
getClassOrNamespaceMemberDescription(const QString & html,const QString & startMark,const QString & endMark) const185 QString HtmlDocExtractor::getClassOrNamespaceMemberDescription(const QString &html,
186 const QString &startMark,
187 const QString &endMark) const
188 {
189 QString contents = getContentsByMarks(html, startMark, endMark);
190 processOutput(&contents);
191
192 return contents;
193 }
194
getContentsByMarks(const QString & html,QString startMark,QString endMark) const195 QString HtmlDocExtractor::getContentsByMarks(const QString &html,
196 QString startMark,
197 QString endMark) const
198 {
199 startMark.prepend(QLatin1String("$$$"));
200 endMark.prepend(QLatin1String("<!-- @@@"));
201
202 QString contents;
203 int start = html.indexOf(startMark);
204 if (start != -1) {
205 start = html.indexOf(QLatin1String("-->"), start);
206 if (start != -1) {
207 int end = html.indexOf(endMark, start);
208 if (end != -1) {
209 start += 3;
210 contents = html.mid(start, end - start);
211 }
212 }
213 }
214 return contents;
215 }
216
processOutput(QString * html) const217 void HtmlDocExtractor::processOutput(QString *html) const
218 {
219 if (html->isEmpty())
220 return;
221
222 if (m_mode == FirstParagraph) {
223 // Try to get the entire first paragraph, but if one is not found or if its opening
224 // tag is not in the very beginning (using an empirical value as the limit) the html
225 // is cleared to avoid too much content. In case the first paragraph looks like:
226 // <p><i>This is only used on the Maemo platform.</i></p>
227 // or: <p><tt>This is used on Windows only.</tt></p>
228 // or: <p>[Conditional]</p>
229 // include also the next paragraph.
230 int index = html->indexOf(QLatin1String("<p>"));
231 if (index != -1 && index < 400) {
232 if (html->indexOf(QLatin1String("<p><i>")) == index ||
233 html->indexOf(QLatin1String("<p><tt>")) == index ||
234 html->indexOf(QLatin1String("<p>[Conditional]</p>")) == index)
235 index = html->indexOf(QLatin1String("<p>"), index + 6); // skip the first paragraph
236
237 index = html->indexOf(QLatin1String("</p>"), index + 3);
238 if (index != -1) {
239 // Most paragraphs end with a period, but there are cases without punctuation
240 // and cases like this: <p>This is a description. Example:</p>
241 const int period = html->lastIndexOf(QLatin1Char('.'), index);
242 if (period != -1) {
243 html->truncate(period + 1);
244 html->append(QLatin1String("</p>"));
245 } else {
246 html->truncate(index + 4);
247 }
248 } else {
249 html->clear();
250 }
251 } else {
252 html->clear();
253 }
254 }
255
256 if (!html->isEmpty() && m_formatContents) {
257 stripBold(html);
258 replaceNonStyledHeadingsForBold(html);
259 replaceTablesForSimpleLines(html);
260 replaceListsForSimpleLines(html);
261 stripLinks(html);
262 stripHorizontalLines(html);
263 stripDivs(html);
264 stripTagsStyles(html);
265 stripHeadings(html);
266 stripImagens(html);
267 stripEmptyParagraphs(html);
268 }
269 }
270
stripAllHtml(QString * html)271 void HtmlDocExtractor::stripAllHtml(QString *html)
272 {
273 html->remove(QRegularExpression("<.*?>"));
274 }
275
stripHeadings(QString * html)276 void HtmlDocExtractor::stripHeadings(QString *html)
277 {
278 html->remove(QRegularExpression("<h\\d{1}.*?>|</h\\d{1}>"));
279 }
280
stripLinks(QString * html)281 void HtmlDocExtractor::stripLinks(QString *html)
282 {
283 html->remove(QRegularExpression("<a\\s.*?>|</a>"));
284 }
285
stripHorizontalLines(QString * html)286 void HtmlDocExtractor::stripHorizontalLines(QString *html)
287 {
288 html->remove(QRegularExpression("<hr\\s+/>"));
289 }
290
stripDivs(QString * html)291 void HtmlDocExtractor::stripDivs(QString *html)
292 {
293 html->remove(QRegularExpression("<div\\s.*?>|</div>|<div\\s.*?/\\s*>"));
294 }
295
stripTagsStyles(QString * html)296 void HtmlDocExtractor::stripTagsStyles(QString *html)
297 {
298 html->replace(QRegularExpression("<(.*?\\s+)class=\".*?\">"), "<\\1>");
299 }
300
stripTeletypes(QString * html)301 void HtmlDocExtractor::stripTeletypes(QString *html)
302 {
303 html->remove(QLatin1String("<tt>"));
304 html->remove(QLatin1String("</tt>"));
305 }
306
stripImagens(QString * html)307 void HtmlDocExtractor::stripImagens(QString *html)
308 {
309 html->remove(QRegularExpression("<img.*?>"));
310 }
311
stripBold(QString * html)312 void HtmlDocExtractor::stripBold(QString *html)
313 {
314 html->remove(QLatin1String("<b>"));
315 html->remove(QLatin1String("</b>"));
316 }
317
stripEmptyParagraphs(QString * html)318 void HtmlDocExtractor::stripEmptyParagraphs(QString *html)
319 {
320 html->remove(QLatin1String("<p></p>"));
321 }
322
replaceNonStyledHeadingsForBold(QString * html)323 void HtmlDocExtractor::replaceNonStyledHeadingsForBold(QString *html)
324 {
325 const QRegularExpression hStart("<h\\d{1}>");
326 const QRegularExpression hEnd("</h\\d{1}>");
327 html->replace(hStart, QLatin1String("<p><b>"));
328 html->replace(hEnd, QLatin1String("</b></p>"));
329 }
330
replaceTablesForSimpleLines(QString * html)331 void HtmlDocExtractor::replaceTablesForSimpleLines(QString *html)
332 {
333 html->replace(QRegularExpression("(?:<p>)?<table.*?>"), QLatin1String("<p>"));
334 html->replace(QLatin1String("</table>"), QLatin1String("</p>"));
335 html->remove(QRegularExpression("<thead.*?>"));
336 html->remove(QLatin1String("</thead>"));
337 html->remove(QRegularExpression("<tfoot.*?>"));
338 html->remove(QLatin1String("</tfoot>"));
339 html->remove(QRegularExpression("<tr.*?><th.*?>.*?</th></tr>"));
340 html->replace(QLatin1String("</td><td"), QLatin1String("</td> <td"));
341 html->remove(QRegularExpression("<td.*?><p>"));
342 html->remove(QRegularExpression("<td.*?>"));
343 html->remove(QRegularExpression("(?:</p>)?</td>"));
344 html->replace(QRegularExpression("<tr.*?>"), QLatin1String(" "));
345 html->replace(QLatin1String("</tr>"), QLatin1String("<br />"));
346 }
347
replaceListsForSimpleLines(QString * html)348 void HtmlDocExtractor::replaceListsForSimpleLines(QString *html)
349 {
350 html->remove(QRegularExpression("<(?:ul|ol).*?>"));
351 html->remove(QRegularExpression("</(?:ul|ol)>"));
352 html->replace(QLatin1String("<li>"), QLatin1String(" "));
353 html->replace(QLatin1String("</li>"), QLatin1String("<br />"));
354 }
355
356 } // namespace Utils
357