1 /*******************************************************************************
2  * Copyright (c) 2000, 2017 IBM Corporation and others.
3  *
4  * This program and the accompanying materials
5  * are made available under the terms of the Eclipse Public License 2.0
6  * which accompanies this distribution, and is available at
7  * https://www.eclipse.org/legal/epl-2.0/
8  *
9  * SPDX-License-Identifier: EPL-2.0
10  *
11  * Contributors:
12  *     IBM Corporation - initial API and implementation
13  *     Mickael Istria (Red Hat Inc.) - [509032] Support additional tags
14  *******************************************************************************/
15 package org.eclipse.jface.internal.text.html;
16 
17 import java.io.IOException;
18 import java.io.PushbackReader;
19 import java.io.Reader;
20 import java.util.HashMap;
21 import java.util.HashSet;
22 import java.util.Map;
23 import java.util.Set;
24 
25 import org.eclipse.swt.SWT;
26 import org.eclipse.swt.custom.StyleRange;
27 
28 import org.eclipse.jface.text.TextPresentation;
29 
30 
31 /**
32  * Reads the text contents from a reader of HTML contents and translates
33  * the tags or cut them out.
34  * <p>
35  * Moved into this package from <code>org.eclipse.jface.internal.text.revisions</code>.</p>
36  */
37 public class HTML2TextReader extends SubstitutionTextReader {
38 
39 	private static final String EMPTY_STRING= ""; //$NON-NLS-1$
40 	private static final Map<String, String> fgEntityLookup;
41 	private static final Set<String> fgTags;
42 
43 	static {
44 
45 		fgTags= new HashSet<>();
46 		fgTags.add("b"); //$NON-NLS-1$
47 		fgTags.add("strong"); //$NON-NLS-1$
48 		fgTags.add("br"); //$NON-NLS-1$
49 		fgTags.add("br/"); //$NON-NLS-1$
50 		fgTags.add("div"); //$NON-NLS-1$
51 		fgTags.add("del"); //$NON-NLS-1$
52 		fgTags.add("strike"); //$NON-NLS-1$
53 		fgTags.add("s"); //$NON-NLS-1$
54 		fgTags.add("em"); //$NON-NLS-1$
55 		fgTags.add("i"); //$NON-NLS-1$
56 		fgTags.add("h1"); //$NON-NLS-1$
57 		fgTags.add("h2"); //$NON-NLS-1$
58 		fgTags.add("h3"); //$NON-NLS-1$
59 		fgTags.add("h4"); //$NON-NLS-1$
60 		fgTags.add("h5"); //$NON-NLS-1$
61 		fgTags.add("p"); //$NON-NLS-1$
62 		fgTags.add("dl"); //$NON-NLS-1$
63 		fgTags.add("dt"); //$NON-NLS-1$
64 		fgTags.add("dd"); //$NON-NLS-1$
65 		fgTags.add("li"); //$NON-NLS-1$
66 		fgTags.add("ul"); //$NON-NLS-1$
67 		fgTags.add("pre"); //$NON-NLS-1$
68 		fgTags.add("head"); //$NON-NLS-1$
69 
70 		fgEntityLookup= new HashMap<>(7);
71 		fgEntityLookup.put("lt", "<"); //$NON-NLS-1$ //$NON-NLS-2$
72 		fgEntityLookup.put("gt", ">"); //$NON-NLS-1$ //$NON-NLS-2$
73 		fgEntityLookup.put("nbsp", " "); //$NON-NLS-1$ //$NON-NLS-2$
74 		fgEntityLookup.put("amp", "&"); //$NON-NLS-1$ //$NON-NLS-2$
75 		fgEntityLookup.put("circ", "^"); //$NON-NLS-1$ //$NON-NLS-2$
76 		fgEntityLookup.put("tilde", "~"); //$NON-NLS-2$ //$NON-NLS-1$
77 		fgEntityLookup.put("quot", "\"");		 //$NON-NLS-1$ //$NON-NLS-2$
78 	}
79 
80 	private int fCounter= 0;
81 	private TextPresentation fTextPresentation;
82 	private int fBold= 0;
83 	private int fItalic= 0;
84 	private int fStrikeout= 0;
85 	private boolean fInParagraph= false;
86 	private boolean fIgnore= false;
87 	private boolean fHeaderDetected= false;
88 	private StyleRange fCurrentStyleRange;
89 
90 	/**
91 	 * Transforms the HTML text from the reader to formatted text.
92 	 *
93 	 * @param reader the reader
94 	 * @param presentation If not <code>null</code>, formattings will be applied to
95 	 * the presentation.
96 	*/
HTML2TextReader(Reader reader, TextPresentation presentation)97 	public HTML2TextReader(Reader reader, TextPresentation presentation) {
98 		super(new PushbackReader(reader));
99 		fTextPresentation= presentation;
100 		fCurrentStyleRange= new StyleRange();
101 		fCurrentStyleRange.start= 0;
102 	}
103 
104 	@Override
read()105 	public int read() throws IOException {
106 		int c= super.read();
107 		if (c != -1)
108 			++ fCounter;
109 		return c;
110 	}
111 
startBold()112 	protected void startBold() {
113 		if (fBold == 0) {
114 			finishAndReinitializeCurrentStyle(fCounter);
115 			fCurrentStyleRange.fontStyle |= SWT.BOLD;
116 		}
117 		++fBold;
118 	}
119 
startItalic()120 	protected void startItalic() {
121 		if (fItalic == 0) {
122 			finishAndReinitializeCurrentStyle(fCounter);
123 			fCurrentStyleRange.fontStyle |= SWT.ITALIC;
124 		}
125 		++fItalic;
126 	}
127 
stopItalic()128 	protected void stopItalic() {
129 		--fItalic;
130 		if (fItalic == 0) {
131 			finishAndReinitializeCurrentStyle(fCounter);
132 			fCurrentStyleRange.fontStyle ^= SWT.ITALIC;
133 		}
134 	}
135 
stopBold()136 	protected void stopBold() {
137 		--fBold;
138 		if (fBold == 0) {
139 			finishAndReinitializeCurrentStyle(fCounter);
140 			fCurrentStyleRange.fontStyle ^= SWT.BOLD;
141 		}
142 	}
143 
startStrikeout()144 	protected void startStrikeout() {
145 		if (fStrikeout == 0) {
146 			finishAndReinitializeCurrentStyle(fCounter);
147 			fCurrentStyleRange.strikeout= true;
148 		}
149 		++fStrikeout;
150 	}
151 
stopStrikeout()152 	protected void stopStrikeout() {
153 		--fStrikeout;
154 		if (fStrikeout == 0) {
155 			finishAndReinitializeCurrentStyle(fCounter);
156 			fCurrentStyleRange.strikeout= false;
157 		}
158 	}
159 
finishAndReinitializeCurrentStyle(int offset)160 	private void finishAndReinitializeCurrentStyle(int offset) {
161 		if (fTextPresentation != null && offset != fCurrentStyleRange.start && !isDefaultStyleRange(fCurrentStyleRange)) {
162 			fCurrentStyleRange.length= offset - fCurrentStyleRange.start;
163 			fTextPresentation.addStyleRange(fCurrentStyleRange);
164 		}
165 		fCurrentStyleRange= (StyleRange)fCurrentStyleRange.clone();
166 		fCurrentStyleRange.start= offset;
167 		fCurrentStyleRange.length= 0;
168 	}
169 
isDefaultStyleRange(StyleRange styleRange)170 	private static boolean isDefaultStyleRange(StyleRange styleRange) {
171 		return styleRange.equals(new StyleRange(styleRange.start,styleRange.length, null, null));
172 	}
173 
startPreformattedText()174 	protected void startPreformattedText() {
175 		setSkipWhitespace(false);
176 	}
177 
stopPreformattedText()178 	protected void stopPreformattedText() {
179 		setSkipWhitespace(true);
180 	}
181 
182 
183 	@Override
computeSubstitution(int c)184 	protected String computeSubstitution(int c) throws IOException {
185 
186 		if (c == '<')
187 			return  processHTMLTag();
188 		else if (fIgnore)
189 			return EMPTY_STRING;
190 		else if (c == '&')
191 			return processEntity();
192 
193 		return null;
194 	}
195 
html2Text(String html)196 	private String html2Text(String html) {
197 
198 		if (html == null || html.isEmpty())
199 			return EMPTY_STRING;
200 
201 		html= html.toLowerCase();
202 
203 		String tag= html;
204 		if ('/' == tag.charAt(0))
205 			tag= tag.substring(1);
206 
207 		if (!fgTags.contains(tag))
208 			return EMPTY_STRING;
209 
210 
211 		if ("pre".equals(html)) { //$NON-NLS-1$
212 			startPreformattedText();
213 			return EMPTY_STRING;
214 		}
215 
216 		if ("/pre".equals(html)) { //$NON-NLS-1$
217 			stopPreformattedText();
218 			return EMPTY_STRING;
219 		}
220 
221 		if ("i".equals(html) || "em".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$
222 			startItalic();
223 			return EMPTY_STRING;
224 		}
225 
226 		if ("/i".equals(html) || "/em".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$
227 			stopItalic();
228 			return EMPTY_STRING;
229 		}
230 
231 		if ("b".equals(html) || "strong".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$
232 			startBold();
233 			return EMPTY_STRING;
234 		}
235 
236 		if ("del".equals(html) || "s".equals(html) || "strike".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
237 			startStrikeout();
238 			return EMPTY_STRING;
239 		}
240 
241 		if ((html.length() > 1 && html.charAt(0) == 'h' && Character.isDigit(html.charAt(1))) || "dt".equals(html)) { //$NON-NLS-1$
242 			startBold();
243 			return EMPTY_STRING;
244 		}
245 
246 		if ("dl".equals(html)) //$NON-NLS-1$
247 			return LINE_DELIM;
248 
249 		if ("dd".equals(html)) //$NON-NLS-1$
250 			return "\t"; //$NON-NLS-1$
251 
252 		if ("li".equals(html)) //$NON-NLS-1$
253 			// FIXME: this hard-coded prefix does not work for RTL languages, see https://bugs.eclipse.org/bugs/show_bug.cgi?id=91682
254 			return LINE_DELIM + HTMLMessages.getString("HTML2TextReader.listItemPrefix"); //$NON-NLS-1$
255 
256 		if ("/b".equals(html) || "/strong".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$
257 			stopBold();
258 			return EMPTY_STRING;
259 		}
260 
261 		if ("/del".equals(html) || "/s".equals(html) || "/strike".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
262 			stopStrikeout();
263 			return EMPTY_STRING;
264 		}
265 
266 		if ("p".equals(html))  { //$NON-NLS-1$
267 			fInParagraph= true;
268 			return LINE_DELIM;
269 		}
270 
271 		if ("br".equals(html) || "br/".equals(html) || "div".equals(html)) //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
272 			return LINE_DELIM;
273 
274 		if ("/p".equals(html))  { //$NON-NLS-1$
275 			boolean inParagraph= fInParagraph;
276 			fInParagraph= false;
277 			return inParagraph ? EMPTY_STRING : LINE_DELIM;
278 		}
279 
280 		if ((html.startsWith("/h") && html.length() > 2 && Character.isDigit(html.charAt(2))) || "/dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$
281 			stopBold();
282 			return LINE_DELIM;
283 		}
284 
285 		if ("/dd".equals(html)) //$NON-NLS-1$
286 			return LINE_DELIM;
287 
288 		if ("head".equals(html) && !fHeaderDetected) { //$NON-NLS-1$
289 			fHeaderDetected= true;
290 			fIgnore= true;
291 			return EMPTY_STRING;
292 		}
293 
294 		if ("/head".equals(html) && fHeaderDetected && fIgnore) { //$NON-NLS-1$
295 			fIgnore= false;
296 			return EMPTY_STRING;
297 		}
298 
299 		return EMPTY_STRING;
300 	}
301 
302 	/*
303 	 * A '<' has been read. Process a html tag
304 	 */
processHTMLTag()305 	private String processHTMLTag() throws IOException {
306 
307 		StringBuilder buf= new StringBuilder();
308 		int ch;
309 		do {
310 
311 			ch= nextChar();
312 
313 			while (ch != -1 && ch != '>') {
314 				buf.append(Character.toLowerCase((char) ch));
315 				ch= nextChar();
316 				if (ch == '"'){
317 					buf.append(Character.toLowerCase((char) ch));
318 					ch= nextChar();
319 					while (ch != -1 && ch != '"'){
320 						buf.append(Character.toLowerCase((char) ch));
321 						ch= nextChar();
322 					}
323 				}
324 				if (ch == '<' && !isInComment(buf)) {
325 					unread(ch);
326 					return '<' + buf.toString();
327 				}
328 			}
329 
330 			if (ch == -1)
331 				return null;
332 
333 			if (!isInComment(buf) || isCommentEnd(buf)) {
334 				break;
335 			}
336 			// unfinished comment
337 			buf.append((char) ch);
338 		} while (true);
339 
340 		return html2Text(buf.toString());
341 	}
342 
isInComment(StringBuilder buf)343 	private static boolean isInComment(StringBuilder buf) {
344 		return buf.length() >= 3 && "!--".equals(buf.substring(0, 3)); //$NON-NLS-1$
345 	}
346 
isCommentEnd(StringBuilder buf)347 	private static boolean isCommentEnd(StringBuilder buf) {
348 		int tagLen= buf.length();
349 		return tagLen >= 5 && "--".equals(buf.substring(tagLen - 2)); //$NON-NLS-1$
350 	}
351 
unread(int ch)352 	private void unread(int ch) throws IOException {
353 		((PushbackReader) getReader()).unread(ch);
354 	}
355 
entity2Text(String symbol)356 	protected String entity2Text(String symbol) {
357 		if (symbol.length() > 1 && symbol.charAt(0) == '#') {
358 			int ch;
359 			try {
360 				if (symbol.charAt(1) == 'x') {
361 					ch= Integer.parseInt(symbol.substring(2), 16);
362 				} else {
363 					ch= Integer.parseInt(symbol.substring(1), 10);
364 				}
365 				return EMPTY_STRING + (char)ch;
366 			} catch (NumberFormatException e) {
367 			}
368 		} else {
369 			String str= fgEntityLookup.get(symbol);
370 			if (str != null) {
371 				return str;
372 			}
373 		}
374 		return "&" + symbol + ";"; // not found //$NON-NLS-1$ //$NON-NLS-2$
375 	}
376 
377 	/*
378 	 * A '&' has been read. Process a entity
379 	 */
processEntity()380 	private String processEntity() throws IOException {
381 		StringBuilder buf= new StringBuilder();
382 		int ch= nextChar();
383 		while (Character.isLetterOrDigit((char)ch) || ch == '#') {
384 			buf.append((char) ch);
385 			ch= nextChar();
386 		}
387 
388 		if (ch == ';')
389 			return entity2Text(buf.toString());
390 
391 		buf.insert(0, '&');
392 		if (ch != -1)
393 			buf.append((char) ch);
394 		return buf.toString();
395 	}
396 }
397