1 /******************************************************************************* 2 * Copyright (c) 2000, 2017 IBM Corporation and others. 3 * 4 * This program and the accompanying materials 5 * are made available under the terms of the Eclipse Public License 2.0 6 * which accompanies this distribution, and is available at 7 * https://www.eclipse.org/legal/epl-2.0/ 8 * 9 * SPDX-License-Identifier: EPL-2.0 10 * 11 * Contributors: 12 * IBM Corporation - initial API and implementation 13 * Mickael Istria (Red Hat Inc.) - [509032] Support additional tags 14 *******************************************************************************/ 15 package org.eclipse.jface.internal.text.html; 16 17 import java.io.IOException; 18 import java.io.PushbackReader; 19 import java.io.Reader; 20 import java.util.HashMap; 21 import java.util.HashSet; 22 import java.util.Map; 23 import java.util.Set; 24 25 import org.eclipse.swt.SWT; 26 import org.eclipse.swt.custom.StyleRange; 27 28 import org.eclipse.jface.text.TextPresentation; 29 30 31 /** 32 * Reads the text contents from a reader of HTML contents and translates 33 * the tags or cut them out. 34 * <p> 35 * Moved into this package from <code>org.eclipse.jface.internal.text.revisions</code>.</p> 36 */ 37 public class HTML2TextReader extends SubstitutionTextReader { 38 39 private static final String EMPTY_STRING= ""; //$NON-NLS-1$ 40 private static final Map<String, String> fgEntityLookup; 41 private static final Set<String> fgTags; 42 43 static { 44 45 fgTags= new HashSet<>(); 46 fgTags.add("b"); //$NON-NLS-1$ 47 fgTags.add("strong"); //$NON-NLS-1$ 48 fgTags.add("br"); //$NON-NLS-1$ 49 fgTags.add("br/"); //$NON-NLS-1$ 50 fgTags.add("div"); //$NON-NLS-1$ 51 fgTags.add("del"); //$NON-NLS-1$ 52 fgTags.add("strike"); //$NON-NLS-1$ 53 fgTags.add("s"); //$NON-NLS-1$ 54 fgTags.add("em"); //$NON-NLS-1$ 55 fgTags.add("i"); //$NON-NLS-1$ 56 fgTags.add("h1"); //$NON-NLS-1$ 57 fgTags.add("h2"); //$NON-NLS-1$ 58 fgTags.add("h3"); //$NON-NLS-1$ 59 fgTags.add("h4"); //$NON-NLS-1$ 60 fgTags.add("h5"); //$NON-NLS-1$ 61 fgTags.add("p"); //$NON-NLS-1$ 62 fgTags.add("dl"); //$NON-NLS-1$ 63 fgTags.add("dt"); //$NON-NLS-1$ 64 fgTags.add("dd"); //$NON-NLS-1$ 65 fgTags.add("li"); //$NON-NLS-1$ 66 fgTags.add("ul"); //$NON-NLS-1$ 67 fgTags.add("pre"); //$NON-NLS-1$ 68 fgTags.add("head"); //$NON-NLS-1$ 69 70 fgEntityLookup= new HashMap<>(7); 71 fgEntityLookup.put("lt", "<"); //$NON-NLS-1$ //$NON-NLS-2$ 72 fgEntityLookup.put("gt", ">"); //$NON-NLS-1$ //$NON-NLS-2$ 73 fgEntityLookup.put("nbsp", " "); //$NON-NLS-1$ //$NON-NLS-2$ 74 fgEntityLookup.put("amp", "&"); //$NON-NLS-1$ //$NON-NLS-2$ 75 fgEntityLookup.put("circ", "^"); //$NON-NLS-1$ //$NON-NLS-2$ 76 fgEntityLookup.put("tilde", "~"); //$NON-NLS-2$ //$NON-NLS-1$ 77 fgEntityLookup.put("quot", "\""); //$NON-NLS-1$ //$NON-NLS-2$ 78 } 79 80 private int fCounter= 0; 81 private TextPresentation fTextPresentation; 82 private int fBold= 0; 83 private int fItalic= 0; 84 private int fStrikeout= 0; 85 private boolean fInParagraph= false; 86 private boolean fIgnore= false; 87 private boolean fHeaderDetected= false; 88 private StyleRange fCurrentStyleRange; 89 90 /** 91 * Transforms the HTML text from the reader to formatted text. 92 * 93 * @param reader the reader 94 * @param presentation If not <code>null</code>, formattings will be applied to 95 * the presentation. 96 */ HTML2TextReader(Reader reader, TextPresentation presentation)97 public HTML2TextReader(Reader reader, TextPresentation presentation) { 98 super(new PushbackReader(reader)); 99 fTextPresentation= presentation; 100 fCurrentStyleRange= new StyleRange(); 101 fCurrentStyleRange.start= 0; 102 } 103 104 @Override read()105 public int read() throws IOException { 106 int c= super.read(); 107 if (c != -1) 108 ++ fCounter; 109 return c; 110 } 111 startBold()112 protected void startBold() { 113 if (fBold == 0) { 114 finishAndReinitializeCurrentStyle(fCounter); 115 fCurrentStyleRange.fontStyle |= SWT.BOLD; 116 } 117 ++fBold; 118 } 119 startItalic()120 protected void startItalic() { 121 if (fItalic == 0) { 122 finishAndReinitializeCurrentStyle(fCounter); 123 fCurrentStyleRange.fontStyle |= SWT.ITALIC; 124 } 125 ++fItalic; 126 } 127 stopItalic()128 protected void stopItalic() { 129 --fItalic; 130 if (fItalic == 0) { 131 finishAndReinitializeCurrentStyle(fCounter); 132 fCurrentStyleRange.fontStyle ^= SWT.ITALIC; 133 } 134 } 135 stopBold()136 protected void stopBold() { 137 --fBold; 138 if (fBold == 0) { 139 finishAndReinitializeCurrentStyle(fCounter); 140 fCurrentStyleRange.fontStyle ^= SWT.BOLD; 141 } 142 } 143 startStrikeout()144 protected void startStrikeout() { 145 if (fStrikeout == 0) { 146 finishAndReinitializeCurrentStyle(fCounter); 147 fCurrentStyleRange.strikeout= true; 148 } 149 ++fStrikeout; 150 } 151 stopStrikeout()152 protected void stopStrikeout() { 153 --fStrikeout; 154 if (fStrikeout == 0) { 155 finishAndReinitializeCurrentStyle(fCounter); 156 fCurrentStyleRange.strikeout= false; 157 } 158 } 159 finishAndReinitializeCurrentStyle(int offset)160 private void finishAndReinitializeCurrentStyle(int offset) { 161 if (fTextPresentation != null && offset != fCurrentStyleRange.start && !isDefaultStyleRange(fCurrentStyleRange)) { 162 fCurrentStyleRange.length= offset - fCurrentStyleRange.start; 163 fTextPresentation.addStyleRange(fCurrentStyleRange); 164 } 165 fCurrentStyleRange= (StyleRange)fCurrentStyleRange.clone(); 166 fCurrentStyleRange.start= offset; 167 fCurrentStyleRange.length= 0; 168 } 169 isDefaultStyleRange(StyleRange styleRange)170 private static boolean isDefaultStyleRange(StyleRange styleRange) { 171 return styleRange.equals(new StyleRange(styleRange.start,styleRange.length, null, null)); 172 } 173 startPreformattedText()174 protected void startPreformattedText() { 175 setSkipWhitespace(false); 176 } 177 stopPreformattedText()178 protected void stopPreformattedText() { 179 setSkipWhitespace(true); 180 } 181 182 183 @Override computeSubstitution(int c)184 protected String computeSubstitution(int c) throws IOException { 185 186 if (c == '<') 187 return processHTMLTag(); 188 else if (fIgnore) 189 return EMPTY_STRING; 190 else if (c == '&') 191 return processEntity(); 192 193 return null; 194 } 195 html2Text(String html)196 private String html2Text(String html) { 197 198 if (html == null || html.isEmpty()) 199 return EMPTY_STRING; 200 201 html= html.toLowerCase(); 202 203 String tag= html; 204 if ('/' == tag.charAt(0)) 205 tag= tag.substring(1); 206 207 if (!fgTags.contains(tag)) 208 return EMPTY_STRING; 209 210 211 if ("pre".equals(html)) { //$NON-NLS-1$ 212 startPreformattedText(); 213 return EMPTY_STRING; 214 } 215 216 if ("/pre".equals(html)) { //$NON-NLS-1$ 217 stopPreformattedText(); 218 return EMPTY_STRING; 219 } 220 221 if ("i".equals(html) || "em".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$ 222 startItalic(); 223 return EMPTY_STRING; 224 } 225 226 if ("/i".equals(html) || "/em".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$ 227 stopItalic(); 228 return EMPTY_STRING; 229 } 230 231 if ("b".equals(html) || "strong".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$ 232 startBold(); 233 return EMPTY_STRING; 234 } 235 236 if ("del".equals(html) || "s".equals(html) || "strike".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ 237 startStrikeout(); 238 return EMPTY_STRING; 239 } 240 241 if ((html.length() > 1 && html.charAt(0) == 'h' && Character.isDigit(html.charAt(1))) || "dt".equals(html)) { //$NON-NLS-1$ 242 startBold(); 243 return EMPTY_STRING; 244 } 245 246 if ("dl".equals(html)) //$NON-NLS-1$ 247 return LINE_DELIM; 248 249 if ("dd".equals(html)) //$NON-NLS-1$ 250 return "\t"; //$NON-NLS-1$ 251 252 if ("li".equals(html)) //$NON-NLS-1$ 253 // FIXME: this hard-coded prefix does not work for RTL languages, see https://bugs.eclipse.org/bugs/show_bug.cgi?id=91682 254 return LINE_DELIM + HTMLMessages.getString("HTML2TextReader.listItemPrefix"); //$NON-NLS-1$ 255 256 if ("/b".equals(html) || "/strong".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$ 257 stopBold(); 258 return EMPTY_STRING; 259 } 260 261 if ("/del".equals(html) || "/s".equals(html) || "/strike".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ 262 stopStrikeout(); 263 return EMPTY_STRING; 264 } 265 266 if ("p".equals(html)) { //$NON-NLS-1$ 267 fInParagraph= true; 268 return LINE_DELIM; 269 } 270 271 if ("br".equals(html) || "br/".equals(html) || "div".equals(html)) //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ 272 return LINE_DELIM; 273 274 if ("/p".equals(html)) { //$NON-NLS-1$ 275 boolean inParagraph= fInParagraph; 276 fInParagraph= false; 277 return inParagraph ? EMPTY_STRING : LINE_DELIM; 278 } 279 280 if ((html.startsWith("/h") && html.length() > 2 && Character.isDigit(html.charAt(2))) || "/dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$ 281 stopBold(); 282 return LINE_DELIM; 283 } 284 285 if ("/dd".equals(html)) //$NON-NLS-1$ 286 return LINE_DELIM; 287 288 if ("head".equals(html) && !fHeaderDetected) { //$NON-NLS-1$ 289 fHeaderDetected= true; 290 fIgnore= true; 291 return EMPTY_STRING; 292 } 293 294 if ("/head".equals(html) && fHeaderDetected && fIgnore) { //$NON-NLS-1$ 295 fIgnore= false; 296 return EMPTY_STRING; 297 } 298 299 return EMPTY_STRING; 300 } 301 302 /* 303 * A '<' has been read. Process a html tag 304 */ processHTMLTag()305 private String processHTMLTag() throws IOException { 306 307 StringBuilder buf= new StringBuilder(); 308 int ch; 309 do { 310 311 ch= nextChar(); 312 313 while (ch != -1 && ch != '>') { 314 buf.append(Character.toLowerCase((char) ch)); 315 ch= nextChar(); 316 if (ch == '"'){ 317 buf.append(Character.toLowerCase((char) ch)); 318 ch= nextChar(); 319 while (ch != -1 && ch != '"'){ 320 buf.append(Character.toLowerCase((char) ch)); 321 ch= nextChar(); 322 } 323 } 324 if (ch == '<' && !isInComment(buf)) { 325 unread(ch); 326 return '<' + buf.toString(); 327 } 328 } 329 330 if (ch == -1) 331 return null; 332 333 if (!isInComment(buf) || isCommentEnd(buf)) { 334 break; 335 } 336 // unfinished comment 337 buf.append((char) ch); 338 } while (true); 339 340 return html2Text(buf.toString()); 341 } 342 isInComment(StringBuilder buf)343 private static boolean isInComment(StringBuilder buf) { 344 return buf.length() >= 3 && "!--".equals(buf.substring(0, 3)); //$NON-NLS-1$ 345 } 346 isCommentEnd(StringBuilder buf)347 private static boolean isCommentEnd(StringBuilder buf) { 348 int tagLen= buf.length(); 349 return tagLen >= 5 && "--".equals(buf.substring(tagLen - 2)); //$NON-NLS-1$ 350 } 351 unread(int ch)352 private void unread(int ch) throws IOException { 353 ((PushbackReader) getReader()).unread(ch); 354 } 355 entity2Text(String symbol)356 protected String entity2Text(String symbol) { 357 if (symbol.length() > 1 && symbol.charAt(0) == '#') { 358 int ch; 359 try { 360 if (symbol.charAt(1) == 'x') { 361 ch= Integer.parseInt(symbol.substring(2), 16); 362 } else { 363 ch= Integer.parseInt(symbol.substring(1), 10); 364 } 365 return EMPTY_STRING + (char)ch; 366 } catch (NumberFormatException e) { 367 } 368 } else { 369 String str= fgEntityLookup.get(symbol); 370 if (str != null) { 371 return str; 372 } 373 } 374 return "&" + symbol + ";"; // not found //$NON-NLS-1$ //$NON-NLS-2$ 375 } 376 377 /* 378 * A '&' has been read. Process a entity 379 */ processEntity()380 private String processEntity() throws IOException { 381 StringBuilder buf= new StringBuilder(); 382 int ch= nextChar(); 383 while (Character.isLetterOrDigit((char)ch) || ch == '#') { 384 buf.append((char) ch); 385 ch= nextChar(); 386 } 387 388 if (ch == ';') 389 return entity2Text(buf.toString()); 390 391 buf.insert(0, '&'); 392 if (ch != -1) 393 buf.append((char) ch); 394 return buf.toString(); 395 } 396 } 397