1// -*- indent-tabs-mode: nil; js-indent-level: 2 -*- 2/* This Source Code Form is subject to the terms of the Mozilla Public 3 * License, v. 2.0. If a copy of the MPL was not distributed with this file, 4 * You can obtain one at http://mozilla.org/MPL/2.0/. */ 5"use strict"; 6 7var EXPORTED_SYMBOLS = ["ReaderMode"]; 8 9// Constants for telemetry. 10const DOWNLOAD_SUCCESS = 0; 11const DOWNLOAD_ERROR_XHR = 1; 12const DOWNLOAD_ERROR_NO_DOC = 2; 13 14const PARSE_SUCCESS = 0; 15const PARSE_ERROR_TOO_MANY_ELEMENTS = 1; 16const PARSE_ERROR_WORKER = 2; 17const PARSE_ERROR_NO_ARTICLE = 3; 18 19// Class names to preserve in the readerized output. We preserve these class 20// names so that rules in aboutReader.css can match them. 21const CLASSES_TO_PRESERVE = [ 22 "caption", 23 "emoji", 24 "hidden", 25 "invisible", 26 "sr-only", 27 "visually-hidden", 28 "visuallyhidden", 29 "wp-caption", 30 "wp-caption-text", 31 "wp-smiley", 32]; 33 34const { Services } = ChromeUtils.import("resource://gre/modules/Services.jsm"); 35const { XPCOMUtils } = ChromeUtils.import( 36 "resource://gre/modules/XPCOMUtils.jsm" 37); 38 39XPCOMUtils.defineLazyGlobalGetters(this, ["XMLHttpRequest", "XMLSerializer"]); 40 41ChromeUtils.defineModuleGetter( 42 this, 43 "CommonUtils", 44 "resource://services-common/utils.js" 45); 46ChromeUtils.defineModuleGetter( 47 this, 48 "EventDispatcher", 49 "resource://gre/modules/Messaging.jsm" 50); 51ChromeUtils.defineModuleGetter(this, "OS", "resource://gre/modules/osfile.jsm"); 52ChromeUtils.defineModuleGetter( 53 this, 54 "ReaderWorker", 55 "resource://gre/modules/reader/ReaderWorker.jsm" 56); 57ChromeUtils.defineModuleGetter( 58 this, 59 "LanguageDetector", 60 "resource:///modules/translation/LanguageDetector.jsm" 61); 62ChromeUtils.defineModuleGetter( 63 this, 64 "Readerable", 65 "resource://gre/modules/Readerable.jsm" 66); 67 68const gIsFirefoxDesktop = 69 Services.appinfo.ID == "{ec8030f7-c20a-464f-9b0e-13a3a9e97384}"; 70 71Services.telemetry.setEventRecordingEnabled("readermode", true); 72 73var ReaderMode = { 74 // Version of the cache schema. 75 CACHE_VERSION: 1, 76 77 DEBUG: 0, 78 79 // For time spent telemetry 80 enterTime: undefined, 81 leaveTime: undefined, 82 83 /** 84 * Enter the reader mode by going forward one step in history if applicable, 85 * if not, append the about:reader page in the history instead. 86 */ 87 enterReaderMode(docShell, win) { 88 this.enterTime = Date.now(); 89 90 Services.telemetry.recordEvent("readermode", "view", "on", null, { 91 subcategory: "feature", 92 }); 93 94 let url = win.document.location.href; 95 let readerURL = "about:reader?url=" + encodeURIComponent(url); 96 97 if (!Services.appinfo.sessionHistoryInParent) { 98 let webNav = docShell.QueryInterface(Ci.nsIWebNavigation); 99 let sh = webNav.sessionHistory; 100 if (webNav.canGoForward) { 101 let forwardEntry = sh.legacySHistory.getEntryAtIndex(sh.index + 1); 102 let forwardURL = forwardEntry.URI.spec; 103 if (forwardURL && (forwardURL == readerURL || !readerURL)) { 104 webNav.goForward(); 105 return; 106 } 107 } 108 } 109 110 // This could possibly move to the parent. See bug 1664982. 111 win.document.location = readerURL; 112 }, 113 114 /** 115 * Exit the reader mode by going back one step in history if applicable, 116 * if not, append the original page in the history instead. 117 */ 118 leaveReaderMode(docShell, win) { 119 this.leaveTime = Date.now(); 120 121 // Measured in seconds (whole number) 122 let timeSpentInReaderMode = Math.floor( 123 (this.leaveTime - this.enterTime) / 1000 124 ); 125 126 // Measured as percentage (whole number) 127 let scrollPosition = Math.floor( 128 ((win.scrollY + win.innerHeight) / win.document.body.clientHeight) * 100 129 ); 130 131 Services.telemetry.recordEvent("readermode", "view", "off", null, { 132 subcategory: "feature", 133 reader_time: `${timeSpentInReaderMode}`, 134 scroll_position: `${scrollPosition}`, 135 }); 136 137 let url = win.document.location.href; 138 let originalURL = ReaderMode.getOriginalUrl(url); 139 let webNav = docShell.QueryInterface(Ci.nsIWebNavigation); 140 141 if (!Services.appinfo.sessionHistoryInParent) { 142 let sh = webNav.sessionHistory; 143 if (webNav.canGoBack) { 144 let prevEntry = sh.legacySHistory.getEntryAtIndex(sh.index - 1); 145 let prevURL = prevEntry.URI.spec; 146 if (prevURL && (prevURL == originalURL || !originalURL)) { 147 webNav.goBack(); 148 return; 149 } 150 } 151 } 152 153 let referrerURI, principal; 154 try { 155 referrerURI = Services.io.newURI(url); 156 principal = Services.scriptSecurityManager.createContentPrincipal( 157 referrerURI, 158 win.document.nodePrincipal.originAttributes 159 ); 160 } catch (e) { 161 Cu.reportError(e); 162 return; 163 } 164 let loadFlags = webNav.LOAD_FLAGS_DISALLOW_INHERIT_PRINCIPAL; 165 let ReferrerInfo = Components.Constructor( 166 "@mozilla.org/referrer-info;1", 167 "nsIReferrerInfo", 168 "init" 169 ); 170 let loadURIOptions = { 171 triggeringPrincipal: principal, 172 loadFlags, 173 referrerInfo: new ReferrerInfo( 174 Ci.nsIReferrerInfo.EMPTY, 175 true, 176 referrerURI 177 ), 178 }; 179 // This could possibly move to the parent. See bug 1664982. 180 webNav.loadURI(originalURL, loadURIOptions); 181 }, 182 183 /** 184 * Returns original URL from an about:reader URL. 185 * 186 * @param url An about:reader URL. 187 * @return The original URL for the article, or null if we did not find 188 * a properly formatted about:reader URL. 189 */ 190 getOriginalUrl(url) { 191 if (!url.startsWith("about:reader?")) { 192 return null; 193 } 194 195 let outerHash = ""; 196 try { 197 let uriObj = Services.io.newURI(url); 198 url = uriObj.specIgnoringRef; 199 outerHash = uriObj.ref; 200 } catch (ex) { 201 /* ignore, use the raw string */ 202 } 203 204 let searchParams = new URLSearchParams( 205 url.substring("about:reader?".length) 206 ); 207 if (!searchParams.has("url")) { 208 return null; 209 } 210 let originalUrl = searchParams.get("url"); 211 if (outerHash) { 212 try { 213 let uriObj = Services.io.newURI(originalUrl); 214 uriObj = Services.io.newURI("#" + outerHash, null, uriObj); 215 originalUrl = uriObj.spec; 216 } catch (ex) {} 217 } 218 return originalUrl; 219 }, 220 221 getOriginalUrlObjectForDisplay(url) { 222 let originalUrl = ReaderMode.getOriginalUrl(url); 223 if (originalUrl) { 224 let uriObj; 225 try { 226 uriObj = Services.uriFixup.getFixupURIInfo(originalUrl).preferredURI; 227 } catch (ex) { 228 return null; 229 } 230 try { 231 return Services.io.createExposableURI(uriObj); 232 } catch (ex) { 233 return null; 234 } 235 } 236 return null; 237 }, 238 239 /** 240 * Gets an article from a loaded browser's document. This method will not attempt 241 * to parse certain URIs (e.g. about: URIs). 242 * 243 * @param doc A document to parse. 244 * @return {Promise} 245 * @resolves JS object representing the article, or null if no article is found. 246 */ 247 parseDocument(doc) { 248 if ( 249 !Readerable.shouldCheckUri(doc.documentURIObject) || 250 !Readerable.shouldCheckUri(doc.baseURIObject, true) 251 ) { 252 this.log("Reader mode disabled for URI"); 253 return null; 254 } 255 256 return this._readerParse(doc); 257 }, 258 259 /** 260 * Downloads and parses a document from a URL. 261 * 262 * @param url URL to download and parse. 263 * @return {Promise} 264 * @resolves JS object representing the article, or null if no article is found. 265 */ 266 async downloadAndParseDocument(url) { 267 let doc = await this._downloadDocument(url); 268 if (!doc) { 269 return null; 270 } 271 if ( 272 !Readerable.shouldCheckUri(doc.documentURIObject) || 273 !Readerable.shouldCheckUri(doc.baseURIObject, true) 274 ) { 275 this.log("Reader mode disabled for URI"); 276 return null; 277 } 278 279 return this._readerParse(doc); 280 }, 281 282 _downloadDocument(url) { 283 try { 284 if (!Readerable.shouldCheckUri(Services.io.newURI(url))) { 285 return null; 286 } 287 } catch (ex) { 288 Cu.reportError( 289 new Error(`Couldn't create URI from ${url} to download: ${ex}`) 290 ); 291 return null; 292 } 293 let histogram = Services.telemetry.getHistogramById( 294 "READER_MODE_DOWNLOAD_RESULT" 295 ); 296 return new Promise((resolve, reject) => { 297 let xhr = new XMLHttpRequest(); 298 xhr.open("GET", url, true); 299 xhr.onerror = evt => reject(evt.error); 300 xhr.responseType = "document"; 301 xhr.onload = evt => { 302 if (xhr.status !== 200) { 303 reject("Reader mode XHR failed with status: " + xhr.status); 304 histogram.add(DOWNLOAD_ERROR_XHR); 305 return; 306 } 307 308 let doc = xhr.responseXML; 309 if (!doc) { 310 reject("Reader mode XHR didn't return a document"); 311 histogram.add(DOWNLOAD_ERROR_NO_DOC); 312 return; 313 } 314 315 // Manually follow a meta refresh tag if one exists. 316 let meta = doc.querySelector("meta[http-equiv=refresh]"); 317 if (meta) { 318 let content = meta.getAttribute("content"); 319 if (content) { 320 let urlIndex = content.toUpperCase().indexOf("URL="); 321 if (urlIndex > -1) { 322 let baseURI = Services.io.newURI(url); 323 let newURI = Services.io.newURI( 324 content.substring(urlIndex + 4), 325 null, 326 baseURI 327 ); 328 let newURL = newURI.spec; 329 let ssm = Services.scriptSecurityManager; 330 let flags = 331 ssm.LOAD_IS_AUTOMATIC_DOCUMENT_REPLACEMENT | 332 ssm.DISALLOW_INHERIT_PRINCIPAL; 333 try { 334 ssm.checkLoadURIStrWithPrincipal( 335 doc.nodePrincipal, 336 newURL, 337 flags 338 ); 339 } catch (ex) { 340 let errorMsg = 341 "Reader mode disallowed meta refresh (reason: " + ex + ")."; 342 343 if (Services.prefs.getBoolPref("reader.errors.includeURLs")) { 344 errorMsg += " Refresh target URI: '" + newURL + "'."; 345 } 346 reject(errorMsg); 347 return; 348 } 349 // Otherwise, pass an object indicating our new URL: 350 if (!baseURI.equalsExceptRef(newURI)) { 351 reject({ newURL }); 352 return; 353 } 354 } 355 } 356 } 357 let responseURL = xhr.responseURL; 358 let givenURL = url; 359 // Convert these to real URIs to make sure the escaping (or lack 360 // thereof) is identical: 361 try { 362 responseURL = Services.io.newURI(responseURL).specIgnoringRef; 363 } catch (ex) { 364 /* Ignore errors - we'll use what we had before */ 365 } 366 try { 367 givenURL = Services.io.newURI(givenURL).specIgnoringRef; 368 } catch (ex) { 369 /* Ignore errors - we'll use what we had before */ 370 } 371 372 if (responseURL != givenURL) { 373 // We were redirected without a meta refresh tag. 374 // Force redirect to the correct place: 375 reject({ newURL: xhr.responseURL }); 376 return; 377 } 378 resolve(doc); 379 histogram.add(DOWNLOAD_SUCCESS); 380 }; 381 xhr.send(); 382 }); 383 }, 384 385 /** 386 * Retrieves an article from the cache given an article URI. 387 * 388 * @param url The article URL. 389 * @return {Promise} 390 * @resolves JS object representing the article, or null if no article is found. 391 * @rejects OS.File.Error 392 */ 393 async getArticleFromCache(url) { 394 let path = this._toHashedPath(url); 395 try { 396 let array = await OS.File.read(path); 397 return JSON.parse(new TextDecoder().decode(array)); 398 } catch (e) { 399 if (!(e instanceof OS.File.Error) || !e.becauseNoSuchFile) { 400 throw e; 401 } 402 return null; 403 } 404 }, 405 406 /** 407 * Stores an article in the cache. 408 * 409 * @param article JS object representing article. 410 * @return {Promise} 411 * @resolves When the article is stored. 412 * @rejects OS.File.Error 413 */ 414 async storeArticleInCache(article) { 415 let array = new TextEncoder().encode(JSON.stringify(article)); 416 let path = this._toHashedPath(article.url); 417 await this._ensureCacheDir(); 418 return OS.File.writeAtomic(path, array, { tmpPath: path + ".tmp" }).then( 419 success => { 420 OS.File.stat(path).then(info => { 421 return EventDispatcher.instance.sendRequest({ 422 type: "Reader:AddedToCache", 423 url: article.url, 424 size: info.size, 425 path, 426 }); 427 }); 428 } 429 ); 430 }, 431 432 /** 433 * Removes an article from the cache given an article URI. 434 * 435 * @param url The article URL. 436 * @return {Promise} 437 * @resolves When the article is removed. 438 * @rejects OS.File.Error 439 */ 440 async removeArticleFromCache(url) { 441 let path = this._toHashedPath(url); 442 await OS.File.remove(path); 443 }, 444 445 log(msg) { 446 if (this.DEBUG) { 447 dump("Reader: " + msg); 448 } 449 }, 450 451 /** 452 * Attempts to parse a document into an article. Heavy lifting happens 453 * in readerWorker.js. 454 * 455 * @param doc The document to parse. 456 * @return {Promise} 457 * @resolves JS object representing the article, or null if no article is found. 458 */ 459 async _readerParse(doc) { 460 let histogram = Services.telemetry.getHistogramById( 461 "READER_MODE_PARSE_RESULT" 462 ); 463 if (this.parseNodeLimit) { 464 let numTags = doc.getElementsByTagName("*").length; 465 if (numTags > this.parseNodeLimit) { 466 this.log( 467 "Aborting parse for " + 468 doc.baseURIObject.spec + 469 "; " + 470 numTags + 471 " elements found" 472 ); 473 histogram.add(PARSE_ERROR_TOO_MANY_ELEMENTS); 474 return null; 475 } 476 } 477 478 // Fetch this here before we send `doc` off to the worker thread, as later on the 479 // document might be nuked but we will still want the URI. 480 let { documentURI } = doc; 481 482 let uriParam = { 483 spec: doc.baseURIObject.spec, 484 host: doc.baseURIObject.host, 485 prePath: doc.baseURIObject.prePath, 486 scheme: doc.baseURIObject.scheme, 487 pathBase: Services.io.newURI(".", null, doc.baseURIObject).spec, 488 }; 489 490 // convert text/plain document, if any, to XHTML format 491 if (this._isDocumentPlainText(doc)) { 492 doc = this._convertPlainTextDocument(doc); 493 } 494 495 let serializer = new XMLSerializer(); 496 let serializedDoc = serializer.serializeToString(doc); 497 // Explicitly null out doc to make it clear it might not be available from this 498 // point on. 499 doc = null; 500 501 let options = { 502 classesToPreserve: CLASSES_TO_PRESERVE, 503 }; 504 505 let article = null; 506 try { 507 article = await ReaderWorker.post("parseDocument", [ 508 uriParam, 509 serializedDoc, 510 options, 511 ]); 512 } catch (e) { 513 Cu.reportError("Error in ReaderWorker: " + e); 514 histogram.add(PARSE_ERROR_WORKER); 515 } 516 517 if (!article) { 518 this.log("Worker did not return an article"); 519 histogram.add(PARSE_ERROR_NO_ARTICLE); 520 return null; 521 } 522 523 // Readability returns a URI object based on the baseURI, but we only care 524 // about the original document's URL from now on. This also avoids spoofing 525 // attempts where the baseURI doesn't match the domain of the documentURI 526 article.url = documentURI; 527 delete article.uri; 528 529 let flags = 530 Ci.nsIDocumentEncoder.OutputSelectionOnly | 531 Ci.nsIDocumentEncoder.OutputAbsoluteLinks; 532 article.title = Cc["@mozilla.org/parserutils;1"] 533 .getService(Ci.nsIParserUtils) 534 .convertToPlainText(article.title, flags, 0); 535 if (gIsFirefoxDesktop) { 536 await this._assignLanguage(article); 537 this._maybeAssignTextDirection(article); 538 } 539 540 this._assignReadTime(article); 541 542 histogram.add(PARSE_SUCCESS); 543 return article; 544 }, 545 546 get _cryptoHash() { 547 delete this._cryptoHash; 548 return (this._cryptoHash = Cc[ 549 "@mozilla.org/security/hash;1" 550 ].createInstance(Ci.nsICryptoHash)); 551 }, 552 553 get _unicodeConverter() { 554 delete this._unicodeConverter; 555 this._unicodeConverter = Cc[ 556 "@mozilla.org/intl/scriptableunicodeconverter" 557 ].createInstance(Ci.nsIScriptableUnicodeConverter); 558 this._unicodeConverter.charset = "utf8"; 559 return this._unicodeConverter; 560 }, 561 562 /** 563 * Calculate the hashed path for a stripped article URL. 564 * 565 * @param url The article URL. This should have referrers removed. 566 * @return The file path to the cached article. 567 */ 568 _toHashedPath(url) { 569 let value = this._unicodeConverter.convertToByteArray(url); 570 this._cryptoHash.init(this._cryptoHash.MD5); 571 this._cryptoHash.update(value, value.length); 572 573 let hash = CommonUtils.encodeBase32(this._cryptoHash.finish(false)); 574 let fileName = hash.substring(0, hash.indexOf("=")) + ".json"; 575 return OS.Path.join(OS.Constants.Path.profileDir, "readercache", fileName); 576 }, 577 578 /** 579 * Ensures the cache directory exists. 580 * 581 * @return Promise 582 * @resolves When the cache directory exists. 583 * @rejects OS.File.Error 584 */ 585 _ensureCacheDir() { 586 let dir = OS.Path.join(OS.Constants.Path.profileDir, "readercache"); 587 return OS.File.exists(dir).then(exists => { 588 if (!exists) { 589 return OS.File.makeDir(dir); 590 } 591 return undefined; 592 }); 593 }, 594 595 /** 596 * Sets a global language string value if the result is confident 597 * 598 * @return Promise 599 * @resolves when the language is detected 600 */ 601 _assignLanguage(article) { 602 return LanguageDetector.detectLanguage(article.textContent).then(result => { 603 article.language = result.confident ? result.language : null; 604 }); 605 }, 606 607 _maybeAssignTextDirection(article) { 608 // TODO: Remove the hardcoded language codes below once bug 1320265 is resolved. 609 if ( 610 !article.dir && 611 ["ar", "fa", "he", "ug", "ur"].includes(article.language) 612 ) { 613 article.dir = "rtl"; 614 } 615 }, 616 617 /** 618 * Assigns the estimated reading time range of the article to the article object. 619 * 620 * @param article the article object to assign the reading time estimate to. 621 */ 622 _assignReadTime(article) { 623 let lang = article.language || "en"; 624 const readingSpeed = this._getReadingSpeedForLanguage(lang); 625 const charactersPerMinuteLow = readingSpeed.cpm - readingSpeed.variance; 626 const charactersPerMinuteHigh = readingSpeed.cpm + readingSpeed.variance; 627 const length = article.length; 628 629 article.readingTimeMinsSlow = Math.ceil(length / charactersPerMinuteLow); 630 article.readingTimeMinsFast = Math.ceil(length / charactersPerMinuteHigh); 631 }, 632 633 /** 634 * Returns the reading speed of a selection of languages with likely variance. 635 * 636 * Reading speed estimated from a study done on reading speeds in various languages. 637 * study can be found here: http://iovs.arvojournals.org/article.aspx?articleid=2166061 638 * 639 * @return object with characters per minute and variance. Defaults to English 640 * if no suitable language is found in the collection. 641 */ 642 _getReadingSpeedForLanguage(lang) { 643 const readingSpeed = new Map([ 644 ["en", { cpm: 987, variance: 118 }], 645 ["ar", { cpm: 612, variance: 88 }], 646 ["de", { cpm: 920, variance: 86 }], 647 ["es", { cpm: 1025, variance: 127 }], 648 ["fi", { cpm: 1078, variance: 121 }], 649 ["fr", { cpm: 998, variance: 126 }], 650 ["he", { cpm: 833, variance: 130 }], 651 ["it", { cpm: 950, variance: 140 }], 652 ["jw", { cpm: 357, variance: 56 }], 653 ["nl", { cpm: 978, variance: 143 }], 654 ["pl", { cpm: 916, variance: 126 }], 655 ["pt", { cpm: 913, variance: 145 }], 656 ["ru", { cpm: 986, variance: 175 }], 657 ["sk", { cpm: 885, variance: 145 }], 658 ["sv", { cpm: 917, variance: 156 }], 659 ["tr", { cpm: 1054, variance: 156 }], 660 ["zh", { cpm: 255, variance: 29 }], 661 ]); 662 663 return readingSpeed.get(lang) || readingSpeed.get("en"); 664 }, 665 /** 666 * 667 * Check if the document to be parsed is text document. 668 * @param doc the doc object to be parsed. 669 * @return boolean 670 * 671 */ 672 _isDocumentPlainText(doc) { 673 return doc.contentType == "text/plain"; 674 }, 675 /** 676 * 677 * The document to be parsed is text document and is converted to HTML format. 678 * @param doc the doc object to be parsed. 679 * @return doc 680 * 681 */ 682 _convertPlainTextDocument(doc) { 683 let preTag = doc.querySelector("pre"); 684 let docFrag = doc.createDocumentFragment(); 685 let content = preTag.textContent; 686 let paragraphs = content.split(/\r?\n\r?\n/); 687 for (let para of paragraphs) { 688 let pElem = doc.createElement("p"); 689 let lines = para.split(/\n/); 690 for (let line of lines) { 691 pElem.append(line); 692 let brElem = doc.createElement("br"); 693 pElem.append(brElem); 694 } 695 docFrag.append(pElem); 696 } 697 // Clone the document to avoid the original document being affected 698 // (which shows up when exiting reader mode again). 699 let clone = doc.documentElement.cloneNode(true); 700 clone.querySelector("pre").replaceWith(docFrag); 701 return clone; 702 }, 703}; 704 705XPCOMUtils.defineLazyPreferenceGetter( 706 ReaderMode, 707 "maxElemsToParse", 708 "reader.parse-node-limit", 709 0 710); 711