1// -*- indent-tabs-mode: nil; js-indent-level: 2 -*-
2/* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
4 * You can obtain one at http://mozilla.org/MPL/2.0/. */
5"use strict";
6
7var EXPORTED_SYMBOLS = ["ReaderMode"];
8
9// Constants for telemetry.
10const DOWNLOAD_SUCCESS = 0;
11const DOWNLOAD_ERROR_XHR = 1;
12const DOWNLOAD_ERROR_NO_DOC = 2;
13
14const PARSE_SUCCESS = 0;
15const PARSE_ERROR_TOO_MANY_ELEMENTS = 1;
16const PARSE_ERROR_WORKER = 2;
17const PARSE_ERROR_NO_ARTICLE = 3;
18
19// Class names to preserve in the readerized output. We preserve these class
20// names so that rules in aboutReader.css can match them.
21const CLASSES_TO_PRESERVE = [
22  "caption",
23  "emoji",
24  "hidden",
25  "invisible",
26  "sr-only",
27  "visually-hidden",
28  "visuallyhidden",
29  "wp-caption",
30  "wp-caption-text",
31  "wp-smiley",
32];
33
34const { Services } = ChromeUtils.import("resource://gre/modules/Services.jsm");
35const { XPCOMUtils } = ChromeUtils.import(
36  "resource://gre/modules/XPCOMUtils.jsm"
37);
38
39XPCOMUtils.defineLazyGlobalGetters(this, ["XMLHttpRequest", "XMLSerializer"]);
40
41ChromeUtils.defineModuleGetter(
42  this,
43  "CommonUtils",
44  "resource://services-common/utils.js"
45);
46ChromeUtils.defineModuleGetter(
47  this,
48  "EventDispatcher",
49  "resource://gre/modules/Messaging.jsm"
50);
51ChromeUtils.defineModuleGetter(this, "OS", "resource://gre/modules/osfile.jsm");
52ChromeUtils.defineModuleGetter(
53  this,
54  "ReaderWorker",
55  "resource://gre/modules/reader/ReaderWorker.jsm"
56);
57ChromeUtils.defineModuleGetter(
58  this,
59  "LanguageDetector",
60  "resource:///modules/translation/LanguageDetector.jsm"
61);
62ChromeUtils.defineModuleGetter(
63  this,
64  "Readerable",
65  "resource://gre/modules/Readerable.jsm"
66);
67
68const gIsFirefoxDesktop =
69  Services.appinfo.ID == "{ec8030f7-c20a-464f-9b0e-13a3a9e97384}";
70
71Services.telemetry.setEventRecordingEnabled("readermode", true);
72
73var ReaderMode = {
74  // Version of the cache schema.
75  CACHE_VERSION: 1,
76
77  DEBUG: 0,
78
79  // For time spent telemetry
80  enterTime: undefined,
81  leaveTime: undefined,
82
83  /**
84   * Enter the reader mode by going forward one step in history if applicable,
85   * if not, append the about:reader page in the history instead.
86   */
87  enterReaderMode(docShell, win) {
88    this.enterTime = Date.now();
89
90    Services.telemetry.recordEvent("readermode", "view", "on", null, {
91      subcategory: "feature",
92    });
93
94    let url = win.document.location.href;
95    let readerURL = "about:reader?url=" + encodeURIComponent(url);
96
97    if (!Services.appinfo.sessionHistoryInParent) {
98      let webNav = docShell.QueryInterface(Ci.nsIWebNavigation);
99      let sh = webNav.sessionHistory;
100      if (webNav.canGoForward) {
101        let forwardEntry = sh.legacySHistory.getEntryAtIndex(sh.index + 1);
102        let forwardURL = forwardEntry.URI.spec;
103        if (forwardURL && (forwardURL == readerURL || !readerURL)) {
104          webNav.goForward();
105          return;
106        }
107      }
108    }
109
110    // This could possibly move to the parent. See bug 1664982.
111    win.document.location = readerURL;
112  },
113
114  /**
115   * Exit the reader mode by going back one step in history if applicable,
116   * if not, append the original page in the history instead.
117   */
118  leaveReaderMode(docShell, win) {
119    this.leaveTime = Date.now();
120
121    // Measured in seconds (whole number)
122    let timeSpentInReaderMode = Math.floor(
123      (this.leaveTime - this.enterTime) / 1000
124    );
125
126    // Measured as percentage (whole number)
127    let scrollPosition = Math.floor(
128      ((win.scrollY + win.innerHeight) / win.document.body.clientHeight) * 100
129    );
130
131    Services.telemetry.recordEvent("readermode", "view", "off", null, {
132      subcategory: "feature",
133      reader_time: `${timeSpentInReaderMode}`,
134      scroll_position: `${scrollPosition}`,
135    });
136
137    let url = win.document.location.href;
138    let originalURL = ReaderMode.getOriginalUrl(url);
139    let webNav = docShell.QueryInterface(Ci.nsIWebNavigation);
140
141    if (!Services.appinfo.sessionHistoryInParent) {
142      let sh = webNav.sessionHistory;
143      if (webNav.canGoBack) {
144        let prevEntry = sh.legacySHistory.getEntryAtIndex(sh.index - 1);
145        let prevURL = prevEntry.URI.spec;
146        if (prevURL && (prevURL == originalURL || !originalURL)) {
147          webNav.goBack();
148          return;
149        }
150      }
151    }
152
153    let referrerURI, principal;
154    try {
155      referrerURI = Services.io.newURI(url);
156      principal = Services.scriptSecurityManager.createContentPrincipal(
157        referrerURI,
158        win.document.nodePrincipal.originAttributes
159      );
160    } catch (e) {
161      Cu.reportError(e);
162      return;
163    }
164    let loadFlags = webNav.LOAD_FLAGS_DISALLOW_INHERIT_PRINCIPAL;
165    let ReferrerInfo = Components.Constructor(
166      "@mozilla.org/referrer-info;1",
167      "nsIReferrerInfo",
168      "init"
169    );
170    let loadURIOptions = {
171      triggeringPrincipal: principal,
172      loadFlags,
173      referrerInfo: new ReferrerInfo(
174        Ci.nsIReferrerInfo.EMPTY,
175        true,
176        referrerURI
177      ),
178    };
179    // This could possibly move to the parent. See bug 1664982.
180    webNav.loadURI(originalURL, loadURIOptions);
181  },
182
183  /**
184   * Returns original URL from an about:reader URL.
185   *
186   * @param url An about:reader URL.
187   * @return The original URL for the article, or null if we did not find
188   *         a properly formatted about:reader URL.
189   */
190  getOriginalUrl(url) {
191    if (!url.startsWith("about:reader?")) {
192      return null;
193    }
194
195    let outerHash = "";
196    try {
197      let uriObj = Services.io.newURI(url);
198      url = uriObj.specIgnoringRef;
199      outerHash = uriObj.ref;
200    } catch (ex) {
201      /* ignore, use the raw string */
202    }
203
204    let searchParams = new URLSearchParams(
205      url.substring("about:reader?".length)
206    );
207    if (!searchParams.has("url")) {
208      return null;
209    }
210    let originalUrl = searchParams.get("url");
211    if (outerHash) {
212      try {
213        let uriObj = Services.io.newURI(originalUrl);
214        uriObj = Services.io.newURI("#" + outerHash, null, uriObj);
215        originalUrl = uriObj.spec;
216      } catch (ex) {}
217    }
218    return originalUrl;
219  },
220
221  getOriginalUrlObjectForDisplay(url) {
222    let originalUrl = ReaderMode.getOriginalUrl(url);
223    if (originalUrl) {
224      let uriObj;
225      try {
226        uriObj = Services.uriFixup.getFixupURIInfo(originalUrl).preferredURI;
227      } catch (ex) {
228        return null;
229      }
230      try {
231        return Services.io.createExposableURI(uriObj);
232      } catch (ex) {
233        return null;
234      }
235    }
236    return null;
237  },
238
239  /**
240   * Gets an article from a loaded browser's document. This method will not attempt
241   * to parse certain URIs (e.g. about: URIs).
242   *
243   * @param doc A document to parse.
244   * @return {Promise}
245   * @resolves JS object representing the article, or null if no article is found.
246   */
247  parseDocument(doc) {
248    if (
249      !Readerable.shouldCheckUri(doc.documentURIObject) ||
250      !Readerable.shouldCheckUri(doc.baseURIObject, true)
251    ) {
252      this.log("Reader mode disabled for URI");
253      return null;
254    }
255
256    return this._readerParse(doc);
257  },
258
259  /**
260   * Downloads and parses a document from a URL.
261   *
262   * @param url URL to download and parse.
263   * @return {Promise}
264   * @resolves JS object representing the article, or null if no article is found.
265   */
266  async downloadAndParseDocument(url) {
267    let doc = await this._downloadDocument(url);
268    if (!doc) {
269      return null;
270    }
271    if (
272      !Readerable.shouldCheckUri(doc.documentURIObject) ||
273      !Readerable.shouldCheckUri(doc.baseURIObject, true)
274    ) {
275      this.log("Reader mode disabled for URI");
276      return null;
277    }
278
279    return this._readerParse(doc);
280  },
281
282  _downloadDocument(url) {
283    try {
284      if (!Readerable.shouldCheckUri(Services.io.newURI(url))) {
285        return null;
286      }
287    } catch (ex) {
288      Cu.reportError(
289        new Error(`Couldn't create URI from ${url} to download: ${ex}`)
290      );
291      return null;
292    }
293    let histogram = Services.telemetry.getHistogramById(
294      "READER_MODE_DOWNLOAD_RESULT"
295    );
296    return new Promise((resolve, reject) => {
297      let xhr = new XMLHttpRequest();
298      xhr.open("GET", url, true);
299      xhr.onerror = evt => reject(evt.error);
300      xhr.responseType = "document";
301      xhr.onload = evt => {
302        if (xhr.status !== 200) {
303          reject("Reader mode XHR failed with status: " + xhr.status);
304          histogram.add(DOWNLOAD_ERROR_XHR);
305          return;
306        }
307
308        let doc = xhr.responseXML;
309        if (!doc) {
310          reject("Reader mode XHR didn't return a document");
311          histogram.add(DOWNLOAD_ERROR_NO_DOC);
312          return;
313        }
314
315        // Manually follow a meta refresh tag if one exists.
316        let meta = doc.querySelector("meta[http-equiv=refresh]");
317        if (meta) {
318          let content = meta.getAttribute("content");
319          if (content) {
320            let urlIndex = content.toUpperCase().indexOf("URL=");
321            if (urlIndex > -1) {
322              let baseURI = Services.io.newURI(url);
323              let newURI = Services.io.newURI(
324                content.substring(urlIndex + 4),
325                null,
326                baseURI
327              );
328              let newURL = newURI.spec;
329              let ssm = Services.scriptSecurityManager;
330              let flags =
331                ssm.LOAD_IS_AUTOMATIC_DOCUMENT_REPLACEMENT |
332                ssm.DISALLOW_INHERIT_PRINCIPAL;
333              try {
334                ssm.checkLoadURIStrWithPrincipal(
335                  doc.nodePrincipal,
336                  newURL,
337                  flags
338                );
339              } catch (ex) {
340                let errorMsg =
341                  "Reader mode disallowed meta refresh (reason: " + ex + ").";
342
343                if (Services.prefs.getBoolPref("reader.errors.includeURLs")) {
344                  errorMsg += " Refresh target URI: '" + newURL + "'.";
345                }
346                reject(errorMsg);
347                return;
348              }
349              // Otherwise, pass an object indicating our new URL:
350              if (!baseURI.equalsExceptRef(newURI)) {
351                reject({ newURL });
352                return;
353              }
354            }
355          }
356        }
357        let responseURL = xhr.responseURL;
358        let givenURL = url;
359        // Convert these to real URIs to make sure the escaping (or lack
360        // thereof) is identical:
361        try {
362          responseURL = Services.io.newURI(responseURL).specIgnoringRef;
363        } catch (ex) {
364          /* Ignore errors - we'll use what we had before */
365        }
366        try {
367          givenURL = Services.io.newURI(givenURL).specIgnoringRef;
368        } catch (ex) {
369          /* Ignore errors - we'll use what we had before */
370        }
371
372        if (responseURL != givenURL) {
373          // We were redirected without a meta refresh tag.
374          // Force redirect to the correct place:
375          reject({ newURL: xhr.responseURL });
376          return;
377        }
378        resolve(doc);
379        histogram.add(DOWNLOAD_SUCCESS);
380      };
381      xhr.send();
382    });
383  },
384
385  /**
386   * Retrieves an article from the cache given an article URI.
387   *
388   * @param url The article URL.
389   * @return {Promise}
390   * @resolves JS object representing the article, or null if no article is found.
391   * @rejects OS.File.Error
392   */
393  async getArticleFromCache(url) {
394    let path = this._toHashedPath(url);
395    try {
396      let array = await OS.File.read(path);
397      return JSON.parse(new TextDecoder().decode(array));
398    } catch (e) {
399      if (!(e instanceof OS.File.Error) || !e.becauseNoSuchFile) {
400        throw e;
401      }
402      return null;
403    }
404  },
405
406  /**
407   * Stores an article in the cache.
408   *
409   * @param article JS object representing article.
410   * @return {Promise}
411   * @resolves When the article is stored.
412   * @rejects OS.File.Error
413   */
414  async storeArticleInCache(article) {
415    let array = new TextEncoder().encode(JSON.stringify(article));
416    let path = this._toHashedPath(article.url);
417    await this._ensureCacheDir();
418    return OS.File.writeAtomic(path, array, { tmpPath: path + ".tmp" }).then(
419      success => {
420        OS.File.stat(path).then(info => {
421          return EventDispatcher.instance.sendRequest({
422            type: "Reader:AddedToCache",
423            url: article.url,
424            size: info.size,
425            path,
426          });
427        });
428      }
429    );
430  },
431
432  /**
433   * Removes an article from the cache given an article URI.
434   *
435   * @param url The article URL.
436   * @return {Promise}
437   * @resolves When the article is removed.
438   * @rejects OS.File.Error
439   */
440  async removeArticleFromCache(url) {
441    let path = this._toHashedPath(url);
442    await OS.File.remove(path);
443  },
444
445  log(msg) {
446    if (this.DEBUG) {
447      dump("Reader: " + msg);
448    }
449  },
450
451  /**
452   * Attempts to parse a document into an article. Heavy lifting happens
453   * in readerWorker.js.
454   *
455   * @param doc The document to parse.
456   * @return {Promise}
457   * @resolves JS object representing the article, or null if no article is found.
458   */
459  async _readerParse(doc) {
460    let histogram = Services.telemetry.getHistogramById(
461      "READER_MODE_PARSE_RESULT"
462    );
463    if (this.parseNodeLimit) {
464      let numTags = doc.getElementsByTagName("*").length;
465      if (numTags > this.parseNodeLimit) {
466        this.log(
467          "Aborting parse for " +
468            doc.baseURIObject.spec +
469            "; " +
470            numTags +
471            " elements found"
472        );
473        histogram.add(PARSE_ERROR_TOO_MANY_ELEMENTS);
474        return null;
475      }
476    }
477
478    // Fetch this here before we send `doc` off to the worker thread, as later on the
479    // document might be nuked but we will still want the URI.
480    let { documentURI } = doc;
481
482    let uriParam = {
483      spec: doc.baseURIObject.spec,
484      host: doc.baseURIObject.host,
485      prePath: doc.baseURIObject.prePath,
486      scheme: doc.baseURIObject.scheme,
487      pathBase: Services.io.newURI(".", null, doc.baseURIObject).spec,
488    };
489
490    // convert text/plain document, if any, to XHTML format
491    if (this._isDocumentPlainText(doc)) {
492      doc = this._convertPlainTextDocument(doc);
493    }
494
495    let serializer = new XMLSerializer();
496    let serializedDoc = serializer.serializeToString(doc);
497    // Explicitly null out doc to make it clear it might not be available from this
498    // point on.
499    doc = null;
500
501    let options = {
502      classesToPreserve: CLASSES_TO_PRESERVE,
503    };
504
505    let article = null;
506    try {
507      article = await ReaderWorker.post("parseDocument", [
508        uriParam,
509        serializedDoc,
510        options,
511      ]);
512    } catch (e) {
513      Cu.reportError("Error in ReaderWorker: " + e);
514      histogram.add(PARSE_ERROR_WORKER);
515    }
516
517    if (!article) {
518      this.log("Worker did not return an article");
519      histogram.add(PARSE_ERROR_NO_ARTICLE);
520      return null;
521    }
522
523    // Readability returns a URI object based on the baseURI, but we only care
524    // about the original document's URL from now on. This also avoids spoofing
525    // attempts where the baseURI doesn't match the domain of the documentURI
526    article.url = documentURI;
527    delete article.uri;
528
529    let flags =
530      Ci.nsIDocumentEncoder.OutputSelectionOnly |
531      Ci.nsIDocumentEncoder.OutputAbsoluteLinks;
532    article.title = Cc["@mozilla.org/parserutils;1"]
533      .getService(Ci.nsIParserUtils)
534      .convertToPlainText(article.title, flags, 0);
535    if (gIsFirefoxDesktop) {
536      await this._assignLanguage(article);
537      this._maybeAssignTextDirection(article);
538    }
539
540    this._assignReadTime(article);
541
542    histogram.add(PARSE_SUCCESS);
543    return article;
544  },
545
546  get _cryptoHash() {
547    delete this._cryptoHash;
548    return (this._cryptoHash = Cc[
549      "@mozilla.org/security/hash;1"
550    ].createInstance(Ci.nsICryptoHash));
551  },
552
553  get _unicodeConverter() {
554    delete this._unicodeConverter;
555    this._unicodeConverter = Cc[
556      "@mozilla.org/intl/scriptableunicodeconverter"
557    ].createInstance(Ci.nsIScriptableUnicodeConverter);
558    this._unicodeConverter.charset = "utf8";
559    return this._unicodeConverter;
560  },
561
562  /**
563   * Calculate the hashed path for a stripped article URL.
564   *
565   * @param url The article URL. This should have referrers removed.
566   * @return The file path to the cached article.
567   */
568  _toHashedPath(url) {
569    let value = this._unicodeConverter.convertToByteArray(url);
570    this._cryptoHash.init(this._cryptoHash.MD5);
571    this._cryptoHash.update(value, value.length);
572
573    let hash = CommonUtils.encodeBase32(this._cryptoHash.finish(false));
574    let fileName = hash.substring(0, hash.indexOf("=")) + ".json";
575    return OS.Path.join(OS.Constants.Path.profileDir, "readercache", fileName);
576  },
577
578  /**
579   * Ensures the cache directory exists.
580   *
581   * @return Promise
582   * @resolves When the cache directory exists.
583   * @rejects OS.File.Error
584   */
585  _ensureCacheDir() {
586    let dir = OS.Path.join(OS.Constants.Path.profileDir, "readercache");
587    return OS.File.exists(dir).then(exists => {
588      if (!exists) {
589        return OS.File.makeDir(dir);
590      }
591      return undefined;
592    });
593  },
594
595  /**
596   * Sets a global language string value if the result is confident
597   *
598   * @return Promise
599   * @resolves when the language is detected
600   */
601  _assignLanguage(article) {
602    return LanguageDetector.detectLanguage(article.textContent).then(result => {
603      article.language = result.confident ? result.language : null;
604    });
605  },
606
607  _maybeAssignTextDirection(article) {
608    // TODO: Remove the hardcoded language codes below once bug 1320265 is resolved.
609    if (
610      !article.dir &&
611      ["ar", "fa", "he", "ug", "ur"].includes(article.language)
612    ) {
613      article.dir = "rtl";
614    }
615  },
616
617  /**
618   * Assigns the estimated reading time range of the article to the article object.
619   *
620   * @param article the article object to assign the reading time estimate to.
621   */
622  _assignReadTime(article) {
623    let lang = article.language || "en";
624    const readingSpeed = this._getReadingSpeedForLanguage(lang);
625    const charactersPerMinuteLow = readingSpeed.cpm - readingSpeed.variance;
626    const charactersPerMinuteHigh = readingSpeed.cpm + readingSpeed.variance;
627    const length = article.length;
628
629    article.readingTimeMinsSlow = Math.ceil(length / charactersPerMinuteLow);
630    article.readingTimeMinsFast = Math.ceil(length / charactersPerMinuteHigh);
631  },
632
633  /**
634   * Returns the reading speed of a selection of languages with likely variance.
635   *
636   * Reading speed estimated from a study done on reading speeds in various languages.
637   * study can be found here: http://iovs.arvojournals.org/article.aspx?articleid=2166061
638   *
639   * @return object with characters per minute and variance. Defaults to English
640   *         if no suitable language is found in the collection.
641   */
642  _getReadingSpeedForLanguage(lang) {
643    const readingSpeed = new Map([
644      ["en", { cpm: 987, variance: 118 }],
645      ["ar", { cpm: 612, variance: 88 }],
646      ["de", { cpm: 920, variance: 86 }],
647      ["es", { cpm: 1025, variance: 127 }],
648      ["fi", { cpm: 1078, variance: 121 }],
649      ["fr", { cpm: 998, variance: 126 }],
650      ["he", { cpm: 833, variance: 130 }],
651      ["it", { cpm: 950, variance: 140 }],
652      ["jw", { cpm: 357, variance: 56 }],
653      ["nl", { cpm: 978, variance: 143 }],
654      ["pl", { cpm: 916, variance: 126 }],
655      ["pt", { cpm: 913, variance: 145 }],
656      ["ru", { cpm: 986, variance: 175 }],
657      ["sk", { cpm: 885, variance: 145 }],
658      ["sv", { cpm: 917, variance: 156 }],
659      ["tr", { cpm: 1054, variance: 156 }],
660      ["zh", { cpm: 255, variance: 29 }],
661    ]);
662
663    return readingSpeed.get(lang) || readingSpeed.get("en");
664  },
665  /**
666   *
667   * Check if the document to be parsed is text document.
668   * @param doc the doc object to be parsed.
669   * @return boolean
670   *
671   */
672  _isDocumentPlainText(doc) {
673    return doc.contentType == "text/plain";
674  },
675  /**
676   *
677   * The document to be parsed is text document and is converted to HTML format.
678   * @param doc the doc object to be parsed.
679   * @return doc
680   *
681   */
682  _convertPlainTextDocument(doc) {
683    let preTag = doc.querySelector("pre");
684    let docFrag = doc.createDocumentFragment();
685    let content = preTag.textContent;
686    let paragraphs = content.split(/\r?\n\r?\n/);
687    for (let para of paragraphs) {
688      let pElem = doc.createElement("p");
689      let lines = para.split(/\n/);
690      for (let line of lines) {
691        pElem.append(line);
692        let brElem = doc.createElement("br");
693        pElem.append(brElem);
694      }
695      docFrag.append(pElem);
696    }
697    // Clone the document to avoid the original document being affected
698    // (which shows up when exiting reader mode again).
699    let clone = doc.documentElement.cloneNode(true);
700    clone.querySelector("pre").replaceWith(docFrag);
701    return clone;
702  },
703};
704
705XPCOMUtils.defineLazyPreferenceGetter(
706  ReaderMode,
707  "maxElemsToParse",
708  "reader.parse-node-limit",
709  0
710);
711