1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
3 * You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5"use strict";
6
7var EXPORTED_SYMBOLS = ["LanguageDetector"];
8
9const { clearTimeout, setTimeout } = ChromeUtils.import(
10  "resource://gre/modules/Timer.jsm"
11);
12
13// Since Emscripten can handle heap growth, but not heap shrinkage, we
14// need to refresh the worker after we've processed a particularly large
15// string in order to prevent unnecessary resident memory growth.
16//
17// These values define the cut-off string length and the idle timeout
18// (in milliseconds) before destroying a worker. Once a string of the
19// maximum size has been processed, the worker is marked for
20// destruction, and is terminated as soon as it has been idle for the
21// given timeout.
22//
23// 1.5MB. This is the approximate string length that forces heap growth
24// for a 2MB heap.
25var LARGE_STRING = 1.5 * 1024 * 1024;
26var IDLE_TIMEOUT = 10 * 1000;
27
28const WORKER_URL = "resource:///modules/translation/cld-worker.js";
29
30var workerManager = {
31  detectionQueue: [],
32
33  detectLanguage(aParams) {
34    return this.workerReady
35      .then(worker => {
36        return new Promise(resolve => {
37          this.detectionQueue.push({ resolve });
38          worker.postMessage(aParams);
39        });
40      })
41      .then(result => {
42        // We have our asynchronous result from the worker.
43        //
44        // Determine if our input was large enough to trigger heap growth,
45        // or if we're already waiting to destroy the worker when it's
46        // idle. If so, schedule termination after the idle timeout.
47        if (aParams.text.length >= LARGE_STRING || this._idleTimeout != null) {
48          this.flushWorker();
49        }
50
51        return result;
52      });
53  },
54
55  _worker: null,
56  _workerReadyPromise: null,
57
58  get workerReady() {
59    if (!this._workerReadyPromise) {
60      this._workerReadyPromise = new Promise(resolve => {
61        let worker = new Worker(WORKER_URL);
62        worker.onmessage = aMsg => {
63          if (aMsg.data == "ready") {
64            resolve(worker);
65          } else {
66            this.detectionQueue.shift().resolve(aMsg.data);
67          }
68        };
69        this._worker = worker;
70      });
71    }
72
73    return this._workerReadyPromise;
74  },
75
76  // Holds the ID of the current pending idle cleanup setTimeout.
77  _idleTimeout: null,
78
79  // Schedule the current worker to be terminated after the idle timeout.
80  flushWorker() {
81    if (this._idleTimeout != null) {
82      clearTimeout(this._idleTimeout);
83    }
84
85    this._idleTimeout = setTimeout(this._flushWorker.bind(this), IDLE_TIMEOUT);
86  },
87
88  // Immediately terminate the worker, as long as there no pending
89  // results. Otherwise, reschedule termination until after the next
90  // idle timeout.
91  _flushWorker() {
92    if (this.detectionQueue.length) {
93      this.flushWorker();
94    } else {
95      if (this._worker) {
96        this._worker.terminate();
97      }
98
99      this._worker = null;
100      this._workerReadyPromise = null;
101      this._idleTimeout = null;
102    }
103  },
104};
105
106var LanguageDetector = {
107  /**
108   * Detect the language of a given string.
109   *
110   * The argument may be either a string containing the text to analyze,
111   * or an object with the following properties:
112   *
113   *  - 'text' The text to analyze.
114   *
115   *  - 'isHTML' (optional) A boolean, indicating whether the text
116   *      should be analyzed as HTML rather than plain text.
117   *
118   *  - 'language' (optional) A string indicating the expected language.
119   *      For text extracted from HTTP documents, this is expected to
120   *      come from the Content-Language header.
121   *
122   *  - 'tld' (optional) A string indicating the top-level domain of the
123   *      document the text was extracted from.
124   *
125   *  - 'encoding' (optional) A string describing the encoding of the
126   *      document the string was extracted from. Note that, regardless
127   *      of the value of this property, the 'text' property must be a
128   *      UTF-16 JavaScript string.
129   *
130   * @returns {Promise<Object>}
131   * @resolves When detection is finished, with a object containing
132   * these fields:
133   *  - 'language' (string with a language code)
134   *  - 'confident' (boolean) Whether the detector is confident of the
135   *      result.
136   *  - 'languages' (array) An array of up to three elements, containing
137   *      the most prevalent languages detected. It contains a
138   *      'languageCode' property, containing the ISO language code of
139   *      the language, and a 'percent' property, describing the
140   *      approximate percentage of the input which is in that language.
141   *      For text of an unknown language, the result may contain an
142   *      entry with the languge code 'un', indicating the percent of
143   *      the text which is unknown.
144   */
145  detectLanguage(aParams) {
146    if (typeof aParams == "string") {
147      aParams = { text: aParams };
148    }
149
150    return workerManager.detectLanguage(aParams);
151  },
152};
153