1 // Copyright 2018 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_
6 #define UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_
7 
8 #include <memory>
9 #include <string>
10 #include <unordered_map>
11 #include <unordered_set>
12 #include <utility>
13 #include <vector>
14 
15 #include "base/macros.h"
16 #include "third_party/cld_3/src/src/nnet_language_identifier.h"
17 #include "ui/accessibility/ax_enums.mojom-forward.h"
18 #include "ui/accessibility/ax_export.h"
19 #include "ui/accessibility/ax_tree_observer.h"
20 
21 namespace ui {
22 
23 class AXNode;
24 class AXTree;
25 
26 // This module implements language detection enabling Chrome to automatically
27 // detect the language for runs of text within the page.
28 //
29 // Node-level language detection runs once per page after the load complete
30 // event. This involves two passes:
31 //   *Detect* walks the tree from the given root using cld3 to detect up to 3
32 //            potential languages per node. A ranked list is created enumerating
33 //            all potential languages on a page.
34 //   *Label* re-walks the tree, assigning a language to each node considering
35 //           the potential languages from the detect phase, page level
36 //           statistics, and the assigned languages of ancestor nodes.
37 //
38 // Optionally an embedder may run *sub-node* language detection which attempts
39 // to assign languages for runs of text within a node, potentially down to the
40 // individual character level. This is useful in cases where a single paragraph
41 // involves switching between multiple languages, and where the speech engine
42 // doesn't automatically switch voices to handle different character sets.
43 // Due to the potentially small lengths of text runs involved this tends to be
44 // lower in accuracy, and works best when a node is composed of multiple
45 // languages with easily distinguishable scripts.
46 
47 // AXLanguageInfo represents the local language detection data for all text
48 // within an AXNode. Stored on AXNode.
49 struct AX_EXPORT AXLanguageInfo {
50   AXLanguageInfo();
51   ~AXLanguageInfo();
52 
53   // This is the final language we have assigned for this node during the
54   // 'label' step, it is the result of merging:
55   //  a) The detected language for this node
56   //  b) The declared lang attribute on this node
57   //  c) the (recursive) language of the parent (detected or declared).
58   //
59   // This will be the empty string if no language was assigned during label
60   // phase.
61   //
62   // IETF BCP 47 Language code (rfc5646).
63   // examples:
64   //  'de'
65   //  'de-DE'
66   //  'en'
67   //  'en-US'
68   //  'es-ES'
69   //
70   // This should not be read directly by clients of AXNode, instead clients
71   // should call AXNode::GetLanguage().
72   // TODO(chrishall): consider renaming this to `assigned_language`.
73   std::string language;
74 
75   // Detected languages for this node sorted as returned by
76   // FindTopNMostFreqLangs, which sorts in decreasing order of probability,
77   // filtered to remove any unreliable results.
78   std::vector<std::string> detected_languages;
79 };
80 
81 // Each AXLanguageSpan contains a language, a probability, and start and end
82 // indices. The indices are used to specify the substring that contains the
83 // associated language. The string which the indices are relative to is not
84 // included in this structure.
85 // Also, the indices are relative to a Utf8 string.
86 // See documentation on GetLanguageAnnotationForStringAttribute for details
87 // on how to associate this object with a string.
88 struct AX_EXPORT AXLanguageSpan {
89   int start_index;
90   int end_index;
91   std::string language;
92   float probability;
93 };
94 
95 // A single AXLanguageInfoStats instance is stored on each AXTree and contains
96 // statistics on detected languages for all the AXNodes in that tree.
97 //
98 // We rely on these tree-level statistics when labelling individual nodes, to
99 // provide extra signals to increase our confidence in assigning a detected
100 // language.
101 //
102 // These tree level statistics are also used to send reports on the language
103 // detection feature to enable tuning.
104 //
105 // The Label step will only assign a detected language to a node if that
106 // language is one of the most frequent languages on the page.
107 //
108 // For example, if a single node has detected_languages (in order of probability
109 // assigned by cld_3): da-DK, en-AU, fr-FR, but the page statistics overall
110 // indicate that the page is generally in en-AU and ja-JP, it is more likely to
111 // be a mis-recognition of Danish than an accurate assignment, so we assign
112 // en-AU instead of da-DK.
113 class AX_EXPORT AXLanguageInfoStats {
114  public:
115   AXLanguageInfoStats();
116   ~AXLanguageInfoStats();
117 
118   // Each AXLanguageInfoStats is tied to a specific AXTree, copying is safe but
119   // logically doesn't make sense.
120   AXLanguageInfoStats(const AXLanguageInfoStats&) = delete;
121   AXLanguageInfoStats& operator=(const AXLanguageInfoStats&) = delete;
122 
123   // Adjust our statistics to add provided detected languages.
124   void Add(const std::vector<std::string>& languages);
125 
126   // Fetch the score for a given language.
127   int GetScore(const std::string& lang) const;
128 
129   // Check if a given language is within the top results.
130   bool CheckLanguageWithinTop(const std::string& lang);
131 
132   // Record statistics based on how we labelled a node.
133   // We consider the language we labelled the node with, the language the author
134   // assigned, and whether or not we assigned our highest confidence detection
135   // result.
136   void RecordLabelStatistics(const std::string& labelled_lang,
137                              const std::string& author_lang,
138                              bool labelled_with_first_result);
139 
140   // Update metrics to reflect we attempted to detect language for a node.
141   void RecordDetectionAttempt();
142 
143   // Report metrics to UMA.
144   // Reports statistics since last run, run once detect & label iteration.
145   // If successful, will reset statistics.
146   void ReportMetrics();
147 
148  private:
149   // Allow access from a fixture only used in testing.
150   friend class AXLanguageDetectionTestFixture;
151 
152   // Store a count of the occurrences of a given language.
153   std::unordered_map<std::string, int> lang_counts_;
154 
155   // Cache of last calculated top language results.
156   // A vector of pairs of (score, language) sorted by descending score.
157   std::vector<std::pair<int, std::string>> top_results_;
158 
159   // Boolean recording that we have not mutated the statistics since last
160   // calculating top results, setting this to false will cause recalculation
161   // when the results are next fetched.
162   bool top_results_valid_;
163 
164   // Invalidate the top results cache.
165   void InvalidateTopResults();
166 
167   // Compute the top results and store them in cache.
168   void GenerateTopResults();
169 
170   // TODO(chrishall): Do we want this for testing? or is it better to only test
171   //  the generated metrics by inspecting the histogram?
172   // Boolean used for testing metrics only, disables clearing of metrics.
173   bool disable_metric_clearing_;
174   void ClearMetrics();
175 
176   // *** Statistics recorded for metric reporting. ***
177   // All statistics represent a single iteration of language detection and are
178   // reset after each successful call of ReportMetrics.
179 
180   // The number of nodes we attempted detection on.
181   int count_detection_attempted_;
182 
183   // The number of nodes we got detection results for.
184   int count_detection_results_;
185 
186   // The number of nodes we assigned a label to.
187   int count_labelled_;
188 
189   // The number of nodes we assigned a label to which was the highest confident
190   // detected language.
191   int count_labelled_with_top_result_;
192 
193   // The number of times we labelled a language which disagreed with the node's
194   // author provided language annotation.
195   //
196   // If we have
197   //  <div lang='en'><span>...</span><span>...</span></div>
198   // and we detect and label both spans as having language 'fr', then we count
199   // this as `2` overrides.
200   int count_overridden_;
201 
202   // Set of top language detected for every node, used to generate the unique
203   // number of detected languages metric (LangsPerPage).
204   std::unordered_set<std::string> unique_top_lang_detected_;
205 };
206 
207 // AXLanguageDetectionObserver is registered as a change observer on an AXTree
208 // and will run language detection after each update to the tree.
209 //
210 // We have kept this observer separate from the AXLanguageDetectionManager as we
211 // are aiming to launch language detection in two phases and wanted to try keep
212 // the code paths somewhat separate.
213 //
214 // TODO(chrishall): After both features have launched we could consider merging
215 // AXLanguageDetectionObserver into AXLanguageDetectionManager.
216 //
217 // TODO(chrishall): Investigate the cost of using AXTreeObserver, given that it
218 // has many empty virtual methods which are called for every AXTree change and
219 // we are only currently interested in OnAtomicUpdateFinished.
220 class AX_EXPORT AXLanguageDetectionObserver : public ui::AXTreeObserver {
221  public:
222   // Observer constructor will register itself with the provided AXTree.
223   AXLanguageDetectionObserver(AXTree* tree);
224 
225   // Observer destructor will remove itself as an observer from the AXTree.
226   ~AXLanguageDetectionObserver() override;
227 
228   // AXLanguageDetectionObserver contains a pointer so copying is non-trivial.
229   AXLanguageDetectionObserver(const AXLanguageDetectionObserver&) = delete;
230   AXLanguageDetectionObserver& operator=(const AXLanguageDetectionObserver&) =
231       delete;
232 
233  private:
234   void OnAtomicUpdateFinished(ui::AXTree* tree,
235                               bool root_changed,
236                               const std::vector<Change>& changes) override;
237 
238   // Non-owning pointer to AXTree, used to de-register observer on destruction.
239   AXTree* const tree_;
240 };
241 
242 // AXLanguageDetectionManager manages all of the context needed for language
243 // detection within an AXTree.
244 class AX_EXPORT AXLanguageDetectionManager {
245  public:
246   // Construct an AXLanguageDetectionManager for the specified tree.
247   explicit AXLanguageDetectionManager(AXTree* tree);
248   ~AXLanguageDetectionManager();
249 
250   // AXLanguageDetectionManager contains pointers so copying is non-trivial.
251   AXLanguageDetectionManager(const AXLanguageDetectionManager&) = delete;
252   AXLanguageDetectionManager& operator=(const AXLanguageDetectionManager&) =
253       delete;
254 
255   // Detect languages for each node in the tree managed by this manager.
256   // This is the first pass in detection and labelling.
257   // This only detects the language, it does not label it, for that see
258   //  LabelLanguageForSubtree.
259   void DetectLanguages();
260 
261   // Label languages for each node in the tree manager by this manager.
262   // This is the second pass in detection and labelling.
263   // This will label the language, but relies on the earlier detection phase
264   // having already completed.
265   void LabelLanguages();
266 
267   // Sub-node language detection for a given string attribute.
268   // For example, if a node has name: "My name is Fred", then calling
269   // GetLanguageAnnotationForStringAttribute(*node, ax::mojom::StringAttribute::
270   // kName) would return language detection information about "My name is Fred".
271   std::vector<AXLanguageSpan> GetLanguageAnnotationForStringAttribute(
272       const AXNode& node,
273       ax::mojom::StringAttribute attr);
274 
275   // Construct and register a dynamic content change observer for this manager.
276   void RegisterLanguageDetectionObserver();
277 
278  private:
279   friend class AXLanguageDetectionObserver;
280 
281   // Allow access from a fixture only used in testing.
282   friend class AXLanguageDetectionTestFixture;
283 
284   // Helper methods to test if language detection features are enabled.
285   static bool IsStaticLanguageDetectionEnabled();
286   static bool IsDynamicLanguageDetectionEnabled();
287 
288   // Perform detection for subtree rooted at subtree_root.
289   void DetectLanguagesForSubtree(AXNode* subtree_root);
290   // Perform detection for node. Will not descend into children.
291   void DetectLanguagesForNode(AXNode* node);
292   // Perform labelling for subtree rooted at subtree_root.
293   void LabelLanguagesForSubtree(AXNode* subtree_root);
294   // Perform labelling for node. Will not descend into children.
295   void LabelLanguagesForNode(AXNode* node);
296 
297   // This language identifier is constructed with a default minimum byte length
298   // of chrome_lang_id::NNetLanguageIdentifier::kMinNumBytesToConsider and is
299   // used for detecting page-level languages.
300   chrome_lang_id::NNetLanguageIdentifier language_identifier_;
301 
302   // This language identifier is constructed with a minimum byte length of
303   // kShortTextIdentifierMinByteLength so it can be used for detecting languages
304   // of shorter text (e.g. one character).
305   chrome_lang_id::NNetLanguageIdentifier short_text_language_identifier_;
306 
307   // The observer to support dynamic content language detection.
308   std::unique_ptr<AXLanguageDetectionObserver> language_detection_observer_;
309 
310   // Non-owning back pointer to the tree which owns this manager.
311   AXTree* tree_;
312 
313   AXLanguageInfoStats lang_info_stats_;
314 };
315 
316 }  // namespace ui
317 
318 #endif  // UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_
319