1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5"use strict";
6
7var EXPORTED_SYMBOLS = ["WebsiteMetadata"];
8
9ChromeUtils.import("resource://gre/modules/XPCOMUtils.jsm");
10
11ChromeUtils.defineModuleGetter(this, "EventDispatcher", "resource://gre/modules/Messaging.jsm");
12ChromeUtils.defineModuleGetter(this, "Task", "resource://gre/modules/Task.jsm");
13
14var WebsiteMetadata = {
15  /**
16   * Asynchronously parse the document extract metadata. A 'Website:Metadata' event with the metadata
17   * will be sent.
18   */
19  parseAsynchronously: function(doc) {
20    Task.spawn(function() {
21      let metadata = getMetadata(doc, doc.location.href, {
22        image_url: metadataRules.image_url,
23        provider: metadataRules.provider,
24        description_length: metadataRules.description_length
25      });
26
27      // No metadata was extracted, so don't bother sending it.
28      if (Object.keys(metadata).length === 0) {
29        return;
30      }
31
32      let msg = {
33        type: "Website:Metadata",
34        location: doc.location.href,
35        hasImage: metadata.image_url && metadata.image_url !== "",
36        metadata: JSON.stringify(metadata),
37      };
38
39      EventDispatcher.instance.sendRequest(msg);
40    });
41  }
42};
43
44// #################################################################################################
45// # Modified version of makeUrlAbsolute() to not import url parser library (and dependencies)
46// #################################################################################################
47
48function makeUrlAbsolute(context, relative) {
49    var a = context.doc.createElement("a");
50    a.href = relative;
51    return a.href;
52}
53
54// #################################################################################################
55// # page-metadata-parser
56// # https://github.com/mozilla/page-metadata-parser/
57// # 61c58cbd0f0bf2153df832a388a79c66b288b98c
58// #################################################################################################
59
60function buildRuleset(name, rules, processors) {
61  const reversedRules = Array.from(rules).reverse();
62  const builtRuleset = ruleset(...reversedRules.map(([query, handler], order) => rule(
63    dom(query),
64    node => [{
65      score: order,
66      flavor: name,
67      notes: handler(node),
68    }]
69  )));
70
71  return (doc, context) => {
72    const kb = builtRuleset.score(doc);
73    const maxNode = kb.max(name);
74
75    if (maxNode) {
76      let value = maxNode.flavors.get(name);
77
78      if (processors) {
79        processors.forEach(processor => {
80          value = processor(value, context);
81        });
82      }
83
84      if (value) {
85        if (value.trim) {
86          return value.trim();
87        }
88        return value;
89      }
90    }
91  };
92}
93
94const descriptionRules = [
95  ['meta[property="og:description"]', node => node.element.getAttribute("content")],
96  ['meta[name="description"]', node => node.element.getAttribute("content")],
97];
98
99const metadataRules = {
100  description: {
101    rules: descriptionRules
102  },
103
104  description_length: {
105    rules: descriptionRules,
106    processors: [
107      (description) => description.length
108    ]
109  },
110
111  icon_url: {
112    rules: [
113      ['link[rel="apple-touch-icon"]', node => node.element.getAttribute("href")],
114      ['link[rel="apple-touch-icon-precomposed"]', node => node.element.getAttribute("href")],
115      ['link[rel="icon"]', node => node.element.getAttribute("href")],
116      ['link[rel="fluid-icon"]', node => node.element.getAttribute("href")],
117      ['link[rel="shortcut icon"]', node => node.element.getAttribute("href")],
118      ['link[rel="Shortcut Icon"]', node => node.element.getAttribute("href")],
119      ['link[rel="mask-icon"]', node => node.element.getAttribute("href")],
120    ],
121    processors: [
122      (icon_url, context) => makeUrlAbsolute(context, icon_url)
123    ]
124  },
125
126  image_url: {
127    rules: [
128      ['meta[property="og:image:secure_url"]', node => node.element.getAttribute("content")],
129      ['meta[property="og:image:url"]', node => node.element.getAttribute("content")],
130      ['meta[property="og:image"]', node => node.element.getAttribute("content")],
131      ['meta[property="twitter:image"]', node => node.element.getAttribute("content")],
132      ['meta[name="thumbnail"]', node => node.element.getAttribute("content")],
133    ],
134    processors: [
135      (image_url, context) => makeUrlAbsolute(context, image_url)
136    ],
137  },
138
139  keywords: {
140    rules: [
141      ['meta[name="keywords"]', node => node.element.getAttribute("content")],
142    ],
143    processors: [
144      (keywords) => keywords.split(",").map((keyword) => keyword.trim()),
145    ]
146  },
147
148  title: {
149    rules: [
150      ['meta[property="og:title"]', node => node.element.getAttribute("content")],
151      ['meta[property="twitter:title"]', node => node.element.getAttribute("content")],
152      ['meta[name="hdl"]', node => node.element.getAttribute("content")],
153      ["title", node => node.element.text],
154    ],
155  },
156
157  type: {
158    rules: [
159      ['meta[property="og:type"]', node => node.element.getAttribute("content")],
160    ],
161  },
162
163  url: {
164    rules: [
165      ['meta[property="og:url"]', node => node.element.getAttribute("content")],
166      ['link[rel="canonical"]', node => node.element.getAttribute("href")],
167    ],
168  },
169
170  provider: {
171    rules: [
172      ['meta[property="og:site_name"]', node => node.element.getAttribute("content")]
173    ]
174  },
175};
176
177function getMetadata(doc, url, rules) {
178  const metadata = {};
179  const context = {url, doc};
180  const ruleSet = rules || metadataRules;
181
182  Object.keys(ruleSet).map(metadataKey => {
183    const metadataRule = ruleSet[metadataKey];
184
185    if (Array.isArray(metadataRule.rules)) {
186      const builtRule = buildRuleset(metadataKey, metadataRule.rules, metadataRule.processors);
187      metadata[metadataKey] = builtRule(doc, context);
188    } else {
189      metadata[metadataKey] = getMetadata(doc, url, metadataRule);
190    }
191  });
192
193  return metadata;
194}
195
196// #################################################################################################
197// # Fathom dependencies resolved
198// #################################################################################################
199
200// const {forEach} = require('wu');
201function forEach(fn, obj) {
202    for (let x of obj) {
203        fn(x);
204    }
205}
206
207function best(iterable, by, isBetter) {
208    let bestSoFar, bestKeySoFar;
209    let isFirst = true;
210    forEach(
211        function(item) {
212            const key = by(item);
213            if (isBetter(key, bestKeySoFar) || isFirst) {
214                bestSoFar = item;
215                bestKeySoFar = key;
216                isFirst = false;
217            }
218        },
219        iterable);
220    if (isFirst) {
221        throw new Error("Tried to call best() on empty iterable");
222    }
223    return bestSoFar;
224}
225
226// const {max} = require('./utils');
227function max(iterable, by = identity) {
228    return best(iterable, by, (a, b) => a > b);
229}
230
231// #################################################################################################
232// # Fathom
233// # https://github.com/mozilla/fathom
234// # cac59e470816f17fc1efd4a34437b585e3e451cd
235// #################################################################################################
236
237// Get a key of a map, first setting it to a default value if it's missing.
238function getDefault(map, key, defaultMaker) {
239    if (map.has(key)) {
240        return map.get(key);
241    }
242    const defaultValue = defaultMaker();
243    map.set(key, defaultValue);
244    return defaultValue;
245}
246
247
248// Construct a filtration network of rules.
249function ruleset(...rules) {
250    const rulesByInputFlavor = new Map(); // [someInputFlavor: [rule, ...]]
251
252    // File each rule under its input flavor:
253    forEach(rule => getDefault(rulesByInputFlavor, rule.source.inputFlavor, () => []).push(rule),
254            rules);
255
256    return {
257        // Iterate over a DOM tree or subtree, building up a knowledgebase, a
258        // data structure holding scores and annotations for interesting
259        // elements. Return the knowledgebase.
260        //
261        // This is the "rank" portion of the rank-and-yank algorithm.
262        score: function(tree) {
263            const kb = knowledgebase();
264
265            // Introduce the whole DOM into the KB as flavor 'dom' to get
266            // things started:
267            const nonterminals = [[{tree}, "dom"]]; // [[node, flavor], [node, flavor], ...]
268
269            // While there are new facts, run the applicable rules over them to
270            // generate even newer facts. Repeat until everything's fully
271            // digested. Rules run in no particular guaranteed order.
272            while (nonterminals.length) {
273                const [inNode, inFlavor] = nonterminals.pop();
274                for (let rule of getDefault(rulesByInputFlavor, inFlavor, () => [])) {
275                    const outFacts = resultsOf(rule, inNode, inFlavor, kb);
276                    for (let fact of outFacts) {
277                        const outNode = kb.nodeForElement(fact.element);
278
279                        // No matter whether or not this flavor has been
280                        // emitted before for this node, we multiply the score.
281                        // We want to be able to add rules that refine the
282                        // scoring of a node, without having to rewire the path
283                        // of flavors that winds through the ruleset.
284                        //
285                        // 1 score per Node is plenty. That simplifies our
286                        // data, our rankers, our flavor system (since we don't
287                        // need to represent score axes), and our engine. If
288                        // somebody wants more score axes, they can fake it
289                        // themselves with notes, thus paying only for what
290                        // they eat. (We can even provide functions that help
291                        // with that.) Most rulesets will probably be concerned
292                        // with scoring only 1 thing at a time anyway. So,
293                        // rankers return a score multiplier + 0 or more new
294                        // flavors with optional notes. Facts can never be
295                        // deleted from the KB by rankers (or order would start
296                        // to matter); after all, they're *facts*.
297                        outNode.score *= fact.score;
298
299                        // Add a new annotation to a node--but only if there
300                        // wasn't already one of the given flavor already
301                        // there; otherwise there's no point.
302                        //
303                        // You might argue that we might want to modify an
304                        // existing note here, but that would be a bad
305                        // idea. Notes of a given flavor should be
306                        // considered immutable once laid down. Otherwise, the
307                        // order of execution of same-flavored rules could
308                        // matter, hurting pluggability. Emit a new flavor and
309                        // a new note if you want to do that.
310                        //
311                        // Also, choosing not to add a new fact to nonterminals
312                        // when we're not adding a new flavor saves the work of
313                        // running the rules against it, which would be
314                        // entirely redundant and perform no new work (unless
315                        // the rankers were nondeterministic, but don't do
316                        // that).
317                        if (!outNode.flavors.has(fact.flavor)) {
318                            outNode.flavors.set(fact.flavor, fact.notes);
319                            kb.indexNodeByFlavor(outNode, fact.flavor); // TODO: better encapsulation rather than indexing explicitly
320                            nonterminals.push([outNode, fact.flavor]);
321                        }
322                    }
323                }
324            }
325            return kb;
326        }
327    };
328}
329
330
331// Construct a container for storing and querying facts, where a fact has a
332// flavor (used to dispatch further rules upon), a corresponding DOM element, a
333// score, and some other arbitrary notes opaque to fathom.
334function knowledgebase() {
335    const nodesByFlavor = new Map(); // Map{'texty' -> [NodeA],
336                                      //     'spiffy' -> [NodeA, NodeB]}
337                                      // NodeA = {element: <someElement>,
338                                      //
339                                      //          // Global nodewide score. Add
340                                      //          // custom ones with notes if
341                                      //          // you want.
342                                      //          score: 8,
343                                      //
344                                      //          // Flavors is a map of flavor names to notes:
345                                      //          flavors: Map{'texty' -> {ownText: 'blah',
346                                      //                                   someOtherNote: 'foo',
347                                      //                                   someCustomScore: 10},
348                                      //                       // This is an empty note:
349                                      //                       'fluffy' -> undefined}}
350    const nodesByElement = new Map();
351
352    return {
353        // Return the "node" (our own data structure that we control) that
354        // corresponds to a given DOM element, creating one if necessary.
355        nodeForElement: function(element) {
356            return getDefault(nodesByElement,
357                              element,
358                              () => ({element,
359                                      score: 1,
360                                      flavors: new Map()}));
361        },
362
363        // Return the highest-scored node of the given flavor, undefined if
364        // there is none.
365        max: function(flavor) {
366            const nodes = nodesByFlavor.get(flavor);
367            return nodes === undefined ? undefined : max(nodes, node => node.score);
368        },
369
370        // Let the KB know that a new flavor has been added to an element.
371        indexNodeByFlavor: function(node, flavor) {
372            getDefault(nodesByFlavor, flavor, () => []).push(node);
373        },
374
375        nodesOfFlavor: function(flavor) {
376            return getDefault(nodesByFlavor, flavor, () => []);
377        }
378    };
379}
380
381
382// Apply a rule (as returned by a call to rule()) to a fact, and return the
383// new facts that result.
384function resultsOf(rule, node, flavor, kb) {
385    // If more types of rule pop up someday, do fancier dispatching here.
386    return rule.source.flavor === "flavor" ? resultsOfFlavorRule(rule, node, flavor) : resultsOfDomRule(rule, node, kb);
387}
388
389
390// Pull the DOM tree off the special property of the root "dom" fact, and query
391// against it.
392function* resultsOfDomRule(rule, specialDomNode, kb) {
393    // Use the special "tree" property of the special starting node:
394    const matches = specialDomNode.tree.querySelectorAll(rule.source.selector);
395
396    for (let i = 0; i < matches.length; i++) { // matches is a NodeList, which doesn't conform to iterator protocol
397        const element = matches[i];
398        const newFacts = explicitFacts(rule.ranker(kb.nodeForElement(element)));
399        for (let fact of newFacts) {
400            if (fact.element === undefined) {
401                fact.element = element;
402            }
403            if (fact.flavor === undefined) {
404                throw new Error("Rankers of dom() rules must return a flavor in each fact. Otherwise, there is no way for that fact to be used later.");
405            }
406            yield fact;
407        }
408    }
409}
410
411
412function* resultsOfFlavorRule(rule, node, flavor) {
413    const newFacts = explicitFacts(rule.ranker(node));
414
415    for (let fact of newFacts) {
416        // If the ranker didn't specify a different element, assume it's
417        // talking about the one we passed in:
418        if (fact.element === undefined) {
419            fact.element = node.element;
420        }
421        if (fact.flavor === undefined) {
422            fact.flavor = flavor;
423        }
424        yield fact;
425    }
426}
427
428
429// Take the possibly abbreviated output of a ranker function, and make it
430// explicitly an iterable with a defined score.
431//
432// Rankers can return undefined, which means "no facts", a single fact, or an
433// array of facts.
434function* explicitFacts(rankerResult) {
435    const array = (rankerResult === undefined) ? [] : (Array.isArray(rankerResult) ? rankerResult : [rankerResult]);
436    for (let fact of array) {
437        if (fact.score === undefined) {
438            fact.score = 1;
439        }
440        yield fact;
441    }
442}
443
444
445// TODO: For the moment, a lot of responsibility is on the rankers to return a
446// pretty big data structure of up to 4 properties. This is a bit verbose for
447// an arrow function (as I hope we can use most of the time) and the usual case
448// will probably be returning just a score multiplier. Make that case more
449// concise.
450
451// TODO: It is likely that rankers should receive the notes of their input type
452// as a 2nd arg, for brevity.
453
454
455// Return a condition that uses a DOM selector to find its matches from the
456// original DOM tree.
457//
458// For consistency, Nodes will still be delivered to the transformers, but
459// they'll have empty flavors and score = 1.
460//
461// Condition constructors like dom() and flavor() build stupid, introspectable
462// objects that the query engine can read. They don't actually do the query
463// themselves. That way, the query planner can be smarter than them, figuring
464// out which indices to use based on all of them. (We'll probably keep a heap
465// by each dimension's score and a hash by flavor name, for starters.) Someday,
466// fancy things like this may be possible: rule(and(tag('p'), klass('snork')),
467// ...)
468function dom(selector) {
469    return {
470        flavor: "dom",
471        inputFlavor: "dom",
472        selector
473    };
474}
475
476
477// Return a condition that discriminates on nodes of the knowledgebase by flavor.
478function flavor(inputFlavor) {
479    return {
480        flavor: "flavor",
481        inputFlavor
482    };
483}
484
485
486function rule(source, ranker) {
487    return {
488        source,
489        ranker
490    };
491}
492