1/* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5"use strict"; 6 7var EXPORTED_SYMBOLS = ["WebsiteMetadata"]; 8 9ChromeUtils.import("resource://gre/modules/XPCOMUtils.jsm"); 10 11ChromeUtils.defineModuleGetter(this, "EventDispatcher", "resource://gre/modules/Messaging.jsm"); 12ChromeUtils.defineModuleGetter(this, "Task", "resource://gre/modules/Task.jsm"); 13 14var WebsiteMetadata = { 15 /** 16 * Asynchronously parse the document extract metadata. A 'Website:Metadata' event with the metadata 17 * will be sent. 18 */ 19 parseAsynchronously: function(doc) { 20 Task.spawn(function() { 21 let metadata = getMetadata(doc, doc.location.href, { 22 image_url: metadataRules.image_url, 23 provider: metadataRules.provider, 24 description_length: metadataRules.description_length 25 }); 26 27 // No metadata was extracted, so don't bother sending it. 28 if (Object.keys(metadata).length === 0) { 29 return; 30 } 31 32 let msg = { 33 type: "Website:Metadata", 34 location: doc.location.href, 35 hasImage: metadata.image_url && metadata.image_url !== "", 36 metadata: JSON.stringify(metadata), 37 }; 38 39 EventDispatcher.instance.sendRequest(msg); 40 }); 41 } 42}; 43 44// ################################################################################################# 45// # Modified version of makeUrlAbsolute() to not import url parser library (and dependencies) 46// ################################################################################################# 47 48function makeUrlAbsolute(context, relative) { 49 var a = context.doc.createElement("a"); 50 a.href = relative; 51 return a.href; 52} 53 54// ################################################################################################# 55// # page-metadata-parser 56// # https://github.com/mozilla/page-metadata-parser/ 57// # 61c58cbd0f0bf2153df832a388a79c66b288b98c 58// ################################################################################################# 59 60function buildRuleset(name, rules, processors) { 61 const reversedRules = Array.from(rules).reverse(); 62 const builtRuleset = ruleset(...reversedRules.map(([query, handler], order) => rule( 63 dom(query), 64 node => [{ 65 score: order, 66 flavor: name, 67 notes: handler(node), 68 }] 69 ))); 70 71 return (doc, context) => { 72 const kb = builtRuleset.score(doc); 73 const maxNode = kb.max(name); 74 75 if (maxNode) { 76 let value = maxNode.flavors.get(name); 77 78 if (processors) { 79 processors.forEach(processor => { 80 value = processor(value, context); 81 }); 82 } 83 84 if (value) { 85 if (value.trim) { 86 return value.trim(); 87 } 88 return value; 89 } 90 } 91 }; 92} 93 94const descriptionRules = [ 95 ['meta[property="og:description"]', node => node.element.getAttribute("content")], 96 ['meta[name="description"]', node => node.element.getAttribute("content")], 97]; 98 99const metadataRules = { 100 description: { 101 rules: descriptionRules 102 }, 103 104 description_length: { 105 rules: descriptionRules, 106 processors: [ 107 (description) => description.length 108 ] 109 }, 110 111 icon_url: { 112 rules: [ 113 ['link[rel="apple-touch-icon"]', node => node.element.getAttribute("href")], 114 ['link[rel="apple-touch-icon-precomposed"]', node => node.element.getAttribute("href")], 115 ['link[rel="icon"]', node => node.element.getAttribute("href")], 116 ['link[rel="fluid-icon"]', node => node.element.getAttribute("href")], 117 ['link[rel="shortcut icon"]', node => node.element.getAttribute("href")], 118 ['link[rel="Shortcut Icon"]', node => node.element.getAttribute("href")], 119 ['link[rel="mask-icon"]', node => node.element.getAttribute("href")], 120 ], 121 processors: [ 122 (icon_url, context) => makeUrlAbsolute(context, icon_url) 123 ] 124 }, 125 126 image_url: { 127 rules: [ 128 ['meta[property="og:image:secure_url"]', node => node.element.getAttribute("content")], 129 ['meta[property="og:image:url"]', node => node.element.getAttribute("content")], 130 ['meta[property="og:image"]', node => node.element.getAttribute("content")], 131 ['meta[property="twitter:image"]', node => node.element.getAttribute("content")], 132 ['meta[name="thumbnail"]', node => node.element.getAttribute("content")], 133 ], 134 processors: [ 135 (image_url, context) => makeUrlAbsolute(context, image_url) 136 ], 137 }, 138 139 keywords: { 140 rules: [ 141 ['meta[name="keywords"]', node => node.element.getAttribute("content")], 142 ], 143 processors: [ 144 (keywords) => keywords.split(",").map((keyword) => keyword.trim()), 145 ] 146 }, 147 148 title: { 149 rules: [ 150 ['meta[property="og:title"]', node => node.element.getAttribute("content")], 151 ['meta[property="twitter:title"]', node => node.element.getAttribute("content")], 152 ['meta[name="hdl"]', node => node.element.getAttribute("content")], 153 ["title", node => node.element.text], 154 ], 155 }, 156 157 type: { 158 rules: [ 159 ['meta[property="og:type"]', node => node.element.getAttribute("content")], 160 ], 161 }, 162 163 url: { 164 rules: [ 165 ['meta[property="og:url"]', node => node.element.getAttribute("content")], 166 ['link[rel="canonical"]', node => node.element.getAttribute("href")], 167 ], 168 }, 169 170 provider: { 171 rules: [ 172 ['meta[property="og:site_name"]', node => node.element.getAttribute("content")] 173 ] 174 }, 175}; 176 177function getMetadata(doc, url, rules) { 178 const metadata = {}; 179 const context = {url, doc}; 180 const ruleSet = rules || metadataRules; 181 182 Object.keys(ruleSet).map(metadataKey => { 183 const metadataRule = ruleSet[metadataKey]; 184 185 if (Array.isArray(metadataRule.rules)) { 186 const builtRule = buildRuleset(metadataKey, metadataRule.rules, metadataRule.processors); 187 metadata[metadataKey] = builtRule(doc, context); 188 } else { 189 metadata[metadataKey] = getMetadata(doc, url, metadataRule); 190 } 191 }); 192 193 return metadata; 194} 195 196// ################################################################################################# 197// # Fathom dependencies resolved 198// ################################################################################################# 199 200// const {forEach} = require('wu'); 201function forEach(fn, obj) { 202 for (let x of obj) { 203 fn(x); 204 } 205} 206 207function best(iterable, by, isBetter) { 208 let bestSoFar, bestKeySoFar; 209 let isFirst = true; 210 forEach( 211 function(item) { 212 const key = by(item); 213 if (isBetter(key, bestKeySoFar) || isFirst) { 214 bestSoFar = item; 215 bestKeySoFar = key; 216 isFirst = false; 217 } 218 }, 219 iterable); 220 if (isFirst) { 221 throw new Error("Tried to call best() on empty iterable"); 222 } 223 return bestSoFar; 224} 225 226// const {max} = require('./utils'); 227function max(iterable, by = identity) { 228 return best(iterable, by, (a, b) => a > b); 229} 230 231// ################################################################################################# 232// # Fathom 233// # https://github.com/mozilla/fathom 234// # cac59e470816f17fc1efd4a34437b585e3e451cd 235// ################################################################################################# 236 237// Get a key of a map, first setting it to a default value if it's missing. 238function getDefault(map, key, defaultMaker) { 239 if (map.has(key)) { 240 return map.get(key); 241 } 242 const defaultValue = defaultMaker(); 243 map.set(key, defaultValue); 244 return defaultValue; 245} 246 247 248// Construct a filtration network of rules. 249function ruleset(...rules) { 250 const rulesByInputFlavor = new Map(); // [someInputFlavor: [rule, ...]] 251 252 // File each rule under its input flavor: 253 forEach(rule => getDefault(rulesByInputFlavor, rule.source.inputFlavor, () => []).push(rule), 254 rules); 255 256 return { 257 // Iterate over a DOM tree or subtree, building up a knowledgebase, a 258 // data structure holding scores and annotations for interesting 259 // elements. Return the knowledgebase. 260 // 261 // This is the "rank" portion of the rank-and-yank algorithm. 262 score: function(tree) { 263 const kb = knowledgebase(); 264 265 // Introduce the whole DOM into the KB as flavor 'dom' to get 266 // things started: 267 const nonterminals = [[{tree}, "dom"]]; // [[node, flavor], [node, flavor], ...] 268 269 // While there are new facts, run the applicable rules over them to 270 // generate even newer facts. Repeat until everything's fully 271 // digested. Rules run in no particular guaranteed order. 272 while (nonterminals.length) { 273 const [inNode, inFlavor] = nonterminals.pop(); 274 for (let rule of getDefault(rulesByInputFlavor, inFlavor, () => [])) { 275 const outFacts = resultsOf(rule, inNode, inFlavor, kb); 276 for (let fact of outFacts) { 277 const outNode = kb.nodeForElement(fact.element); 278 279 // No matter whether or not this flavor has been 280 // emitted before for this node, we multiply the score. 281 // We want to be able to add rules that refine the 282 // scoring of a node, without having to rewire the path 283 // of flavors that winds through the ruleset. 284 // 285 // 1 score per Node is plenty. That simplifies our 286 // data, our rankers, our flavor system (since we don't 287 // need to represent score axes), and our engine. If 288 // somebody wants more score axes, they can fake it 289 // themselves with notes, thus paying only for what 290 // they eat. (We can even provide functions that help 291 // with that.) Most rulesets will probably be concerned 292 // with scoring only 1 thing at a time anyway. So, 293 // rankers return a score multiplier + 0 or more new 294 // flavors with optional notes. Facts can never be 295 // deleted from the KB by rankers (or order would start 296 // to matter); after all, they're *facts*. 297 outNode.score *= fact.score; 298 299 // Add a new annotation to a node--but only if there 300 // wasn't already one of the given flavor already 301 // there; otherwise there's no point. 302 // 303 // You might argue that we might want to modify an 304 // existing note here, but that would be a bad 305 // idea. Notes of a given flavor should be 306 // considered immutable once laid down. Otherwise, the 307 // order of execution of same-flavored rules could 308 // matter, hurting pluggability. Emit a new flavor and 309 // a new note if you want to do that. 310 // 311 // Also, choosing not to add a new fact to nonterminals 312 // when we're not adding a new flavor saves the work of 313 // running the rules against it, which would be 314 // entirely redundant and perform no new work (unless 315 // the rankers were nondeterministic, but don't do 316 // that). 317 if (!outNode.flavors.has(fact.flavor)) { 318 outNode.flavors.set(fact.flavor, fact.notes); 319 kb.indexNodeByFlavor(outNode, fact.flavor); // TODO: better encapsulation rather than indexing explicitly 320 nonterminals.push([outNode, fact.flavor]); 321 } 322 } 323 } 324 } 325 return kb; 326 } 327 }; 328} 329 330 331// Construct a container for storing and querying facts, where a fact has a 332// flavor (used to dispatch further rules upon), a corresponding DOM element, a 333// score, and some other arbitrary notes opaque to fathom. 334function knowledgebase() { 335 const nodesByFlavor = new Map(); // Map{'texty' -> [NodeA], 336 // 'spiffy' -> [NodeA, NodeB]} 337 // NodeA = {element: <someElement>, 338 // 339 // // Global nodewide score. Add 340 // // custom ones with notes if 341 // // you want. 342 // score: 8, 343 // 344 // // Flavors is a map of flavor names to notes: 345 // flavors: Map{'texty' -> {ownText: 'blah', 346 // someOtherNote: 'foo', 347 // someCustomScore: 10}, 348 // // This is an empty note: 349 // 'fluffy' -> undefined}} 350 const nodesByElement = new Map(); 351 352 return { 353 // Return the "node" (our own data structure that we control) that 354 // corresponds to a given DOM element, creating one if necessary. 355 nodeForElement: function(element) { 356 return getDefault(nodesByElement, 357 element, 358 () => ({element, 359 score: 1, 360 flavors: new Map()})); 361 }, 362 363 // Return the highest-scored node of the given flavor, undefined if 364 // there is none. 365 max: function(flavor) { 366 const nodes = nodesByFlavor.get(flavor); 367 return nodes === undefined ? undefined : max(nodes, node => node.score); 368 }, 369 370 // Let the KB know that a new flavor has been added to an element. 371 indexNodeByFlavor: function(node, flavor) { 372 getDefault(nodesByFlavor, flavor, () => []).push(node); 373 }, 374 375 nodesOfFlavor: function(flavor) { 376 return getDefault(nodesByFlavor, flavor, () => []); 377 } 378 }; 379} 380 381 382// Apply a rule (as returned by a call to rule()) to a fact, and return the 383// new facts that result. 384function resultsOf(rule, node, flavor, kb) { 385 // If more types of rule pop up someday, do fancier dispatching here. 386 return rule.source.flavor === "flavor" ? resultsOfFlavorRule(rule, node, flavor) : resultsOfDomRule(rule, node, kb); 387} 388 389 390// Pull the DOM tree off the special property of the root "dom" fact, and query 391// against it. 392function* resultsOfDomRule(rule, specialDomNode, kb) { 393 // Use the special "tree" property of the special starting node: 394 const matches = specialDomNode.tree.querySelectorAll(rule.source.selector); 395 396 for (let i = 0; i < matches.length; i++) { // matches is a NodeList, which doesn't conform to iterator protocol 397 const element = matches[i]; 398 const newFacts = explicitFacts(rule.ranker(kb.nodeForElement(element))); 399 for (let fact of newFacts) { 400 if (fact.element === undefined) { 401 fact.element = element; 402 } 403 if (fact.flavor === undefined) { 404 throw new Error("Rankers of dom() rules must return a flavor in each fact. Otherwise, there is no way for that fact to be used later."); 405 } 406 yield fact; 407 } 408 } 409} 410 411 412function* resultsOfFlavorRule(rule, node, flavor) { 413 const newFacts = explicitFacts(rule.ranker(node)); 414 415 for (let fact of newFacts) { 416 // If the ranker didn't specify a different element, assume it's 417 // talking about the one we passed in: 418 if (fact.element === undefined) { 419 fact.element = node.element; 420 } 421 if (fact.flavor === undefined) { 422 fact.flavor = flavor; 423 } 424 yield fact; 425 } 426} 427 428 429// Take the possibly abbreviated output of a ranker function, and make it 430// explicitly an iterable with a defined score. 431// 432// Rankers can return undefined, which means "no facts", a single fact, or an 433// array of facts. 434function* explicitFacts(rankerResult) { 435 const array = (rankerResult === undefined) ? [] : (Array.isArray(rankerResult) ? rankerResult : [rankerResult]); 436 for (let fact of array) { 437 if (fact.score === undefined) { 438 fact.score = 1; 439 } 440 yield fact; 441 } 442} 443 444 445// TODO: For the moment, a lot of responsibility is on the rankers to return a 446// pretty big data structure of up to 4 properties. This is a bit verbose for 447// an arrow function (as I hope we can use most of the time) and the usual case 448// will probably be returning just a score multiplier. Make that case more 449// concise. 450 451// TODO: It is likely that rankers should receive the notes of their input type 452// as a 2nd arg, for brevity. 453 454 455// Return a condition that uses a DOM selector to find its matches from the 456// original DOM tree. 457// 458// For consistency, Nodes will still be delivered to the transformers, but 459// they'll have empty flavors and score = 1. 460// 461// Condition constructors like dom() and flavor() build stupid, introspectable 462// objects that the query engine can read. They don't actually do the query 463// themselves. That way, the query planner can be smarter than them, figuring 464// out which indices to use based on all of them. (We'll probably keep a heap 465// by each dimension's score and a hash by flavor name, for starters.) Someday, 466// fancy things like this may be possible: rule(and(tag('p'), klass('snork')), 467// ...) 468function dom(selector) { 469 return { 470 flavor: "dom", 471 inputFlavor: "dom", 472 selector 473 }; 474} 475 476 477// Return a condition that discriminates on nodes of the knowledgebase by flavor. 478function flavor(inputFlavor) { 479 return { 480 flavor: "flavor", 481 inputFlavor 482 }; 483} 484 485 486function rule(source, ranker) { 487 return { 488 source, 489 ranker 490 }; 491} 492