1#!/usr/bin/env python
2# Copyright 2014 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE.chromium file.
5
6"""
7A Deterministic acyclic finite state automaton (DAFSA) is a compact
8representation of an unordered word list (dictionary).
9
10https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
11
12This python program converts a list of strings to a byte array in C++.
13This python program fetches strings and return values from a gperf file
14and generates a C++ file with a byte array representing graph that can be
15used as a memory efficient replacement for the perfect hash table.
16
17The input strings must consist of printable 7-bit ASCII characters or UTF-8
18multibyte sequences. Control characters in the range [0x00-0x1F] are not
19allowed. The return values must be one digit integers. .
20
21In this program a DAFSA is a diamond shaped graph starting at a common
22source node and ending at a common sink node. All internal nodes contain
23a label and each word is represented by the labels in one path from
24the source node to the sink node.
25
26The following python represention is used for nodes:
27
28  Source node: [ children ]
29  Internal node: (label, [ children ])
30  Sink node: None
31
32The graph is first compressed by prefixes like a trie. In the next step
33suffixes are compressed so that the graph gets diamond shaped. Finally
34one to one linked nodes are replaced by nodes with the labels joined.
35
36The order of the operations is crucial since lookups will be performed
37starting from the source with no backtracking. Thus a node must have at
38most one child with a label starting by the same character. The output
39is also arranged so that all jumps are to increasing addresses, thus forward
40in memory.
41
42The generated output has suffix free decoding so that the sign of leading
43bits in a link (a reference to a child node) indicate if it has a size of one,
44two or three bytes and if it is the last outgoing link from the actual node.
45A node label is terminated by a byte with the leading bit set.
46
47The generated byte array can described by the following BNF:
48
49<byte> ::= < 8-bit value in range [0x00-0xFF] >
50
51<char> ::= < byte in range [0x1F-0x7F] >
52<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] >
53<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
54
55<offset1> ::= < byte in range [0x00-0x3F] >
56<offset2> ::= < byte in range [0x40-0x5F] >
57<offset3> ::= < byte in range [0x60-0x7F] >
58
59<end_offset1> ::= < byte in range [0x80-0xBF] >
60<end_offset2> ::= < byte in range [0xC0-0xDF] >
61<end_offset3> ::= < byte in range [0xE0-0xFF] >
62
63<prefix> ::= <char>
64
65<label> ::= <end_char>
66          | <char> <label>
67
68<end_label> ::= <return_value>
69          | <char> <end_label>
70
71<offset> ::= <offset1>
72           | <offset2> <byte>
73           | <offset3> <byte> <byte>
74
75<end_offset> ::= <end_offset1>
76               | <end_offset2> <byte>
77               | <end_offset3> <byte> <byte>
78
79<offsets> ::= <end_offset>
80            | <offset> <offsets>
81
82<source> ::= <offsets>
83
84<node> ::= <label> <offsets>
85         | <prefix> <node>
86         | <end_label>
87
88<graph> ::= <graph>
89          | <graph> <node>
90
91<version> ::= <empty>            # The DAFSA was generated in ASCII mode.
92          | < byte value 0x01 >  # The DAFSA was generated in UTF-8 mode.
93
94<dafsa> ::= <graph> <version>
95
96Decoding:
97
98<char> -> character
99<end_char> & 0x7F -> character
100<return value> & 0x0F -> integer
101<offset1 & 0x3F> -> integer
102((<offset2> & 0x1F>) << 8) + <byte> -> integer
103((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
104
105end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
106offset2 and offset3 respectively.
107
108The first offset in a list of offsets is the distance in bytes between the
109offset itself and the first child node. Subsequent offsets are the distance
110between previous child node and next child node. Thus each offset links a node
111to a child node. The distance is always counted between start addresses, i.e.
112first byte in decoded offset or first byte in child node.
113
114Transcoding of UTF-8 multibyte sequences:
115
116The original DAFSA format was limited to 7-bit printable ASCII characters in
117range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences.
118By transcoding of such characters the new format preserves compatibility with
119old parsers, so that a DAFSA in the extended format can be used by an old
120parser without false positives, although strings containing transcoded
121characters will never match. Since the format is extended rather than being
122changed, a parser supporting the new format will automatically support data
123generated in the old format.
124
125Transcoding is performed by insertion of a start byte with the special value
1260x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside
127the range of printable ASCII.
128
1292-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn
130
1313-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn
132
1334-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn ->
134                00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn
135
136Example 1:
137
138%%
139aa, 1
140a, 2
141%%
142
143The input is first parsed to a list of words:
144["aa1", "a2"]
145
146A fully expanded graph is created from the words:
147source = [node1, node4]
148node1 = ("a", [node2])
149node2 = ("a", [node3])
150node3 = ("\x01", [sink])
151node4 = ("a", [node5])
152node5 = ("\x02", [sink])
153sink = None
154
155Compression results in the following graph:
156source = [node1]
157node1 = ("a", [node2, node3])
158node2 = ("\x02", [sink])
159node3 = ("a\x01", [sink])
160sink = None
161
162A C++ representation of the compressed graph is generated:
163
164const unsigned char dafsa[7] = {
165  0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
166};
167
168The bytes in the generated array has the following meaning:
169
170 0: 0x81 <end_offset1>  child at position 0 + (0x81 & 0x3F) -> jump to 1
171
172 1: 0xE1 <end_char>     label character (0xE1 & 0x7F) -> match "a"
173 2: 0x02 <offset1>      child at position 2 + (0x02 & 0x3F) -> jump to 4
174
175 3: 0x81 <end_offset1>  child at position 4 + (0x81 & 0x3F) -> jump to 5
176 4: 0x82 <return_value> 0x82 & 0x0F -> return 2
177
178 5: 0x61 <char>         label character 0x61 -> match "a"
179 6: 0x81 <return_value> 0x81 & 0x0F -> return 1
180
181Example 2:
182
183%%
184aa, 1
185bbb, 2
186baa, 1
187%%
188
189The input is first parsed to a list of words:
190["aa1", "bbb2", "baa1"]
191
192Compression results in the following graph:
193source = [node1, node2]
194node1 = ("b", [node2, node3])
195node2 = ("aa\x01", [sink])
196node3 = ("bb\x02", [sink])
197sink = None
198
199A C++ representation of the compressed graph is generated:
200
201const unsigned char dafsa[11] = {
202  0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
203};
204
205The bytes in the generated array has the following meaning:
206
207 0: 0x02 <offset1>      child at position 0 + (0x02 & 0x3F) -> jump to 2
208 1: 0x83 <end_offset1>  child at position 2 + (0x83 & 0x3F) -> jump to 5
209
210 2: 0xE2 <end_char>     label character (0xE2 & 0x7F) -> match "b"
211 3: 0x02 <offset1>      child at position 3 + (0x02 & 0x3F) -> jump to 5
212 4: 0x83 <end_offset1>  child at position 5 + (0x83 & 0x3F) -> jump to 8
213
214 5: 0x61 <char>         label character 0x61 -> match "a"
215 6: 0x61 <char>         label character 0x61 -> match "a"
216 7: 0x81 <return_value> 0x81 & 0x0F -> return 1
217
218 8: 0x62 <char>         label character 0x62 -> match "b"
219 9: 0x62 <char>         label character 0x62 -> match "b"
22010: 0x82 <return_value> 0x82 & 0x0F -> return 2
221"""
222
223import sys
224import os.path
225import time
226import hashlib
227import json
228
229class InputError(Exception):
230  """Exception raised for errors in the input file."""
231
232# Length of a character starting at a given byte.
233char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x00-0x0F
234                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x10-0x1F
235                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x20-0x2F
236                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x30-x03F
237                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x40-0x4F
238                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x50-x05F
239                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x60-0x6F
240                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x70-x07F
241                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x80-0x8F
242                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x90-0x9F
243                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0xA0-0xAF
244                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0xB0-0xBF
245                      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  # 0xC0-0xCF
246                      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  # 0xD0-0xDF
247                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  # 0xE0-0xEF
248                      4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
249
250def to_bytes(n):
251  """Converts an integer value to a bytes object."""
252  return bytes(bytearray((n,)))
253
254def to_dafsa(words, utf_mode):
255  """Generates a DAFSA from a word list and returns the source node.
256
257  Each word is split into characters so that each character is represented by
258  a unique node. It is assumed the word list is not empty.
259  """
260  if not words:
261    raise InputError('The domain list must not be empty')
262  def to_nodes(word, multibyte_length):
263    """Split words into characters"""
264    byte = ord(word[:1])
265    if multibyte_length:
266      # Consume next byte in multibyte sequence.
267      if byte & 0xC0 != 0x80:
268        raise InputError('Invalid UTF-8 multibyte sequence')
269      return to_bytes(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
270    char_length = char_length_table[byte]
271    if char_length == 1:
272      # 7-bit printable ASCII.
273      if len(word) == 1:
274        return to_bytes(int(word[:1], 16) & 0x0F), [None]
275      return word[:1], [to_nodes(word[1:], 0)]
276    elif char_length > 1:
277      # Leading byte in multibyte sequence.
278      if not utf_mode:
279        raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
280      if len(word) <= char_length:
281        raise InputError('Unterminated UTF-8 multibyte sequence')
282      return to_bytes(0x1F), [(to_bytes(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
283    # Unexpected character.
284    raise InputError('Domain names must be printable ASCII or UTF-8')
285
286  return [to_nodes(word, 0) for word in words]
287
288def to_words(node):
289  """Generates a word list from all paths starting from an internal node."""
290  if not node:
291    return [b'']
292  return [(node[0] + word) for child in node[1] for word in to_words(child)]
293
294
295def reverse(dafsa):
296  """Generates a new DAFSA that is reversed, so that the old sink node becomes
297  the new source node.
298  """
299  sink = []
300  nodemap = {}
301
302  def dfs(node, parent):
303    """Creates reverse nodes.
304
305    A new reverse node will be created for each old node. The new node will
306    get a reversed label and the parents of the old node as children.
307    """
308    if not node:
309      sink.append(parent)
310    elif id(node) not in nodemap:
311      nodemap[id(node)] = (node[0][::-1], [parent])
312      for child in node[1]:
313        dfs(child, nodemap[id(node)])
314    else:
315      nodemap[id(node)][1].append(parent)
316
317  for node in dafsa:
318    dfs(node, None)
319  return sink
320
321
322def join_labels(dafsa):
323  """Generates a new DAFSA where internal nodes are merged if there is a one to
324  one connection.
325  """
326  parentcount = {id(None): 2}
327  nodemap = {id(None): None}
328
329  def count_parents(node):
330    """Count incoming references"""
331    if id(node) in parentcount:
332      parentcount[id(node)] += 1
333    else:
334      parentcount[id(node)] = 1
335      for child in node[1]:
336        count_parents(child)
337
338  def join(node):
339    """Create new nodes"""
340    if id(node) not in nodemap:
341      children = [join(child) for child in node[1]]
342      if len(children) == 1 and parentcount[id(node[1][0])] == 1:
343        child = children[0]
344        nodemap[id(node)] = (node[0] + child[0], child[1])
345      else:
346        nodemap[id(node)] = (node[0], children)
347    return nodemap[id(node)]
348
349  for node in dafsa:
350    count_parents(node)
351  return [join(node) for node in dafsa]
352
353
354def join_suffixes(dafsa):
355  """Generates a new DAFSA where nodes that represent the same word lists
356  towards the sink are merged.
357  """
358  nodemap = {frozenset((b'',)): None}
359
360  def join(node):
361    """Returns a macthing node. A new node is created if no matching node
362    exists. The graph is accessed in dfs order.
363    """
364    suffixes = frozenset(to_words(node))
365    if suffixes not in nodemap:
366      nodemap[suffixes] = (node[0], [join(child) for child in node[1]])
367    return nodemap[suffixes]
368
369  return [join(node) for node in dafsa]
370
371
372def top_sort(dafsa):
373  """Generates list of nodes in topological sort order."""
374  incoming = {}
375
376  def count_incoming(node):
377    """Counts incoming references."""
378    if node:
379      if id(node) not in incoming:
380        incoming[id(node)] = 1
381        for child in node[1]:
382          count_incoming(child)
383      else:
384        incoming[id(node)] += 1
385
386  for node in dafsa:
387    count_incoming(node)
388
389  for node in dafsa:
390    incoming[id(node)] -= 1
391
392  waiting = [node for node in dafsa if incoming[id(node)] == 0]
393  nodes = []
394
395  while waiting:
396    node = waiting.pop()
397    assert incoming[id(node)] == 0
398    nodes.append(node)
399    for child in node[1]:
400      if child:
401        incoming[id(child)] -= 1
402        if incoming[id(child)] == 0:
403          waiting.append(child)
404  return nodes
405
406
407def encode_links(children, offsets, current):
408  """Encodes a list of children as one, two or three byte offsets."""
409  if not children[0]:
410    # This is an <end_label> node and no links follow such nodes
411    assert len(children) == 1
412    return []
413  guess = 3 * len(children)
414  assert children
415  children = sorted(children, key=lambda x: -offsets[id(x)])
416  while True:
417    offset = current + guess
418    buf = []
419    for child in children:
420      last = len(buf)
421      distance = offset - offsets[id(child)]
422      assert distance > 0 and distance < (1 << 21)
423
424      if distance < (1 << 6):
425        # A 6-bit offset: "s0xxxxxx"
426        buf.append(distance)
427      elif distance < (1 << 13):
428        # A 13-bit offset: "s10xxxxxxxxxxxxx"
429        buf.append(0x40 | (distance >> 8))
430        buf.append(distance & 0xFF)
431      else:
432        # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
433        buf.append(0x60 | (distance >> 16))
434        buf.append((distance >> 8) & 0xFF)
435        buf.append(distance & 0xFF)
436      # Distance in first link is relative to following record.
437      # Distance in other links are relative to previous link.
438      offset -= distance
439    if len(buf) == guess:
440      break
441    guess = len(buf)
442  # Set most significant bit to mark end of links in this node.
443  buf[last] |= (1 << 7)
444  buf.reverse()
445  return buf
446
447
448def encode_prefix(label):
449  """Encodes a node label as a list of bytes without a trailing high byte.
450
451  This method encodes a node if there is exactly one child  and the
452  child follows immediately after so that no jump is needed. This label
453  will then be a prefix to the label in the child node.
454  """
455  assert label
456  return [c for c in bytearray(reversed(label))]
457
458
459def encode_label(label):
460  """Encodes a node label as a list of bytes with a trailing high byte >0x80.
461  """
462  buf = encode_prefix(label)
463  # Set most significant bit to mark end of label in this node.
464  buf[0] |= (1 << 7)
465  return buf
466
467
468def encode(dafsa, utf_mode):
469  """Encodes a DAFSA to a list of bytes"""
470  output = []
471  offsets = {}
472
473  for node in reversed(top_sort(dafsa)):
474    if (len(node[1]) == 1 and node[1][0] and
475        (offsets[id(node[1][0])] == len(output))):
476      output.extend(encode_prefix(node[0]))
477    else:
478      output.extend(encode_links(node[1], offsets, len(output)))
479      output.extend(encode_label(node[0]))
480    offsets[id(node)] = len(output)
481
482  output.extend(encode_links(dafsa, offsets, len(output)))
483  output.reverse()
484  if utf_mode:
485    output.append(0x01)
486  return output
487
488
489def to_cxx(data, codecs):
490  """Generates C++ code from a list of encoded bytes."""
491  text = b'/* This file has been generated by hsts-make-dafsa. DO NOT EDIT!\n\n'
492  text += b'The byte array encodes effective tld names. See hsts-make-dafsa source for'
493  text += b' documentation.'
494  text += b'*/\n\n'
495  text += b'static const unsigned char kDafsa['
496  text += bytes(str(len(data)), **codecs)
497  text += b'] = {\n'
498  for i in range(0, len(data), 12):
499    text += b'  '
500    text += bytes(', '.join('0x%02x' % byte for byte in data[i:i + 12]), **codecs)
501    text += b',\n'
502  text += b'};\n'
503  return text
504
505def sha1_file(name):
506  sha1 = hashlib.sha1()
507  with open(name, 'rb') as f:
508    while True:
509        data = f.read(65536)
510        if not data:
511            break
512        sha1.update(data)
513  return sha1.hexdigest()
514
515def to_cxx_plus(data, codecs):
516  """Generates C++ code from a word list plus some variable assignments as needed by libhsts"""
517  text = to_cxx(data, codecs)
518  text += b'static time_t _hsts_file_time = %d;\n' % os.stat(hsts_input_file).st_mtime
519  text += b'static int _hsts_ndomains = %d;\n' % hsts_ndomains
520  text += b'static const char _hsts_sha1_checksum[] = "%s";\n' % bytes(sha1_file(hsts_input_file), **codecs)
521  text += b'static const char _hsts_filename[] = "%s";\n' % bytes(hsts_input_file, **codecs)
522  return text
523
524def words_to_whatever(words, converter, utf_mode, codecs):
525  """Generates C++ code from a word list"""
526  dafsa = to_dafsa(words, utf_mode)
527  for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
528    dafsa = fun(dafsa)
529  return converter(encode(dafsa, utf_mode), codecs)
530
531
532def words_to_cxx(words, utf_mode, codecs):
533  """Generates C++ code from a word list"""
534  return words_to_whatever(words, to_cxx, utf_mode, codecs)
535
536def words_to_cxx_plus(words, utf_mode, codecs):
537  """Generates C++ code from a word list plus some variable assignments as needed by libhsts"""
538  return words_to_whatever(words, to_cxx_plus, utf_mode, codecs)
539
540def words_to_binary(words, utf_mode, codecs):
541  """Generates C++ code from a word list"""
542  return b'.DAFSA@HSTS_0  \n' + words_to_whatever(words, lambda x, _: bytearray(x), utf_mode, codecs)
543
544
545def parse_hsts(infile, utf_mode, codecs):
546  """Parses HSTS file and extract strings and return code"""
547  HSTS_FLAG_INCLUDE_SUBDIRS = (1<<0)
548
549  global hsts_nsuffixes
550
551  data = json.load(infile)["entries"]
552
553  hsts = {}
554  nentries = 0;
555
556  for entry in data:
557    flags = 0;
558
559    if "include_subdomains" in entry:
560      if entry['include_subdomains'] == True:
561        flags = HSTS_FLAG_INCLUDE_SUBDIRS;
562
563    domain = bytes(entry['name'].strip(), **codecs)
564#    utf8 = bytes(domain.decode('idna').encode('utf-8'))
565
566#    if utf8 in hsts:
567#      print('Found %s/%X (now %X)' % utf8, hsts[utf8], flags)
568#      continue
569
570#    if utf_mode and domain != utf8:
571#      print('%s %s %X' % (domain, utf8, flags))
572#      hsts[utf8] = flags
573
574    hsts[domain] = flags
575    nentries += 1;
576
577  return [domain + bytes('%X' % (flags & 0x0F), **codecs) for (domain, flags) in sorted(hsts.items())]
578
579
580def usage():
581  """Prints the usage"""
582  print('usage: %s [options] infile outfile' % sys.argv[0])
583  print('  --output-format=cxx     Write DAFSA as C/C++ code (default)')
584  print('  --output-format=cxx+    Write DAFSA as C/C++ code plus statistical assignments')
585  print('  --output-format=binary  Write DAFSA binary data')
586  print('  --encoding=ascii        7-bit ASCII mode')
587  print('  --encoding=utf-8        UTF-8 mode (default)')
588  exit(1)
589
590
591def main():
592  """Convert HSTS file into C or binary DAFSA file"""
593  if len(sys.argv) < 3:
594    usage()
595
596  converter = words_to_cxx
597  parser = parse_hsts
598  utf_mode = True
599
600  codecs = dict()
601  if sys.version_info.major > 2:
602    codecs['encoding'] = 'utf-8'
603
604  for arg in sys.argv[1:-2]:
605    if arg.startswith('--output-format='):
606      value = arg[16:].lower()
607      if value == 'binary':
608        converter = words_to_binary
609      elif value == 'cxx':
610        converter = words_to_cxx
611      elif value == 'cxx+':
612        converter = words_to_cxx_plus
613      else:
614        print("Unknown output format '%s'" % value)
615        return 1
616    elif arg.startswith('--encoding='):
617      value = arg[11:].lower()
618      if value == 'ascii':
619        utf_mode = False
620      elif value == 'utf-8':
621        utf_mode = True
622      else:
623        print("Unknown encoding '%s'" % value)
624        return 1
625    else:
626      usage()
627
628  if sys.argv[-2] == '-':
629    with open(sys.argv[-1], 'wb') as outfile:
630      outfile.write(converter(parser(sys.stdin, utf_mode, codecs), utf_mode, codecs))
631  else:
632    """Some statistical data for --output-format=cxx+"""
633    global hsts_input_file, hsts_nsuffixes
634
635    hsts_input_file = sys.argv[-2]
636    hsts_nsuffixes = 0
637
638    with open(sys.argv[-2], 'r', **codecs) as infile, open(sys.argv[-1], 'wb') as outfile:
639      outfile.write(converter(parser(infile, utf_mode, codecs), utf_mode, codecs))
640
641  return 0
642
643
644if __name__ == '__main__':
645  sys.exit(main())
646