1#!/usr/bin/env python3
2#
3# Wireshark - Network traffic analyzer
4# By Gerald Combs <gerald@wireshark.org>
5# Copyright 1998 Gerald Combs
6#
7# SPDX-License-Identifier: GPL-2.0-or-later
8'''Update the "manuf" file.
9
10Make-manuf creates a file containing ethernet OUIs and their company
11IDs. It merges the databases at IEEE with entries in our template file.
12Our file in turn contains entries from
13http://www.cavebear.com/archive/cavebear/Ethernet/Ethernet.txt along
14with our own.
15
16The script reads the comments at the top of "manuf.tmpl" and writes them
17to "manuf".  It then joins the manufacturer listing in "manuf.tmpl" with
18the listing in "oui.txt", "iab.txt", etc, with the entries in
19"manuf.tmpl" taking precedence.
20'''
21
22import codecs
23import csv
24import io
25import os
26import re
27import sys
28import urllib.request, urllib.error, urllib.parse
29
30have_icu = False
31try:
32    # Use the grapheme or segments module instead?
33    import icu
34    have_icu = True
35except ImportError:
36    pass
37
38def exit_msg(msg=None, status=1):
39    if msg is not None:
40        sys.stderr.write(msg + '\n\n')
41    sys.stderr.write(__doc__ + '\n')
42    sys.exit(status)
43
44def open_url(url):
45    '''Open a URL.
46    Returns a tuple containing the body and response dict. The body is a
47    str in Python 3 and bytes in Python 2 in order to be compatibile with
48    csv.reader.
49    '''
50    req_headers = { 'User-Agent': 'Wireshark make-manuf' }
51    try:
52        req = urllib.request.Request(url, headers=req_headers)
53        response = urllib.request.urlopen(req)
54        body = response.read().decode('UTF-8', 'replace')
55    except Exception:
56        exit_msg('Error opening ' + url)
57
58    return (body, dict(response.info()))
59
60# These are applied after punctuation has been removed.
61# More examples at https://en.wikipedia.org/wiki/Incorporation_(business)
62general_terms = '|'.join([
63    'a +s', # A/S and A.S. but not "As" as in "Connect As".
64    'ab', # Also follows "Oy", which is covered below.
65    'ag',
66    'b ?v',
67    'closed joint stock company',
68    'co',
69    'company',
70    'corp',
71    'corporation',
72    'de c ?v', # Follows "S.A.", which is covered separately below.
73    'gmbh',
74    'holding',
75    'inc',
76    'incorporated',
77    'jsc',
78    'kg',
79    'k k', # "K.K." as in "kabushiki kaisha", but not "K+K" as in "K+K Messtechnik".
80    'limited',
81    'llc',
82    'ltd',
83    'n ?v',
84    'oao',
85    'of',
86    'open joint stock company',
87    'ooo',
88    'oü',
89    'oy',
90    'oyj',
91    'plc',
92    'pty',
93    'pvt',
94    's ?a ?r ?l',
95    's ?a',
96    's ?p ?a',
97    'sp ?k',
98    's ?r ?l',
99    'systems',
100    'the',
101    'zao',
102    'z ?o ?o'
103    ])
104
105def shorten(manuf):
106    '''Convert a long manufacturer name to abbreviated and short names'''
107    # Normalize whitespace.
108    manuf = ' '.join(manuf.split())
109    orig_manuf = manuf
110    # Add exactly one space on each end.
111    # XXX This appears to be for the re.sub below.
112    manuf = u' {} '.format(manuf)
113    # Convert all caps to title case
114    if manuf.isupper():
115        manuf = manuf.title()
116    # Remove any punctuation
117    # XXX Use string.punctuation? Note that it includes '-' and '*'.
118    manuf = re.sub(u"[\"',./:()]", ' ', manuf)
119    # & isn't needed when Standalone
120    manuf = manuf.replace(" & ", " ")
121    # Remove business types and other general terms ("the", "inc", "plc", etc.)
122    plain_manuf = re.sub('\W(' + general_terms + ')(?= )', '', manuf, flags=re.IGNORECASE)
123    # ...but make sure we don't remove everything.
124    if not all(s == ' ' for s in plain_manuf):
125        manuf = plain_manuf
126    # Remove all spaces
127    manuf = re.sub('\s+', '', manuf)
128
129    if len(manuf) < 1:
130        sys.stderr.write('Manufacturer "{}" shortened to nothing.\n'.format(orig_manuf))
131        sys.exit(1)
132
133    # Truncate names to a reasonable length, say, 8 characters. If
134    # the string contains UTF-8, this may be substantially more than
135    # 8 bytes. It might also be less than 8 visible characters. Plain
136    # Python slices Unicode strings by code point, which is better
137    # than raw bytes but not as good as grapheme clusters. PyICU
138    # supports grapheme clusters. https://bugs.python.org/issue30717
139    #
140    # In our case plain Python truncates 'Savroni̇k Elektroni̇k'
141    # to 'Savroni̇', which is 7 visible characters, 8 code points,
142    # and 9 bytes.
143
144    # Truncate by code points
145    trunc_len = 8
146
147    if have_icu:
148        # Truncate by grapheme clusters
149        bi_ci = icu.BreakIterator.createCharacterInstance(icu.Locale('en_US'))
150        bi_ci.setText(manuf)
151        bounds = list(bi_ci)
152        bounds = bounds[0:8]
153        trunc_len = bounds[-1]
154
155    manuf = manuf[:trunc_len]
156
157    if manuf.lower() == orig_manuf.lower():
158        # Original manufacturer name was short and simple.
159        return manuf
160
161    mixed_manuf = orig_manuf
162    # At least one entry has whitespace in front of a period.
163    mixed_manuf = re.sub('\s+\.', '.', mixed_manuf)
164    #If company is all caps, convert to mixed case (so it doesn't look like we're screaming the company name)
165    if mixed_manuf.upper() == mixed_manuf:
166        mixed_manuf = mixed_manuf.title()
167
168    return u'{}\t{}'.format(manuf, mixed_manuf)
169
170def prefix_to_oui(prefix):
171    pfx_len = len(prefix) * 8 / 2
172
173    if pfx_len == 24:
174        # 24-bit OUI assignment, no mask
175        return ':'.join(hi + lo for hi, lo in zip(prefix[0::2], prefix[1::2]))
176
177    # Other lengths which require a mask.
178    oui = prefix.ljust(12, '0')
179    oui = ':'.join(hi + lo for hi, lo in zip(oui[0::2], oui[1::2]))
180    return '{}/{:d}'.format(oui, int(pfx_len))
181
182def main():
183    this_dir = os.path.dirname(__file__)
184    template_path = os.path.join(this_dir, '..', 'manuf.tmpl')
185    manuf_path = os.path.join(this_dir, '..', 'manuf')
186    header_l = []
187    in_header = True
188
189    ieee_d = {
190        'OUI':   { 'url': "http://standards-oui.ieee.org/oui/oui.csv", 'min_entries': 1000 },
191        'CID':   { 'url': "http://standards-oui.ieee.org/cid/cid.csv", 'min_entries': 75 },
192        'IAB':   { 'url': "http://standards-oui.ieee.org/iab/iab.csv", 'min_entries': 1000 },
193        'OUI28': { 'url': "http://standards-oui.ieee.org/oui28/mam.csv", 'min_entries': 1000 },
194        'OUI36': { 'url': "http://standards-oui.ieee.org/oui36/oui36.csv", 'min_entries': 1000 },
195    }
196    oui_d = {}
197    hp = "[0-9a-fA-F]{2}"
198    manuf_re = re.compile('^({}:{}:{})\s+(\S.*)$'.format(hp, hp, hp))
199
200    min_total = 35000; # 35830 as of 2018-09-05
201    tmpl_added  = 0
202    total_added = 0
203
204    # Write out the header and populate the OUI list with our entries.
205
206    try:
207        tmpl_fd = io.open(template_path, 'r', encoding='UTF-8')
208    except Exception:
209        exit_msg("Couldn't open template file for reading ({}) ".format(template_path))
210    for tmpl_line in tmpl_fd:
211        tmpl_line = tmpl_line.strip()
212        m = manuf_re.match(tmpl_line)
213        if not m and in_header:
214            header_l.append(tmpl_line)
215        elif m:
216            in_header = False
217            oui = m.group(1).upper()
218            oui_d[oui] = m.group(2)
219            tmpl_added += 1
220    tmpl_fd.close()
221
222    total_added += tmpl_added
223
224    # Add IEEE entries from each of their databases
225    ieee_db_l = list(ieee_d.keys())
226    ieee_db_l.sort()
227
228    for db in ieee_db_l:
229        db_url = ieee_d[db]['url']
230        ieee_d[db]['skipped'] = 0
231        ieee_d[db]['added'] = 0
232        ieee_d[db]['total'] = 0
233        print('Merging {} data from {}'.format(db, db_url))
234        (body, response_d) = open_url(db_url)
235        ieee_csv = csv.reader(body.splitlines())
236        ieee_d[db]['last-modified'] = response_d['Last-Modified']
237        ieee_d[db]['length'] = response_d['Content-Length']
238
239        # Pop the title row.
240        next(ieee_csv)
241        for ieee_row in ieee_csv:
242            #Registry,Assignment,Organization Name,Organization Address
243            #IAB,0050C2DD6,Transas Marine Limited,Datavagen 37 Askim Vastra Gotaland SE 436 32
244            oui = prefix_to_oui(ieee_row[1].upper())
245            manuf = ieee_row[2].strip()
246            if oui in oui_d:
247                action = 'Skipping'
248                try:
249                    manuf_stripped = re.findall('[a-z]+', manuf.lower())
250                    tmpl_manuf_stripped = re.findall('[a-z]+', oui_d[oui].split('\t')[-1].strip().lower())
251                    if manuf_stripped == tmpl_manuf_stripped:
252                        action = 'Skipping duplicate'
253                except IndexError:
254                    pass
255
256                print(u'{} - {} IEEE "{}" in favor of "{}"'.format(oui, action, manuf, oui_d[oui]))
257                ieee_d[db]['skipped'] += 1
258            else:
259                oui_d[oui] = shorten(manuf)
260                ieee_d[db]['added'] += 1
261            ieee_d[db]['total'] += 1
262
263        if ieee_d[db]['total'] < ieee_d[db]['min_entries']:
264            exit_msg("Too few {} entries ({})".format(ieee_db, ieee_d[db]['total']))
265        total_added += ieee_d[db]['total']
266
267    if total_added < min_total:
268        exit_msg("Too few total entries ({})".format(total_added))
269
270    # Write the output file.
271
272    try:
273        manuf_fd = io.open(manuf_path, 'w', encoding='UTF-8')
274    except Exception:
275        exit_msg("Couldn't open manuf file for reading ({}) ".format(manuf_path))
276
277    manuf_fd.write(u"# This file was generated by running ./tools/make-manuf.py.\n")
278    manuf_fd.write(u"# Don't change it directly, change manuf.tmpl instead.\n#\n")
279    manuf_fd.write('\n'.join(header_l))
280
281    for db in ieee_db_l:
282        manuf_fd.write(
283            u'''\
284# {url}:
285#   Content-Length: {length}
286#   Last-Modified: {last-modified}
287
288'''.format( **ieee_d[db]))
289
290    oui_l = list(oui_d.keys())
291    oui_l.sort()
292    for oui in oui_l:
293        manuf_fd.write(u'{}\t{}\n'.format(oui, oui_d[oui]))
294
295    manuf_fd.close()
296
297    print('{:<20}: {}'.format('Original entries', tmpl_added))
298    for db in ieee_d:
299        print('{:<20}: {}'.format('IEEE ' + db + ' added', ieee_d[db]['added']))
300    print('{:<20}: {}'.format('Total added', total_added))
301
302    print()
303    for db in ieee_d:
304        print('{:<20}: {}'.format('IEEE ' + db + ' total', ieee_d[db]['total']))
305
306    print()
307    for db in ieee_d:
308        print('{:<20}: {}'.format('IEEE ' + db + ' skipped', ieee_d[db]['skipped']))
309
310if __name__ == '__main__':
311    main()
312