1#!/usr/bin/env python3 2# 3# Wireshark - Network traffic analyzer 4# By Gerald Combs <gerald@wireshark.org> 5# Copyright 1998 Gerald Combs 6# 7# SPDX-License-Identifier: GPL-2.0-or-later 8'''Update the "manuf" file. 9 10Make-manuf creates a file containing ethernet OUIs and their company 11IDs. It merges the databases at IEEE with entries in our template file. 12Our file in turn contains entries from 13http://www.cavebear.com/archive/cavebear/Ethernet/Ethernet.txt along 14with our own. 15 16The script reads the comments at the top of "manuf.tmpl" and writes them 17to "manuf". It then joins the manufacturer listing in "manuf.tmpl" with 18the listing in "oui.txt", "iab.txt", etc, with the entries in 19"manuf.tmpl" taking precedence. 20''' 21 22import codecs 23import csv 24import io 25import os 26import re 27import sys 28import urllib.request, urllib.error, urllib.parse 29 30have_icu = False 31try: 32 # Use the grapheme or segments module instead? 33 import icu 34 have_icu = True 35except ImportError: 36 pass 37 38def exit_msg(msg=None, status=1): 39 if msg is not None: 40 sys.stderr.write(msg + '\n\n') 41 sys.stderr.write(__doc__ + '\n') 42 sys.exit(status) 43 44def open_url(url): 45 '''Open a URL. 46 Returns a tuple containing the body and response dict. The body is a 47 str in Python 3 and bytes in Python 2 in order to be compatibile with 48 csv.reader. 49 ''' 50 req_headers = { 'User-Agent': 'Wireshark make-manuf' } 51 try: 52 req = urllib.request.Request(url, headers=req_headers) 53 response = urllib.request.urlopen(req) 54 body = response.read().decode('UTF-8', 'replace') 55 except Exception: 56 exit_msg('Error opening ' + url) 57 58 return (body, dict(response.info())) 59 60# These are applied after punctuation has been removed. 61# More examples at https://en.wikipedia.org/wiki/Incorporation_(business) 62general_terms = '|'.join([ 63 'a +s', # A/S and A.S. but not "As" as in "Connect As". 64 'ab', # Also follows "Oy", which is covered below. 65 'ag', 66 'b ?v', 67 'closed joint stock company', 68 'co', 69 'company', 70 'corp', 71 'corporation', 72 'de c ?v', # Follows "S.A.", which is covered separately below. 73 'gmbh', 74 'holding', 75 'inc', 76 'incorporated', 77 'jsc', 78 'kg', 79 'k k', # "K.K." as in "kabushiki kaisha", but not "K+K" as in "K+K Messtechnik". 80 'limited', 81 'llc', 82 'ltd', 83 'n ?v', 84 'oao', 85 'of', 86 'open joint stock company', 87 'ooo', 88 'oü', 89 'oy', 90 'oyj', 91 'plc', 92 'pty', 93 'pvt', 94 's ?a ?r ?l', 95 's ?a', 96 's ?p ?a', 97 'sp ?k', 98 's ?r ?l', 99 'systems', 100 'the', 101 'zao', 102 'z ?o ?o' 103 ]) 104 105def shorten(manuf): 106 '''Convert a long manufacturer name to abbreviated and short names''' 107 # Normalize whitespace. 108 manuf = ' '.join(manuf.split()) 109 orig_manuf = manuf 110 # Add exactly one space on each end. 111 # XXX This appears to be for the re.sub below. 112 manuf = u' {} '.format(manuf) 113 # Convert all caps to title case 114 if manuf.isupper(): 115 manuf = manuf.title() 116 # Remove any punctuation 117 # XXX Use string.punctuation? Note that it includes '-' and '*'. 118 manuf = re.sub(u"[\"',./:()]", ' ', manuf) 119 # & isn't needed when Standalone 120 manuf = manuf.replace(" & ", " ") 121 # Remove business types and other general terms ("the", "inc", "plc", etc.) 122 plain_manuf = re.sub('\W(' + general_terms + ')(?= )', '', manuf, flags=re.IGNORECASE) 123 # ...but make sure we don't remove everything. 124 if not all(s == ' ' for s in plain_manuf): 125 manuf = plain_manuf 126 # Remove all spaces 127 manuf = re.sub('\s+', '', manuf) 128 129 if len(manuf) < 1: 130 sys.stderr.write('Manufacturer "{}" shortened to nothing.\n'.format(orig_manuf)) 131 sys.exit(1) 132 133 # Truncate names to a reasonable length, say, 8 characters. If 134 # the string contains UTF-8, this may be substantially more than 135 # 8 bytes. It might also be less than 8 visible characters. Plain 136 # Python slices Unicode strings by code point, which is better 137 # than raw bytes but not as good as grapheme clusters. PyICU 138 # supports grapheme clusters. https://bugs.python.org/issue30717 139 # 140 # In our case plain Python truncates 'Savroni̇k Elektroni̇k' 141 # to 'Savroni̇', which is 7 visible characters, 8 code points, 142 # and 9 bytes. 143 144 # Truncate by code points 145 trunc_len = 8 146 147 if have_icu: 148 # Truncate by grapheme clusters 149 bi_ci = icu.BreakIterator.createCharacterInstance(icu.Locale('en_US')) 150 bi_ci.setText(manuf) 151 bounds = list(bi_ci) 152 bounds = bounds[0:8] 153 trunc_len = bounds[-1] 154 155 manuf = manuf[:trunc_len] 156 157 if manuf.lower() == orig_manuf.lower(): 158 # Original manufacturer name was short and simple. 159 return manuf 160 161 mixed_manuf = orig_manuf 162 # At least one entry has whitespace in front of a period. 163 mixed_manuf = re.sub('\s+\.', '.', mixed_manuf) 164 #If company is all caps, convert to mixed case (so it doesn't look like we're screaming the company name) 165 if mixed_manuf.upper() == mixed_manuf: 166 mixed_manuf = mixed_manuf.title() 167 168 return u'{}\t{}'.format(manuf, mixed_manuf) 169 170def prefix_to_oui(prefix): 171 pfx_len = len(prefix) * 8 / 2 172 173 if pfx_len == 24: 174 # 24-bit OUI assignment, no mask 175 return ':'.join(hi + lo for hi, lo in zip(prefix[0::2], prefix[1::2])) 176 177 # Other lengths which require a mask. 178 oui = prefix.ljust(12, '0') 179 oui = ':'.join(hi + lo for hi, lo in zip(oui[0::2], oui[1::2])) 180 return '{}/{:d}'.format(oui, int(pfx_len)) 181 182def main(): 183 this_dir = os.path.dirname(__file__) 184 template_path = os.path.join(this_dir, '..', 'manuf.tmpl') 185 manuf_path = os.path.join(this_dir, '..', 'manuf') 186 header_l = [] 187 in_header = True 188 189 ieee_d = { 190 'OUI': { 'url': "http://standards-oui.ieee.org/oui/oui.csv", 'min_entries': 1000 }, 191 'CID': { 'url': "http://standards-oui.ieee.org/cid/cid.csv", 'min_entries': 75 }, 192 'IAB': { 'url': "http://standards-oui.ieee.org/iab/iab.csv", 'min_entries': 1000 }, 193 'OUI28': { 'url': "http://standards-oui.ieee.org/oui28/mam.csv", 'min_entries': 1000 }, 194 'OUI36': { 'url': "http://standards-oui.ieee.org/oui36/oui36.csv", 'min_entries': 1000 }, 195 } 196 oui_d = {} 197 hp = "[0-9a-fA-F]{2}" 198 manuf_re = re.compile('^({}:{}:{})\s+(\S.*)$'.format(hp, hp, hp)) 199 200 min_total = 35000; # 35830 as of 2018-09-05 201 tmpl_added = 0 202 total_added = 0 203 204 # Write out the header and populate the OUI list with our entries. 205 206 try: 207 tmpl_fd = io.open(template_path, 'r', encoding='UTF-8') 208 except Exception: 209 exit_msg("Couldn't open template file for reading ({}) ".format(template_path)) 210 for tmpl_line in tmpl_fd: 211 tmpl_line = tmpl_line.strip() 212 m = manuf_re.match(tmpl_line) 213 if not m and in_header: 214 header_l.append(tmpl_line) 215 elif m: 216 in_header = False 217 oui = m.group(1).upper() 218 oui_d[oui] = m.group(2) 219 tmpl_added += 1 220 tmpl_fd.close() 221 222 total_added += tmpl_added 223 224 # Add IEEE entries from each of their databases 225 ieee_db_l = list(ieee_d.keys()) 226 ieee_db_l.sort() 227 228 for db in ieee_db_l: 229 db_url = ieee_d[db]['url'] 230 ieee_d[db]['skipped'] = 0 231 ieee_d[db]['added'] = 0 232 ieee_d[db]['total'] = 0 233 print('Merging {} data from {}'.format(db, db_url)) 234 (body, response_d) = open_url(db_url) 235 ieee_csv = csv.reader(body.splitlines()) 236 ieee_d[db]['last-modified'] = response_d['Last-Modified'] 237 ieee_d[db]['length'] = response_d['Content-Length'] 238 239 # Pop the title row. 240 next(ieee_csv) 241 for ieee_row in ieee_csv: 242 #Registry,Assignment,Organization Name,Organization Address 243 #IAB,0050C2DD6,Transas Marine Limited,Datavagen 37 Askim Vastra Gotaland SE 436 32 244 oui = prefix_to_oui(ieee_row[1].upper()) 245 manuf = ieee_row[2].strip() 246 if oui in oui_d: 247 action = 'Skipping' 248 try: 249 manuf_stripped = re.findall('[a-z]+', manuf.lower()) 250 tmpl_manuf_stripped = re.findall('[a-z]+', oui_d[oui].split('\t')[-1].strip().lower()) 251 if manuf_stripped == tmpl_manuf_stripped: 252 action = 'Skipping duplicate' 253 except IndexError: 254 pass 255 256 print(u'{} - {} IEEE "{}" in favor of "{}"'.format(oui, action, manuf, oui_d[oui])) 257 ieee_d[db]['skipped'] += 1 258 else: 259 oui_d[oui] = shorten(manuf) 260 ieee_d[db]['added'] += 1 261 ieee_d[db]['total'] += 1 262 263 if ieee_d[db]['total'] < ieee_d[db]['min_entries']: 264 exit_msg("Too few {} entries ({})".format(ieee_db, ieee_d[db]['total'])) 265 total_added += ieee_d[db]['total'] 266 267 if total_added < min_total: 268 exit_msg("Too few total entries ({})".format(total_added)) 269 270 # Write the output file. 271 272 try: 273 manuf_fd = io.open(manuf_path, 'w', encoding='UTF-8') 274 except Exception: 275 exit_msg("Couldn't open manuf file for reading ({}) ".format(manuf_path)) 276 277 manuf_fd.write(u"# This file was generated by running ./tools/make-manuf.py.\n") 278 manuf_fd.write(u"# Don't change it directly, change manuf.tmpl instead.\n#\n") 279 manuf_fd.write('\n'.join(header_l)) 280 281 for db in ieee_db_l: 282 manuf_fd.write( 283 u'''\ 284# {url}: 285# Content-Length: {length} 286# Last-Modified: {last-modified} 287 288'''.format( **ieee_d[db])) 289 290 oui_l = list(oui_d.keys()) 291 oui_l.sort() 292 for oui in oui_l: 293 manuf_fd.write(u'{}\t{}\n'.format(oui, oui_d[oui])) 294 295 manuf_fd.close() 296 297 print('{:<20}: {}'.format('Original entries', tmpl_added)) 298 for db in ieee_d: 299 print('{:<20}: {}'.format('IEEE ' + db + ' added', ieee_d[db]['added'])) 300 print('{:<20}: {}'.format('Total added', total_added)) 301 302 print() 303 for db in ieee_d: 304 print('{:<20}: {}'.format('IEEE ' + db + ' total', ieee_d[db]['total'])) 305 306 print() 307 for db in ieee_d: 308 print('{:<20}: {}'.format('IEEE ' + db + ' skipped', ieee_d[db]['skipped'])) 309 310if __name__ == '__main__': 311 main() 312