1# Copyright (c) 2013-2020 Philip Hane 2# All rights reserved. 3# 4# Redistribution and use in source and binary forms, with or without 5# modification, are permitted provided that the following conditions are met: 6# 7# 1. Redistributions of source code must retain the above copyright notice, 8# this list of conditions and the following disclaimer. 9# 2. Redistributions in binary form must reproduce the above copyright notice, 10# this list of conditions and the following disclaimer in the documentation 11# and/or other materials provided with the distribution. 12# 13# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 14# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 17# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 18# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 19# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 20# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 21# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 22# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 23# POSSIBILITY OF SUCH DAMAGE. 24 25from . import NetError 26from .utils import unique_everseen 27import logging 28import sys 29import re 30import copy 31from datetime import (datetime, timedelta) 32 33if sys.version_info >= (3, 3): # pragma: no cover 34 from ipaddress import (ip_address, 35 ip_network, 36 summarize_address_range, 37 collapse_addresses) 38else: # pragma: no cover 39 from ipaddr import (IPAddress as ip_address, 40 IPNetwork as ip_network, 41 summarize_address_range, 42 collapse_address_list as collapse_addresses) 43 44log = logging.getLogger(__name__) 45 46# Base NIR whois output dictionary. 47BASE_NET = { 48 'cidr': None, 49 'name': None, 50 'handle': None, 51 'range': None, 52 'country': None, 53 'address': None, 54 'postal_code': None, 55 'nameservers': None, 56 'created': None, 57 'updated': None, 58 'contacts': None 59} 60 61# Base NIR whois contact output dictionary. 62BASE_CONTACT = { 63 'name': None, 64 'email': None, 65 'reply_email': None, 66 'organization': None, 67 'division': None, 68 'title': None, 69 'phone': None, 70 'fax': None, 71 'updated': None 72} 73 74# National Internet Registry 75NIR_WHOIS = { 76 'jpnic': { 77 'country_code': 'JP', 78 'url': ('http://whois.nic.ad.jp/cgi-bin/whois_gw?lang=%2Fe&key={0}' 79 '&submit=query'), 80 'request_type': 'GET', 81 'request_headers': {'Accept': 'text/html'}, 82 'form_data_ip_field': None, 83 'fields': { 84 'name': r'(\[Organization\])[^\S\n]+(?P<val>.*?)\n', 85 'handle': r'(\[Network Name\])[^\S\n]+(?P<val>.*?)\n', 86 'created': r'(\[Assigned Date\])[^\S\n]+(?P<val>.*?)\n', 87 'updated': r'(\[Last Update\])[^\S\n]+(?P<val>.*?)\n', 88 'nameservers': r'(\[Nameserver\])[^\S\n]+(?P<val>.*?)\n', 89 'contact_admin': r'(\[Administrative Contact\])[^\S\n]+.+?\>' 90 '(?P<val>.+?)\\<\\/A\\>\n', 91 'contact_tech': r'(\[Technical Contact\])[^\S\n]+.+?\>' 92 '(?P<val>.+?)\\<\\/A\\>\n' 93 }, 94 'contact_fields': { 95 'name': r'(\[Last, First\])[^\S\n]+(?P<val>.*?)\n', 96 'email': r'(\[E-Mail\])[^\S\n]+(?P<val>.*?)\n', 97 'reply_email': r'(\[Reply Mail\])[^\S\n]+(?P<val>.*?)\n', 98 'organization': r'(\[Organization\])[^\S\n]+(?P<val>.*?)\n', 99 'division': r'(\[Division\])[^\S\n]+(?P<val>.*?)\n', 100 'title': r'(\[Title\])[^\S\n]+(?P<val>.*?)\n', 101 'phone': r'(\[TEL\])[^\S\n]+(?P<val>.*?)\n', 102 'fax': r'(\[FAX\])[^\S\n]+(?P<val>.*?)\n', 103 'updated': r'(\[Last Update\])[^\S\n]+(?P<val>.*?)\n' 104 }, 105 'dt_format': '%Y/%m/%d %H:%M:%S(JST)', 106 'dt_hourdelta': 9, 107 'multi_net': False 108 }, 109 'krnic': { 110 'country_code': 'KR', 111 'url': 'https://xn--c79as89aj0e29b77z.xn--3e0b707e/eng/whois.jsc', 112 'request_type': 'POST', 113 'request_headers': { 114 'Accept': 'text/html', 115 'Referer': ( 116 'https://xn--c79as89aj0e29b77z.xn--3e0b707e/eng/whois.jsp' 117 ), 118 }, 119 'form_data_ip_field': 'query', 120 'fields': { 121 'name': r'(Organization Name)[\s]+\:[^\S\n]+(?P<val>.+?)\n', 122 'handle': r'(Service Name|Network Type)[\s]+\:[^\S\n]+(?P<val>.+?)' 123 '\n', 124 'address': r'(Address)[\s]+\:[^\S\n]+(?P<val>.+?)\n', 125 'postal_code': r'(Zip Code)[\s]+\:[^\S\n]+(?P<val>.+?)\n', 126 'created': r'(Registration Date)[\s]+\:[^\S\n]+(?P<val>.+?)\n', 127 'contact_admin': r'(id="eng_isp_contact").+?\>(?P<val>.*?)\<' 128 '\\/div\\>\n', 129 'contact_tech': r'(id="eng_user_contact").+?\>(?P<val>.*?)\<' 130 '\\/div\\>\n' 131 }, 132 'contact_fields': { 133 'name': r'(Name)[^\S\n]+?:[^\S\n]+?(?P<val>.*?)\n', 134 'email': r'(E-Mail)[^\S\n]+?:[^\S\n]+?(?P<val>.*?)\n', 135 'phone': r'(Phone)[^\S\n]+?:[^\S\n]+?(?P<val>.*?)\n' 136 }, 137 'dt_format': '%Y%m%d', 138 'dt_hourdelta': 0, 139 'multi_net': True 140 } 141} 142 143 144class NIRWhois: 145 """ 146 The class for parsing whois data for NIRs (National Internet Registry). 147 JPNIC and KRNIC are currently the only NIRs supported. Output varies 148 based on NIR specific whois formatting. 149 150 Args: 151 net (:obj:`ipwhois.net.Net`): The network object. 152 153 Raises: 154 NetError: The parameter provided is not an instance of 155 ipwhois.net.Net 156 IPDefinedError: The address provided is defined (does not need to be 157 resolved). 158 """ 159 160 def __init__(self, net): 161 162 from .net import Net 163 164 # ipwhois.net.Net validation 165 if isinstance(net, Net): 166 167 self._net = net 168 169 else: 170 171 raise NetError('The provided net parameter is not an instance of ' 172 'ipwhois.net.Net') 173 174 def parse_fields(self, response, fields_dict, net_start=None, 175 net_end=None, dt_format=None, field_list=None, 176 hourdelta=0, is_contact=False): 177 """ 178 The function for parsing whois fields from a data input. 179 180 Args: 181 response (:obj:`str`): The response from the whois/rwhois server. 182 fields_dict (:obj:`dict`): The mapping of fields to regex search 183 values (required). 184 net_start (:obj:`int`): The starting point of the network (if 185 parsing multiple networks). Defaults to None. 186 net_end (:obj:`int`): The ending point of the network (if parsing 187 multiple networks). Defaults to None. 188 dt_format (:obj:`str`): The format of datetime fields if known. 189 Defaults to None. 190 field_list (:obj:`list` of :obj:`str`): If provided, fields to 191 parse. Defaults to :obj:`ipwhois.nir.BASE_NET` if is_contact 192 is False. Otherwise, defaults to 193 :obj:`ipwhois.nir.BASE_CONTACT`. 194 hourdelta (:obj:`int`): The timezone delta for created/updated 195 fields. Defaults to 0. 196 is_contact (:obj:`bool`): If True, uses contact information 197 field parsing. Defaults to False. 198 199 Returns: 200 dict: A dictionary of fields provided in fields_dict, mapping to 201 the results of the regex searches. 202 """ 203 204 response = '{0}\n'.format(response) 205 if is_contact: 206 207 ret = {} 208 209 if not field_list: 210 211 field_list = list(BASE_CONTACT.keys()) 212 213 else: 214 215 ret = { 216 'contacts': {'admin': None, 'tech': None}, 217 'contact_admin': {}, 218 'contact_tech': {} 219 } 220 221 if not field_list: 222 223 field_list = list(BASE_NET.keys()) 224 field_list.remove('contacts') 225 field_list.append('contact_admin') 226 field_list.append('contact_tech') 227 228 generate = ((field, pattern) for (field, pattern) in 229 fields_dict.items() if field in field_list) 230 231 for field, pattern in generate: 232 233 pattern = re.compile( 234 str(pattern), 235 re.DOTALL 236 ) 237 238 if net_start is not None: 239 240 match = pattern.finditer(response, net_end, net_start) 241 242 elif net_end is not None: 243 244 match = pattern.finditer(response, net_end) 245 246 else: 247 248 match = pattern.finditer(response) 249 250 values = [] 251 for m in match: 252 253 try: 254 255 values.append(m.group('val').strip()) 256 257 except IndexError: 258 259 pass 260 261 if len(values) > 0: 262 263 value = None 264 try: 265 266 if field in ['created', 'updated'] and dt_format: 267 268 try: 269 value = ( 270 datetime.strptime( 271 values[0], 272 str(dt_format) 273 ) - timedelta(hours=hourdelta) 274 ).isoformat('T') 275 except ValueError: 276 value = ( 277 datetime.strptime( 278 values[0], 279 '%Y/%m/%d' 280 ) 281 ).isoformat('T') 282 283 elif field in ['nameservers']: 284 285 value = list(unique_everseen(values)) 286 287 else: 288 289 values = unique_everseen(values) 290 value = '\n'.join(values) 291 292 except ValueError as e: 293 294 log.debug('NIR whois field parsing failed for {0}: {1}' 295 ''.format(field, e)) 296 pass 297 298 ret[field] = value 299 300 return ret 301 302 def get_nets_jpnic(self, response): 303 """ 304 The function for parsing network blocks from jpnic whois data. 305 306 Args: 307 response (:obj:`str`): The response from the jpnic server. 308 309 Returns: 310 list of dict: Mapping of networks with start and end positions. 311 312 :: 313 314 [{ 315 'cidr' (str) - The network routing block 316 'start' (int) - The starting point of the network 317 'end' (int) - The endpoint point of the network 318 }] 319 """ 320 321 nets = [] 322 323 # Iterate through all of the networks found, storing the CIDR value 324 # and the start and end positions. 325 for match in re.finditer( 326 r'^.*?(\[Network Number\])[^\S\n]+.+?>(?P<val>.+?)</A>$', 327 response, 328 re.MULTILINE 329 ): 330 331 try: 332 333 net = copy.deepcopy(BASE_NET) 334 tmp = ip_network(match.group(2)) 335 336 try: # pragma: no cover 337 network_address = tmp.network_address 338 except AttributeError: # pragma: no cover 339 network_address = tmp.ip 340 pass 341 342 try: # pragma: no cover 343 broadcast_address = tmp.broadcast_address 344 except AttributeError: # pragma: no cover 345 broadcast_address = tmp.broadcast 346 pass 347 348 net['range'] = '{0} - {1}'.format( 349 network_address + 1, broadcast_address 350 ) 351 352 cidr = ip_network(match.group(2).strip()).__str__() 353 354 net['cidr'] = cidr 355 net['start'] = match.start() 356 net['end'] = match.end() 357 nets.append(net) 358 359 except (ValueError, TypeError): 360 361 pass 362 363 return nets 364 365 def get_nets_krnic(self, response): 366 """ 367 The function for parsing network blocks from krnic whois data. 368 369 Args: 370 response (:obj:`str`): The response from the krnic server. 371 372 Returns: 373 list of dict: Mapping of networks with start and end positions. 374 375 :: 376 377 [{ 378 'cidr' (str) - The network routing block 379 'start' (int) - The starting point of the network 380 'end' (int) - The endpoint point of the network 381 }] 382 """ 383 384 nets = [] 385 386 # Iterate through all of the networks found, storing the CIDR value 387 # and the start and end positions. 388 for match in re.finditer( 389 r'^(IPv4 Address)[\s]+:[^\S\n]+((.+?)[^\S\n]-[^\S\n](.+?)' 390 '[^\\S\n]\\((.+?)\\)|.+)$', 391 response, 392 re.MULTILINE 393 ): 394 395 try: 396 397 net = copy.deepcopy(BASE_NET) 398 net['range'] = match.group(2) 399 400 if match.group(3) and match.group(4): 401 402 addrs = [] 403 addrs.extend(summarize_address_range( 404 ip_address(match.group(3).strip()), 405 ip_address(match.group(4).strip()))) 406 407 cidr = ', '.join( 408 [i.__str__() for i in collapse_addresses(addrs)] 409 ) 410 411 net['range'] = '{0} - {1}'.format( 412 match.group(3), match.group(4) 413 ) 414 415 else: 416 417 cidr = ip_network(match.group(2).strip()).__str__() 418 419 net['cidr'] = cidr 420 net['start'] = match.start() 421 net['end'] = match.end() 422 nets.append(net) 423 424 except (ValueError, TypeError): 425 426 pass 427 428 return nets 429 430 def get_contact(self, response=None, nir=None, handle=None, 431 retry_count=3, dt_format=None): 432 """ 433 The function for retrieving and parsing NIR whois data based on 434 NIR_WHOIS contact_fields. 435 436 Args: 437 response (:obj:`str`): Optional response object, this bypasses the 438 lookup. 439 nir (:obj:`str`): The NIR to query ('jpnic' or 'krnic'). Required 440 if response is None. 441 handle (:obj:`str`): For NIRs that have separate contact queries 442 (JPNIC), this is the contact handle to use in the query. 443 Defaults to None. 444 retry_count (:obj:`int`): The number of times to retry in case 445 socket errors, timeouts, connection resets, etc. are 446 encountered. Defaults to 3. 447 dt_format (:obj:`str`): The format of datetime fields if known. 448 Defaults to None. 449 450 Returns: 451 dict: Mapping of the fields provided in contact_fields, to their 452 parsed results. 453 """ 454 455 if response or nir == 'krnic': 456 457 contact_response = response 458 459 else: 460 461 # Retrieve the whois data. 462 contact_response = self._net.get_http_raw( 463 url=str(NIR_WHOIS[nir]['url']).format(handle), 464 retry_count=retry_count, 465 headers=NIR_WHOIS[nir]['request_headers'], 466 request_type=NIR_WHOIS[nir]['request_type'] 467 ) 468 469 return self.parse_fields( 470 response=contact_response, 471 fields_dict=NIR_WHOIS[nir]['contact_fields'], 472 dt_format=dt_format, 473 hourdelta=int(NIR_WHOIS[nir]['dt_hourdelta']), 474 is_contact=True 475 ) 476 477 def lookup(self, nir=None, inc_raw=False, retry_count=3, response=None, 478 field_list=None, is_offline=False): 479 """ 480 The function for retrieving and parsing NIR whois information for an IP 481 address via HTTP (HTML scraping). 482 483 Args: 484 nir (:obj:`str`): The NIR to query ('jpnic' or 'krnic'). Required 485 if response is None. 486 inc_raw (:obj:`bool`, optional): Whether to include the raw 487 results in the returned dictionary. Defaults to False. 488 retry_count (:obj:`int`): The number of times to retry in case 489 socket errors, timeouts, connection resets, etc. are 490 encountered. Defaults to 3. 491 response (:obj:`str`): Optional response object, this bypasses the 492 NIR lookup. Required when is_offline=True. 493 field_list (:obj:`list` of :obj:`str`): If provided, fields to 494 parse. Defaults to :obj:`ipwhois.nir.BASE_NET`. 495 is_offline (:obj:`bool`): Whether to perform lookups offline. If 496 True, response and asn_data must be provided. Primarily used 497 for testing. 498 499 Returns: 500 dict: The NIR whois results: 501 502 :: 503 504 { 505 'query' (str) - The IP address. 506 'nets' (list of dict) - Network information which consists 507 of the fields listed in the ipwhois.nir.NIR_WHOIS 508 dictionary. 509 'raw' (str) - Raw NIR whois results if the inc_raw 510 parameter is True. 511 } 512 """ 513 514 if nir not in NIR_WHOIS.keys(): 515 516 raise KeyError('Invalid arg for nir (National Internet Registry') 517 518 # Create the return dictionary. 519 results = { 520 'query': self._net.address_str, 521 'raw': None 522 } 523 524 # Only fetch the response if we haven't already. 525 if response is None: 526 527 if is_offline: 528 529 raise KeyError('response argument required when ' 530 'is_offline=True') 531 532 log.debug('Response not given, perform WHOIS lookup for {0}' 533 .format(self._net.address_str)) 534 535 form_data = None 536 if NIR_WHOIS[nir]['form_data_ip_field']: 537 form_data = {NIR_WHOIS[nir]['form_data_ip_field']: 538 self._net.address_str} 539 540 # Retrieve the whois data. 541 response = self._net.get_http_raw( 542 url=str(NIR_WHOIS[nir]['url']).format(self._net.address_str), 543 retry_count=retry_count, 544 headers=NIR_WHOIS[nir]['request_headers'], 545 request_type=NIR_WHOIS[nir]['request_type'], 546 form_data=form_data 547 ) 548 549 # If inc_raw parameter is True, add the response to return dictionary. 550 if inc_raw: 551 552 results['raw'] = response 553 554 nets = [] 555 nets_response = None 556 if nir == 'jpnic': 557 558 nets_response = self.get_nets_jpnic(response) 559 560 elif nir == 'krnic': 561 562 nets_response = self.get_nets_krnic(response) 563 564 nets.extend(nets_response) 565 566 global_contacts = {} 567 568 # Iterate through all of the network sections and parse out the 569 # appropriate fields for each. 570 log.debug('Parsing NIR WHOIS data') 571 for index, net in enumerate(nets): 572 573 section_end = None 574 if index + 1 < len(nets): 575 section_end = nets[index + 1]['start'] 576 577 try: 578 579 dt_format = NIR_WHOIS[nir]['dt_format'] 580 581 except KeyError: # pragma: no cover 582 583 dt_format = None 584 585 temp_net = self.parse_fields( 586 response=response, 587 fields_dict=NIR_WHOIS[nir]['fields'], 588 net_start=section_end, 589 net_end=net['end'], 590 dt_format=dt_format, 591 field_list=field_list, 592 hourdelta=int(NIR_WHOIS[nir]['dt_hourdelta']) 593 ) 594 temp_net['country'] = NIR_WHOIS[nir]['country_code'] 595 contacts = { 596 'admin': temp_net['contact_admin'], 597 'tech': temp_net['contact_tech'] 598 } 599 600 del ( 601 temp_net['contact_admin'], 602 temp_net['contact_tech'] 603 ) 604 605 if not is_offline: 606 607 for key, val in contacts.items(): 608 609 if len(val) > 0: 610 611 if isinstance(val, str): 612 613 val = val.splitlines() 614 615 for contact in val: 616 617 if contact in global_contacts.keys(): 618 619 temp_net['contacts'][key] = ( 620 global_contacts[contact] 621 ) 622 623 else: 624 625 if nir == 'krnic': 626 627 tmp_response = contact 628 tmp_handle = None 629 630 else: 631 632 tmp_response = None 633 tmp_handle = contact 634 635 temp_net['contacts'][key] = self.get_contact( 636 response=tmp_response, 637 handle=tmp_handle, 638 nir=nir, 639 retry_count=retry_count, 640 dt_format=dt_format 641 ) 642 global_contacts[contact] = ( 643 temp_net['contacts'][key] 644 ) 645 646 # Merge the net dictionaries. 647 net.update(temp_net) 648 649 # The start and end values are no longer needed. 650 del net['start'], net['end'] 651 652 # Add the networks to the return dictionary. 653 results['nets'] = nets 654 655 return results 656