1# Copyright (c) 2013-2020 Philip Hane
2# All rights reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions are met:
6#
7# 1. Redistributions of source code must retain the above copyright notice,
8#    this list of conditions and the following disclaimer.
9# 2. Redistributions in binary form must reproduce the above copyright notice,
10#    this list of conditions and the following disclaimer in the documentation
11#    and/or other materials provided with the distribution.
12#
13# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
17# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
23# POSSIBILITY OF SUCH DAMAGE.
24
25from . import NetError
26from .utils import unique_everseen
27import logging
28import sys
29import re
30import copy
31from datetime import (datetime, timedelta)
32
33if sys.version_info >= (3, 3):  # pragma: no cover
34    from ipaddress import (ip_address,
35                           ip_network,
36                           summarize_address_range,
37                           collapse_addresses)
38else:  # pragma: no cover
39    from ipaddr import (IPAddress as ip_address,
40                        IPNetwork as ip_network,
41                        summarize_address_range,
42                        collapse_address_list as collapse_addresses)
43
44log = logging.getLogger(__name__)
45
46# Base NIR whois output dictionary.
47BASE_NET = {
48    'cidr': None,
49    'name': None,
50    'handle': None,
51    'range': None,
52    'country': None,
53    'address': None,
54    'postal_code': None,
55    'nameservers': None,
56    'created': None,
57    'updated': None,
58    'contacts': None
59}
60
61# Base NIR whois contact output dictionary.
62BASE_CONTACT = {
63    'name': None,
64    'email': None,
65    'reply_email': None,
66    'organization': None,
67    'division': None,
68    'title': None,
69    'phone': None,
70    'fax': None,
71    'updated': None
72}
73
74# National Internet Registry
75NIR_WHOIS = {
76    'jpnic': {
77        'country_code': 'JP',
78        'url': ('http://whois.nic.ad.jp/cgi-bin/whois_gw?lang=%2Fe&key={0}'
79                '&submit=query'),
80        'request_type': 'GET',
81        'request_headers': {'Accept': 'text/html'},
82        'form_data_ip_field': None,
83        'fields': {
84            'name': r'(\[Organization\])[^\S\n]+(?P<val>.*?)\n',
85            'handle': r'(\[Network Name\])[^\S\n]+(?P<val>.*?)\n',
86            'created': r'(\[Assigned Date\])[^\S\n]+(?P<val>.*?)\n',
87            'updated': r'(\[Last Update\])[^\S\n]+(?P<val>.*?)\n',
88            'nameservers': r'(\[Nameserver\])[^\S\n]+(?P<val>.*?)\n',
89            'contact_admin': r'(\[Administrative Contact\])[^\S\n]+.+?\>'
90                             '(?P<val>.+?)\\<\\/A\\>\n',
91            'contact_tech': r'(\[Technical Contact\])[^\S\n]+.+?\>'
92                             '(?P<val>.+?)\\<\\/A\\>\n'
93        },
94        'contact_fields': {
95            'name': r'(\[Last, First\])[^\S\n]+(?P<val>.*?)\n',
96            'email': r'(\[E-Mail\])[^\S\n]+(?P<val>.*?)\n',
97            'reply_email': r'(\[Reply Mail\])[^\S\n]+(?P<val>.*?)\n',
98            'organization': r'(\[Organization\])[^\S\n]+(?P<val>.*?)\n',
99            'division': r'(\[Division\])[^\S\n]+(?P<val>.*?)\n',
100            'title': r'(\[Title\])[^\S\n]+(?P<val>.*?)\n',
101            'phone': r'(\[TEL\])[^\S\n]+(?P<val>.*?)\n',
102            'fax': r'(\[FAX\])[^\S\n]+(?P<val>.*?)\n',
103            'updated': r'(\[Last Update\])[^\S\n]+(?P<val>.*?)\n'
104        },
105        'dt_format': '%Y/%m/%d %H:%M:%S(JST)',
106        'dt_hourdelta': 9,
107        'multi_net': False
108    },
109    'krnic': {
110        'country_code': 'KR',
111        'url': 'https://xn--c79as89aj0e29b77z.xn--3e0b707e/eng/whois.jsc',
112        'request_type': 'POST',
113        'request_headers': {
114            'Accept': 'text/html',
115            'Referer': (
116                'https://xn--c79as89aj0e29b77z.xn--3e0b707e/eng/whois.jsp'
117            ),
118        },
119        'form_data_ip_field': 'query',
120        'fields': {
121            'name': r'(Organization Name)[\s]+\:[^\S\n]+(?P<val>.+?)\n',
122            'handle': r'(Service Name|Network Type)[\s]+\:[^\S\n]+(?P<val>.+?)'
123                      '\n',
124            'address': r'(Address)[\s]+\:[^\S\n]+(?P<val>.+?)\n',
125            'postal_code': r'(Zip Code)[\s]+\:[^\S\n]+(?P<val>.+?)\n',
126            'created': r'(Registration Date)[\s]+\:[^\S\n]+(?P<val>.+?)\n',
127            'contact_admin': r'(id="eng_isp_contact").+?\>(?P<val>.*?)\<'
128                              '\\/div\\>\n',
129            'contact_tech': r'(id="eng_user_contact").+?\>(?P<val>.*?)\<'
130                             '\\/div\\>\n'
131        },
132        'contact_fields': {
133            'name': r'(Name)[^\S\n]+?:[^\S\n]+?(?P<val>.*?)\n',
134            'email': r'(E-Mail)[^\S\n]+?:[^\S\n]+?(?P<val>.*?)\n',
135            'phone': r'(Phone)[^\S\n]+?:[^\S\n]+?(?P<val>.*?)\n'
136        },
137        'dt_format': '%Y%m%d',
138        'dt_hourdelta': 0,
139        'multi_net': True
140    }
141}
142
143
144class NIRWhois:
145    """
146    The class for parsing whois data for NIRs (National Internet Registry).
147    JPNIC and KRNIC are currently the only NIRs supported. Output varies
148    based on NIR specific whois formatting.
149
150    Args:
151        net (:obj:`ipwhois.net.Net`): The network object.
152
153    Raises:
154        NetError: The parameter provided is not an instance of
155            ipwhois.net.Net
156        IPDefinedError: The address provided is defined (does not need to be
157            resolved).
158    """
159
160    def __init__(self, net):
161
162        from .net import Net
163
164        # ipwhois.net.Net validation
165        if isinstance(net, Net):
166
167            self._net = net
168
169        else:
170
171            raise NetError('The provided net parameter is not an instance of '
172                           'ipwhois.net.Net')
173
174    def parse_fields(self, response, fields_dict, net_start=None,
175                     net_end=None, dt_format=None, field_list=None,
176                     hourdelta=0, is_contact=False):
177        """
178        The function for parsing whois fields from a data input.
179
180        Args:
181            response (:obj:`str`): The response from the whois/rwhois server.
182            fields_dict (:obj:`dict`): The mapping of fields to regex search
183                values (required).
184            net_start (:obj:`int`): The starting point of the network (if
185                parsing multiple networks). Defaults to None.
186            net_end (:obj:`int`): The ending point of the network (if parsing
187                multiple networks). Defaults to None.
188            dt_format (:obj:`str`): The format of datetime fields if known.
189                Defaults to None.
190            field_list (:obj:`list` of :obj:`str`): If provided, fields to
191                parse. Defaults to :obj:`ipwhois.nir.BASE_NET` if is_contact
192                is False. Otherwise, defaults to
193                :obj:`ipwhois.nir.BASE_CONTACT`.
194            hourdelta (:obj:`int`): The timezone delta for created/updated
195                fields. Defaults to 0.
196            is_contact (:obj:`bool`): If True, uses contact information
197                field parsing. Defaults to False.
198
199        Returns:
200            dict: A dictionary of fields provided in fields_dict, mapping to
201                the results of the regex searches.
202        """
203
204        response = '{0}\n'.format(response)
205        if is_contact:
206
207            ret = {}
208
209            if not field_list:
210
211                field_list = list(BASE_CONTACT.keys())
212
213        else:
214
215            ret = {
216                'contacts': {'admin': None, 'tech': None},
217                'contact_admin': {},
218                'contact_tech': {}
219            }
220
221            if not field_list:
222
223                field_list = list(BASE_NET.keys())
224                field_list.remove('contacts')
225                field_list.append('contact_admin')
226                field_list.append('contact_tech')
227
228        generate = ((field, pattern) for (field, pattern) in
229                    fields_dict.items() if field in field_list)
230
231        for field, pattern in generate:
232
233            pattern = re.compile(
234                str(pattern),
235                re.DOTALL
236            )
237
238            if net_start is not None:
239
240                match = pattern.finditer(response, net_end, net_start)
241
242            elif net_end is not None:
243
244                match = pattern.finditer(response, net_end)
245
246            else:
247
248                match = pattern.finditer(response)
249
250            values = []
251            for m in match:
252
253                try:
254
255                    values.append(m.group('val').strip())
256
257                except IndexError:
258
259                    pass
260
261            if len(values) > 0:
262
263                value = None
264                try:
265
266                    if field in ['created', 'updated'] and dt_format:
267
268                        try:
269                            value = (
270                                datetime.strptime(
271                                    values[0],
272                                    str(dt_format)
273                                ) - timedelta(hours=hourdelta)
274                            ).isoformat('T')
275                        except ValueError:
276                            value = (
277                                datetime.strptime(
278                                    values[0],
279                                    '%Y/%m/%d'
280                                )
281                            ).isoformat('T')
282
283                    elif field in ['nameservers']:
284
285                        value = list(unique_everseen(values))
286
287                    else:
288
289                        values = unique_everseen(values)
290                        value = '\n'.join(values)
291
292                except ValueError as e:
293
294                    log.debug('NIR whois field parsing failed for {0}: {1}'
295                              ''.format(field, e))
296                    pass
297
298                ret[field] = value
299
300        return ret
301
302    def get_nets_jpnic(self, response):
303        """
304        The function for parsing network blocks from jpnic whois data.
305
306        Args:
307            response (:obj:`str`): The response from the jpnic server.
308
309        Returns:
310            list of dict: Mapping of networks with start and end positions.
311
312            ::
313
314                [{
315                    'cidr' (str) - The network routing block
316                    'start' (int) - The starting point of the network
317                    'end' (int) - The endpoint point of the network
318                }]
319        """
320
321        nets = []
322
323        # Iterate through all of the networks found, storing the CIDR value
324        # and the start and end positions.
325        for match in re.finditer(
326                r'^.*?(\[Network Number\])[^\S\n]+.+?>(?P<val>.+?)</A>$',
327                response,
328                re.MULTILINE
329        ):
330
331            try:
332
333                net = copy.deepcopy(BASE_NET)
334                tmp = ip_network(match.group(2))
335
336                try:  # pragma: no cover
337                    network_address = tmp.network_address
338                except AttributeError:  # pragma: no cover
339                    network_address = tmp.ip
340                    pass
341
342                try:  # pragma: no cover
343                    broadcast_address = tmp.broadcast_address
344                except AttributeError:  # pragma: no cover
345                    broadcast_address = tmp.broadcast
346                    pass
347
348                net['range'] = '{0} - {1}'.format(
349                    network_address + 1, broadcast_address
350                )
351
352                cidr = ip_network(match.group(2).strip()).__str__()
353
354                net['cidr'] = cidr
355                net['start'] = match.start()
356                net['end'] = match.end()
357                nets.append(net)
358
359            except (ValueError, TypeError):
360
361                pass
362
363        return nets
364
365    def get_nets_krnic(self, response):
366        """
367        The function for parsing network blocks from krnic whois data.
368
369        Args:
370            response (:obj:`str`): The response from the krnic server.
371
372        Returns:
373            list of dict: Mapping of networks with start and end positions.
374
375            ::
376
377                [{
378                    'cidr' (str) - The network routing block
379                    'start' (int) - The starting point of the network
380                    'end' (int) - The endpoint point of the network
381                }]
382        """
383
384        nets = []
385
386        # Iterate through all of the networks found, storing the CIDR value
387        # and the start and end positions.
388        for match in re.finditer(
389                r'^(IPv4 Address)[\s]+:[^\S\n]+((.+?)[^\S\n]-[^\S\n](.+?)'
390                '[^\\S\n]\\((.+?)\\)|.+)$',
391                response,
392                re.MULTILINE
393        ):
394
395            try:
396
397                net = copy.deepcopy(BASE_NET)
398                net['range'] = match.group(2)
399
400                if match.group(3) and match.group(4):
401
402                    addrs = []
403                    addrs.extend(summarize_address_range(
404                        ip_address(match.group(3).strip()),
405                        ip_address(match.group(4).strip())))
406
407                    cidr = ', '.join(
408                        [i.__str__() for i in collapse_addresses(addrs)]
409                    )
410
411                    net['range'] = '{0} - {1}'.format(
412                        match.group(3), match.group(4)
413                    )
414
415                else:
416
417                    cidr = ip_network(match.group(2).strip()).__str__()
418
419                net['cidr'] = cidr
420                net['start'] = match.start()
421                net['end'] = match.end()
422                nets.append(net)
423
424            except (ValueError, TypeError):
425
426                pass
427
428        return nets
429
430    def get_contact(self, response=None, nir=None, handle=None,
431                    retry_count=3, dt_format=None):
432        """
433        The function for retrieving and parsing NIR whois data based on
434        NIR_WHOIS contact_fields.
435
436        Args:
437            response (:obj:`str`): Optional response object, this bypasses the
438                lookup.
439            nir (:obj:`str`): The NIR to query ('jpnic' or 'krnic'). Required
440                if response is None.
441            handle (:obj:`str`): For NIRs that have separate contact queries
442                (JPNIC), this is the contact handle to use in the query.
443                Defaults to None.
444            retry_count (:obj:`int`): The number of times to retry in case
445                socket errors, timeouts, connection resets, etc. are
446                encountered. Defaults to 3.
447            dt_format (:obj:`str`): The format of datetime fields if known.
448                Defaults to None.
449
450        Returns:
451            dict: Mapping of the fields provided in contact_fields, to their
452                parsed results.
453        """
454
455        if response or nir == 'krnic':
456
457            contact_response = response
458
459        else:
460
461            # Retrieve the whois data.
462            contact_response = self._net.get_http_raw(
463                url=str(NIR_WHOIS[nir]['url']).format(handle),
464                retry_count=retry_count,
465                headers=NIR_WHOIS[nir]['request_headers'],
466                request_type=NIR_WHOIS[nir]['request_type']
467            )
468
469        return self.parse_fields(
470            response=contact_response,
471            fields_dict=NIR_WHOIS[nir]['contact_fields'],
472            dt_format=dt_format,
473            hourdelta=int(NIR_WHOIS[nir]['dt_hourdelta']),
474            is_contact=True
475        )
476
477    def lookup(self, nir=None, inc_raw=False, retry_count=3, response=None,
478               field_list=None, is_offline=False):
479        """
480        The function for retrieving and parsing NIR whois information for an IP
481        address via HTTP (HTML scraping).
482
483        Args:
484            nir (:obj:`str`): The NIR to query ('jpnic' or 'krnic'). Required
485                if response is None.
486            inc_raw (:obj:`bool`, optional): Whether to include the raw
487                results in the returned dictionary. Defaults to False.
488            retry_count (:obj:`int`): The number of times to retry in case
489                socket errors, timeouts, connection resets, etc. are
490                encountered. Defaults to 3.
491            response (:obj:`str`): Optional response object, this bypasses the
492                NIR lookup. Required when is_offline=True.
493            field_list (:obj:`list` of :obj:`str`): If provided, fields to
494                parse. Defaults to :obj:`ipwhois.nir.BASE_NET`.
495            is_offline (:obj:`bool`): Whether to perform lookups offline. If
496                True, response and asn_data must be provided. Primarily used
497                for testing.
498
499        Returns:
500            dict: The NIR whois results:
501
502            ::
503
504                {
505                    'query' (str) - The IP address.
506                    'nets' (list of dict) - Network information which consists
507                        of the fields listed in the ipwhois.nir.NIR_WHOIS
508                        dictionary.
509                    'raw' (str) - Raw NIR whois results if the inc_raw
510                        parameter is True.
511                }
512        """
513
514        if nir not in NIR_WHOIS.keys():
515
516            raise KeyError('Invalid arg for nir (National Internet Registry')
517
518        # Create the return dictionary.
519        results = {
520            'query': self._net.address_str,
521            'raw': None
522        }
523
524        # Only fetch the response if we haven't already.
525        if response is None:
526
527            if is_offline:
528
529                raise KeyError('response argument required when '
530                               'is_offline=True')
531
532            log.debug('Response not given, perform WHOIS lookup for {0}'
533                      .format(self._net.address_str))
534
535            form_data = None
536            if NIR_WHOIS[nir]['form_data_ip_field']:
537                form_data = {NIR_WHOIS[nir]['form_data_ip_field']:
538                             self._net.address_str}
539
540            # Retrieve the whois data.
541            response = self._net.get_http_raw(
542                url=str(NIR_WHOIS[nir]['url']).format(self._net.address_str),
543                retry_count=retry_count,
544                headers=NIR_WHOIS[nir]['request_headers'],
545                request_type=NIR_WHOIS[nir]['request_type'],
546                form_data=form_data
547            )
548
549        # If inc_raw parameter is True, add the response to return dictionary.
550        if inc_raw:
551
552            results['raw'] = response
553
554        nets = []
555        nets_response = None
556        if nir == 'jpnic':
557
558            nets_response = self.get_nets_jpnic(response)
559
560        elif nir == 'krnic':
561
562            nets_response = self.get_nets_krnic(response)
563
564        nets.extend(nets_response)
565
566        global_contacts = {}
567
568        # Iterate through all of the network sections and parse out the
569        # appropriate fields for each.
570        log.debug('Parsing NIR WHOIS data')
571        for index, net in enumerate(nets):
572
573            section_end = None
574            if index + 1 < len(nets):
575                section_end = nets[index + 1]['start']
576
577            try:
578
579                dt_format = NIR_WHOIS[nir]['dt_format']
580
581            except KeyError:  # pragma: no cover
582
583                dt_format = None
584
585            temp_net = self.parse_fields(
586                response=response,
587                fields_dict=NIR_WHOIS[nir]['fields'],
588                net_start=section_end,
589                net_end=net['end'],
590                dt_format=dt_format,
591                field_list=field_list,
592                hourdelta=int(NIR_WHOIS[nir]['dt_hourdelta'])
593            )
594            temp_net['country'] = NIR_WHOIS[nir]['country_code']
595            contacts = {
596                'admin': temp_net['contact_admin'],
597                'tech': temp_net['contact_tech']
598            }
599
600            del (
601                temp_net['contact_admin'],
602                temp_net['contact_tech']
603            )
604
605            if not is_offline:
606
607                for key, val in contacts.items():
608
609                    if len(val) > 0:
610
611                        if isinstance(val, str):
612
613                            val = val.splitlines()
614
615                        for contact in val:
616
617                            if contact in global_contacts.keys():
618
619                                temp_net['contacts'][key] = (
620                                    global_contacts[contact]
621                                )
622
623                            else:
624
625                                if nir == 'krnic':
626
627                                    tmp_response = contact
628                                    tmp_handle = None
629
630                                else:
631
632                                    tmp_response = None
633                                    tmp_handle = contact
634
635                                temp_net['contacts'][key] = self.get_contact(
636                                    response=tmp_response,
637                                    handle=tmp_handle,
638                                    nir=nir,
639                                    retry_count=retry_count,
640                                    dt_format=dt_format
641                                )
642                                global_contacts[contact] = (
643                                    temp_net['contacts'][key]
644                                )
645
646            # Merge the net dictionaries.
647            net.update(temp_net)
648
649            # The start and end values are no longer needed.
650            del net['start'], net['end']
651
652        # Add the networks to the return dictionary.
653        results['nets'] = nets
654
655        return results
656