1from __future__ import unicode_literals
2
3import requests
4import time
5from bs4 import BeautifulSoup
6from datetime import datetime, timedelta
7from decimal import Decimal
8
9from .exceptions import (
10  PageError, DisambiguationError, RedirectError, HTTPTimeoutError,
11  WikipediaException, ODD_ERROR_MESSAGE)
12from .util import cache, stdout_encode, debug
13import re
14
15API_URL = 'http://en.wikipedia.org/w/api.php'
16RATE_LIMIT = False
17RATE_LIMIT_MIN_WAIT = None
18RATE_LIMIT_LAST_CALL = None
19USER_AGENT = 'wikipedia (https://github.com/goldsmith/Wikipedia/)'
20
21
22def set_lang(prefix):
23  '''
24  Change the language of the API being requested.
25  Set `prefix` to one of the two letter prefixes found on the `list of all Wikipedias <http://meta.wikimedia.org/wiki/List_of_Wikipedias>`_.
26
27  After setting the language, the cache for ``search``, ``suggest``, and ``summary`` will be cleared.
28
29  .. note:: Make sure you search for page titles in the language that you have set.
30  '''
31  global API_URL
32  API_URL = 'http://' + prefix.lower() + '.wikipedia.org/w/api.php'
33
34  for cached_func in (search, suggest, summary):
35    cached_func.clear_cache()
36
37
38def set_user_agent(user_agent_string):
39  '''
40  Set the User-Agent string to be used for all requests.
41
42  Arguments:
43
44  * user_agent_string - (string) a string specifying the User-Agent header
45  '''
46  global USER_AGENT
47  USER_AGENT = user_agent_string
48
49
50def set_rate_limiting(rate_limit, min_wait=timedelta(milliseconds=50)):
51  '''
52  Enable or disable rate limiting on requests to the Mediawiki servers.
53  If rate limiting is not enabled, under some circumstances (depending on
54  load on Wikipedia, the number of requests you and other `wikipedia` users
55  are making, and other factors), Wikipedia may return an HTTP timeout error.
56
57  Enabling rate limiting generally prevents that issue, but please note that
58  HTTPTimeoutError still might be raised.
59
60  Arguments:
61
62  * rate_limit - (Boolean) whether to enable rate limiting or not
63
64  Keyword arguments:
65
66  * min_wait - if rate limiting is enabled, `min_wait` is a timedelta describing the minimum time to wait before requests.
67         Defaults to timedelta(milliseconds=50)
68  '''
69  global RATE_LIMIT
70  global RATE_LIMIT_MIN_WAIT
71  global RATE_LIMIT_LAST_CALL
72
73  RATE_LIMIT = rate_limit
74  if not rate_limit:
75    RATE_LIMIT_MIN_WAIT = None
76  else:
77    RATE_LIMIT_MIN_WAIT = min_wait
78
79  RATE_LIMIT_LAST_CALL = None
80
81
82@cache
83def search(query, results=10, suggestion=False):
84  '''
85  Do a Wikipedia search for `query`.
86
87  Keyword arguments:
88
89  * results - the maxmimum number of results returned
90  * suggestion - if True, return results and suggestion (if any) in a tuple
91  '''
92
93  search_params = {
94    'list': 'search',
95    'srprop': '',
96    'srlimit': results,
97    'limit': results,
98    'srsearch': query
99  }
100  if suggestion:
101    search_params['srinfo'] = 'suggestion'
102
103  raw_results = _wiki_request(search_params)
104
105  if 'error' in raw_results:
106    if raw_results['error']['info'] in ('HTTP request timed out.', 'Pool queue is full'):
107      raise HTTPTimeoutError(query)
108    else:
109      raise WikipediaException(raw_results['error']['info'])
110
111  search_results = (d['title'] for d in raw_results['query']['search'])
112
113  if suggestion:
114    if raw_results['query'].get('searchinfo'):
115      return list(search_results), raw_results['query']['searchinfo']['suggestion']
116    else:
117      return list(search_results), None
118
119  return list(search_results)
120
121
122@cache
123def geosearch(latitude, longitude, title=None, results=10, radius=1000):
124  '''
125  Do a wikipedia geo search for `latitude` and `longitude`
126  using HTTP API described in http://www.mediawiki.org/wiki/Extension:GeoData
127
128  Arguments:
129
130  * latitude (float or decimal.Decimal)
131  * longitude (float or decimal.Decimal)
132
133  Keyword arguments:
134
135  * title - The title of an article to search for
136  * results - the maximum number of results returned
137  * radius - Search radius in meters. The value must be between 10 and 10000
138  '''
139
140  search_params = {
141    'list': 'geosearch',
142    'gsradius': radius,
143    'gscoord': '{0}|{1}'.format(latitude, longitude),
144    'gslimit': results
145  }
146  if title:
147    search_params['titles'] = title
148
149  raw_results = _wiki_request(search_params)
150
151  if 'error' in raw_results:
152    if raw_results['error']['info'] in ('HTTP request timed out.', 'Pool queue is full'):
153      raise HTTPTimeoutError('{0}|{1}'.format(latitude, longitude))
154    else:
155      raise WikipediaException(raw_results['error']['info'])
156
157  search_pages = raw_results['query'].get('pages', None)
158  if search_pages:
159    search_results = (v['title'] for k, v in search_pages.items() if k != '-1')
160  else:
161    search_results = (d['title'] for d in raw_results['query']['geosearch'])
162
163  return list(search_results)
164
165
166@cache
167def suggest(query):
168  '''
169  Get a Wikipedia search suggestion for `query`.
170  Returns a string or None if no suggestion was found.
171  '''
172
173  search_params = {
174    'list': 'search',
175    'srinfo': 'suggestion',
176    'srprop': '',
177  }
178  search_params['srsearch'] = query
179
180  raw_result = _wiki_request(search_params)
181
182  if raw_result['query'].get('searchinfo'):
183    return raw_result['query']['searchinfo']['suggestion']
184
185  return None
186
187
188def random(pages=1):
189  '''
190  Get a list of random Wikipedia article titles.
191
192  .. note:: Random only gets articles from namespace 0, meaning no Category, User talk, or other meta-Wikipedia pages.
193
194  Keyword arguments:
195
196  * pages - the number of random pages returned (max of 10)
197  '''
198  #http://en.wikipedia.org/w/api.php?action=query&list=random&rnlimit=5000&format=jsonfm
199  query_params = {
200    'list': 'random',
201    'rnnamespace': 0,
202    'rnlimit': pages,
203  }
204
205  request = _wiki_request(query_params)
206  titles = [page['title'] for page in request['query']['random']]
207
208  if len(titles) == 1:
209    return titles[0]
210
211  return titles
212
213
214@cache
215def summary(title, sentences=0, chars=0, auto_suggest=True, redirect=True):
216  '''
217  Plain text summary of the page.
218
219  .. note:: This is a convenience wrapper - auto_suggest and redirect are enabled by default
220
221  Keyword arguments:
222
223  * sentences - if set, return the first `sentences` sentences (can be no greater than 10).
224  * chars - if set, return only the first `chars` characters (actual text returned may be slightly longer).
225  * auto_suggest - let Wikipedia find a valid page title for the query
226  * redirect - allow redirection without raising RedirectError
227  '''
228
229  # use auto_suggest and redirect to get the correct article
230  # also, use page's error checking to raise DisambiguationError if necessary
231  page_info = page(title, auto_suggest=auto_suggest, redirect=redirect)
232  title = page_info.title
233  pageid = page_info.pageid
234
235  query_params = {
236    'prop': 'extracts',
237    'explaintext': '',
238    'titles': title
239  }
240
241  if sentences:
242    query_params['exsentences'] = sentences
243  elif chars:
244    query_params['exchars'] = chars
245  else:
246    query_params['exintro'] = ''
247
248  request = _wiki_request(query_params)
249  summary = request['query']['pages'][pageid]['extract']
250
251  return summary
252
253
254def page(title=None, pageid=None, auto_suggest=True, redirect=True, preload=False):
255  '''
256  Get a WikipediaPage object for the page with title `title` or the pageid
257  `pageid` (mutually exclusive).
258
259  Keyword arguments:
260
261  * title - the title of the page to load
262  * pageid - the numeric pageid of the page to load
263  * auto_suggest - let Wikipedia find a valid page title for the query
264  * redirect - allow redirection without raising RedirectError
265  * preload - load content, summary, images, references, and links during initialization
266  '''
267
268  if title is not None:
269    if auto_suggest:
270      results, suggestion = search(title, results=1, suggestion=True)
271      try:
272        title = suggestion or results[0]
273      except IndexError:
274        # if there is no suggestion or search results, the page doesn't exist
275        raise PageError(title)
276    return WikipediaPage(title, redirect=redirect, preload=preload)
277  elif pageid is not None:
278    return WikipediaPage(pageid=pageid, preload=preload)
279  else:
280    raise ValueError("Either a title or a pageid must be specified")
281
282
283
284class WikipediaPage(object):
285  '''
286  Contains data from a Wikipedia page.
287  Uses property methods to filter data from the raw HTML.
288  '''
289
290  def __init__(self, title=None, pageid=None, redirect=True, preload=False, original_title=''):
291    if title is not None:
292      self.title = title
293      self.original_title = original_title or title
294    elif pageid is not None:
295      self.pageid = pageid
296    else:
297      raise ValueError("Either a title or a pageid must be specified")
298
299    self.__load(redirect=redirect, preload=preload)
300
301    if preload:
302      for prop in ('content', 'summary', 'images', 'references', 'links', 'sections'):
303        getattr(self, prop)
304
305  def __repr__(self):
306    return stdout_encode(u'<WikipediaPage \'{}\'>'.format(self.title))
307
308  def __eq__(self, other):
309    try:
310      return (
311        self.pageid == other.pageid
312        and self.title == other.title
313        and self.url == other.url
314      )
315    except:
316      return False
317
318  def __load(self, redirect=True, preload=False):
319    '''
320    Load basic information from Wikipedia.
321    Confirm that page exists and is not a disambiguation/redirect.
322
323    Does not need to be called manually, should be called automatically during __init__.
324    '''
325    query_params = {
326      'prop': 'info|pageprops',
327      'inprop': 'url',
328      'ppprop': 'disambiguation',
329      'redirects': '',
330    }
331    if not getattr(self, 'pageid', None):
332      query_params['titles'] = self.title
333    else:
334      query_params['pageids'] = self.pageid
335
336    request = _wiki_request(query_params)
337
338    query = request['query']
339    pageid = list(query['pages'].keys())[0]
340    page = query['pages'][pageid]
341
342    # missing is present if the page is missing
343    if 'missing' in page:
344      if hasattr(self, 'title'):
345        raise PageError(self.title)
346      else:
347        raise PageError(pageid=self.pageid)
348
349    # same thing for redirect, except it shows up in query instead of page for
350    # whatever silly reason
351    elif 'redirects' in query:
352      if redirect:
353        redirects = query['redirects'][0]
354
355        if 'normalized' in query:
356          normalized = query['normalized'][0]
357          assert normalized['from'] == self.title, ODD_ERROR_MESSAGE
358
359          from_title = normalized['to']
360
361        else:
362          from_title = self.title
363
364        assert redirects['from'] == from_title, ODD_ERROR_MESSAGE
365
366        # change the title and reload the whole object
367        self.__init__(redirects['to'], redirect=redirect, preload=preload)
368
369      else:
370        raise RedirectError(getattr(self, 'title', page['title']))
371
372    # since we only asked for disambiguation in ppprop,
373    # if a pageprop is returned,
374    # then the page must be a disambiguation page
375    elif 'pageprops' in page:
376      query_params = {
377        'prop': 'revisions',
378        'rvprop': 'content',
379        'rvparse': '',
380        'rvlimit': 1
381      }
382      if hasattr(self, 'pageid'):
383        query_params['pageids'] = self.pageid
384      else:
385        query_params['titles'] = self.title
386      request = _wiki_request(query_params)
387      html = request['query']['pages'][pageid]['revisions'][0]['*']
388
389      lis = BeautifulSoup(html).find_all('li')
390      filtered_lis = [li for li in lis if not 'tocsection' in ''.join(li.get('class', []))]
391      may_refer_to = [li.a.get_text() for li in filtered_lis if li.a]
392
393      raise DisambiguationError(getattr(self, 'title', page['title']), may_refer_to)
394
395    else:
396      self.pageid = pageid
397      self.title = page['title']
398      self.url = page['fullurl']
399
400  def __continued_query(self, query_params):
401    '''
402    Based on https://www.mediawiki.org/wiki/API:Query#Continuing_queries
403    '''
404    query_params.update(self.__title_query_param)
405
406    last_continue = {}
407    prop = query_params.get('prop', None)
408
409    while True:
410      params = query_params.copy()
411      params.update(last_continue)
412
413      request = _wiki_request(params)
414
415      if 'query' not in request:
416        break
417
418      pages = request['query']['pages']
419      if 'generator' in query_params:
420        for datum in pages.values():  # in python 3.3+: "yield from pages.values()"
421          yield datum
422      else:
423        for datum in pages[self.pageid][prop]:
424          yield datum
425
426      if 'continue' not in request:
427        break
428
429      last_continue = request['continue']
430
431  @property
432  def __title_query_param(self):
433    if getattr(self, 'title', None) is not None:
434      return {'titles': self.title}
435    else:
436      return {'pageids': self.pageid}
437
438  def html(self):
439    '''
440    Get full page HTML.
441
442    .. warning:: This can get pretty slow on long pages.
443    '''
444
445    if not getattr(self, '_html', False):
446      query_params = {
447        'prop': 'revisions',
448        'rvprop': 'content',
449        'rvlimit': 1,
450        'rvparse': '',
451        'titles': self.title
452      }
453
454      request = _wiki_request(query_params)
455      self._html = request['query']['pages'][self.pageid]['revisions'][0]['*']
456
457    return self._html
458
459  @property
460  def content(self):
461    '''
462    Plain text content of the page, excluding images, tables, and other data.
463    '''
464
465    if not getattr(self, '_content', False):
466      query_params = {
467        'prop': 'extracts|revisions',
468        'explaintext': '',
469        'rvprop': 'ids'
470      }
471      if not getattr(self, 'title', None) is None:
472         query_params['titles'] = self.title
473      else:
474         query_params['pageids'] = self.pageid
475      request = _wiki_request(query_params)
476      self._content     = request['query']['pages'][self.pageid]['extract']
477      self._revision_id = request['query']['pages'][self.pageid]['revisions'][0]['revid']
478      self._parent_id   = request['query']['pages'][self.pageid]['revisions'][0]['parentid']
479
480    return self._content
481
482  @property
483  def revision_id(self):
484    '''
485    Revision ID of the page.
486
487    The revision ID is a number that uniquely identifies the current
488    version of the page. It can be used to create the permalink or for
489    other direct API calls. See `Help:Page history
490    <http://en.wikipedia.org/wiki/Wikipedia:Revision>`_ for more
491    information.
492    '''
493
494    if not getattr(self, '_revid', False):
495      # fetch the content (side effect is loading the revid)
496      self.content
497
498    return self._revision_id
499
500  @property
501  def parent_id(self):
502    '''
503    Revision ID of the parent version of the current revision of this
504    page. See ``revision_id`` for more information.
505    '''
506
507    if not getattr(self, '_parentid', False):
508      # fetch the content (side effect is loading the revid)
509      self.content
510
511    return self._parent_id
512
513  @property
514  def summary(self):
515    '''
516    Plain text summary of the page.
517    '''
518
519    if not getattr(self, '_summary', False):
520      query_params = {
521        'prop': 'extracts',
522        'explaintext': '',
523        'exintro': '',
524      }
525      if not getattr(self, 'title', None) is None:
526         query_params['titles'] = self.title
527      else:
528         query_params['pageids'] = self.pageid
529
530      request = _wiki_request(query_params)
531      self._summary = request['query']['pages'][self.pageid]['extract']
532
533    return self._summary
534
535  @property
536  def images(self):
537    '''
538    List of URLs of images on the page.
539    '''
540
541    if not getattr(self, '_images', False):
542      self._images = [
543        page['imageinfo'][0]['url']
544        for page in self.__continued_query({
545          'generator': 'images',
546          'gimlimit': 'max',
547          'prop': 'imageinfo',
548          'iiprop': 'url',
549        })
550        if 'imageinfo' in page
551      ]
552
553    return self._images
554
555  @property
556  def coordinates(self):
557    '''
558    Tuple of Decimals in the form of (lat, lon) or None
559    '''
560    if not getattr(self, '_coordinates', False):
561      query_params = {
562        'prop': 'coordinates',
563        'colimit': 'max',
564        'titles': self.title,
565      }
566
567      request = _wiki_request(query_params)
568
569      if 'query' in request:
570        coordinates = request['query']['pages'][self.pageid]['coordinates']
571        self._coordinates = (Decimal(coordinates[0]['lat']), Decimal(coordinates[0]['lon']))
572      else:
573        self._coordinates = None
574
575    return self._coordinates
576
577  @property
578  def references(self):
579    '''
580    List of URLs of external links on a page.
581    May include external links within page that aren't technically cited anywhere.
582    '''
583
584    if not getattr(self, '_references', False):
585      def add_protocol(url):
586        return url if url.startswith('http') else 'http:' + url
587
588      self._references = [
589        add_protocol(link['*'])
590        for link in self.__continued_query({
591          'prop': 'extlinks',
592          'ellimit': 'max'
593        })
594      ]
595
596    return self._references
597
598  @property
599  def links(self):
600    '''
601    List of titles of Wikipedia page links on a page.
602
603    .. note:: Only includes articles from namespace 0, meaning no Category, User talk, or other meta-Wikipedia pages.
604    '''
605
606    if not getattr(self, '_links', False):
607      self._links = [
608        link['title']
609        for link in self.__continued_query({
610          'prop': 'links',
611          'plnamespace': 0,
612          'pllimit': 'max'
613        })
614      ]
615
616    return self._links
617
618  @property
619  def categories(self):
620    '''
621    List of categories of a page.
622    '''
623
624    if not getattr(self, '_categories', False):
625      self._categories = [re.sub(r'^Category:', '', x) for x in
626        [link['title']
627        for link in self.__continued_query({
628          'prop': 'categories',
629          'cllimit': 'max'
630        })
631      ]]
632
633    return self._categories
634
635  @property
636  def sections(self):
637    '''
638    List of section titles from the table of contents on the page.
639    '''
640
641    if not getattr(self, '_sections', False):
642      query_params = {
643        'action': 'parse',
644        'prop': 'sections',
645      }
646      query_params.update(self.__title_query_param)
647
648      request = _wiki_request(query_params)
649      self._sections = [section['line'] for section in request['parse']['sections']]
650
651    return self._sections
652
653  def section(self, section_title):
654    '''
655    Get the plain text content of a section from `self.sections`.
656    Returns None if `section_title` isn't found, otherwise returns a whitespace stripped string.
657
658    This is a convenience method that wraps self.content.
659
660    .. warning:: Calling `section` on a section that has subheadings will NOT return
661           the full text of all of the subsections. It only gets the text between
662           `section_title` and the next subheading, which is often empty.
663    '''
664
665    section = u"== {} ==".format(section_title)
666    try:
667      index = self.content.index(section) + len(section)
668    except ValueError:
669      return None
670
671    try:
672      next_index = self.content.index("==", index)
673    except ValueError:
674      next_index = len(self.content)
675
676    return self.content[index:next_index].lstrip("=").strip()
677
678
679@cache
680def languages():
681  '''
682  List all the currently supported language prefixes (usually ISO language code).
683
684  Can be inputted to `set_lang` to change the Mediawiki that `wikipedia` requests
685  results from.
686
687  Returns: dict of <prefix>: <local_lang_name> pairs. To get just a list of prefixes,
688  use `wikipedia.languages().keys()`.
689  '''
690  response = _wiki_request({
691    'meta': 'siteinfo',
692    'siprop': 'languages'
693  })
694
695  languages = response['query']['languages']
696
697  return {
698    lang['code']: lang['*']
699    for lang in languages
700  }
701
702
703def donate():
704  '''
705  Open up the Wikimedia donate page in your favorite browser.
706  '''
707  import webbrowser
708
709  webbrowser.open('https://donate.wikimedia.org/w/index.php?title=Special:FundraiserLandingPage', new=2)
710
711
712def _wiki_request(params):
713  '''
714  Make a request to the Wikipedia API using the given search parameters.
715  Returns a parsed dict of the JSON response.
716  '''
717  global RATE_LIMIT_LAST_CALL
718  global USER_AGENT
719
720  params['format'] = 'json'
721  if not 'action' in params:
722    params['action'] = 'query'
723
724  headers = {
725    'User-Agent': USER_AGENT
726  }
727
728  if RATE_LIMIT and RATE_LIMIT_LAST_CALL and \
729    RATE_LIMIT_LAST_CALL + RATE_LIMIT_MIN_WAIT > datetime.now():
730
731    # it hasn't been long enough since the last API call
732    # so wait until we're in the clear to make the request
733
734    wait_time = (RATE_LIMIT_LAST_CALL + RATE_LIMIT_MIN_WAIT) - datetime.now()
735    time.sleep(int(wait_time.total_seconds()))
736
737  r = requests.get(API_URL, params=params, headers=headers)
738
739  if RATE_LIMIT:
740    RATE_LIMIT_LAST_CALL = datetime.now()
741
742  return r.json()
743