1from __future__ import unicode_literals 2 3import requests 4import time 5from bs4 import BeautifulSoup 6from datetime import datetime, timedelta 7from decimal import Decimal 8 9from .exceptions import ( 10 PageError, DisambiguationError, RedirectError, HTTPTimeoutError, 11 WikipediaException, ODD_ERROR_MESSAGE) 12from .util import cache, stdout_encode, debug 13import re 14 15API_URL = 'http://en.wikipedia.org/w/api.php' 16RATE_LIMIT = False 17RATE_LIMIT_MIN_WAIT = None 18RATE_LIMIT_LAST_CALL = None 19USER_AGENT = 'wikipedia (https://github.com/goldsmith/Wikipedia/)' 20 21 22def set_lang(prefix): 23 ''' 24 Change the language of the API being requested. 25 Set `prefix` to one of the two letter prefixes found on the `list of all Wikipedias <http://meta.wikimedia.org/wiki/List_of_Wikipedias>`_. 26 27 After setting the language, the cache for ``search``, ``suggest``, and ``summary`` will be cleared. 28 29 .. note:: Make sure you search for page titles in the language that you have set. 30 ''' 31 global API_URL 32 API_URL = 'http://' + prefix.lower() + '.wikipedia.org/w/api.php' 33 34 for cached_func in (search, suggest, summary): 35 cached_func.clear_cache() 36 37 38def set_user_agent(user_agent_string): 39 ''' 40 Set the User-Agent string to be used for all requests. 41 42 Arguments: 43 44 * user_agent_string - (string) a string specifying the User-Agent header 45 ''' 46 global USER_AGENT 47 USER_AGENT = user_agent_string 48 49 50def set_rate_limiting(rate_limit, min_wait=timedelta(milliseconds=50)): 51 ''' 52 Enable or disable rate limiting on requests to the Mediawiki servers. 53 If rate limiting is not enabled, under some circumstances (depending on 54 load on Wikipedia, the number of requests you and other `wikipedia` users 55 are making, and other factors), Wikipedia may return an HTTP timeout error. 56 57 Enabling rate limiting generally prevents that issue, but please note that 58 HTTPTimeoutError still might be raised. 59 60 Arguments: 61 62 * rate_limit - (Boolean) whether to enable rate limiting or not 63 64 Keyword arguments: 65 66 * min_wait - if rate limiting is enabled, `min_wait` is a timedelta describing the minimum time to wait before requests. 67 Defaults to timedelta(milliseconds=50) 68 ''' 69 global RATE_LIMIT 70 global RATE_LIMIT_MIN_WAIT 71 global RATE_LIMIT_LAST_CALL 72 73 RATE_LIMIT = rate_limit 74 if not rate_limit: 75 RATE_LIMIT_MIN_WAIT = None 76 else: 77 RATE_LIMIT_MIN_WAIT = min_wait 78 79 RATE_LIMIT_LAST_CALL = None 80 81 82@cache 83def search(query, results=10, suggestion=False): 84 ''' 85 Do a Wikipedia search for `query`. 86 87 Keyword arguments: 88 89 * results - the maxmimum number of results returned 90 * suggestion - if True, return results and suggestion (if any) in a tuple 91 ''' 92 93 search_params = { 94 'list': 'search', 95 'srprop': '', 96 'srlimit': results, 97 'limit': results, 98 'srsearch': query 99 } 100 if suggestion: 101 search_params['srinfo'] = 'suggestion' 102 103 raw_results = _wiki_request(search_params) 104 105 if 'error' in raw_results: 106 if raw_results['error']['info'] in ('HTTP request timed out.', 'Pool queue is full'): 107 raise HTTPTimeoutError(query) 108 else: 109 raise WikipediaException(raw_results['error']['info']) 110 111 search_results = (d['title'] for d in raw_results['query']['search']) 112 113 if suggestion: 114 if raw_results['query'].get('searchinfo'): 115 return list(search_results), raw_results['query']['searchinfo']['suggestion'] 116 else: 117 return list(search_results), None 118 119 return list(search_results) 120 121 122@cache 123def geosearch(latitude, longitude, title=None, results=10, radius=1000): 124 ''' 125 Do a wikipedia geo search for `latitude` and `longitude` 126 using HTTP API described in http://www.mediawiki.org/wiki/Extension:GeoData 127 128 Arguments: 129 130 * latitude (float or decimal.Decimal) 131 * longitude (float or decimal.Decimal) 132 133 Keyword arguments: 134 135 * title - The title of an article to search for 136 * results - the maximum number of results returned 137 * radius - Search radius in meters. The value must be between 10 and 10000 138 ''' 139 140 search_params = { 141 'list': 'geosearch', 142 'gsradius': radius, 143 'gscoord': '{0}|{1}'.format(latitude, longitude), 144 'gslimit': results 145 } 146 if title: 147 search_params['titles'] = title 148 149 raw_results = _wiki_request(search_params) 150 151 if 'error' in raw_results: 152 if raw_results['error']['info'] in ('HTTP request timed out.', 'Pool queue is full'): 153 raise HTTPTimeoutError('{0}|{1}'.format(latitude, longitude)) 154 else: 155 raise WikipediaException(raw_results['error']['info']) 156 157 search_pages = raw_results['query'].get('pages', None) 158 if search_pages: 159 search_results = (v['title'] for k, v in search_pages.items() if k != '-1') 160 else: 161 search_results = (d['title'] for d in raw_results['query']['geosearch']) 162 163 return list(search_results) 164 165 166@cache 167def suggest(query): 168 ''' 169 Get a Wikipedia search suggestion for `query`. 170 Returns a string or None if no suggestion was found. 171 ''' 172 173 search_params = { 174 'list': 'search', 175 'srinfo': 'suggestion', 176 'srprop': '', 177 } 178 search_params['srsearch'] = query 179 180 raw_result = _wiki_request(search_params) 181 182 if raw_result['query'].get('searchinfo'): 183 return raw_result['query']['searchinfo']['suggestion'] 184 185 return None 186 187 188def random(pages=1): 189 ''' 190 Get a list of random Wikipedia article titles. 191 192 .. note:: Random only gets articles from namespace 0, meaning no Category, User talk, or other meta-Wikipedia pages. 193 194 Keyword arguments: 195 196 * pages - the number of random pages returned (max of 10) 197 ''' 198 #http://en.wikipedia.org/w/api.php?action=query&list=random&rnlimit=5000&format=jsonfm 199 query_params = { 200 'list': 'random', 201 'rnnamespace': 0, 202 'rnlimit': pages, 203 } 204 205 request = _wiki_request(query_params) 206 titles = [page['title'] for page in request['query']['random']] 207 208 if len(titles) == 1: 209 return titles[0] 210 211 return titles 212 213 214@cache 215def summary(title, sentences=0, chars=0, auto_suggest=True, redirect=True): 216 ''' 217 Plain text summary of the page. 218 219 .. note:: This is a convenience wrapper - auto_suggest and redirect are enabled by default 220 221 Keyword arguments: 222 223 * sentences - if set, return the first `sentences` sentences (can be no greater than 10). 224 * chars - if set, return only the first `chars` characters (actual text returned may be slightly longer). 225 * auto_suggest - let Wikipedia find a valid page title for the query 226 * redirect - allow redirection without raising RedirectError 227 ''' 228 229 # use auto_suggest and redirect to get the correct article 230 # also, use page's error checking to raise DisambiguationError if necessary 231 page_info = page(title, auto_suggest=auto_suggest, redirect=redirect) 232 title = page_info.title 233 pageid = page_info.pageid 234 235 query_params = { 236 'prop': 'extracts', 237 'explaintext': '', 238 'titles': title 239 } 240 241 if sentences: 242 query_params['exsentences'] = sentences 243 elif chars: 244 query_params['exchars'] = chars 245 else: 246 query_params['exintro'] = '' 247 248 request = _wiki_request(query_params) 249 summary = request['query']['pages'][pageid]['extract'] 250 251 return summary 252 253 254def page(title=None, pageid=None, auto_suggest=True, redirect=True, preload=False): 255 ''' 256 Get a WikipediaPage object for the page with title `title` or the pageid 257 `pageid` (mutually exclusive). 258 259 Keyword arguments: 260 261 * title - the title of the page to load 262 * pageid - the numeric pageid of the page to load 263 * auto_suggest - let Wikipedia find a valid page title for the query 264 * redirect - allow redirection without raising RedirectError 265 * preload - load content, summary, images, references, and links during initialization 266 ''' 267 268 if title is not None: 269 if auto_suggest: 270 results, suggestion = search(title, results=1, suggestion=True) 271 try: 272 title = suggestion or results[0] 273 except IndexError: 274 # if there is no suggestion or search results, the page doesn't exist 275 raise PageError(title) 276 return WikipediaPage(title, redirect=redirect, preload=preload) 277 elif pageid is not None: 278 return WikipediaPage(pageid=pageid, preload=preload) 279 else: 280 raise ValueError("Either a title or a pageid must be specified") 281 282 283 284class WikipediaPage(object): 285 ''' 286 Contains data from a Wikipedia page. 287 Uses property methods to filter data from the raw HTML. 288 ''' 289 290 def __init__(self, title=None, pageid=None, redirect=True, preload=False, original_title=''): 291 if title is not None: 292 self.title = title 293 self.original_title = original_title or title 294 elif pageid is not None: 295 self.pageid = pageid 296 else: 297 raise ValueError("Either a title or a pageid must be specified") 298 299 self.__load(redirect=redirect, preload=preload) 300 301 if preload: 302 for prop in ('content', 'summary', 'images', 'references', 'links', 'sections'): 303 getattr(self, prop) 304 305 def __repr__(self): 306 return stdout_encode(u'<WikipediaPage \'{}\'>'.format(self.title)) 307 308 def __eq__(self, other): 309 try: 310 return ( 311 self.pageid == other.pageid 312 and self.title == other.title 313 and self.url == other.url 314 ) 315 except: 316 return False 317 318 def __load(self, redirect=True, preload=False): 319 ''' 320 Load basic information from Wikipedia. 321 Confirm that page exists and is not a disambiguation/redirect. 322 323 Does not need to be called manually, should be called automatically during __init__. 324 ''' 325 query_params = { 326 'prop': 'info|pageprops', 327 'inprop': 'url', 328 'ppprop': 'disambiguation', 329 'redirects': '', 330 } 331 if not getattr(self, 'pageid', None): 332 query_params['titles'] = self.title 333 else: 334 query_params['pageids'] = self.pageid 335 336 request = _wiki_request(query_params) 337 338 query = request['query'] 339 pageid = list(query['pages'].keys())[0] 340 page = query['pages'][pageid] 341 342 # missing is present if the page is missing 343 if 'missing' in page: 344 if hasattr(self, 'title'): 345 raise PageError(self.title) 346 else: 347 raise PageError(pageid=self.pageid) 348 349 # same thing for redirect, except it shows up in query instead of page for 350 # whatever silly reason 351 elif 'redirects' in query: 352 if redirect: 353 redirects = query['redirects'][0] 354 355 if 'normalized' in query: 356 normalized = query['normalized'][0] 357 assert normalized['from'] == self.title, ODD_ERROR_MESSAGE 358 359 from_title = normalized['to'] 360 361 else: 362 from_title = self.title 363 364 assert redirects['from'] == from_title, ODD_ERROR_MESSAGE 365 366 # change the title and reload the whole object 367 self.__init__(redirects['to'], redirect=redirect, preload=preload) 368 369 else: 370 raise RedirectError(getattr(self, 'title', page['title'])) 371 372 # since we only asked for disambiguation in ppprop, 373 # if a pageprop is returned, 374 # then the page must be a disambiguation page 375 elif 'pageprops' in page: 376 query_params = { 377 'prop': 'revisions', 378 'rvprop': 'content', 379 'rvparse': '', 380 'rvlimit': 1 381 } 382 if hasattr(self, 'pageid'): 383 query_params['pageids'] = self.pageid 384 else: 385 query_params['titles'] = self.title 386 request = _wiki_request(query_params) 387 html = request['query']['pages'][pageid]['revisions'][0]['*'] 388 389 lis = BeautifulSoup(html).find_all('li') 390 filtered_lis = [li for li in lis if not 'tocsection' in ''.join(li.get('class', []))] 391 may_refer_to = [li.a.get_text() for li in filtered_lis if li.a] 392 393 raise DisambiguationError(getattr(self, 'title', page['title']), may_refer_to) 394 395 else: 396 self.pageid = pageid 397 self.title = page['title'] 398 self.url = page['fullurl'] 399 400 def __continued_query(self, query_params): 401 ''' 402 Based on https://www.mediawiki.org/wiki/API:Query#Continuing_queries 403 ''' 404 query_params.update(self.__title_query_param) 405 406 last_continue = {} 407 prop = query_params.get('prop', None) 408 409 while True: 410 params = query_params.copy() 411 params.update(last_continue) 412 413 request = _wiki_request(params) 414 415 if 'query' not in request: 416 break 417 418 pages = request['query']['pages'] 419 if 'generator' in query_params: 420 for datum in pages.values(): # in python 3.3+: "yield from pages.values()" 421 yield datum 422 else: 423 for datum in pages[self.pageid][prop]: 424 yield datum 425 426 if 'continue' not in request: 427 break 428 429 last_continue = request['continue'] 430 431 @property 432 def __title_query_param(self): 433 if getattr(self, 'title', None) is not None: 434 return {'titles': self.title} 435 else: 436 return {'pageids': self.pageid} 437 438 def html(self): 439 ''' 440 Get full page HTML. 441 442 .. warning:: This can get pretty slow on long pages. 443 ''' 444 445 if not getattr(self, '_html', False): 446 query_params = { 447 'prop': 'revisions', 448 'rvprop': 'content', 449 'rvlimit': 1, 450 'rvparse': '', 451 'titles': self.title 452 } 453 454 request = _wiki_request(query_params) 455 self._html = request['query']['pages'][self.pageid]['revisions'][0]['*'] 456 457 return self._html 458 459 @property 460 def content(self): 461 ''' 462 Plain text content of the page, excluding images, tables, and other data. 463 ''' 464 465 if not getattr(self, '_content', False): 466 query_params = { 467 'prop': 'extracts|revisions', 468 'explaintext': '', 469 'rvprop': 'ids' 470 } 471 if not getattr(self, 'title', None) is None: 472 query_params['titles'] = self.title 473 else: 474 query_params['pageids'] = self.pageid 475 request = _wiki_request(query_params) 476 self._content = request['query']['pages'][self.pageid]['extract'] 477 self._revision_id = request['query']['pages'][self.pageid]['revisions'][0]['revid'] 478 self._parent_id = request['query']['pages'][self.pageid]['revisions'][0]['parentid'] 479 480 return self._content 481 482 @property 483 def revision_id(self): 484 ''' 485 Revision ID of the page. 486 487 The revision ID is a number that uniquely identifies the current 488 version of the page. It can be used to create the permalink or for 489 other direct API calls. See `Help:Page history 490 <http://en.wikipedia.org/wiki/Wikipedia:Revision>`_ for more 491 information. 492 ''' 493 494 if not getattr(self, '_revid', False): 495 # fetch the content (side effect is loading the revid) 496 self.content 497 498 return self._revision_id 499 500 @property 501 def parent_id(self): 502 ''' 503 Revision ID of the parent version of the current revision of this 504 page. See ``revision_id`` for more information. 505 ''' 506 507 if not getattr(self, '_parentid', False): 508 # fetch the content (side effect is loading the revid) 509 self.content 510 511 return self._parent_id 512 513 @property 514 def summary(self): 515 ''' 516 Plain text summary of the page. 517 ''' 518 519 if not getattr(self, '_summary', False): 520 query_params = { 521 'prop': 'extracts', 522 'explaintext': '', 523 'exintro': '', 524 } 525 if not getattr(self, 'title', None) is None: 526 query_params['titles'] = self.title 527 else: 528 query_params['pageids'] = self.pageid 529 530 request = _wiki_request(query_params) 531 self._summary = request['query']['pages'][self.pageid]['extract'] 532 533 return self._summary 534 535 @property 536 def images(self): 537 ''' 538 List of URLs of images on the page. 539 ''' 540 541 if not getattr(self, '_images', False): 542 self._images = [ 543 page['imageinfo'][0]['url'] 544 for page in self.__continued_query({ 545 'generator': 'images', 546 'gimlimit': 'max', 547 'prop': 'imageinfo', 548 'iiprop': 'url', 549 }) 550 if 'imageinfo' in page 551 ] 552 553 return self._images 554 555 @property 556 def coordinates(self): 557 ''' 558 Tuple of Decimals in the form of (lat, lon) or None 559 ''' 560 if not getattr(self, '_coordinates', False): 561 query_params = { 562 'prop': 'coordinates', 563 'colimit': 'max', 564 'titles': self.title, 565 } 566 567 request = _wiki_request(query_params) 568 569 if 'query' in request: 570 coordinates = request['query']['pages'][self.pageid]['coordinates'] 571 self._coordinates = (Decimal(coordinates[0]['lat']), Decimal(coordinates[0]['lon'])) 572 else: 573 self._coordinates = None 574 575 return self._coordinates 576 577 @property 578 def references(self): 579 ''' 580 List of URLs of external links on a page. 581 May include external links within page that aren't technically cited anywhere. 582 ''' 583 584 if not getattr(self, '_references', False): 585 def add_protocol(url): 586 return url if url.startswith('http') else 'http:' + url 587 588 self._references = [ 589 add_protocol(link['*']) 590 for link in self.__continued_query({ 591 'prop': 'extlinks', 592 'ellimit': 'max' 593 }) 594 ] 595 596 return self._references 597 598 @property 599 def links(self): 600 ''' 601 List of titles of Wikipedia page links on a page. 602 603 .. note:: Only includes articles from namespace 0, meaning no Category, User talk, or other meta-Wikipedia pages. 604 ''' 605 606 if not getattr(self, '_links', False): 607 self._links = [ 608 link['title'] 609 for link in self.__continued_query({ 610 'prop': 'links', 611 'plnamespace': 0, 612 'pllimit': 'max' 613 }) 614 ] 615 616 return self._links 617 618 @property 619 def categories(self): 620 ''' 621 List of categories of a page. 622 ''' 623 624 if not getattr(self, '_categories', False): 625 self._categories = [re.sub(r'^Category:', '', x) for x in 626 [link['title'] 627 for link in self.__continued_query({ 628 'prop': 'categories', 629 'cllimit': 'max' 630 }) 631 ]] 632 633 return self._categories 634 635 @property 636 def sections(self): 637 ''' 638 List of section titles from the table of contents on the page. 639 ''' 640 641 if not getattr(self, '_sections', False): 642 query_params = { 643 'action': 'parse', 644 'prop': 'sections', 645 } 646 query_params.update(self.__title_query_param) 647 648 request = _wiki_request(query_params) 649 self._sections = [section['line'] for section in request['parse']['sections']] 650 651 return self._sections 652 653 def section(self, section_title): 654 ''' 655 Get the plain text content of a section from `self.sections`. 656 Returns None if `section_title` isn't found, otherwise returns a whitespace stripped string. 657 658 This is a convenience method that wraps self.content. 659 660 .. warning:: Calling `section` on a section that has subheadings will NOT return 661 the full text of all of the subsections. It only gets the text between 662 `section_title` and the next subheading, which is often empty. 663 ''' 664 665 section = u"== {} ==".format(section_title) 666 try: 667 index = self.content.index(section) + len(section) 668 except ValueError: 669 return None 670 671 try: 672 next_index = self.content.index("==", index) 673 except ValueError: 674 next_index = len(self.content) 675 676 return self.content[index:next_index].lstrip("=").strip() 677 678 679@cache 680def languages(): 681 ''' 682 List all the currently supported language prefixes (usually ISO language code). 683 684 Can be inputted to `set_lang` to change the Mediawiki that `wikipedia` requests 685 results from. 686 687 Returns: dict of <prefix>: <local_lang_name> pairs. To get just a list of prefixes, 688 use `wikipedia.languages().keys()`. 689 ''' 690 response = _wiki_request({ 691 'meta': 'siteinfo', 692 'siprop': 'languages' 693 }) 694 695 languages = response['query']['languages'] 696 697 return { 698 lang['code']: lang['*'] 699 for lang in languages 700 } 701 702 703def donate(): 704 ''' 705 Open up the Wikimedia donate page in your favorite browser. 706 ''' 707 import webbrowser 708 709 webbrowser.open('https://donate.wikimedia.org/w/index.php?title=Special:FundraiserLandingPage', new=2) 710 711 712def _wiki_request(params): 713 ''' 714 Make a request to the Wikipedia API using the given search parameters. 715 Returns a parsed dict of the JSON response. 716 ''' 717 global RATE_LIMIT_LAST_CALL 718 global USER_AGENT 719 720 params['format'] = 'json' 721 if not 'action' in params: 722 params['action'] = 'query' 723 724 headers = { 725 'User-Agent': USER_AGENT 726 } 727 728 if RATE_LIMIT and RATE_LIMIT_LAST_CALL and \ 729 RATE_LIMIT_LAST_CALL + RATE_LIMIT_MIN_WAIT > datetime.now(): 730 731 # it hasn't been long enough since the last API call 732 # so wait until we're in the clear to make the request 733 734 wait_time = (RATE_LIMIT_LAST_CALL + RATE_LIMIT_MIN_WAIT) - datetime.now() 735 time.sleep(int(wait_time.total_seconds())) 736 737 r = requests.get(API_URL, params=params, headers=headers) 738 739 if RATE_LIMIT: 740 RATE_LIMIT_LAST_CALL = datetime.now() 741 742 return r.json() 743