1from functools import partial 2from importlib import reload 3from io import BytesIO, StringIO 4import os 5from pathlib import Path 6import re 7import threading 8from urllib.error import URLError 9 10import numpy as np 11import pytest 12 13from pandas.compat import is_platform_windows 14from pandas.errors import ParserError 15import pandas.util._test_decorators as td 16 17from pandas import ( 18 DataFrame, 19 MultiIndex, 20 Series, 21 Timestamp, 22 date_range, 23 read_csv, 24 to_datetime, 25) 26import pandas._testing as tm 27 28from pandas.io.common import file_path_to_url 29import pandas.io.html 30from pandas.io.html import read_html 31 32HERE = os.path.dirname(__file__) 33 34 35@pytest.fixture( 36 params=[ 37 "chinese_utf-16.html", 38 "chinese_utf-32.html", 39 "chinese_utf-8.html", 40 "letz_latin1.html", 41 ] 42) 43def html_encoding_file(request, datapath): 44 """Parametrized fixture for HTML encoding test filenames.""" 45 return datapath("io", "data", "html_encoding", request.param) 46 47 48def assert_framelist_equal(list1, list2, *args, **kwargs): 49 assert len(list1) == len(list2), ( 50 "lists are not of equal size " 51 f"len(list1) == {len(list1)}, " 52 f"len(list2) == {len(list2)}" 53 ) 54 msg = "not all list elements are DataFrames" 55 both_frames = all( 56 map( 57 lambda x, y: isinstance(x, DataFrame) and isinstance(y, DataFrame), 58 list1, 59 list2, 60 ) 61 ) 62 assert both_frames, msg 63 for frame_i, frame_j in zip(list1, list2): 64 tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs) 65 assert not frame_i.empty, "frames are both empty" 66 67 68@td.skip_if_no("bs4") 69@td.skip_if_no("html5lib") 70def test_bs4_version_fails(monkeypatch, datapath): 71 import bs4 72 73 monkeypatch.setattr(bs4, "__version__", "4.2") 74 with pytest.raises(ImportError, match="Pandas requires version"): 75 read_html(datapath("io", "data", "html", "spam.html"), flavor="bs4") 76 77 78def test_invalid_flavor(): 79 url = "google.com" 80 flavor = "invalid flavor" 81 msg = r"\{" + flavor + r"\} is not a valid set of flavors" 82 83 with pytest.raises(ValueError, match=msg): 84 read_html(url, match="google", flavor=flavor) 85 86 87@td.skip_if_no("bs4") 88@td.skip_if_no("lxml") 89@td.skip_if_no("html5lib") 90def test_same_ordering(datapath): 91 filename = datapath("io", "data", "html", "valid_markup.html") 92 dfs_lxml = read_html(filename, index_col=0, flavor=["lxml"]) 93 dfs_bs4 = read_html(filename, index_col=0, flavor=["bs4"]) 94 assert_framelist_equal(dfs_lxml, dfs_bs4) 95 96 97@pytest.mark.parametrize( 98 "flavor", 99 [ 100 pytest.param("bs4", marks=[td.skip_if_no("bs4"), td.skip_if_no("html5lib")]), 101 pytest.param("lxml", marks=td.skip_if_no("lxml")), 102 ], 103 scope="class", 104) 105class TestReadHtml: 106 @pytest.fixture(autouse=True) 107 def set_files(self, datapath): 108 self.spam_data = datapath("io", "data", "html", "spam.html") 109 self.spam_data_kwargs = {} 110 self.spam_data_kwargs["encoding"] = "UTF-8" 111 self.banklist_data = datapath("io", "data", "html", "banklist.html") 112 113 @pytest.fixture(autouse=True, scope="function") 114 def set_defaults(self, flavor, request): 115 self.read_html = partial(read_html, flavor=flavor) 116 yield 117 118 def test_to_html_compat(self): 119 df = ( 120 tm.makeCustomDataframe( 121 4, 122 3, 123 data_gen_f=lambda *args: np.random.rand(), 124 c_idx_names=False, 125 r_idx_names=False, 126 ) 127 .applymap("{:.3f}".format) 128 .astype(float) 129 ) 130 out = df.to_html() 131 res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0] 132 tm.assert_frame_equal(res, df) 133 134 @pytest.mark.xfail(reason="Html file was removed") 135 @tm.network 136 def test_banklist_url_positional_match(self): 137 url = "https://www.fdic.gov/bank/individual/failed/banklist.html" 138 # Passing match argument as positional should cause a FutureWarning. 139 with tm.assert_produces_warning(FutureWarning): 140 df1 = self.read_html( 141 url, "First Federal Bank of Florida", attrs={"id": "table"} 142 ) 143 with tm.assert_produces_warning(FutureWarning): 144 df2 = self.read_html(url, "Metcalf Bank", attrs={"id": "table"}) 145 146 assert_framelist_equal(df1, df2) 147 148 @pytest.mark.xfail(reason="Html file was removed") 149 @tm.network 150 def test_banklist_url(self): 151 url = "https://www.fdic.gov/bank/individual/failed/banklist.html" 152 df1 = self.read_html( 153 url, match="First Federal Bank of Florida", attrs={"id": "table"} 154 ) 155 df2 = self.read_html(url, match="Metcalf Bank", attrs={"id": "table"}) 156 157 assert_framelist_equal(df1, df2) 158 159 @tm.network 160 def test_spam_url(self): 161 url = ( 162 "https://raw.githubusercontent.com/pandas-dev/pandas/master/" 163 "pandas/tests/io/data/html/spam.html" 164 ) 165 df1 = self.read_html(url, match=".*Water.*") 166 df2 = self.read_html(url, match="Unit") 167 168 assert_framelist_equal(df1, df2) 169 170 @pytest.mark.slow 171 def test_banklist(self): 172 df1 = self.read_html( 173 self.banklist_data, match=".*Florida.*", attrs={"id": "table"} 174 ) 175 df2 = self.read_html( 176 self.banklist_data, match="Metcalf Bank", attrs={"id": "table"} 177 ) 178 179 assert_framelist_equal(df1, df2) 180 181 def test_spam(self): 182 df1 = self.read_html(self.spam_data, match=".*Water.*") 183 df2 = self.read_html(self.spam_data, match="Unit") 184 assert_framelist_equal(df1, df2) 185 186 assert df1[0].iloc[0, 0] == "Proximates" 187 assert df1[0].columns[0] == "Nutrient" 188 189 def test_spam_no_match(self): 190 dfs = self.read_html(self.spam_data) 191 for df in dfs: 192 assert isinstance(df, DataFrame) 193 194 def test_banklist_no_match(self): 195 dfs = self.read_html(self.banklist_data, attrs={"id": "table"}) 196 for df in dfs: 197 assert isinstance(df, DataFrame) 198 199 def test_spam_header(self): 200 df = self.read_html(self.spam_data, match=".*Water.*", header=2)[0] 201 assert df.columns[0] == "Proximates" 202 assert not df.empty 203 204 def test_skiprows_int(self): 205 df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=1) 206 df2 = self.read_html(self.spam_data, match="Unit", skiprows=1) 207 208 assert_framelist_equal(df1, df2) 209 210 def test_skiprows_range(self): 211 df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=range(2)) 212 df2 = self.read_html(self.spam_data, match="Unit", skiprows=range(2)) 213 214 assert_framelist_equal(df1, df2) 215 216 def test_skiprows_list(self): 217 df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=[1, 2]) 218 df2 = self.read_html(self.spam_data, match="Unit", skiprows=[2, 1]) 219 220 assert_framelist_equal(df1, df2) 221 222 def test_skiprows_set(self): 223 df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows={1, 2}) 224 df2 = self.read_html(self.spam_data, match="Unit", skiprows={2, 1}) 225 226 assert_framelist_equal(df1, df2) 227 228 def test_skiprows_slice(self): 229 df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=1) 230 df2 = self.read_html(self.spam_data, match="Unit", skiprows=1) 231 232 assert_framelist_equal(df1, df2) 233 234 def test_skiprows_slice_short(self): 235 df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=slice(2)) 236 df2 = self.read_html(self.spam_data, match="Unit", skiprows=slice(2)) 237 238 assert_framelist_equal(df1, df2) 239 240 def test_skiprows_slice_long(self): 241 df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=slice(2, 5)) 242 df2 = self.read_html(self.spam_data, match="Unit", skiprows=slice(4, 1, -1)) 243 244 assert_framelist_equal(df1, df2) 245 246 def test_skiprows_ndarray(self): 247 df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=np.arange(2)) 248 df2 = self.read_html(self.spam_data, match="Unit", skiprows=np.arange(2)) 249 250 assert_framelist_equal(df1, df2) 251 252 def test_skiprows_invalid(self): 253 with pytest.raises(TypeError, match=("is not a valid type for skipping rows")): 254 self.read_html(self.spam_data, match=".*Water.*", skiprows="asdf") 255 256 def test_index(self): 257 df1 = self.read_html(self.spam_data, match=".*Water.*", index_col=0) 258 df2 = self.read_html(self.spam_data, match="Unit", index_col=0) 259 assert_framelist_equal(df1, df2) 260 261 def test_header_and_index_no_types(self): 262 df1 = self.read_html(self.spam_data, match=".*Water.*", header=1, index_col=0) 263 df2 = self.read_html(self.spam_data, match="Unit", header=1, index_col=0) 264 assert_framelist_equal(df1, df2) 265 266 def test_header_and_index_with_types(self): 267 df1 = self.read_html(self.spam_data, match=".*Water.*", header=1, index_col=0) 268 df2 = self.read_html(self.spam_data, match="Unit", header=1, index_col=0) 269 assert_framelist_equal(df1, df2) 270 271 def test_infer_types(self): 272 273 # 10892 infer_types removed 274 df1 = self.read_html(self.spam_data, match=".*Water.*", index_col=0) 275 df2 = self.read_html(self.spam_data, match="Unit", index_col=0) 276 assert_framelist_equal(df1, df2) 277 278 def test_string_io(self): 279 with open(self.spam_data, **self.spam_data_kwargs) as f: 280 data1 = StringIO(f.read()) 281 282 with open(self.spam_data, **self.spam_data_kwargs) as f: 283 data2 = StringIO(f.read()) 284 285 df1 = self.read_html(data1, match=".*Water.*") 286 df2 = self.read_html(data2, match="Unit") 287 assert_framelist_equal(df1, df2) 288 289 def test_string(self): 290 with open(self.spam_data, **self.spam_data_kwargs) as f: 291 data = f.read() 292 293 df1 = self.read_html(data, match=".*Water.*") 294 df2 = self.read_html(data, match="Unit") 295 296 assert_framelist_equal(df1, df2) 297 298 def test_file_like(self): 299 with open(self.spam_data, **self.spam_data_kwargs) as f: 300 df1 = self.read_html(f, match=".*Water.*") 301 302 with open(self.spam_data, **self.spam_data_kwargs) as f: 303 df2 = self.read_html(f, match="Unit") 304 305 assert_framelist_equal(df1, df2) 306 307 @tm.network 308 def test_bad_url_protocol(self): 309 with pytest.raises(URLError): 310 self.read_html("git://github.com", match=".*Water.*") 311 312 @tm.network 313 @pytest.mark.slow 314 def test_invalid_url(self): 315 try: 316 with pytest.raises(URLError): 317 self.read_html("http://www.a23950sdfa908sd.com", match=".*Water.*") 318 except ValueError as e: 319 assert "No tables found" in str(e) 320 321 @pytest.mark.slow 322 def test_file_url(self): 323 url = self.banklist_data 324 dfs = self.read_html( 325 file_path_to_url(os.path.abspath(url)), match="First", attrs={"id": "table"} 326 ) 327 assert isinstance(dfs, list) 328 for df in dfs: 329 assert isinstance(df, DataFrame) 330 331 @pytest.mark.slow 332 def test_invalid_table_attrs(self): 333 url = self.banklist_data 334 with pytest.raises(ValueError, match="No tables found"): 335 self.read_html( 336 url, match="First Federal Bank of Florida", attrs={"id": "tasdfable"} 337 ) 338 339 def _bank_data(self, *args, **kwargs): 340 return self.read_html( 341 self.banklist_data, match="Metcalf", attrs={"id": "table"}, *args, **kwargs 342 ) 343 344 @pytest.mark.slow 345 def test_multiindex_header(self): 346 df = self._bank_data(header=[0, 1])[0] 347 assert isinstance(df.columns, MultiIndex) 348 349 @pytest.mark.slow 350 def test_multiindex_index(self): 351 df = self._bank_data(index_col=[0, 1])[0] 352 assert isinstance(df.index, MultiIndex) 353 354 @pytest.mark.slow 355 def test_multiindex_header_index(self): 356 df = self._bank_data(header=[0, 1], index_col=[0, 1])[0] 357 assert isinstance(df.columns, MultiIndex) 358 assert isinstance(df.index, MultiIndex) 359 360 @pytest.mark.slow 361 def test_multiindex_header_skiprows_tuples(self): 362 df = self._bank_data(header=[0, 1], skiprows=1)[0] 363 assert isinstance(df.columns, MultiIndex) 364 365 @pytest.mark.slow 366 def test_multiindex_header_skiprows(self): 367 df = self._bank_data(header=[0, 1], skiprows=1)[0] 368 assert isinstance(df.columns, MultiIndex) 369 370 @pytest.mark.slow 371 def test_multiindex_header_index_skiprows(self): 372 df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0] 373 assert isinstance(df.index, MultiIndex) 374 assert isinstance(df.columns, MultiIndex) 375 376 @pytest.mark.slow 377 def test_regex_idempotency(self): 378 url = self.banklist_data 379 dfs = self.read_html( 380 file_path_to_url(os.path.abspath(url)), 381 match=re.compile(re.compile("Florida")), 382 attrs={"id": "table"}, 383 ) 384 assert isinstance(dfs, list) 385 for df in dfs: 386 assert isinstance(df, DataFrame) 387 388 def test_negative_skiprows(self): 389 msg = r"\(you passed a negative value\)" 390 with pytest.raises(ValueError, match=msg): 391 self.read_html(self.spam_data, match="Water", skiprows=-1) 392 393 @tm.network 394 def test_multiple_matches(self): 395 url = "https://docs.python.org/2/" 396 dfs = self.read_html(url, match="Python") 397 assert len(dfs) > 1 398 399 @tm.network 400 def test_python_docs_table(self): 401 url = "https://docs.python.org/2/" 402 dfs = self.read_html(url, match="Python") 403 zz = [df.iloc[0, 0][0:4] for df in dfs] 404 assert sorted(zz) == sorted(["Repo", "What"]) 405 406 def test_empty_tables(self): 407 """ 408 Make sure that read_html ignores empty tables. 409 """ 410 html = """ 411 <table> 412 <thead> 413 <tr> 414 <th>A</th> 415 <th>B</th> 416 </tr> 417 </thead> 418 <tbody> 419 <tr> 420 <td>1</td> 421 <td>2</td> 422 </tr> 423 </tbody> 424 </table> 425 <table> 426 <tbody> 427 </tbody> 428 </table> 429 """ 430 result = self.read_html(html) 431 assert len(result) == 1 432 433 def test_multiple_tbody(self): 434 # GH-20690 435 # Read all tbody tags within a single table. 436 result = self.read_html( 437 """<table> 438 <thead> 439 <tr> 440 <th>A</th> 441 <th>B</th> 442 </tr> 443 </thead> 444 <tbody> 445 <tr> 446 <td>1</td> 447 <td>2</td> 448 </tr> 449 </tbody> 450 <tbody> 451 <tr> 452 <td>3</td> 453 <td>4</td> 454 </tr> 455 </tbody> 456 </table>""" 457 )[0] 458 459 expected = DataFrame(data=[[1, 2], [3, 4]], columns=["A", "B"]) 460 461 tm.assert_frame_equal(result, expected) 462 463 def test_header_and_one_column(self): 464 """ 465 Don't fail with bs4 when there is a header and only one column 466 as described in issue #9178 467 """ 468 result = self.read_html( 469 """<table> 470 <thead> 471 <tr> 472 <th>Header</th> 473 </tr> 474 </thead> 475 <tbody> 476 <tr> 477 <td>first</td> 478 </tr> 479 </tbody> 480 </table>""" 481 )[0] 482 483 expected = DataFrame(data={"Header": "first"}, index=[0]) 484 485 tm.assert_frame_equal(result, expected) 486 487 def test_thead_without_tr(self): 488 """ 489 Ensure parser adds <tr> within <thead> on malformed HTML. 490 """ 491 result = self.read_html( 492 """<table> 493 <thead> 494 <tr> 495 <th>Country</th> 496 <th>Municipality</th> 497 <th>Year</th> 498 </tr> 499 </thead> 500 <tbody> 501 <tr> 502 <td>Ukraine</td> 503 <th>Odessa</th> 504 <td>1944</td> 505 </tr> 506 </tbody> 507 </table>""" 508 )[0] 509 510 expected = DataFrame( 511 data=[["Ukraine", "Odessa", 1944]], 512 columns=["Country", "Municipality", "Year"], 513 ) 514 515 tm.assert_frame_equal(result, expected) 516 517 def test_tfoot_read(self): 518 """ 519 Make sure that read_html reads tfoot, containing td or th. 520 Ignores empty tfoot 521 """ 522 data_template = """<table> 523 <thead> 524 <tr> 525 <th>A</th> 526 <th>B</th> 527 </tr> 528 </thead> 529 <tbody> 530 <tr> 531 <td>bodyA</td> 532 <td>bodyB</td> 533 </tr> 534 </tbody> 535 <tfoot> 536 {footer} 537 </tfoot> 538 </table>""" 539 540 expected1 = DataFrame(data=[["bodyA", "bodyB"]], columns=["A", "B"]) 541 542 expected2 = DataFrame( 543 data=[["bodyA", "bodyB"], ["footA", "footB"]], columns=["A", "B"] 544 ) 545 546 data1 = data_template.format(footer="") 547 data2 = data_template.format(footer="<tr><td>footA</td><th>footB</th></tr>") 548 549 result1 = self.read_html(data1)[0] 550 result2 = self.read_html(data2)[0] 551 552 tm.assert_frame_equal(result1, expected1) 553 tm.assert_frame_equal(result2, expected2) 554 555 def test_parse_header_of_non_string_column(self): 556 # GH5048: if header is specified explicitly, an int column should be 557 # parsed as int while its header is parsed as str 558 result = self.read_html( 559 """ 560 <table> 561 <tr> 562 <td>S</td> 563 <td>I</td> 564 </tr> 565 <tr> 566 <td>text</td> 567 <td>1944</td> 568 </tr> 569 </table> 570 """, 571 header=0, 572 )[0] 573 574 expected = DataFrame([["text", 1944]], columns=("S", "I")) 575 576 tm.assert_frame_equal(result, expected) 577 578 @pytest.mark.slow 579 def test_banklist_header(self, datapath): 580 from pandas.io.html import _remove_whitespace 581 582 def try_remove_ws(x): 583 try: 584 return _remove_whitespace(x) 585 except AttributeError: 586 return x 587 588 df = self.read_html(self.banklist_data, match="Metcalf", attrs={"id": "table"})[ 589 0 590 ] 591 ground_truth = read_csv( 592 datapath("io", "data", "csv", "banklist.csv"), 593 converters={"Updated Date": Timestamp, "Closing Date": Timestamp}, 594 ) 595 assert df.shape == ground_truth.shape 596 old = [ 597 "First Vietnamese American BankIn Vietnamese", 598 "Westernbank Puerto RicoEn Espanol", 599 "R-G Premier Bank of Puerto RicoEn Espanol", 600 "EurobankEn Espanol", 601 "Sanderson State BankEn Espanol", 602 "Washington Mutual Bank(Including its subsidiary Washington " 603 "Mutual Bank FSB)", 604 "Silver State BankEn Espanol", 605 "AmTrade International BankEn Espanol", 606 "Hamilton Bank, NAEn Espanol", 607 "The Citizens Savings BankPioneer Community Bank, Inc.", 608 ] 609 new = [ 610 "First Vietnamese American Bank", 611 "Westernbank Puerto Rico", 612 "R-G Premier Bank of Puerto Rico", 613 "Eurobank", 614 "Sanderson State Bank", 615 "Washington Mutual Bank", 616 "Silver State Bank", 617 "AmTrade International Bank", 618 "Hamilton Bank, NA", 619 "The Citizens Savings Bank", 620 ] 621 dfnew = df.applymap(try_remove_ws).replace(old, new) 622 gtnew = ground_truth.applymap(try_remove_ws) 623 converted = dfnew._convert(datetime=True, numeric=True) 624 date_cols = ["Closing Date", "Updated Date"] 625 converted[date_cols] = converted[date_cols].apply(to_datetime) 626 tm.assert_frame_equal(converted, gtnew) 627 628 @pytest.mark.slow 629 def test_gold_canyon(self): 630 gc = "Gold Canyon" 631 with open(self.banklist_data) as f: 632 raw_text = f.read() 633 634 assert gc in raw_text 635 df = self.read_html( 636 self.banklist_data, match="Gold Canyon", attrs={"id": "table"} 637 )[0] 638 assert gc in df.to_string() 639 640 def test_different_number_of_cols(self): 641 expected = self.read_html( 642 """<table> 643 <thead> 644 <tr style="text-align: right;"> 645 <th></th> 646 <th>C_l0_g0</th> 647 <th>C_l0_g1</th> 648 <th>C_l0_g2</th> 649 <th>C_l0_g3</th> 650 <th>C_l0_g4</th> 651 </tr> 652 </thead> 653 <tbody> 654 <tr> 655 <th>R_l0_g0</th> 656 <td> 0.763</td> 657 <td> 0.233</td> 658 <td> nan</td> 659 <td> nan</td> 660 <td> nan</td> 661 </tr> 662 <tr> 663 <th>R_l0_g1</th> 664 <td> 0.244</td> 665 <td> 0.285</td> 666 <td> 0.392</td> 667 <td> 0.137</td> 668 <td> 0.222</td> 669 </tr> 670 </tbody> 671 </table>""", 672 index_col=0, 673 )[0] 674 675 result = self.read_html( 676 """<table> 677 <thead> 678 <tr style="text-align: right;"> 679 <th></th> 680 <th>C_l0_g0</th> 681 <th>C_l0_g1</th> 682 <th>C_l0_g2</th> 683 <th>C_l0_g3</th> 684 <th>C_l0_g4</th> 685 </tr> 686 </thead> 687 <tbody> 688 <tr> 689 <th>R_l0_g0</th> 690 <td> 0.763</td> 691 <td> 0.233</td> 692 </tr> 693 <tr> 694 <th>R_l0_g1</th> 695 <td> 0.244</td> 696 <td> 0.285</td> 697 <td> 0.392</td> 698 <td> 0.137</td> 699 <td> 0.222</td> 700 </tr> 701 </tbody> 702 </table>""", 703 index_col=0, 704 )[0] 705 706 tm.assert_frame_equal(result, expected) 707 708 def test_colspan_rowspan_1(self): 709 # GH17054 710 result = self.read_html( 711 """ 712 <table> 713 <tr> 714 <th>A</th> 715 <th colspan="1">B</th> 716 <th rowspan="1">C</th> 717 </tr> 718 <tr> 719 <td>a</td> 720 <td>b</td> 721 <td>c</td> 722 </tr> 723 </table> 724 """ 725 )[0] 726 727 expected = DataFrame([["a", "b", "c"]], columns=["A", "B", "C"]) 728 729 tm.assert_frame_equal(result, expected) 730 731 def test_colspan_rowspan_copy_values(self): 732 # GH17054 733 734 # In ASCII, with lowercase letters being copies: 735 # 736 # X x Y Z W 737 # A B b z C 738 739 result = self.read_html( 740 """ 741 <table> 742 <tr> 743 <td colspan="2">X</td> 744 <td>Y</td> 745 <td rowspan="2">Z</td> 746 <td>W</td> 747 </tr> 748 <tr> 749 <td>A</td> 750 <td colspan="2">B</td> 751 <td>C</td> 752 </tr> 753 </table> 754 """, 755 header=0, 756 )[0] 757 758 expected = DataFrame( 759 data=[["A", "B", "B", "Z", "C"]], columns=["X", "X.1", "Y", "Z", "W"] 760 ) 761 762 tm.assert_frame_equal(result, expected) 763 764 def test_colspan_rowspan_both_not_1(self): 765 # GH17054 766 767 # In ASCII, with lowercase letters being copies: 768 # 769 # A B b b C 770 # a b b b D 771 772 result = self.read_html( 773 """ 774 <table> 775 <tr> 776 <td rowspan="2">A</td> 777 <td rowspan="2" colspan="3">B</td> 778 <td>C</td> 779 </tr> 780 <tr> 781 <td>D</td> 782 </tr> 783 </table> 784 """, 785 header=0, 786 )[0] 787 788 expected = DataFrame( 789 data=[["A", "B", "B", "B", "D"]], columns=["A", "B", "B.1", "B.2", "C"] 790 ) 791 792 tm.assert_frame_equal(result, expected) 793 794 def test_rowspan_at_end_of_row(self): 795 # GH17054 796 797 # In ASCII, with lowercase letters being copies: 798 # 799 # A B 800 # C b 801 802 result = self.read_html( 803 """ 804 <table> 805 <tr> 806 <td>A</td> 807 <td rowspan="2">B</td> 808 </tr> 809 <tr> 810 <td>C</td> 811 </tr> 812 </table> 813 """, 814 header=0, 815 )[0] 816 817 expected = DataFrame(data=[["C", "B"]], columns=["A", "B"]) 818 819 tm.assert_frame_equal(result, expected) 820 821 def test_rowspan_only_rows(self): 822 # GH17054 823 824 result = self.read_html( 825 """ 826 <table> 827 <tr> 828 <td rowspan="3">A</td> 829 <td rowspan="3">B</td> 830 </tr> 831 </table> 832 """, 833 header=0, 834 )[0] 835 836 expected = DataFrame(data=[["A", "B"], ["A", "B"]], columns=["A", "B"]) 837 838 tm.assert_frame_equal(result, expected) 839 840 def test_header_inferred_from_rows_with_only_th(self): 841 # GH17054 842 result = self.read_html( 843 """ 844 <table> 845 <tr> 846 <th>A</th> 847 <th>B</th> 848 </tr> 849 <tr> 850 <th>a</th> 851 <th>b</th> 852 </tr> 853 <tr> 854 <td>1</td> 855 <td>2</td> 856 </tr> 857 </table> 858 """ 859 )[0] 860 861 columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]]) 862 expected = DataFrame(data=[[1, 2]], columns=columns) 863 864 tm.assert_frame_equal(result, expected) 865 866 def test_parse_dates_list(self): 867 df = DataFrame({"date": date_range("1/1/2001", periods=10)}) 868 expected = df.to_html() 869 res = self.read_html(expected, parse_dates=[1], index_col=0) 870 tm.assert_frame_equal(df, res[0]) 871 res = self.read_html(expected, parse_dates=["date"], index_col=0) 872 tm.assert_frame_equal(df, res[0]) 873 874 def test_parse_dates_combine(self): 875 raw_dates = Series(date_range("1/1/2001", periods=10)) 876 df = DataFrame( 877 { 878 "date": raw_dates.map(lambda x: str(x.date())), 879 "time": raw_dates.map(lambda x: str(x.time())), 880 } 881 ) 882 res = self.read_html( 883 df.to_html(), parse_dates={"datetime": [1, 2]}, index_col=1 884 ) 885 newdf = DataFrame({"datetime": raw_dates}) 886 tm.assert_frame_equal(newdf, res[0]) 887 888 def test_wikipedia_states_table(self, datapath): 889 data = datapath("io", "data", "html", "wikipedia_states.html") 890 assert os.path.isfile(data), f"{repr(data)} is not a file" 891 assert os.path.getsize(data), f"{repr(data)} is an empty file" 892 result = self.read_html(data, match="Arizona", header=1)[0] 893 assert result.shape == (60, 12) 894 assert "Unnamed" in result.columns[-1] 895 assert result["sq mi"].dtype == np.dtype("float64") 896 assert np.allclose(result.loc[0, "sq mi"], 665384.04) 897 898 def test_wikipedia_states_multiindex(self, datapath): 899 data = datapath("io", "data", "html", "wikipedia_states.html") 900 result = self.read_html(data, match="Arizona", index_col=0)[0] 901 assert result.shape == (60, 11) 902 assert "Unnamed" in result.columns[-1][1] 903 assert result.columns.nlevels == 2 904 assert np.allclose(result.loc["Alaska", ("Total area[2]", "sq mi")], 665384.04) 905 906 def test_parser_error_on_empty_header_row(self): 907 msg = ( 908 r"Passed header=\[0,1\] are too many " 909 r"rows for this multi_index of columns" 910 ) 911 with pytest.raises(ParserError, match=msg): 912 self.read_html( 913 """ 914 <table> 915 <thead> 916 <tr><th></th><th></tr> 917 <tr><th>A</th><th>B</th></tr> 918 </thead> 919 <tbody> 920 <tr><td>a</td><td>b</td></tr> 921 </tbody> 922 </table> 923 """, 924 header=[0, 1], 925 ) 926 927 def test_decimal_rows(self): 928 # GH 12907 929 result = self.read_html( 930 """<html> 931 <body> 932 <table> 933 <thead> 934 <tr> 935 <th>Header</th> 936 </tr> 937 </thead> 938 <tbody> 939 <tr> 940 <td>1100#101</td> 941 </tr> 942 </tbody> 943 </table> 944 </body> 945 </html>""", 946 decimal="#", 947 )[0] 948 949 expected = DataFrame(data={"Header": 1100.101}, index=[0]) 950 951 assert result["Header"].dtype == np.dtype("float64") 952 tm.assert_frame_equal(result, expected) 953 954 def test_bool_header_arg(self): 955 # GH 6114 956 for arg in [True, False]: 957 with pytest.raises(TypeError): 958 self.read_html(self.spam_data, header=arg) 959 960 def test_converters(self): 961 # GH 13461 962 result = self.read_html( 963 """<table> 964 <thead> 965 <tr> 966 <th>a</th> 967 </tr> 968 </thead> 969 <tbody> 970 <tr> 971 <td> 0.763</td> 972 </tr> 973 <tr> 974 <td> 0.244</td> 975 </tr> 976 </tbody> 977 </table>""", 978 converters={"a": str}, 979 )[0] 980 981 expected = DataFrame({"a": ["0.763", "0.244"]}) 982 983 tm.assert_frame_equal(result, expected) 984 985 def test_na_values(self): 986 # GH 13461 987 result = self.read_html( 988 """<table> 989 <thead> 990 <tr> 991 <th>a</th> 992 </tr> 993 </thead> 994 <tbody> 995 <tr> 996 <td> 0.763</td> 997 </tr> 998 <tr> 999 <td> 0.244</td> 1000 </tr> 1001 </tbody> 1002 </table>""", 1003 na_values=[0.244], 1004 )[0] 1005 1006 expected = DataFrame({"a": [0.763, np.nan]}) 1007 1008 tm.assert_frame_equal(result, expected) 1009 1010 def test_keep_default_na(self): 1011 html_data = """<table> 1012 <thead> 1013 <tr> 1014 <th>a</th> 1015 </tr> 1016 </thead> 1017 <tbody> 1018 <tr> 1019 <td> N/A</td> 1020 </tr> 1021 <tr> 1022 <td> NA</td> 1023 </tr> 1024 </tbody> 1025 </table>""" 1026 1027 expected_df = DataFrame({"a": ["N/A", "NA"]}) 1028 html_df = self.read_html(html_data, keep_default_na=False)[0] 1029 tm.assert_frame_equal(expected_df, html_df) 1030 1031 expected_df = DataFrame({"a": [np.nan, np.nan]}) 1032 html_df = self.read_html(html_data, keep_default_na=True)[0] 1033 tm.assert_frame_equal(expected_df, html_df) 1034 1035 def test_preserve_empty_rows(self): 1036 result = self.read_html( 1037 """ 1038 <table> 1039 <tr> 1040 <th>A</th> 1041 <th>B</th> 1042 </tr> 1043 <tr> 1044 <td>a</td> 1045 <td>b</td> 1046 </tr> 1047 <tr> 1048 <td></td> 1049 <td></td> 1050 </tr> 1051 </table> 1052 """ 1053 )[0] 1054 1055 expected = DataFrame(data=[["a", "b"], [np.nan, np.nan]], columns=["A", "B"]) 1056 1057 tm.assert_frame_equal(result, expected) 1058 1059 def test_ignore_empty_rows_when_inferring_header(self): 1060 result = self.read_html( 1061 """ 1062 <table> 1063 <thead> 1064 <tr><th></th><th></tr> 1065 <tr><th>A</th><th>B</th></tr> 1066 <tr><th>a</th><th>b</th></tr> 1067 </thead> 1068 <tbody> 1069 <tr><td>1</td><td>2</td></tr> 1070 </tbody> 1071 </table> 1072 """ 1073 )[0] 1074 1075 columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]]) 1076 expected = DataFrame(data=[[1, 2]], columns=columns) 1077 1078 tm.assert_frame_equal(result, expected) 1079 1080 def test_multiple_header_rows(self): 1081 # Issue #13434 1082 expected_df = DataFrame( 1083 data=[("Hillary", 68, "D"), ("Bernie", 74, "D"), ("Donald", 69, "R")] 1084 ) 1085 expected_df.columns = [ 1086 ["Unnamed: 0_level_0", "Age", "Party"], 1087 ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"], 1088 ] 1089 html = expected_df.to_html(index=False) 1090 html_df = self.read_html(html)[0] 1091 tm.assert_frame_equal(expected_df, html_df) 1092 1093 def test_works_on_valid_markup(self, datapath): 1094 filename = datapath("io", "data", "html", "valid_markup.html") 1095 dfs = self.read_html(filename, index_col=0) 1096 assert isinstance(dfs, list) 1097 assert isinstance(dfs[0], DataFrame) 1098 1099 @pytest.mark.slow 1100 def test_fallback_success(self, datapath): 1101 banklist_data = datapath("io", "data", "html", "banklist.html") 1102 self.read_html(banklist_data, match=".*Water.*", flavor=["lxml", "html5lib"]) 1103 1104 def test_to_html_timestamp(self): 1105 rng = date_range("2000-01-01", periods=10) 1106 df = DataFrame(np.random.randn(10, 4), index=rng) 1107 1108 result = df.to_html() 1109 assert "2000-01-01" in result 1110 1111 @pytest.mark.parametrize( 1112 "displayed_only,exp0,exp1", 1113 [ 1114 (True, DataFrame(["foo"]), None), 1115 (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"])), 1116 ], 1117 ) 1118 def test_displayed_only(self, displayed_only, exp0, exp1): 1119 # GH 20027 1120 data = StringIO( 1121 """<html> 1122 <body> 1123 <table> 1124 <tr> 1125 <td> 1126 foo 1127 <span style="display:none;text-align:center">bar</span> 1128 <span style="display:none">baz</span> 1129 <span style="display: none">qux</span> 1130 </td> 1131 </tr> 1132 </table> 1133 <table style="display: none"> 1134 <tr> 1135 <td>foo</td> 1136 </tr> 1137 </table> 1138 </body> 1139 </html>""" 1140 ) 1141 1142 dfs = self.read_html(data, displayed_only=displayed_only) 1143 tm.assert_frame_equal(dfs[0], exp0) 1144 1145 if exp1 is not None: 1146 tm.assert_frame_equal(dfs[1], exp1) 1147 else: 1148 assert len(dfs) == 1 # Should not parse hidden table 1149 1150 def test_encode(self, html_encoding_file): 1151 base_path = os.path.basename(html_encoding_file) 1152 root = os.path.splitext(base_path)[0] 1153 _, encoding = root.split("_") 1154 1155 try: 1156 with open(html_encoding_file, "rb") as fobj: 1157 from_string = self.read_html( 1158 fobj.read(), encoding=encoding, index_col=0 1159 ).pop() 1160 1161 with open(html_encoding_file, "rb") as fobj: 1162 from_file_like = self.read_html( 1163 BytesIO(fobj.read()), encoding=encoding, index_col=0 1164 ).pop() 1165 1166 from_filename = self.read_html( 1167 html_encoding_file, encoding=encoding, index_col=0 1168 ).pop() 1169 tm.assert_frame_equal(from_string, from_file_like) 1170 tm.assert_frame_equal(from_string, from_filename) 1171 except Exception: 1172 # seems utf-16/32 fail on windows 1173 if is_platform_windows(): 1174 if "16" in encoding or "32" in encoding: 1175 pytest.skip() 1176 raise 1177 1178 def test_parse_failure_unseekable(self): 1179 # Issue #17975 1180 1181 if self.read_html.keywords.get("flavor") == "lxml": 1182 pytest.skip("Not applicable for lxml") 1183 1184 class UnseekableStringIO(StringIO): 1185 def seekable(self): 1186 return False 1187 1188 bad = UnseekableStringIO( 1189 """ 1190 <table><tr><td>spam<foobr />eggs</td></tr></table>""" 1191 ) 1192 1193 assert self.read_html(bad) 1194 1195 with pytest.raises(ValueError, match="passed a non-rewindable file object"): 1196 self.read_html(bad) 1197 1198 def test_parse_failure_rewinds(self): 1199 # Issue #17975 1200 1201 class MockFile: 1202 def __init__(self, data): 1203 self.data = data 1204 self.at_end = False 1205 1206 def read(self, size=None): 1207 data = "" if self.at_end else self.data 1208 self.at_end = True 1209 return data 1210 1211 def seek(self, offset): 1212 self.at_end = False 1213 1214 def seekable(self): 1215 return True 1216 1217 good = MockFile("<table><tr><td>spam<br />eggs</td></tr></table>") 1218 bad = MockFile("<table><tr><td>spam<foobr />eggs</td></tr></table>") 1219 1220 assert self.read_html(good) 1221 assert self.read_html(bad) 1222 1223 @pytest.mark.slow 1224 def test_importcheck_thread_safety(self, datapath): 1225 # see gh-16928 1226 1227 class ErrorThread(threading.Thread): 1228 def run(self): 1229 try: 1230 super().run() 1231 except Exception as err: 1232 self.err = err 1233 else: 1234 self.err = None 1235 1236 # force import check by reinitalising global vars in html.py 1237 reload(pandas.io.html) 1238 1239 filename = datapath("io", "data", "html", "valid_markup.html") 1240 helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) 1241 helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) 1242 1243 helper_thread1.start() 1244 helper_thread2.start() 1245 1246 while helper_thread1.is_alive() or helper_thread2.is_alive(): 1247 pass 1248 assert None is helper_thread1.err is helper_thread2.err 1249 1250 def test_parse_path_object(self, datapath): 1251 # GH 37705 1252 file_path_string = datapath("io", "data", "html", "spam.html") 1253 file_path = Path(file_path_string) 1254 df1 = self.read_html(file_path_string)[0] 1255 df2 = self.read_html(file_path)[0] 1256 tm.assert_frame_equal(df1, df2) 1257