1from io import StringIO 2import re 3from string import ascii_uppercase as uppercase 4import sys 5import textwrap 6 7import numpy as np 8import pytest 9 10from pandas.compat import IS64, PYPY 11 12from pandas import ( 13 CategoricalIndex, 14 DataFrame, 15 MultiIndex, 16 Series, 17 date_range, 18 option_context, 19) 20 21 22@pytest.fixture 23def duplicate_columns_frame(): 24 """Dataframe with duplicate column names.""" 25 return DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"]) 26 27 28def test_info_empty(): 29 df = DataFrame() 30 buf = StringIO() 31 df.info(buf=buf) 32 result = buf.getvalue() 33 expected = textwrap.dedent( 34 """\ 35 <class 'pandas.core.frame.DataFrame'> 36 Index: 0 entries 37 Empty DataFrame""" 38 ) 39 assert result == expected 40 41 42def test_info_categorical_column_smoke_test(): 43 n = 2500 44 df = DataFrame({"int64": np.random.randint(100, size=n)}) 45 df["category"] = Series( 46 np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) 47 ).astype("category") 48 df.isna() 49 buf = StringIO() 50 df.info(buf=buf) 51 52 df2 = df[df["category"] == "d"] 53 buf = StringIO() 54 df2.info(buf=buf) 55 56 57@pytest.mark.parametrize( 58 "fixture_func_name", 59 [ 60 "int_frame", 61 "float_frame", 62 "datetime_frame", 63 "duplicate_columns_frame", 64 ], 65) 66def test_info_smoke_test(fixture_func_name, request): 67 frame = request.getfixturevalue(fixture_func_name) 68 buf = StringIO() 69 frame.info(buf=buf) 70 result = buf.getvalue().splitlines() 71 assert len(result) > 10 72 73 74@pytest.mark.parametrize( 75 "num_columns, max_info_columns, verbose", 76 [ 77 (10, 100, True), 78 (10, 11, True), 79 (10, 10, True), 80 (10, 9, False), 81 (10, 1, False), 82 ], 83) 84def test_info_default_verbose_selection(num_columns, max_info_columns, verbose): 85 frame = DataFrame(np.random.randn(5, num_columns)) 86 with option_context("display.max_info_columns", max_info_columns): 87 io_default = StringIO() 88 frame.info(buf=io_default) 89 result = io_default.getvalue() 90 91 io_explicit = StringIO() 92 frame.info(buf=io_explicit, verbose=verbose) 93 expected = io_explicit.getvalue() 94 95 assert result == expected 96 97 98def test_info_verbose_check_header_separator_body(): 99 buf = StringIO() 100 size = 1001 101 start = 5 102 frame = DataFrame(np.random.randn(3, size)) 103 frame.info(verbose=True, buf=buf) 104 105 res = buf.getvalue() 106 header = " # Column Dtype \n--- ------ ----- " 107 assert header in res 108 109 frame.info(verbose=True, buf=buf) 110 buf.seek(0) 111 lines = buf.readlines() 112 assert len(lines) > 0 113 114 for i, line in enumerate(lines): 115 if i >= start and i < start + size: 116 line_nr = f" {i - start} " 117 assert line.startswith(line_nr) 118 119 120@pytest.mark.parametrize( 121 "size, header_exp, separator_exp, first_line_exp, last_line_exp", 122 [ 123 ( 124 4, 125 " # Column Non-Null Count Dtype ", 126 "--- ------ -------------- ----- ", 127 " 0 0 3 non-null float64", 128 " 3 3 3 non-null float64", 129 ), 130 ( 131 11, 132 " # Column Non-Null Count Dtype ", 133 "--- ------ -------------- ----- ", 134 " 0 0 3 non-null float64", 135 " 10 10 3 non-null float64", 136 ), 137 ( 138 101, 139 " # Column Non-Null Count Dtype ", 140 "--- ------ -------------- ----- ", 141 " 0 0 3 non-null float64", 142 " 100 100 3 non-null float64", 143 ), 144 ( 145 1001, 146 " # Column Non-Null Count Dtype ", 147 "--- ------ -------------- ----- ", 148 " 0 0 3 non-null float64", 149 " 1000 1000 3 non-null float64", 150 ), 151 ( 152 10001, 153 " # Column Non-Null Count Dtype ", 154 "--- ------ -------------- ----- ", 155 " 0 0 3 non-null float64", 156 " 10000 10000 3 non-null float64", 157 ), 158 ], 159) 160def test_info_verbose_with_counts_spacing( 161 size, header_exp, separator_exp, first_line_exp, last_line_exp 162): 163 """Test header column, spacer, first line and last line in verbose mode.""" 164 frame = DataFrame(np.random.randn(3, size)) 165 buf = StringIO() 166 frame.info(verbose=True, show_counts=True, buf=buf) 167 all_lines = buf.getvalue().splitlines() 168 # Here table would contain only header, separator and table lines 169 # dframe repr, index summary, memory usage and dtypes are excluded 170 table = all_lines[3:-2] 171 header, separator, first_line, *rest, last_line = table 172 assert header == header_exp 173 assert separator == separator_exp 174 assert first_line == first_line_exp 175 assert last_line == last_line_exp 176 177 178def test_info_memory(): 179 # https://github.com/pandas-dev/pandas/issues/21056 180 df = DataFrame({"a": Series([1, 2], dtype="i8")}) 181 buf = StringIO() 182 df.info(buf=buf) 183 result = buf.getvalue() 184 bytes = float(df.memory_usage().sum()) 185 expected = textwrap.dedent( 186 f"""\ 187 <class 'pandas.core.frame.DataFrame'> 188 RangeIndex: 2 entries, 0 to 1 189 Data columns (total 1 columns): 190 # Column Non-Null Count Dtype 191 --- ------ -------------- ----- 192 0 a 2 non-null int64 193 dtypes: int64(1) 194 memory usage: {bytes} bytes 195 """ 196 ) 197 assert result == expected 198 199 200def test_info_wide(): 201 io = StringIO() 202 df = DataFrame(np.random.randn(5, 101)) 203 df.info(buf=io) 204 205 io = StringIO() 206 df.info(buf=io, max_cols=101) 207 result = io.getvalue() 208 assert len(result.splitlines()) > 100 209 210 expected = result 211 with option_context("display.max_info_columns", 101): 212 io = StringIO() 213 df.info(buf=io) 214 result = io.getvalue() 215 assert result == expected 216 217 218def test_info_duplicate_columns_shows_correct_dtypes(): 219 # GH11761 220 io = StringIO() 221 frame = DataFrame([[1, 2.0]], columns=["a", "a"]) 222 frame.info(buf=io) 223 lines = io.getvalue().splitlines(True) 224 assert " 0 a 1 non-null int64 \n" == lines[5] 225 assert " 1 a 1 non-null float64\n" == lines[6] 226 227 228def test_info_shows_column_dtypes(): 229 dtypes = [ 230 "int64", 231 "float64", 232 "datetime64[ns]", 233 "timedelta64[ns]", 234 "complex128", 235 "object", 236 "bool", 237 ] 238 data = {} 239 n = 10 240 for i, dtype in enumerate(dtypes): 241 data[i] = np.random.randint(2, size=n).astype(dtype) 242 df = DataFrame(data) 243 buf = StringIO() 244 df.info(buf=buf) 245 res = buf.getvalue() 246 header = ( 247 " # Column Non-Null Count Dtype \n" 248 "--- ------ -------------- ----- " 249 ) 250 assert header in res 251 for i, dtype in enumerate(dtypes): 252 name = f" {i:d} {i:d} {n:d} non-null {dtype}" 253 assert name in res 254 255 256def test_info_max_cols(): 257 df = DataFrame(np.random.randn(10, 5)) 258 for len_, verbose in [(5, None), (5, False), (12, True)]: 259 # For verbose always ^ setting ^ summarize ^ full output 260 with option_context("max_info_columns", 4): 261 buf = StringIO() 262 df.info(buf=buf, verbose=verbose) 263 res = buf.getvalue() 264 assert len(res.strip().split("\n")) == len_ 265 266 for len_, verbose in [(12, None), (5, False), (12, True)]: 267 # max_cols not exceeded 268 with option_context("max_info_columns", 5): 269 buf = StringIO() 270 df.info(buf=buf, verbose=verbose) 271 res = buf.getvalue() 272 assert len(res.strip().split("\n")) == len_ 273 274 for len_, max_cols in [(12, 5), (5, 4)]: 275 # setting truncates 276 with option_context("max_info_columns", 4): 277 buf = StringIO() 278 df.info(buf=buf, max_cols=max_cols) 279 res = buf.getvalue() 280 assert len(res.strip().split("\n")) == len_ 281 282 # setting wouldn't truncate 283 with option_context("max_info_columns", 5): 284 buf = StringIO() 285 df.info(buf=buf, max_cols=max_cols) 286 res = buf.getvalue() 287 assert len(res.strip().split("\n")) == len_ 288 289 290def test_info_memory_usage(): 291 # Ensure memory usage is displayed, when asserted, on the last line 292 dtypes = [ 293 "int64", 294 "float64", 295 "datetime64[ns]", 296 "timedelta64[ns]", 297 "complex128", 298 "object", 299 "bool", 300 ] 301 data = {} 302 n = 10 303 for i, dtype in enumerate(dtypes): 304 data[i] = np.random.randint(2, size=n).astype(dtype) 305 df = DataFrame(data) 306 buf = StringIO() 307 308 # display memory usage case 309 df.info(buf=buf, memory_usage=True) 310 res = buf.getvalue().splitlines() 311 assert "memory usage: " in res[-1] 312 313 # do not display memory usage case 314 df.info(buf=buf, memory_usage=False) 315 res = buf.getvalue().splitlines() 316 assert "memory usage: " not in res[-1] 317 318 df.info(buf=buf, memory_usage=True) 319 res = buf.getvalue().splitlines() 320 321 # memory usage is a lower bound, so print it as XYZ+ MB 322 assert re.match(r"memory usage: [^+]+\+", res[-1]) 323 324 df.iloc[:, :5].info(buf=buf, memory_usage=True) 325 res = buf.getvalue().splitlines() 326 327 # excluded column with object dtype, so estimate is accurate 328 assert not re.match(r"memory usage: [^+]+\+", res[-1]) 329 330 # Test a DataFrame with duplicate columns 331 dtypes = ["int64", "int64", "int64", "float64"] 332 data = {} 333 n = 100 334 for i, dtype in enumerate(dtypes): 335 data[i] = np.random.randint(2, size=n).astype(dtype) 336 df = DataFrame(data) 337 df.columns = dtypes 338 339 df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) 340 df_with_object_index.info(buf=buf, memory_usage=True) 341 res = buf.getvalue().splitlines() 342 assert re.match(r"memory usage: [^+]+\+", res[-1]) 343 344 df_with_object_index.info(buf=buf, memory_usage="deep") 345 res = buf.getvalue().splitlines() 346 assert re.match(r"memory usage: [^+]+$", res[-1]) 347 348 # Ensure df size is as expected 349 # (cols * rows * bytes) + index size 350 df_size = df.memory_usage().sum() 351 exp_size = len(dtypes) * n * 8 + df.index.nbytes 352 assert df_size == exp_size 353 354 # Ensure number of cols in memory_usage is the same as df 355 size_df = np.size(df.columns.values) + 1 # index=True; default 356 assert size_df == np.size(df.memory_usage()) 357 358 # assert deep works only on object 359 assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() 360 361 # test for validity 362 DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) 363 DataFrame(1, index=["a"], columns=["A"]).index.nbytes 364 df = DataFrame( 365 data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] 366 ) 367 df.index.nbytes 368 df.memory_usage(index=True) 369 df.index.values.nbytes 370 371 mem = df.memory_usage(deep=True).sum() 372 assert mem > 0 373 374 375@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") 376def test_info_memory_usage_deep_not_pypy(): 377 df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) 378 assert ( 379 df_with_object_index.memory_usage(index=True, deep=True).sum() 380 > df_with_object_index.memory_usage(index=True).sum() 381 ) 382 383 df_object = DataFrame({"a": ["a"]}) 384 assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() 385 386 387@pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") 388def test_info_memory_usage_deep_pypy(): 389 df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) 390 assert ( 391 df_with_object_index.memory_usage(index=True, deep=True).sum() 392 == df_with_object_index.memory_usage(index=True).sum() 393 ) 394 395 df_object = DataFrame({"a": ["a"]}) 396 assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() 397 398 399@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") 400def test_usage_via_getsizeof(): 401 df = DataFrame( 402 data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] 403 ) 404 mem = df.memory_usage(deep=True).sum() 405 # sys.getsizeof will call the .memory_usage with 406 # deep=True, and add on some GC overhead 407 diff = mem - sys.getsizeof(df) 408 assert abs(diff) < 100 409 410 411def test_info_memory_usage_qualified(): 412 buf = StringIO() 413 df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) 414 df.info(buf=buf) 415 assert "+" not in buf.getvalue() 416 417 buf = StringIO() 418 df = DataFrame(1, columns=list("ab"), index=list("ABC")) 419 df.info(buf=buf) 420 assert "+" in buf.getvalue() 421 422 buf = StringIO() 423 df = DataFrame( 424 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]) 425 ) 426 df.info(buf=buf) 427 assert "+" not in buf.getvalue() 428 429 buf = StringIO() 430 df = DataFrame( 431 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]]) 432 ) 433 df.info(buf=buf) 434 assert "+" in buf.getvalue() 435 436 437def test_info_memory_usage_bug_on_multiindex(): 438 # GH 14308 439 # memory usage introspection should not materialize .values 440 441 def memory_usage(f): 442 return f.memory_usage(deep=True).sum() 443 444 N = 100 445 M = len(uppercase) 446 index = MultiIndex.from_product( 447 [list(uppercase), date_range("20160101", periods=N)], 448 names=["id", "date"], 449 ) 450 df = DataFrame({"value": np.random.randn(N * M)}, index=index) 451 452 unstacked = df.unstack("id") 453 assert df.values.nbytes == unstacked.values.nbytes 454 assert memory_usage(df) > memory_usage(unstacked) 455 456 # high upper bound 457 assert memory_usage(unstacked) - memory_usage(df) < 2000 458 459 460def test_info_categorical(): 461 # GH14298 462 idx = CategoricalIndex(["a", "b"]) 463 df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx) 464 465 buf = StringIO() 466 df.info(buf=buf) 467 468 469@pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") 470def test_info_int_columns(): 471 # GH#37245 472 df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"]) 473 buf = StringIO() 474 df.info(show_counts=True, buf=buf) 475 result = buf.getvalue() 476 expected = textwrap.dedent( 477 """\ 478 <class 'pandas.core.frame.DataFrame'> 479 Index: 2 entries, A to B 480 Data columns (total 2 columns): 481 # Column Non-Null Count Dtype 482 --- ------ -------------- ----- 483 0 1 2 non-null int64 484 1 2 2 non-null int64 485 dtypes: int64(2) 486 memory usage: 48.0+ bytes 487 """ 488 ) 489 assert result == expected 490