1from io import StringIO
2import re
3from string import ascii_uppercase as uppercase
4import sys
5import textwrap
6
7import numpy as np
8import pytest
9
10from pandas.compat import IS64, PYPY
11
12from pandas import (
13    CategoricalIndex,
14    DataFrame,
15    MultiIndex,
16    Series,
17    date_range,
18    option_context,
19)
20
21
22@pytest.fixture
23def duplicate_columns_frame():
24    """Dataframe with duplicate column names."""
25    return DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"])
26
27
28def test_info_empty():
29    df = DataFrame()
30    buf = StringIO()
31    df.info(buf=buf)
32    result = buf.getvalue()
33    expected = textwrap.dedent(
34        """\
35        <class 'pandas.core.frame.DataFrame'>
36        Index: 0 entries
37        Empty DataFrame"""
38    )
39    assert result == expected
40
41
42def test_info_categorical_column_smoke_test():
43    n = 2500
44    df = DataFrame({"int64": np.random.randint(100, size=n)})
45    df["category"] = Series(
46        np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n))
47    ).astype("category")
48    df.isna()
49    buf = StringIO()
50    df.info(buf=buf)
51
52    df2 = df[df["category"] == "d"]
53    buf = StringIO()
54    df2.info(buf=buf)
55
56
57@pytest.mark.parametrize(
58    "fixture_func_name",
59    [
60        "int_frame",
61        "float_frame",
62        "datetime_frame",
63        "duplicate_columns_frame",
64    ],
65)
66def test_info_smoke_test(fixture_func_name, request):
67    frame = request.getfixturevalue(fixture_func_name)
68    buf = StringIO()
69    frame.info(buf=buf)
70    result = buf.getvalue().splitlines()
71    assert len(result) > 10
72
73
74@pytest.mark.parametrize(
75    "num_columns, max_info_columns, verbose",
76    [
77        (10, 100, True),
78        (10, 11, True),
79        (10, 10, True),
80        (10, 9, False),
81        (10, 1, False),
82    ],
83)
84def test_info_default_verbose_selection(num_columns, max_info_columns, verbose):
85    frame = DataFrame(np.random.randn(5, num_columns))
86    with option_context("display.max_info_columns", max_info_columns):
87        io_default = StringIO()
88        frame.info(buf=io_default)
89        result = io_default.getvalue()
90
91        io_explicit = StringIO()
92        frame.info(buf=io_explicit, verbose=verbose)
93        expected = io_explicit.getvalue()
94
95        assert result == expected
96
97
98def test_info_verbose_check_header_separator_body():
99    buf = StringIO()
100    size = 1001
101    start = 5
102    frame = DataFrame(np.random.randn(3, size))
103    frame.info(verbose=True, buf=buf)
104
105    res = buf.getvalue()
106    header = " #     Column  Dtype  \n---    ------  -----  "
107    assert header in res
108
109    frame.info(verbose=True, buf=buf)
110    buf.seek(0)
111    lines = buf.readlines()
112    assert len(lines) > 0
113
114    for i, line in enumerate(lines):
115        if i >= start and i < start + size:
116            line_nr = f" {i - start} "
117            assert line.startswith(line_nr)
118
119
120@pytest.mark.parametrize(
121    "size, header_exp, separator_exp, first_line_exp, last_line_exp",
122    [
123        (
124            4,
125            " #   Column  Non-Null Count  Dtype  ",
126            "---  ------  --------------  -----  ",
127            " 0   0       3 non-null      float64",
128            " 3   3       3 non-null      float64",
129        ),
130        (
131            11,
132            " #   Column  Non-Null Count  Dtype  ",
133            "---  ------  --------------  -----  ",
134            " 0   0       3 non-null      float64",
135            " 10  10      3 non-null      float64",
136        ),
137        (
138            101,
139            " #    Column  Non-Null Count  Dtype  ",
140            "---   ------  --------------  -----  ",
141            " 0    0       3 non-null      float64",
142            " 100  100     3 non-null      float64",
143        ),
144        (
145            1001,
146            " #     Column  Non-Null Count  Dtype  ",
147            "---    ------  --------------  -----  ",
148            " 0     0       3 non-null      float64",
149            " 1000  1000    3 non-null      float64",
150        ),
151        (
152            10001,
153            " #      Column  Non-Null Count  Dtype  ",
154            "---     ------  --------------  -----  ",
155            " 0      0       3 non-null      float64",
156            " 10000  10000   3 non-null      float64",
157        ),
158    ],
159)
160def test_info_verbose_with_counts_spacing(
161    size, header_exp, separator_exp, first_line_exp, last_line_exp
162):
163    """Test header column, spacer, first line and last line in verbose mode."""
164    frame = DataFrame(np.random.randn(3, size))
165    buf = StringIO()
166    frame.info(verbose=True, show_counts=True, buf=buf)
167    all_lines = buf.getvalue().splitlines()
168    # Here table would contain only header, separator and table lines
169    # dframe repr, index summary, memory usage and dtypes are excluded
170    table = all_lines[3:-2]
171    header, separator, first_line, *rest, last_line = table
172    assert header == header_exp
173    assert separator == separator_exp
174    assert first_line == first_line_exp
175    assert last_line == last_line_exp
176
177
178def test_info_memory():
179    # https://github.com/pandas-dev/pandas/issues/21056
180    df = DataFrame({"a": Series([1, 2], dtype="i8")})
181    buf = StringIO()
182    df.info(buf=buf)
183    result = buf.getvalue()
184    bytes = float(df.memory_usage().sum())
185    expected = textwrap.dedent(
186        f"""\
187    <class 'pandas.core.frame.DataFrame'>
188    RangeIndex: 2 entries, 0 to 1
189    Data columns (total 1 columns):
190     #   Column  Non-Null Count  Dtype
191    ---  ------  --------------  -----
192     0   a       2 non-null      int64
193    dtypes: int64(1)
194    memory usage: {bytes} bytes
195    """
196    )
197    assert result == expected
198
199
200def test_info_wide():
201    io = StringIO()
202    df = DataFrame(np.random.randn(5, 101))
203    df.info(buf=io)
204
205    io = StringIO()
206    df.info(buf=io, max_cols=101)
207    result = io.getvalue()
208    assert len(result.splitlines()) > 100
209
210    expected = result
211    with option_context("display.max_info_columns", 101):
212        io = StringIO()
213        df.info(buf=io)
214        result = io.getvalue()
215        assert result == expected
216
217
218def test_info_duplicate_columns_shows_correct_dtypes():
219    # GH11761
220    io = StringIO()
221    frame = DataFrame([[1, 2.0]], columns=["a", "a"])
222    frame.info(buf=io)
223    lines = io.getvalue().splitlines(True)
224    assert " 0   a       1 non-null      int64  \n" == lines[5]
225    assert " 1   a       1 non-null      float64\n" == lines[6]
226
227
228def test_info_shows_column_dtypes():
229    dtypes = [
230        "int64",
231        "float64",
232        "datetime64[ns]",
233        "timedelta64[ns]",
234        "complex128",
235        "object",
236        "bool",
237    ]
238    data = {}
239    n = 10
240    for i, dtype in enumerate(dtypes):
241        data[i] = np.random.randint(2, size=n).astype(dtype)
242    df = DataFrame(data)
243    buf = StringIO()
244    df.info(buf=buf)
245    res = buf.getvalue()
246    header = (
247        " #   Column  Non-Null Count  Dtype          \n"
248        "---  ------  --------------  -----          "
249    )
250    assert header in res
251    for i, dtype in enumerate(dtypes):
252        name = f" {i:d}   {i:d}       {n:d} non-null     {dtype}"
253        assert name in res
254
255
256def test_info_max_cols():
257    df = DataFrame(np.random.randn(10, 5))
258    for len_, verbose in [(5, None), (5, False), (12, True)]:
259        # For verbose always      ^ setting  ^ summarize ^ full output
260        with option_context("max_info_columns", 4):
261            buf = StringIO()
262            df.info(buf=buf, verbose=verbose)
263            res = buf.getvalue()
264            assert len(res.strip().split("\n")) == len_
265
266    for len_, verbose in [(12, None), (5, False), (12, True)]:
267        # max_cols not exceeded
268        with option_context("max_info_columns", 5):
269            buf = StringIO()
270            df.info(buf=buf, verbose=verbose)
271            res = buf.getvalue()
272            assert len(res.strip().split("\n")) == len_
273
274    for len_, max_cols in [(12, 5), (5, 4)]:
275        # setting truncates
276        with option_context("max_info_columns", 4):
277            buf = StringIO()
278            df.info(buf=buf, max_cols=max_cols)
279            res = buf.getvalue()
280            assert len(res.strip().split("\n")) == len_
281
282        # setting wouldn't truncate
283        with option_context("max_info_columns", 5):
284            buf = StringIO()
285            df.info(buf=buf, max_cols=max_cols)
286            res = buf.getvalue()
287            assert len(res.strip().split("\n")) == len_
288
289
290def test_info_memory_usage():
291    # Ensure memory usage is displayed, when asserted, on the last line
292    dtypes = [
293        "int64",
294        "float64",
295        "datetime64[ns]",
296        "timedelta64[ns]",
297        "complex128",
298        "object",
299        "bool",
300    ]
301    data = {}
302    n = 10
303    for i, dtype in enumerate(dtypes):
304        data[i] = np.random.randint(2, size=n).astype(dtype)
305    df = DataFrame(data)
306    buf = StringIO()
307
308    # display memory usage case
309    df.info(buf=buf, memory_usage=True)
310    res = buf.getvalue().splitlines()
311    assert "memory usage: " in res[-1]
312
313    # do not display memory usage case
314    df.info(buf=buf, memory_usage=False)
315    res = buf.getvalue().splitlines()
316    assert "memory usage: " not in res[-1]
317
318    df.info(buf=buf, memory_usage=True)
319    res = buf.getvalue().splitlines()
320
321    # memory usage is a lower bound, so print it as XYZ+ MB
322    assert re.match(r"memory usage: [^+]+\+", res[-1])
323
324    df.iloc[:, :5].info(buf=buf, memory_usage=True)
325    res = buf.getvalue().splitlines()
326
327    # excluded column with object dtype, so estimate is accurate
328    assert not re.match(r"memory usage: [^+]+\+", res[-1])
329
330    # Test a DataFrame with duplicate columns
331    dtypes = ["int64", "int64", "int64", "float64"]
332    data = {}
333    n = 100
334    for i, dtype in enumerate(dtypes):
335        data[i] = np.random.randint(2, size=n).astype(dtype)
336    df = DataFrame(data)
337    df.columns = dtypes
338
339    df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
340    df_with_object_index.info(buf=buf, memory_usage=True)
341    res = buf.getvalue().splitlines()
342    assert re.match(r"memory usage: [^+]+\+", res[-1])
343
344    df_with_object_index.info(buf=buf, memory_usage="deep")
345    res = buf.getvalue().splitlines()
346    assert re.match(r"memory usage: [^+]+$", res[-1])
347
348    # Ensure df size is as expected
349    # (cols * rows * bytes) + index size
350    df_size = df.memory_usage().sum()
351    exp_size = len(dtypes) * n * 8 + df.index.nbytes
352    assert df_size == exp_size
353
354    # Ensure number of cols in memory_usage is the same as df
355    size_df = np.size(df.columns.values) + 1  # index=True; default
356    assert size_df == np.size(df.memory_usage())
357
358    # assert deep works only on object
359    assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
360
361    # test for validity
362    DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True)
363    DataFrame(1, index=["a"], columns=["A"]).index.nbytes
364    df = DataFrame(
365        data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"]
366    )
367    df.index.nbytes
368    df.memory_usage(index=True)
369    df.index.values.nbytes
370
371    mem = df.memory_usage(deep=True).sum()
372    assert mem > 0
373
374
375@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result")
376def test_info_memory_usage_deep_not_pypy():
377    df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
378    assert (
379        df_with_object_index.memory_usage(index=True, deep=True).sum()
380        > df_with_object_index.memory_usage(index=True).sum()
381    )
382
383    df_object = DataFrame({"a": ["a"]})
384    assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()
385
386
387@pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result")
388def test_info_memory_usage_deep_pypy():
389    df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
390    assert (
391        df_with_object_index.memory_usage(index=True, deep=True).sum()
392        == df_with_object_index.memory_usage(index=True).sum()
393    )
394
395    df_object = DataFrame({"a": ["a"]})
396    assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum()
397
398
399@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
400def test_usage_via_getsizeof():
401    df = DataFrame(
402        data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"]
403    )
404    mem = df.memory_usage(deep=True).sum()
405    # sys.getsizeof will call the .memory_usage with
406    # deep=True, and add on some GC overhead
407    diff = mem - sys.getsizeof(df)
408    assert abs(diff) < 100
409
410
411def test_info_memory_usage_qualified():
412    buf = StringIO()
413    df = DataFrame(1, columns=list("ab"), index=[1, 2, 3])
414    df.info(buf=buf)
415    assert "+" not in buf.getvalue()
416
417    buf = StringIO()
418    df = DataFrame(1, columns=list("ab"), index=list("ABC"))
419    df.info(buf=buf)
420    assert "+" in buf.getvalue()
421
422    buf = StringIO()
423    df = DataFrame(
424        1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)])
425    )
426    df.info(buf=buf)
427    assert "+" not in buf.getvalue()
428
429    buf = StringIO()
430    df = DataFrame(
431        1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]])
432    )
433    df.info(buf=buf)
434    assert "+" in buf.getvalue()
435
436
437def test_info_memory_usage_bug_on_multiindex():
438    # GH 14308
439    # memory usage introspection should not materialize .values
440
441    def memory_usage(f):
442        return f.memory_usage(deep=True).sum()
443
444    N = 100
445    M = len(uppercase)
446    index = MultiIndex.from_product(
447        [list(uppercase), date_range("20160101", periods=N)],
448        names=["id", "date"],
449    )
450    df = DataFrame({"value": np.random.randn(N * M)}, index=index)
451
452    unstacked = df.unstack("id")
453    assert df.values.nbytes == unstacked.values.nbytes
454    assert memory_usage(df) > memory_usage(unstacked)
455
456    # high upper bound
457    assert memory_usage(unstacked) - memory_usage(df) < 2000
458
459
460def test_info_categorical():
461    # GH14298
462    idx = CategoricalIndex(["a", "b"])
463    df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
464
465    buf = StringIO()
466    df.info(buf=buf)
467
468
469@pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system")
470def test_info_int_columns():
471    # GH#37245
472    df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"])
473    buf = StringIO()
474    df.info(show_counts=True, buf=buf)
475    result = buf.getvalue()
476    expected = textwrap.dedent(
477        """\
478        <class 'pandas.core.frame.DataFrame'>
479        Index: 2 entries, A to B
480        Data columns (total 2 columns):
481         #   Column  Non-Null Count  Dtype
482        ---  ------  --------------  -----
483         0   1       2 non-null      int64
484         1   2       2 non-null      int64
485        dtypes: int64(2)
486        memory usage: 48.0+ bytes
487        """
488    )
489    assert result == expected
490