1from io import StringIO
2from pathlib import Path
3
4import pytest
5
6import pandas as pd
7from pandas import DataFrame, read_json
8import pandas._testing as tm
9
10from pandas.io.json._json import JsonReader
11
12
13@pytest.fixture
14def lines_json_df():
15    df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
16    return df.to_json(lines=True, orient="records")
17
18
19def test_read_jsonl():
20    # GH9180
21    result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
22    expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
23    tm.assert_frame_equal(result, expected)
24
25
26def test_read_jsonl_unicode_chars():
27    # GH15132: non-ascii unicode characters
28    # \u201d == RIGHT DOUBLE QUOTATION MARK
29
30    # simulate file handle
31    json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
32    json = StringIO(json)
33    result = read_json(json, lines=True)
34    expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
35    tm.assert_frame_equal(result, expected)
36
37    # simulate string
38    json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
39    result = read_json(json, lines=True)
40    expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
41    tm.assert_frame_equal(result, expected)
42
43
44def test_to_jsonl():
45    # GH9180
46    df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
47    result = df.to_json(orient="records", lines=True)
48    expected = '{"a":1,"b":2}\n{"a":1,"b":2}\n'
49    assert result == expected
50
51    df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"])
52    result = df.to_json(orient="records", lines=True)
53    expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n'
54    assert result == expected
55    tm.assert_frame_equal(read_json(result, lines=True), df)
56
57    # GH15096: escaped characters in columns and data
58    df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"])
59    result = df.to_json(orient="records", lines=True)
60    expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n'
61    assert result == expected
62    tm.assert_frame_equal(read_json(result, lines=True), df)
63
64
65def test_to_jsonl_count_new_lines():
66    # GH36888
67    df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
68    actual_new_lines_count = df.to_json(orient="records", lines=True).count("\n")
69    expected_new_lines_count = 2
70    assert actual_new_lines_count == expected_new_lines_count
71
72
73@pytest.mark.parametrize("chunksize", [1, 1.0])
74def test_readjson_chunks(lines_json_df, chunksize):
75    # Basic test that read_json(chunks=True) gives the same result as
76    # read_json(chunks=False)
77    # GH17048: memory usage when lines=True
78
79    unchunked = read_json(StringIO(lines_json_df), lines=True)
80    with read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) as reader:
81        chunked = pd.concat(reader)
82
83    tm.assert_frame_equal(chunked, unchunked)
84
85
86def test_readjson_chunksize_requires_lines(lines_json_df):
87    msg = "chunksize can only be passed if lines=True"
88    with pytest.raises(ValueError, match=msg):
89        with pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2) as _:
90            pass
91
92
93def test_readjson_chunks_series():
94    # Test reading line-format JSON to Series with chunksize param
95    s = pd.Series({"A": 1, "B": 2})
96
97    strio = StringIO(s.to_json(lines=True, orient="records"))
98    unchunked = pd.read_json(strio, lines=True, typ="Series")
99
100    strio = StringIO(s.to_json(lines=True, orient="records"))
101    with pd.read_json(strio, lines=True, typ="Series", chunksize=1) as reader:
102        chunked = pd.concat(reader)
103
104    tm.assert_series_equal(chunked, unchunked)
105
106
107def test_readjson_each_chunk(lines_json_df):
108    # Other tests check that the final result of read_json(chunksize=True)
109    # is correct. This checks the intermediate chunks.
110    with pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2) as reader:
111        chunks = list(reader)
112    assert chunks[0].shape == (2, 2)
113    assert chunks[1].shape == (1, 2)
114
115
116def test_readjson_chunks_from_file():
117    with tm.ensure_clean("test.json") as path:
118        df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
119        df.to_json(path, lines=True, orient="records")
120        with pd.read_json(path, lines=True, chunksize=1) as reader:
121            chunked = pd.concat(reader)
122        unchunked = pd.read_json(path, lines=True)
123        tm.assert_frame_equal(unchunked, chunked)
124
125
126@pytest.mark.parametrize("chunksize", [None, 1])
127def test_readjson_chunks_closes(chunksize):
128    with tm.ensure_clean("test.json") as path:
129        df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
130        df.to_json(path, lines=True, orient="records")
131        reader = JsonReader(
132            path,
133            orient=None,
134            typ="frame",
135            dtype=True,
136            convert_axes=True,
137            convert_dates=True,
138            keep_default_dates=True,
139            numpy=False,
140            precise_float=False,
141            date_unit=None,
142            encoding=None,
143            lines=True,
144            chunksize=chunksize,
145            compression=None,
146            nrows=None,
147        )
148        with reader:
149            reader.read()
150        assert (
151            reader.handles.handle.closed
152        ), f"didn't close stream with chunksize = {chunksize}"
153
154
155@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
156def test_readjson_invalid_chunksize(lines_json_df, chunksize):
157    msg = r"'chunksize' must be an integer >=1"
158
159    with pytest.raises(ValueError, match=msg):
160        with pd.read_json(
161            StringIO(lines_json_df), lines=True, chunksize=chunksize
162        ) as _:
163            pass
164
165
166@pytest.mark.parametrize("chunksize", [None, 1, 2])
167def test_readjson_chunks_multiple_empty_lines(chunksize):
168    j = """
169
170    {"A":1,"B":4}
171
172
173
174    {"A":2,"B":5}
175
176
177
178
179
180
181
182    {"A":3,"B":6}
183    """
184    orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
185    test = pd.read_json(j, lines=True, chunksize=chunksize)
186    if chunksize is not None:
187        with test:
188            test = pd.concat(test)
189    tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}")
190
191
192def test_readjson_unicode(monkeypatch):
193    with tm.ensure_clean("test.json") as path:
194        monkeypatch.setattr("_bootlocale.getpreferredencoding", lambda l: "cp949")
195        with open(path, "w", encoding="utf-8") as f:
196            f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}')
197
198        result = read_json(path)
199        expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]})
200        tm.assert_frame_equal(result, expected)
201
202
203@pytest.mark.parametrize("nrows", [1, 2])
204def test_readjson_nrows(nrows):
205    # GH 33916
206    # Test reading line-format JSON to Series with nrows param
207    jsonl = """{"a": 1, "b": 2}
208        {"a": 3, "b": 4}
209        {"a": 5, "b": 6}
210        {"a": 7, "b": 8}"""
211    result = pd.read_json(jsonl, lines=True, nrows=nrows)
212    expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
213    tm.assert_frame_equal(result, expected)
214
215
216@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)])
217def test_readjson_nrows_chunks(nrows, chunksize):
218    # GH 33916
219    # Test reading line-format JSON to Series with nrows and chunksize param
220    jsonl = """{"a": 1, "b": 2}
221        {"a": 3, "b": 4}
222        {"a": 5, "b": 6}
223        {"a": 7, "b": 8}"""
224    with read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize) as reader:
225        chunked = pd.concat(reader)
226    expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
227    tm.assert_frame_equal(chunked, expected)
228
229
230def test_readjson_nrows_requires_lines():
231    # GH 33916
232    # Test ValuError raised if nrows is set without setting lines in read_json
233    jsonl = """{"a": 1, "b": 2}
234        {"a": 3, "b": 4}
235        {"a": 5, "b": 6}
236        {"a": 7, "b": 8}"""
237    msg = "nrows can only be passed if lines=True"
238    with pytest.raises(ValueError, match=msg):
239        pd.read_json(jsonl, lines=False, nrows=2)
240
241
242def test_readjson_lines_chunks_fileurl(datapath):
243    # GH 27135
244    # Test reading line-format JSON from file url
245    df_list_expected = [
246        DataFrame([[1, 2]], columns=["a", "b"], index=[0]),
247        DataFrame([[3, 4]], columns=["a", "b"], index=[1]),
248        DataFrame([[5, 6]], columns=["a", "b"], index=[2]),
249    ]
250    os_path = datapath("io", "json", "data", "line_delimited.json")
251    file_url = Path(os_path).as_uri()
252    with pd.read_json(file_url, lines=True, chunksize=1) as url_reader:
253        for index, chuck in enumerate(url_reader):
254            tm.assert_frame_equal(chuck, df_list_expected[index])
255
256
257def test_chunksize_is_incremental():
258    # See https://github.com/pandas-dev/pandas/issues/34548
259    jsonl = (
260        """{"a": 1, "b": 2}
261        {"a": 3, "b": 4}
262        {"a": 5, "b": 6}
263        {"a": 7, "b": 8}\n"""
264        * 1000
265    )
266
267    class MyReader:
268        def __init__(self, contents):
269            self.read_count = 0
270            self.stringio = StringIO(contents)
271
272        def read(self, *args):
273            self.read_count += 1
274            return self.stringio.read(*args)
275
276        def __iter__(self):
277            self.read_count += 1
278            return iter(self.stringio)
279
280    reader = MyReader(jsonl)
281    assert len(list(pd.read_json(reader, lines=True, chunksize=100))) > 1
282    assert reader.read_count > 10
283