1from io import StringIO 2from pathlib import Path 3 4import pytest 5 6import pandas as pd 7from pandas import DataFrame, read_json 8import pandas._testing as tm 9 10from pandas.io.json._json import JsonReader 11 12 13@pytest.fixture 14def lines_json_df(): 15 df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) 16 return df.to_json(lines=True, orient="records") 17 18 19def test_read_jsonl(): 20 # GH9180 21 result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) 22 expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) 23 tm.assert_frame_equal(result, expected) 24 25 26def test_read_jsonl_unicode_chars(): 27 # GH15132: non-ascii unicode characters 28 # \u201d == RIGHT DOUBLE QUOTATION MARK 29 30 # simulate file handle 31 json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' 32 json = StringIO(json) 33 result = read_json(json, lines=True) 34 expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) 35 tm.assert_frame_equal(result, expected) 36 37 # simulate string 38 json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' 39 result = read_json(json, lines=True) 40 expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) 41 tm.assert_frame_equal(result, expected) 42 43 44def test_to_jsonl(): 45 # GH9180 46 df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) 47 result = df.to_json(orient="records", lines=True) 48 expected = '{"a":1,"b":2}\n{"a":1,"b":2}\n' 49 assert result == expected 50 51 df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"]) 52 result = df.to_json(orient="records", lines=True) 53 expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n' 54 assert result == expected 55 tm.assert_frame_equal(read_json(result, lines=True), df) 56 57 # GH15096: escaped characters in columns and data 58 df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"]) 59 result = df.to_json(orient="records", lines=True) 60 expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n' 61 assert result == expected 62 tm.assert_frame_equal(read_json(result, lines=True), df) 63 64 65def test_to_jsonl_count_new_lines(): 66 # GH36888 67 df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) 68 actual_new_lines_count = df.to_json(orient="records", lines=True).count("\n") 69 expected_new_lines_count = 2 70 assert actual_new_lines_count == expected_new_lines_count 71 72 73@pytest.mark.parametrize("chunksize", [1, 1.0]) 74def test_readjson_chunks(lines_json_df, chunksize): 75 # Basic test that read_json(chunks=True) gives the same result as 76 # read_json(chunks=False) 77 # GH17048: memory usage when lines=True 78 79 unchunked = read_json(StringIO(lines_json_df), lines=True) 80 with read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) as reader: 81 chunked = pd.concat(reader) 82 83 tm.assert_frame_equal(chunked, unchunked) 84 85 86def test_readjson_chunksize_requires_lines(lines_json_df): 87 msg = "chunksize can only be passed if lines=True" 88 with pytest.raises(ValueError, match=msg): 89 with pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2) as _: 90 pass 91 92 93def test_readjson_chunks_series(): 94 # Test reading line-format JSON to Series with chunksize param 95 s = pd.Series({"A": 1, "B": 2}) 96 97 strio = StringIO(s.to_json(lines=True, orient="records")) 98 unchunked = pd.read_json(strio, lines=True, typ="Series") 99 100 strio = StringIO(s.to_json(lines=True, orient="records")) 101 with pd.read_json(strio, lines=True, typ="Series", chunksize=1) as reader: 102 chunked = pd.concat(reader) 103 104 tm.assert_series_equal(chunked, unchunked) 105 106 107def test_readjson_each_chunk(lines_json_df): 108 # Other tests check that the final result of read_json(chunksize=True) 109 # is correct. This checks the intermediate chunks. 110 with pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2) as reader: 111 chunks = list(reader) 112 assert chunks[0].shape == (2, 2) 113 assert chunks[1].shape == (1, 2) 114 115 116def test_readjson_chunks_from_file(): 117 with tm.ensure_clean("test.json") as path: 118 df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) 119 df.to_json(path, lines=True, orient="records") 120 with pd.read_json(path, lines=True, chunksize=1) as reader: 121 chunked = pd.concat(reader) 122 unchunked = pd.read_json(path, lines=True) 123 tm.assert_frame_equal(unchunked, chunked) 124 125 126@pytest.mark.parametrize("chunksize", [None, 1]) 127def test_readjson_chunks_closes(chunksize): 128 with tm.ensure_clean("test.json") as path: 129 df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) 130 df.to_json(path, lines=True, orient="records") 131 reader = JsonReader( 132 path, 133 orient=None, 134 typ="frame", 135 dtype=True, 136 convert_axes=True, 137 convert_dates=True, 138 keep_default_dates=True, 139 numpy=False, 140 precise_float=False, 141 date_unit=None, 142 encoding=None, 143 lines=True, 144 chunksize=chunksize, 145 compression=None, 146 nrows=None, 147 ) 148 with reader: 149 reader.read() 150 assert ( 151 reader.handles.handle.closed 152 ), f"didn't close stream with chunksize = {chunksize}" 153 154 155@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) 156def test_readjson_invalid_chunksize(lines_json_df, chunksize): 157 msg = r"'chunksize' must be an integer >=1" 158 159 with pytest.raises(ValueError, match=msg): 160 with pd.read_json( 161 StringIO(lines_json_df), lines=True, chunksize=chunksize 162 ) as _: 163 pass 164 165 166@pytest.mark.parametrize("chunksize", [None, 1, 2]) 167def test_readjson_chunks_multiple_empty_lines(chunksize): 168 j = """ 169 170 {"A":1,"B":4} 171 172 173 174 {"A":2,"B":5} 175 176 177 178 179 180 181 182 {"A":3,"B":6} 183 """ 184 orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) 185 test = pd.read_json(j, lines=True, chunksize=chunksize) 186 if chunksize is not None: 187 with test: 188 test = pd.concat(test) 189 tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}") 190 191 192def test_readjson_unicode(monkeypatch): 193 with tm.ensure_clean("test.json") as path: 194 monkeypatch.setattr("_bootlocale.getpreferredencoding", lambda l: "cp949") 195 with open(path, "w", encoding="utf-8") as f: 196 f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}') 197 198 result = read_json(path) 199 expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]}) 200 tm.assert_frame_equal(result, expected) 201 202 203@pytest.mark.parametrize("nrows", [1, 2]) 204def test_readjson_nrows(nrows): 205 # GH 33916 206 # Test reading line-format JSON to Series with nrows param 207 jsonl = """{"a": 1, "b": 2} 208 {"a": 3, "b": 4} 209 {"a": 5, "b": 6} 210 {"a": 7, "b": 8}""" 211 result = pd.read_json(jsonl, lines=True, nrows=nrows) 212 expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] 213 tm.assert_frame_equal(result, expected) 214 215 216@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)]) 217def test_readjson_nrows_chunks(nrows, chunksize): 218 # GH 33916 219 # Test reading line-format JSON to Series with nrows and chunksize param 220 jsonl = """{"a": 1, "b": 2} 221 {"a": 3, "b": 4} 222 {"a": 5, "b": 6} 223 {"a": 7, "b": 8}""" 224 with read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize) as reader: 225 chunked = pd.concat(reader) 226 expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] 227 tm.assert_frame_equal(chunked, expected) 228 229 230def test_readjson_nrows_requires_lines(): 231 # GH 33916 232 # Test ValuError raised if nrows is set without setting lines in read_json 233 jsonl = """{"a": 1, "b": 2} 234 {"a": 3, "b": 4} 235 {"a": 5, "b": 6} 236 {"a": 7, "b": 8}""" 237 msg = "nrows can only be passed if lines=True" 238 with pytest.raises(ValueError, match=msg): 239 pd.read_json(jsonl, lines=False, nrows=2) 240 241 242def test_readjson_lines_chunks_fileurl(datapath): 243 # GH 27135 244 # Test reading line-format JSON from file url 245 df_list_expected = [ 246 DataFrame([[1, 2]], columns=["a", "b"], index=[0]), 247 DataFrame([[3, 4]], columns=["a", "b"], index=[1]), 248 DataFrame([[5, 6]], columns=["a", "b"], index=[2]), 249 ] 250 os_path = datapath("io", "json", "data", "line_delimited.json") 251 file_url = Path(os_path).as_uri() 252 with pd.read_json(file_url, lines=True, chunksize=1) as url_reader: 253 for index, chuck in enumerate(url_reader): 254 tm.assert_frame_equal(chuck, df_list_expected[index]) 255 256 257def test_chunksize_is_incremental(): 258 # See https://github.com/pandas-dev/pandas/issues/34548 259 jsonl = ( 260 """{"a": 1, "b": 2} 261 {"a": 3, "b": 4} 262 {"a": 5, "b": 6} 263 {"a": 7, "b": 8}\n""" 264 * 1000 265 ) 266 267 class MyReader: 268 def __init__(self, contents): 269 self.read_count = 0 270 self.stringio = StringIO(contents) 271 272 def read(self, *args): 273 self.read_count += 1 274 return self.stringio.read(*args) 275 276 def __iter__(self): 277 self.read_count += 1 278 return iter(self.stringio) 279 280 reader = MyReader(jsonl) 281 assert len(list(pd.read_json(reader, lines=True, chunksize=100))) > 1 282 assert reader.read_count > 10 283