1# -*- coding: utf-8 -*- 2"""Implements a lazy JSON file class that wraps around json data.""" 3import io 4import json 5import weakref 6import contextlib 7import collections.abc as cabc 8 9 10def _to_json_with_size(obj, offset=0, sort_keys=False): 11 if isinstance(obj, str): 12 s = json.dumps(obj) 13 o = offset 14 n = size = len(s.encode()) # size in bytes 15 elif isinstance(obj, cabc.Mapping): 16 s = "{" 17 j = offset + 1 18 o = {} 19 size = {} 20 items = sorted(obj.items()) if sort_keys else obj.items() 21 for key, val in items: 22 s_k, o_k, n_k, size_k = _to_json_with_size( 23 key, offset=j, sort_keys=sort_keys 24 ) 25 s += s_k + ": " 26 j += n_k + 2 27 s_v, o_v, n_v, size_v = _to_json_with_size( 28 val, offset=j, sort_keys=sort_keys 29 ) 30 o[key] = o_v 31 size[key] = size_v 32 s += s_v + ", " 33 j += n_v + 2 34 if s.endswith(", "): 35 s = s[:-2] 36 s += "}\n" 37 n = len(s) 38 o["__total__"] = offset 39 size["__total__"] = n 40 elif isinstance(obj, cabc.Sequence): 41 s = "[" 42 j = offset + 1 43 o = [] 44 size = [] 45 for x in obj: 46 s_x, o_x, n_x, size_x = _to_json_with_size(x, offset=j, sort_keys=sort_keys) 47 o.append(o_x) 48 size.append(size_x) 49 s += s_x + ", " 50 j += n_x + 2 51 if s.endswith(", "): 52 s = s[:-2] 53 s += "]\n" 54 n = len(s) 55 o.append(offset) 56 size.append(n) 57 else: 58 s = json.dumps(obj, sort_keys=sort_keys) 59 o = offset 60 n = size = len(s) 61 return s, o, n, size 62 63 64def index(obj, sort_keys=False): 65 """Creates an index for a JSON file.""" 66 idx = {} 67 json_obj = _to_json_with_size(obj, sort_keys=sort_keys) 68 s, idx["offsets"], _, idx["sizes"] = json_obj 69 return s, idx 70 71 72JSON_FORMAT = """{{"locs": [{iloc:>10}, {ilen:>10}, {dloc:>10}, {dlen:>10}], 73 "index": {index}, 74 "data": {data} 75}} 76""" 77 78 79def dumps(obj, sort_keys=False): 80 """Dumps an object to JSON with an index.""" 81 data, idx = index(obj, sort_keys=sort_keys) 82 jdx = json.dumps(idx, sort_keys=sort_keys) 83 iloc = 69 84 ilen = len(jdx) 85 dloc = iloc + ilen + 11 86 dlen = len(data) 87 s = JSON_FORMAT.format( 88 index=jdx, data=data, iloc=iloc, ilen=ilen, dloc=dloc, dlen=dlen 89 ) 90 return s 91 92 93def ljdump(obj, fp, sort_keys=False): 94 """Dumps an object to JSON file.""" 95 s = dumps(obj, sort_keys=sort_keys) 96 fp.write(s) 97 98 99class LJNode(cabc.Mapping, cabc.Sequence): 100 """A proxy node for JSON nodes. Acts as both sequence and mapping.""" 101 102 def __init__(self, offsets, sizes, root): 103 """Parameters 104 ---------- 105 offsets : dict, list, or int 106 offsets of corresponding data structure, in bytes 107 sizes : dict, list, or int 108 sizes of corresponding data structure, in bytes 109 root : weakref.proxy of LazyJSON 110 weakref back to root node, which should be a LazyJSON object. 111 """ 112 self.offsets = offsets 113 self.sizes = sizes 114 self.root = root 115 self.is_mapping = isinstance(self.offsets, cabc.Mapping) 116 self.is_sequence = isinstance(self.offsets, cabc.Sequence) 117 118 def __len__(self): 119 # recall that for maps, the '__total__' key is added and for 120 # sequences the last element represents the total size/offset. 121 return len(self.sizes) - 1 122 123 def load(self): 124 """Returns the Python data structure represented by the node.""" 125 if self.is_mapping: 126 offset = self.offsets["__total__"] 127 size = self.sizes["__total__"] 128 elif self.is_sequence: 129 offset = self.offsets[-1] 130 size = self.sizes[-1] 131 elif isinstance(self.offsets, int): 132 offset = self.offsets 133 size = self.sizes 134 return self._load_or_node(offset, size) 135 136 def _load_or_node(self, offset, size): 137 if isinstance(offset, int): 138 with self.root._open(newline="\n") as f: 139 f.seek(self.root.dloc + offset) 140 s = f.read(size) 141 val = json.loads(s) 142 elif isinstance(offset, (cabc.Mapping, cabc.Sequence)): 143 val = LJNode(offset, size, self.root) 144 else: 145 raise TypeError("incorrect types for offset node") 146 return val 147 148 def _getitem_mapping(self, key): 149 if key == "__total__": 150 raise KeyError('"__total__" is a special LazyJSON key!') 151 offset = self.offsets[key] 152 size = self.sizes[key] 153 return self._load_or_node(offset, size) 154 155 def _getitem_sequence(self, key): 156 if isinstance(key, int): 157 rtn = self._load_or_node(self.offsets[key], self.sizes[key]) 158 elif isinstance(key, slice): 159 key = slice(*key.indices(len(self))) 160 rtn = list(map(self._load_or_node, self.offsets[key], self.sizes[key])) 161 else: 162 raise TypeError("only integer indexing available") 163 return rtn 164 165 def __getitem__(self, key): 166 if self.is_mapping: 167 rtn = self._getitem_mapping(key) 168 elif self.is_sequence: 169 rtn = self._getitem_sequence(key) 170 else: 171 raise NotImplementedError 172 return rtn 173 174 def __iter__(self): 175 if self.is_mapping: 176 keys = set(self.offsets.keys()) 177 keys.discard("__total__") 178 yield from iter(keys) 179 elif self.is_sequence: 180 i = 0 181 n = len(self) 182 while i < n: 183 yield self._load_or_node(self.offsets[i], self.sizes[i]) 184 i += 1 185 else: 186 raise NotImplementedError 187 188 189class LazyJSON(LJNode): 190 """Represents a lazy json file. Can be used like a normal Python 191 dict or list. 192 """ 193 194 def __init__(self, f, reopen=True): 195 """Parameters 196 ---------- 197 f : file handle or str 198 JSON file to open. 199 reopen : bool, optional 200 Whether new file handle should be opened for each load. 201 """ 202 self._f = f 203 self.reopen = reopen 204 if not reopen and isinstance(f, str): 205 self._f = open(f, "r", newline="\n") 206 self._load_index() 207 self.root = weakref.proxy(self) 208 self.is_mapping = isinstance(self.offsets, cabc.Mapping) 209 self.is_sequence = isinstance(self.offsets, cabc.Sequence) 210 211 def __del__(self): 212 self.close() 213 214 def close(self): 215 """Close the file handle, if appropriate.""" 216 if not self.reopen and isinstance(self._f, io.IOBase): 217 try: 218 self._f.close() 219 except OSError: 220 pass 221 222 @contextlib.contextmanager 223 def _open(self, *args, **kwargs): 224 if self.reopen and isinstance(self._f, str): 225 f = open(self._f, *args, **kwargs) 226 yield f 227 f.close() 228 else: 229 yield self._f 230 231 def _load_index(self): 232 """Loads the index from the start of the file.""" 233 with self._open(newline="\n") as f: 234 # read in the location data 235 f.seek(9) 236 locs = f.read(48) 237 locs = json.loads(locs) 238 self.iloc, self.ilen, self.dloc, self.dlen = locs 239 # read in the index 240 f.seek(self.iloc) 241 idx = f.read(self.ilen) 242 idx = json.loads(idx) 243 self.offsets = idx["offsets"] 244 self.sizes = idx["sizes"] 245 246 def __enter__(self): 247 return self 248 249 def __exit__(self, exc_type, exc_value, traceback): 250 self.close() 251