1from __future__ import absolute_import, print_function, division 2 3 4import operator 5from petl.compat import text_type 6 7 8from petl.util.base import Table, asindices, itervalues 9from petl.transform.sorts import sort 10 11 12def duplicates(table, key=None, presorted=False, buffersize=None, tempdir=None, 13 cache=True): 14 """ 15 Select rows with duplicate values under a given key (or duplicate 16 rows where no key is given). E.g.:: 17 18 >>> import petl as etl 19 >>> table1 = [['foo', 'bar', 'baz'], 20 ... ['A', 1, 2.0], 21 ... ['B', 2, 3.4], 22 ... ['D', 6, 9.3], 23 ... ['B', 3, 7.8], 24 ... ['B', 2, 12.3], 25 ... ['E', None, 1.3], 26 ... ['D', 4, 14.5]] 27 >>> table2 = etl.duplicates(table1, 'foo') 28 >>> table2 29 +-----+-----+------+ 30 | foo | bar | baz | 31 +=====+=====+======+ 32 | 'B' | 2 | 3.4 | 33 +-----+-----+------+ 34 | 'B' | 3 | 7.8 | 35 +-----+-----+------+ 36 | 'B' | 2 | 12.3 | 37 +-----+-----+------+ 38 | 'D' | 6 | 9.3 | 39 +-----+-----+------+ 40 | 'D' | 4 | 14.5 | 41 +-----+-----+------+ 42 43 >>> # compound keys are supported 44 ... table3 = etl.duplicates(table1, key=['foo', 'bar']) 45 >>> table3 46 +-----+-----+------+ 47 | foo | bar | baz | 48 +=====+=====+======+ 49 | 'B' | 2 | 3.4 | 50 +-----+-----+------+ 51 | 'B' | 2 | 12.3 | 52 +-----+-----+------+ 53 54 If `presorted` is True, it is assumed that the data are already sorted by 55 the given key, and the `buffersize`, `tempdir` and `cache` arguments are 56 ignored. Otherwise, the data are sorted, see also the discussion of the 57 `buffersize`, `tempdir` and `cache` arguments under the 58 :func:`petl.transform.sorts.sort` function. 59 60 See also :func:`petl.transform.dedup.unique` and 61 :func:`petl.transform.dedup.distinct`. 62 63 """ 64 65 return DuplicatesView(table, key=key, presorted=presorted, 66 buffersize=buffersize, tempdir=tempdir, cache=cache) 67 68 69Table.duplicates = duplicates 70 71 72class DuplicatesView(Table): 73 74 def __init__(self, source, key=None, presorted=False, buffersize=None, 75 tempdir=None, cache=True): 76 if presorted: 77 self.source = source 78 else: 79 self.source = sort(source, key, buffersize=buffersize, 80 tempdir=tempdir, cache=cache) 81 self.key = key 82 83 def __iter__(self): 84 return iterduplicates(self.source, self.key) 85 86 87def iterduplicates(source, key): 88 # assume source is sorted 89 # first need to sort the data 90 it = iter(source) 91 92 hdr = next(it) 93 yield tuple(hdr) 94 95 # convert field selection into field indices 96 if key is None: 97 indices = range(len(hdr)) 98 else: 99 indices = asindices(hdr, key) 100 101 # now use field indices to construct a _getkey function 102 # N.B., this may raise an exception on short rows, depending on 103 # the field selection 104 getkey = operator.itemgetter(*indices) 105 106 previous = None 107 previous_yielded = False 108 109 for row in it: 110 if previous is None: 111 previous = row 112 else: 113 kprev = getkey(previous) 114 kcurr = getkey(row) 115 if kprev == kcurr: 116 if not previous_yielded: 117 yield tuple(previous) 118 previous_yielded = True 119 yield tuple(row) 120 else: 121 # reset 122 previous_yielded = False 123 previous = row 124 125 126def unique(table, key=None, presorted=False, buffersize=None, tempdir=None, 127 cache=True): 128 """ 129 Select rows with unique values under a given key (or unique rows 130 if no key is given). E.g.:: 131 132 >>> import petl as etl 133 >>> table1 = [['foo', 'bar', 'baz'], 134 ... ['A', 1, 2], 135 ... ['B', '2', '3.4'], 136 ... ['D', 'xyz', 9.0], 137 ... ['B', u'3', u'7.8'], 138 ... ['B', '2', 42], 139 ... ['E', None, None], 140 ... ['D', 4, 12.3], 141 ... ['F', 7, 2.3]] 142 >>> table2 = etl.unique(table1, 'foo') 143 >>> table2 144 +-----+------+------+ 145 | foo | bar | baz | 146 +=====+======+======+ 147 | 'A' | 1 | 2 | 148 +-----+------+------+ 149 | 'E' | None | None | 150 +-----+------+------+ 151 | 'F' | 7 | 2.3 | 152 +-----+------+------+ 153 154 If `presorted` is True, it is assumed that the data are already sorted by 155 the given key, and the `buffersize`, `tempdir` and `cache` arguments are 156 ignored. Otherwise, the data are sorted, see also the discussion of the 157 `buffersize`, `tempdir` and `cache` arguments under the 158 :func:`petl.transform.sorts.sort` function. 159 160 See also :func:`petl.transform.dedup.duplicates` and 161 :func:`petl.transform.dedup.distinct`. 162 163 """ 164 165 return UniqueView(table, key=key, presorted=presorted, 166 buffersize=buffersize, tempdir=tempdir, cache=cache) 167 168 169Table.unique = unique 170 171 172class UniqueView(Table): 173 174 def __init__(self, source, key=None, presorted=False, buffersize=None, 175 tempdir=None, cache=True): 176 if presorted: 177 self.source = source 178 else: 179 self.source = sort(source, key, buffersize=buffersize, 180 tempdir=tempdir, cache=cache) 181 self.key = key 182 183 def __iter__(self): 184 return iterunique(self.source, self.key) 185 186 187def iterunique(source, key): 188 # assume source is sorted 189 # first need to sort the data 190 it = iter(source) 191 192 hdr = next(it) 193 yield tuple(hdr) 194 195 # convert field selection into field indices 196 if key is None: 197 indices = range(len(hdr)) 198 else: 199 indices = asindices(hdr, key) 200 201 # now use field indices to construct a _getkey function 202 # N.B., this may raise an exception on short rows, depending on 203 # the field selection 204 getkey = operator.itemgetter(*indices) 205 206 try: 207 prev = next(it) 208 except StopIteration: 209 return 210 211 prev_key = getkey(prev) 212 prev_comp_ne = True 213 214 for curr in it: 215 curr_key = getkey(curr) 216 curr_comp_ne = (curr_key != prev_key) 217 if prev_comp_ne and curr_comp_ne: 218 yield tuple(prev) 219 prev = curr 220 prev_key = curr_key 221 prev_comp_ne = curr_comp_ne 222 223 # last one? 224 if prev_comp_ne: 225 yield prev 226 227 228def conflicts(table, key, missing=None, include=None, exclude=None, 229 presorted=False, buffersize=None, tempdir=None, cache=True): 230 """ 231 Select rows with the same key value but differing in some other field. 232 E.g.:: 233 234 >>> import petl as etl 235 >>> table1 = [['foo', 'bar', 'baz'], 236 ... ['A', 1, 2.7], 237 ... ['B', 2, None], 238 ... ['D', 3, 9.4], 239 ... ['B', None, 7.8], 240 ... ['E', None], 241 ... ['D', 3, 12.3], 242 ... ['A', 2, None]] 243 >>> table2 = etl.conflicts(table1, 'foo') 244 >>> table2 245 +-----+-----+------+ 246 | foo | bar | baz | 247 +=====+=====+======+ 248 | 'A' | 1 | 2.7 | 249 +-----+-----+------+ 250 | 'A' | 2 | None | 251 +-----+-----+------+ 252 | 'D' | 3 | 9.4 | 253 +-----+-----+------+ 254 | 'D' | 3 | 12.3 | 255 +-----+-----+------+ 256 257 Missing values are not considered conflicts. By default, `None` is treated 258 as the missing value, this can be changed via the `missing` keyword 259 argument. 260 261 One or more fields can be ignored when determining conflicts by providing 262 the `exclude` keyword argument. Alternatively, fields to use when 263 determining conflicts can be specified explicitly with the `include` 264 keyword argument. This provides a simple mechanism for analysing the 265 source of conflicting rows from multiple tables, e.g.:: 266 267 >>> table1 = [['foo', 'bar'], [1, 'a'], [2, 'b']] 268 >>> table2 = [['foo', 'bar'], [1, 'a'], [2, 'c']] 269 >>> table3 = etl.cat(etl.addfield(table1, 'source', 1), 270 ... etl.addfield(table2, 'source', 2)) 271 >>> table4 = etl.conflicts(table3, key='foo', exclude='source') 272 >>> table4 273 +-----+-----+--------+ 274 | foo | bar | source | 275 +=====+=====+========+ 276 | 2 | 'b' | 1 | 277 +-----+-----+--------+ 278 | 2 | 'c' | 2 | 279 +-----+-----+--------+ 280 281 If `presorted` is True, it is assumed that the data are already sorted by 282 the given key, and the `buffersize`, `tempdir` and `cache` arguments are 283 ignored. Otherwise, the data are sorted, see also the discussion of the 284 `buffersize`, `tempdir` and `cache` arguments under the 285 :func:`petl.transform.sorts.sort` function. 286 287 """ 288 289 return ConflictsView(table, key, missing=missing, exclude=exclude, 290 include=include, presorted=presorted, 291 buffersize=buffersize, tempdir=tempdir, cache=cache) 292 293 294Table.conflicts = conflicts 295 296 297class ConflictsView(Table): 298 299 def __init__(self, source, key, missing=None, exclude=None, include=None, 300 presorted=False, buffersize=None, tempdir=None, cache=True): 301 if presorted: 302 self.source = source 303 else: 304 self.source = sort(source, key, buffersize=buffersize, 305 tempdir=tempdir, cache=cache) 306 self.key = key 307 self.missing = missing 308 self.exclude = exclude 309 self.include = include 310 311 def __iter__(self): 312 return iterconflicts(self.source, self.key, self.missing, self.exclude, 313 self.include) 314 315 316def iterconflicts(source, key, missing, exclude, include): 317 318 # normalise arguments 319 if exclude and not isinstance(exclude, (list, tuple)): 320 exclude = (exclude,) 321 if include and not isinstance(include, (list, tuple)): 322 include = (include,) 323 324 # exclude overrides include 325 if include and exclude: 326 include = None 327 328 it = iter(source) 329 hdr = next(it) 330 flds = list(map(text_type, hdr)) 331 yield tuple(hdr) 332 333 # convert field selection into field indices 334 indices = asindices(hdr, key) 335 336 # now use field indices to construct a _getkey function 337 # N.B., this may raise an exception on short rows, depending on 338 # the field selection 339 getkey = operator.itemgetter(*indices) 340 341 previous = None 342 previous_yielded = False 343 344 for row in it: 345 if previous is None: 346 previous = row 347 else: 348 kprev = getkey(previous) 349 kcurr = getkey(row) 350 if kprev == kcurr: 351 # is there a conflict? 352 conflict = False 353 for x, y, f in zip(previous, row, flds): 354 if (exclude and f not in exclude) \ 355 or (include and f in include) \ 356 or (not exclude and not include): 357 if missing not in (x, y) and x != y: 358 conflict = True 359 break 360 if conflict: 361 if not previous_yielded: 362 yield tuple(previous) 363 previous_yielded = True 364 yield tuple(row) 365 else: 366 # reset 367 previous_yielded = False 368 previous = row 369 370 371def distinct(table, key=None, count=None, presorted=False, buffersize=None, 372 tempdir=None, cache=True): 373 """ 374 Return only distinct rows in the table. 375 376 If the `count` argument is not None, it will be used as the name for an 377 additional field, and the values of the field will be the number of 378 duplicate rows. 379 380 If the `key` keyword argument is passed, the comparison is done on the 381 given key instead of the full row. 382 383 See also :func:`petl.transform.dedup.duplicates`, 384 :func:`petl.transform.dedup.unique`, 385 :func:`petl.transform.reductions.groupselectfirst`, 386 :func:`petl.transform.reductions.groupselectlast`. 387 388 """ 389 390 return DistinctView(table, key=key, count=count, presorted=presorted, 391 buffersize=buffersize, tempdir=tempdir, cache=cache) 392 393 394Table.distinct = distinct 395 396 397class DistinctView(Table): 398 def __init__(self, table, key=None, count=None, presorted=False, 399 buffersize=None, tempdir=None, cache=True): 400 if presorted: 401 self.table = table 402 else: 403 self.table = sort(table, key=key, buffersize=buffersize, 404 tempdir=tempdir, cache=cache) 405 self.key = key 406 self.count = count 407 408 def __iter__(self): 409 it = iter(self.table) 410 hdr = next(it) 411 412 # convert field selection into field indices 413 if self.key is None: 414 indices = range(len(hdr)) 415 else: 416 indices = asindices(hdr, self.key) 417 418 # now use field indices to construct a _getkey function 419 # N.B., this may raise an exception on short rows, depending on 420 # the field selection 421 getkey = operator.itemgetter(*indices) 422 423 INIT = object() 424 if self.count: 425 hdr = tuple(hdr) + (self.count,) 426 yield hdr 427 previous = INIT 428 n_dup = 1 429 for row in it: 430 if previous is INIT: 431 previous = row 432 else: 433 kprev = getkey(previous) 434 kcurr = getkey(row) 435 if kprev == kcurr: 436 n_dup += 1 437 else: 438 yield tuple(previous) + (n_dup,) 439 n_dup = 1 440 previous = row 441 # deal with last row 442 yield tuple(previous) + (n_dup,) 443 else: 444 yield tuple(hdr) 445 previous_keys = INIT 446 for row in it: 447 keys = getkey(row) 448 if keys != previous_keys: 449 yield tuple(row) 450 previous_keys = keys 451 452 453def isunique(table, field): 454 """ 455 Return True if there are no duplicate values for the given field(s), 456 otherwise False. E.g.:: 457 458 >>> import petl as etl 459 >>> table1 = [['foo', 'bar'], 460 ... ['a', 1], 461 ... ['b'], 462 ... ['b', 2], 463 ... ['c', 3, True]] 464 >>> etl.isunique(table1, 'foo') 465 False 466 >>> etl.isunique(table1, 'bar') 467 True 468 469 The `field` argument can be a single field name or index (starting from 470 zero) or a tuple of field names and/or indexes. 471 472 """ 473 474 vals = set() 475 for v in itervalues(table, field): 476 if v in vals: 477 return False 478 else: 479 vals.add(v) 480 return True 481 482 483Table.isunique = isunique 484