1from __future__ import absolute_import, print_function, division
2
3
4import operator
5from petl.compat import text_type
6
7
8from petl.util.base import Table, asindices, itervalues
9from petl.transform.sorts import sort
10
11
12def duplicates(table, key=None, presorted=False, buffersize=None, tempdir=None,
13               cache=True):
14    """
15    Select rows with duplicate values under a given key (or duplicate
16    rows where no key is given). E.g.::
17
18        >>> import petl as etl
19        >>> table1 = [['foo', 'bar', 'baz'],
20        ...           ['A', 1, 2.0],
21        ...           ['B', 2, 3.4],
22        ...           ['D', 6, 9.3],
23        ...           ['B', 3, 7.8],
24        ...           ['B', 2, 12.3],
25        ...           ['E', None, 1.3],
26        ...           ['D', 4, 14.5]]
27        >>> table2 = etl.duplicates(table1, 'foo')
28        >>> table2
29        +-----+-----+------+
30        | foo | bar | baz  |
31        +=====+=====+======+
32        | 'B' |   2 |  3.4 |
33        +-----+-----+------+
34        | 'B' |   3 |  7.8 |
35        +-----+-----+------+
36        | 'B' |   2 | 12.3 |
37        +-----+-----+------+
38        | 'D' |   6 |  9.3 |
39        +-----+-----+------+
40        | 'D' |   4 | 14.5 |
41        +-----+-----+------+
42
43        >>> # compound keys are supported
44        ... table3 = etl.duplicates(table1, key=['foo', 'bar'])
45        >>> table3
46        +-----+-----+------+
47        | foo | bar | baz  |
48        +=====+=====+======+
49        | 'B' |   2 |  3.4 |
50        +-----+-----+------+
51        | 'B' |   2 | 12.3 |
52        +-----+-----+------+
53
54    If `presorted` is True, it is assumed that the data are already sorted by
55    the given key, and the `buffersize`, `tempdir` and `cache` arguments are
56    ignored. Otherwise, the data are sorted, see also the discussion of the
57    `buffersize`, `tempdir` and `cache` arguments under the
58    :func:`petl.transform.sorts.sort` function.
59
60    See also :func:`petl.transform.dedup.unique` and
61    :func:`petl.transform.dedup.distinct`.
62
63    """
64
65    return DuplicatesView(table, key=key, presorted=presorted,
66                          buffersize=buffersize, tempdir=tempdir, cache=cache)
67
68
69Table.duplicates = duplicates
70
71
72class DuplicatesView(Table):
73
74    def __init__(self, source, key=None, presorted=False, buffersize=None,
75                 tempdir=None, cache=True):
76        if presorted:
77            self.source = source
78        else:
79            self.source = sort(source, key, buffersize=buffersize,
80                               tempdir=tempdir, cache=cache)
81        self.key = key
82
83    def __iter__(self):
84        return iterduplicates(self.source, self.key)
85
86
87def iterduplicates(source, key):
88    # assume source is sorted
89    # first need to sort the data
90    it = iter(source)
91
92    hdr = next(it)
93    yield tuple(hdr)
94
95    # convert field selection into field indices
96    if key is None:
97        indices = range(len(hdr))
98    else:
99        indices = asindices(hdr, key)
100
101    # now use field indices to construct a _getkey function
102    # N.B., this may raise an exception on short rows, depending on
103    # the field selection
104    getkey = operator.itemgetter(*indices)
105
106    previous = None
107    previous_yielded = False
108
109    for row in it:
110        if previous is None:
111            previous = row
112        else:
113            kprev = getkey(previous)
114            kcurr = getkey(row)
115            if kprev == kcurr:
116                if not previous_yielded:
117                    yield tuple(previous)
118                    previous_yielded = True
119                yield tuple(row)
120            else:
121                # reset
122                previous_yielded = False
123            previous = row
124
125
126def unique(table, key=None, presorted=False, buffersize=None, tempdir=None,
127           cache=True):
128    """
129    Select rows with unique values under a given key (or unique rows
130    if no key is given). E.g.::
131
132        >>> import petl as etl
133        >>> table1 = [['foo', 'bar', 'baz'],
134        ...           ['A', 1, 2],
135        ...           ['B', '2', '3.4'],
136        ...           ['D', 'xyz', 9.0],
137        ...           ['B', u'3', u'7.8'],
138        ...           ['B', '2', 42],
139        ...           ['E', None, None],
140        ...           ['D', 4, 12.3],
141        ...           ['F', 7, 2.3]]
142        >>> table2 = etl.unique(table1, 'foo')
143        >>> table2
144        +-----+------+------+
145        | foo | bar  | baz  |
146        +=====+======+======+
147        | 'A' |    1 |    2 |
148        +-----+------+------+
149        | 'E' | None | None |
150        +-----+------+------+
151        | 'F' |    7 |  2.3 |
152        +-----+------+------+
153
154    If `presorted` is True, it is assumed that the data are already sorted by
155    the given key, and the `buffersize`, `tempdir` and `cache` arguments are
156    ignored. Otherwise, the data are sorted, see also the discussion of the
157    `buffersize`, `tempdir` and `cache` arguments under the
158    :func:`petl.transform.sorts.sort` function.
159
160    See also :func:`petl.transform.dedup.duplicates` and
161    :func:`petl.transform.dedup.distinct`.
162
163    """
164
165    return UniqueView(table, key=key, presorted=presorted,
166                      buffersize=buffersize, tempdir=tempdir, cache=cache)
167
168
169Table.unique = unique
170
171
172class UniqueView(Table):
173
174    def __init__(self, source, key=None, presorted=False, buffersize=None,
175                 tempdir=None, cache=True):
176        if presorted:
177            self.source = source
178        else:
179            self.source = sort(source, key, buffersize=buffersize,
180                               tempdir=tempdir, cache=cache)
181        self.key = key
182
183    def __iter__(self):
184        return iterunique(self.source, self.key)
185
186
187def iterunique(source, key):
188    # assume source is sorted
189    # first need to sort the data
190    it = iter(source)
191
192    hdr = next(it)
193    yield tuple(hdr)
194
195    # convert field selection into field indices
196    if key is None:
197        indices = range(len(hdr))
198    else:
199        indices = asindices(hdr, key)
200
201    # now use field indices to construct a _getkey function
202    # N.B., this may raise an exception on short rows, depending on
203    # the field selection
204    getkey = operator.itemgetter(*indices)
205
206    try:
207        prev = next(it)
208    except StopIteration:
209        return
210
211    prev_key = getkey(prev)
212    prev_comp_ne = True
213
214    for curr in it:
215        curr_key = getkey(curr)
216        curr_comp_ne = (curr_key != prev_key)
217        if prev_comp_ne and curr_comp_ne:
218            yield tuple(prev)
219        prev = curr
220        prev_key = curr_key
221        prev_comp_ne = curr_comp_ne
222
223    # last one?
224    if prev_comp_ne:
225        yield prev
226
227
228def conflicts(table, key, missing=None, include=None, exclude=None,
229              presorted=False, buffersize=None, tempdir=None, cache=True):
230    """
231    Select rows with the same key value but differing in some other field.
232    E.g.::
233
234        >>> import petl as etl
235        >>> table1 = [['foo', 'bar', 'baz'],
236        ...           ['A', 1, 2.7],
237        ...           ['B', 2, None],
238        ...           ['D', 3, 9.4],
239        ...           ['B', None, 7.8],
240        ...           ['E', None],
241        ...           ['D', 3, 12.3],
242        ...           ['A', 2, None]]
243        >>> table2 = etl.conflicts(table1, 'foo')
244        >>> table2
245        +-----+-----+------+
246        | foo | bar | baz  |
247        +=====+=====+======+
248        | 'A' |   1 |  2.7 |
249        +-----+-----+------+
250        | 'A' |   2 | None |
251        +-----+-----+------+
252        | 'D' |   3 |  9.4 |
253        +-----+-----+------+
254        | 'D' |   3 | 12.3 |
255        +-----+-----+------+
256
257    Missing values are not considered conflicts. By default, `None` is treated
258    as the missing value, this can be changed via the `missing` keyword
259    argument.
260
261    One or more fields can be ignored when determining conflicts by providing
262    the `exclude` keyword argument. Alternatively, fields to use when
263    determining conflicts can be specified explicitly with the `include`
264    keyword argument. This provides a simple mechanism for analysing the
265    source of conflicting rows from multiple tables, e.g.::
266
267        >>> table1 = [['foo', 'bar'], [1, 'a'], [2, 'b']]
268        >>> table2 = [['foo', 'bar'], [1, 'a'], [2, 'c']]
269        >>> table3 = etl.cat(etl.addfield(table1, 'source', 1),
270        ...                  etl.addfield(table2, 'source', 2))
271        >>> table4 = etl.conflicts(table3, key='foo', exclude='source')
272        >>> table4
273        +-----+-----+--------+
274        | foo | bar | source |
275        +=====+=====+========+
276        |   2 | 'b' |      1 |
277        +-----+-----+--------+
278        |   2 | 'c' |      2 |
279        +-----+-----+--------+
280
281    If `presorted` is True, it is assumed that the data are already sorted by
282    the given key, and the `buffersize`, `tempdir` and `cache` arguments are
283    ignored. Otherwise, the data are sorted, see also the discussion of the
284    `buffersize`, `tempdir` and `cache` arguments under the
285    :func:`petl.transform.sorts.sort` function.
286
287    """
288
289    return ConflictsView(table, key, missing=missing, exclude=exclude,
290                         include=include, presorted=presorted,
291                         buffersize=buffersize, tempdir=tempdir, cache=cache)
292
293
294Table.conflicts = conflicts
295
296
297class ConflictsView(Table):
298
299    def __init__(self, source, key, missing=None, exclude=None, include=None,
300                 presorted=False, buffersize=None, tempdir=None, cache=True):
301        if presorted:
302            self.source = source
303        else:
304            self.source = sort(source, key, buffersize=buffersize,
305                               tempdir=tempdir, cache=cache)
306        self.key = key
307        self.missing = missing
308        self.exclude = exclude
309        self.include = include
310
311    def __iter__(self):
312        return iterconflicts(self.source, self.key, self.missing, self.exclude,
313                             self.include)
314
315
316def iterconflicts(source, key, missing, exclude, include):
317
318    # normalise arguments
319    if exclude and not isinstance(exclude, (list, tuple)):
320        exclude = (exclude,)
321    if include and not isinstance(include, (list, tuple)):
322        include = (include,)
323
324    # exclude overrides include
325    if include and exclude:
326        include = None
327
328    it = iter(source)
329    hdr = next(it)
330    flds = list(map(text_type, hdr))
331    yield tuple(hdr)
332
333    # convert field selection into field indices
334    indices = asindices(hdr, key)
335
336    # now use field indices to construct a _getkey function
337    # N.B., this may raise an exception on short rows, depending on
338    # the field selection
339    getkey = operator.itemgetter(*indices)
340
341    previous = None
342    previous_yielded = False
343
344    for row in it:
345        if previous is None:
346            previous = row
347        else:
348            kprev = getkey(previous)
349            kcurr = getkey(row)
350            if kprev == kcurr:
351                # is there a conflict?
352                conflict = False
353                for x, y, f in zip(previous, row, flds):
354                    if (exclude and f not in exclude) \
355                            or (include and f in include) \
356                            or (not exclude and not include):
357                        if missing not in (x, y) and x != y:
358                            conflict = True
359                            break
360                if conflict:
361                    if not previous_yielded:
362                        yield tuple(previous)
363                        previous_yielded = True
364                    yield tuple(row)
365            else:
366                # reset
367                previous_yielded = False
368            previous = row
369
370
371def distinct(table, key=None, count=None, presorted=False, buffersize=None,
372             tempdir=None, cache=True):
373    """
374    Return only distinct rows in the table.
375
376    If the `count` argument is not None, it will be used as the name for an
377    additional field, and the values of the field will be the number of
378    duplicate rows.
379
380    If the `key` keyword argument is passed, the comparison is done on the
381    given key instead of the full row.
382
383    See also :func:`petl.transform.dedup.duplicates`,
384    :func:`petl.transform.dedup.unique`,
385    :func:`petl.transform.reductions.groupselectfirst`,
386    :func:`petl.transform.reductions.groupselectlast`.
387
388    """
389
390    return DistinctView(table, key=key, count=count, presorted=presorted,
391                        buffersize=buffersize, tempdir=tempdir, cache=cache)
392
393
394Table.distinct = distinct
395
396
397class DistinctView(Table):
398    def __init__(self, table, key=None, count=None, presorted=False,
399                 buffersize=None, tempdir=None, cache=True):
400        if presorted:
401            self.table = table
402        else:
403            self.table = sort(table, key=key, buffersize=buffersize,
404                              tempdir=tempdir, cache=cache)
405        self.key = key
406        self.count = count
407
408    def __iter__(self):
409        it = iter(self.table)
410        hdr = next(it)
411
412        # convert field selection into field indices
413        if self.key is None:
414            indices = range(len(hdr))
415        else:
416            indices = asindices(hdr, self.key)
417
418        # now use field indices to construct a _getkey function
419        # N.B., this may raise an exception on short rows, depending on
420        # the field selection
421        getkey = operator.itemgetter(*indices)
422
423        INIT = object()
424        if self.count:
425            hdr = tuple(hdr) + (self.count,)
426            yield hdr
427            previous = INIT
428            n_dup = 1
429            for row in it:
430                if previous is INIT:
431                    previous = row
432                else:
433                    kprev = getkey(previous)
434                    kcurr = getkey(row)
435                    if kprev == kcurr:
436                        n_dup += 1
437                    else:
438                        yield tuple(previous) + (n_dup,)
439                        n_dup = 1
440                        previous = row
441            # deal with last row
442            yield tuple(previous) + (n_dup,)
443        else:
444            yield tuple(hdr)
445            previous_keys = INIT
446            for row in it:
447                keys = getkey(row)
448                if keys != previous_keys:
449                    yield tuple(row)
450                previous_keys = keys
451
452
453def isunique(table, field):
454    """
455    Return True if there are no duplicate values for the given field(s),
456    otherwise False. E.g.::
457
458        >>> import petl as etl
459        >>> table1 = [['foo', 'bar'],
460        ...           ['a', 1],
461        ...           ['b'],
462        ...           ['b', 2],
463        ...           ['c', 3, True]]
464        >>> etl.isunique(table1, 'foo')
465        False
466        >>> etl.isunique(table1, 'bar')
467        True
468
469    The `field` argument can be a single field name or index (starting from
470    zero) or a tuple of field names and/or indexes.
471
472    """
473
474    vals = set()
475    for v in itervalues(table, field):
476        if v in vals:
477            return False
478        else:
479            vals.add(v)
480    return True
481
482
483Table.isunique = isunique
484