1from __future__ import absolute_import, print_function, division
2
3
4from operator import itemgetter, attrgetter
5from petl.compat import text_type
6
7
8from petl.util.base import asindices, records, Table, values, rowgroupby
9from petl.errors import DuplicateKeyError
10from petl.transform.basics import addfield
11from petl.transform.sorts import sort
12
13
14def tupletree(table, start='start', stop='stop', value=None):
15    """
16    Construct an interval tree for the given table, where each node in the tree
17    is a row of the table.
18
19    """
20
21    import intervaltree
22    tree = intervaltree.IntervalTree()
23    it = iter(table)
24    hdr = next(it)
25    flds = list(map(text_type, hdr))
26    assert start in flds, 'start field not recognised'
27    assert stop in flds, 'stop field not recognised'
28    getstart = itemgetter(flds.index(start))
29    getstop = itemgetter(flds.index(stop))
30    if value is None:
31        getvalue = tuple
32    else:
33        valueindices = asindices(hdr, value)
34        assert len(valueindices) > 0, 'invalid value field specification'
35        getvalue = itemgetter(*valueindices)
36    for row in it:
37        tree.addi(getstart(row), getstop(row), getvalue(row))
38    return tree
39
40
41def facettupletrees(table, key, start='start', stop='stop', value=None):
42    """
43    Construct faceted interval trees for the given table, where each node in
44    the tree is a row of the table.
45
46    """
47
48    import intervaltree
49    it = iter(table)
50    hdr = next(it)
51    flds = list(map(text_type, hdr))
52    assert start in flds, 'start field not recognised'
53    assert stop in flds, 'stop field not recognised'
54    getstart = itemgetter(flds.index(start))
55    getstop = itemgetter(flds.index(stop))
56    if value is None:
57        getvalue = tuple
58    else:
59        valueindices = asindices(hdr, value)
60        assert len(valueindices) > 0, 'invalid value field specification'
61        getvalue = itemgetter(*valueindices)
62    keyindices = asindices(hdr, key)
63    assert len(keyindices) > 0, 'invalid key'
64    getkey = itemgetter(*keyindices)
65
66    trees = dict()
67    for row in it:
68        k = getkey(row)
69        if k not in trees:
70            trees[k] = intervaltree.IntervalTree()
71        trees[k].addi(getstart(row), getstop(row), getvalue(row))
72    return trees
73
74
75def recordtree(table, start='start', stop='stop'):
76    """
77    Construct an interval tree for the given table, where each node in the
78    tree is a row of the table represented as a record object.
79
80    """
81
82    import intervaltree
83    getstart = attrgetter(start)
84    getstop = attrgetter(stop)
85    tree = intervaltree.IntervalTree()
86    for rec in records(table):
87        tree.addi(getstart(rec), getstop(rec), rec)
88    return tree
89
90
91def facetrecordtrees(table, key, start='start', stop='stop'):
92    """
93    Construct faceted interval trees for the given table, where each node in
94    the tree is a record.
95
96    """
97
98    import intervaltree
99    getstart = attrgetter(start)
100    getstop = attrgetter(stop)
101    getkey = attrgetter(key)
102    trees = dict()
103    for rec in records(table):
104        k = getkey(rec)
105        if k not in trees:
106            trees[k] = intervaltree.IntervalTree()
107        trees[k].addi(getstart(rec), getstop(rec), rec)
108    return trees
109
110
111def intervallookup(table, start='start', stop='stop', value=None,
112                   include_stop=False):
113    """
114    Construct an interval lookup for the given table. E.g.::
115
116        >>> import petl as etl
117        >>> table = [['start', 'stop', 'value'],
118        ...          [1, 4, 'foo'],
119        ...          [3, 7, 'bar'],
120        ...          [4, 9, 'baz']]
121        >>> lkp = etl.intervallookup(table, 'start', 'stop')
122        >>> lkp.search(0, 1)
123        []
124        >>> lkp.search(1, 2)
125        [(1, 4, 'foo')]
126        >>> lkp.search(2, 4)
127        [(1, 4, 'foo'), (3, 7, 'bar')]
128        >>> lkp.search(2, 5)
129        [(1, 4, 'foo'), (3, 7, 'bar'), (4, 9, 'baz')]
130        >>> lkp.search(9, 14)
131        []
132        >>> lkp.search(19, 140)
133        []
134        >>> lkp.search(0)
135        []
136        >>> lkp.search(1)
137        [(1, 4, 'foo')]
138        >>> lkp.search(2)
139        [(1, 4, 'foo')]
140        >>> lkp.search(4)
141        [(3, 7, 'bar'), (4, 9, 'baz')]
142        >>> lkp.search(5)
143        [(3, 7, 'bar'), (4, 9, 'baz')]
144
145    Note start coordinates are included and stop coordinates are excluded
146    from the interval. Use the `include_stop` keyword argument to include the
147    upper bound of the interval when finding overlaps.
148
149    Some examples using the `include_stop` and `value` keyword arguments::
150
151        >>> import petl as etl
152        >>> table = [['start', 'stop', 'value'],
153        ...          [1, 4, 'foo'],
154        ...          [3, 7, 'bar'],
155        ...          [4, 9, 'baz']]
156        >>> lkp = etl.intervallookup(table, 'start', 'stop', include_stop=True,
157        ...                          value='value')
158        >>> lkp.search(0, 1)
159        ['foo']
160        >>> lkp.search(1, 2)
161        ['foo']
162        >>> lkp.search(2, 4)
163        ['foo', 'bar', 'baz']
164        >>> lkp.search(2, 5)
165        ['foo', 'bar', 'baz']
166        >>> lkp.search(9, 14)
167        ['baz']
168        >>> lkp.search(19, 140)
169        []
170        >>> lkp.search(0)
171        []
172        >>> lkp.search(1)
173        ['foo']
174        >>> lkp.search(2)
175        ['foo']
176        >>> lkp.search(4)
177        ['foo', 'bar', 'baz']
178        >>> lkp.search(5)
179        ['bar', 'baz']
180
181    """
182
183    tree = tupletree(table, start=start, stop=stop, value=value)
184    return IntervalTreeLookup(tree, include_stop=include_stop)
185
186
187Table.intervallookup = intervallookup
188
189
190def _search_tree(tree, start, stop, include_stop):
191    if stop is None:
192        if include_stop:
193            stop = start + 1
194            start -= 1
195            args = (start, stop)
196        else:
197            args = (start,)
198    else:
199        if include_stop:
200            stop += 1
201            start -= 1
202        args = (start, stop)
203    if len(args) == 2:
204        results = sorted(tree.overlap(*args))
205    else:
206        results = sorted(tree.at(*args))
207    return results
208
209
210class IntervalTreeLookup(object):
211
212    def __init__(self, tree, include_stop=False):
213        self.tree = tree
214        self.include_stop = include_stop
215
216    def search(self, start, stop=None):
217        results = _search_tree(self.tree, start, stop, self.include_stop)
218        return [r.data for r in results]
219
220    find = search
221
222
223def intervallookupone(table, start='start', stop='stop', value=None,
224                      include_stop=False, strict=True):
225    """
226    Construct an interval lookup for the given table, returning at most one
227    result for each query. E.g.::
228
229        >>> import petl as etl
230        >>> table = [['start', 'stop', 'value'],
231        ...          [1, 4, 'foo'],
232        ...          [3, 7, 'bar'],
233        ...          [4, 9, 'baz']]
234        >>> lkp = etl.intervallookupone(table, 'start', 'stop', strict=False)
235        >>> lkp.search(0, 1)
236        >>> lkp.search(1, 2)
237        (1, 4, 'foo')
238        >>> lkp.search(2, 4)
239        (1, 4, 'foo')
240        >>> lkp.search(2, 5)
241        (1, 4, 'foo')
242        >>> lkp.search(9, 14)
243        >>> lkp.search(19, 140)
244        >>> lkp.search(0)
245        >>> lkp.search(1)
246        (1, 4, 'foo')
247        >>> lkp.search(2)
248        (1, 4, 'foo')
249        >>> lkp.search(4)
250        (3, 7, 'bar')
251        >>> lkp.search(5)
252        (3, 7, 'bar')
253
254    If ``strict=True``, queries returning more than one result will
255    raise a `DuplicateKeyError`. If ``strict=False`` and there is
256    more than one result, the first result is returned.
257
258    Note start coordinates are included and stop coordinates are excluded
259    from the interval. Use the `include_stop` keyword argument to include the
260    upper bound of the interval when finding overlaps.
261
262    """
263
264    tree = tupletree(table, start=start, stop=stop, value=value)
265    return IntervalTreeLookupOne(tree, strict=strict, include_stop=include_stop)
266
267
268Table.intervallookupone = intervallookupone
269
270
271class IntervalTreeLookupOne(object):
272
273    def __init__(self, tree, strict=True, include_stop=False):
274        self.tree = tree
275        self.strict = strict
276        self.include_stop = include_stop
277
278    def search(self, start, stop=None):
279        results = _search_tree(self.tree, start, stop, self.include_stop)
280        if len(results) == 0:
281            return None
282        elif len(results) > 1 and self.strict:
283            raise DuplicateKeyError((start, stop))
284        else:
285            return results[0].data
286
287    find = search
288
289
290def intervalrecordlookup(table, start='start', stop='stop', include_stop=False):
291    """
292    As :func:`petl.transform.intervals.intervallookup` but return records
293    instead of tuples.
294
295    """
296
297    tree = recordtree(table, start=start, stop=stop)
298    return IntervalTreeLookup(tree, include_stop=include_stop)
299
300
301Table.intervalrecordlookup = intervalrecordlookup
302
303
304def intervalrecordlookupone(table, start='start', stop='stop',
305                            include_stop=False, strict=True):
306    """
307    As :func:`petl.transform.intervals.intervallookupone` but return records
308    instead of tuples.
309
310    """
311
312    tree = recordtree(table, start=start, stop=stop)
313    return IntervalTreeLookupOne(tree, include_stop=include_stop, strict=strict)
314
315
316Table.intervalrecordlookupone = intervalrecordlookupone
317
318
319def facetintervallookup(table, key, start='start', stop='stop',
320                        value=None, include_stop=False):
321    """
322
323    Construct a faceted interval lookup for the given table. E.g.::
324
325        >>> import petl as etl
326        >>> table = (('type', 'start', 'stop', 'value'),
327        ...          ('apple', 1, 4, 'foo'),
328        ...          ('apple', 3, 7, 'bar'),
329        ...          ('orange', 4, 9, 'baz'))
330        >>> lkp = etl.facetintervallookup(table, key='type', start='start', stop='stop')
331        >>> lkp['apple'].search(1, 2)
332        [('apple', 1, 4, 'foo')]
333        >>> lkp['apple'].search(2, 4)
334        [('apple', 1, 4, 'foo'), ('apple', 3, 7, 'bar')]
335        >>> lkp['apple'].search(2, 5)
336        [('apple', 1, 4, 'foo'), ('apple', 3, 7, 'bar')]
337        >>> lkp['orange'].search(2, 5)
338        [('orange', 4, 9, 'baz')]
339        >>> lkp['orange'].search(9, 14)
340        []
341        >>> lkp['orange'].search(19, 140)
342        []
343        >>> lkp['apple'].search(1)
344        [('apple', 1, 4, 'foo')]
345        >>> lkp['apple'].search(2)
346        [('apple', 1, 4, 'foo')]
347        >>> lkp['apple'].search(4)
348        [('apple', 3, 7, 'bar')]
349        >>> lkp['apple'].search(5)
350        [('apple', 3, 7, 'bar')]
351        >>> lkp['orange'].search(5)
352        [('orange', 4, 9, 'baz')]
353
354    """
355
356    trees = facettupletrees(table, key, start=start, stop=stop, value=value)
357    out = dict()
358    for k in trees:
359        out[k] = IntervalTreeLookup(trees[k], include_stop=include_stop)
360    return out
361
362
363Table.facetintervallookup = facetintervallookup
364
365
366def facetintervallookupone(table, key, start='start', stop='stop',
367                           value=None, include_stop=False, strict=True):
368    """
369    Construct a faceted interval lookup for the given table, returning at most
370    one result for each query.
371
372    If ``strict=True``, queries returning more than one result will
373    raise a `DuplicateKeyError`. If ``strict=False`` and there is
374    more than one result, the first result is returned.
375
376    """
377
378    trees = facettupletrees(table, key, start=start, stop=stop, value=value)
379    out = dict()
380    for k in trees:
381        out[k] = IntervalTreeLookupOne(trees[k], include_stop=include_stop,
382                                       strict=strict)
383    return out
384
385
386Table.facetintervallookupone = facetintervallookupone
387
388
389def facetintervalrecordlookup(table, key, start='start', stop='stop',
390                              include_stop=False):
391    """
392    As :func:`petl.transform.intervals.facetintervallookup` but return records.
393
394    """
395
396    trees = facetrecordtrees(table, key, start=start, stop=stop)
397    out = dict()
398    for k in trees:
399        out[k] = IntervalTreeLookup(trees[k], include_stop=include_stop)
400    return out
401
402
403Table.facetintervalrecordlookup = facetintervalrecordlookup
404
405
406def facetintervalrecordlookupone(table, key, start, stop, include_stop=False,
407                                 strict=True):
408    """
409    As :func:`petl.transform.intervals.facetintervallookupone` but return
410    records.
411
412    """
413
414    trees = facetrecordtrees(table, key, start=start, stop=stop)
415    out = dict()
416    for k in trees:
417        out[k] = IntervalTreeLookupOne(trees[k], include_stop=include_stop,
418                                       strict=strict)
419    return out
420
421
422Table.facetintervalrecordlookupone = facetintervalrecordlookupone
423
424
425def intervaljoin(left, right, lstart='start', lstop='stop', rstart='start',
426                 rstop='stop', lkey=None, rkey=None, include_stop=False,
427                 lprefix=None, rprefix=None):
428    """
429    Join two tables by overlapping intervals. E.g.::
430
431        >>> import petl as etl
432        >>> left = [['begin', 'end', 'quux'],
433        ...         [1, 2, 'a'],
434        ...         [2, 4, 'b'],
435        ...         [2, 5, 'c'],
436        ...         [9, 14, 'd'],
437        ...         [1, 1, 'e'],
438        ...         [10, 10, 'f']]
439        >>> right = [['start', 'stop', 'value'],
440        ...          [1, 4, 'foo'],
441        ...          [3, 7, 'bar'],
442        ...          [4, 9, 'baz']]
443        >>> table1 = etl.intervaljoin(left, right,
444        ...                           lstart='begin', lstop='end',
445        ...                           rstart='start', rstop='stop')
446        >>> table1.lookall()
447        +-------+-----+------+-------+------+-------+
448        | begin | end | quux | start | stop | value |
449        +=======+=====+======+=======+======+=======+
450        |     1 |   2 | 'a'  |     1 |    4 | 'foo' |
451        +-------+-----+------+-------+------+-------+
452        |     2 |   4 | 'b'  |     1 |    4 | 'foo' |
453        +-------+-----+------+-------+------+-------+
454        |     2 |   4 | 'b'  |     3 |    7 | 'bar' |
455        +-------+-----+------+-------+------+-------+
456        |     2 |   5 | 'c'  |     1 |    4 | 'foo' |
457        +-------+-----+------+-------+------+-------+
458        |     2 |   5 | 'c'  |     3 |    7 | 'bar' |
459        +-------+-----+------+-------+------+-------+
460        |     2 |   5 | 'c'  |     4 |    9 | 'baz' |
461        +-------+-----+------+-------+------+-------+
462
463        >>> # include stop coordinate in intervals
464        ... table2 = etl.intervaljoin(left, right,
465        ...                           lstart='begin', lstop='end',
466        ...                           rstart='start', rstop='stop',
467        ...                           include_stop=True)
468        >>> table2.lookall()
469        +-------+-----+------+-------+------+-------+
470        | begin | end | quux | start | stop | value |
471        +=======+=====+======+=======+======+=======+
472        |     1 |   2 | 'a'  |     1 |    4 | 'foo' |
473        +-------+-----+------+-------+------+-------+
474        |     2 |   4 | 'b'  |     1 |    4 | 'foo' |
475        +-------+-----+------+-------+------+-------+
476        |     2 |   4 | 'b'  |     3 |    7 | 'bar' |
477        +-------+-----+------+-------+------+-------+
478        |     2 |   4 | 'b'  |     4 |    9 | 'baz' |
479        +-------+-----+------+-------+------+-------+
480        |     2 |   5 | 'c'  |     1 |    4 | 'foo' |
481        +-------+-----+------+-------+------+-------+
482        |     2 |   5 | 'c'  |     3 |    7 | 'bar' |
483        +-------+-----+------+-------+------+-------+
484        |     2 |   5 | 'c'  |     4 |    9 | 'baz' |
485        +-------+-----+------+-------+------+-------+
486        |     9 |  14 | 'd'  |     4 |    9 | 'baz' |
487        +-------+-----+------+-------+------+-------+
488        |     1 |   1 | 'e'  |     1 |    4 | 'foo' |
489        +-------+-----+------+-------+------+-------+
490
491    Note start coordinates are included and stop coordinates are excluded
492    from the interval. Use the `include_stop` keyword argument to include the
493    upper bound of the interval when finding overlaps.
494
495    An additional key comparison can be made, e.g.::
496
497        >>> import petl as etl
498        >>> left = (('fruit', 'begin', 'end'),
499        ...         ('apple', 1, 2),
500        ...         ('apple', 2, 4),
501        ...         ('apple', 2, 5),
502        ...         ('orange', 2, 5),
503        ...         ('orange', 9, 14),
504        ...         ('orange', 19, 140),
505        ...         ('apple', 1, 1))
506        >>> right = (('type', 'start', 'stop', 'value'),
507        ...          ('apple', 1, 4, 'foo'),
508        ...          ('apple', 3, 7, 'bar'),
509        ...          ('orange', 4, 9, 'baz'))
510        >>> table3 = etl.intervaljoin(left, right,
511        ...                           lstart='begin', lstop='end', lkey='fruit',
512        ...                           rstart='start', rstop='stop', rkey='type')
513        >>> table3.lookall()
514        +----------+-------+-----+----------+-------+------+-------+
515        | fruit    | begin | end | type     | start | stop | value |
516        +==========+=======+=====+==========+=======+======+=======+
517        | 'apple'  |     1 |   2 | 'apple'  |     1 |    4 | 'foo' |
518        +----------+-------+-----+----------+-------+------+-------+
519        | 'apple'  |     2 |   4 | 'apple'  |     1 |    4 | 'foo' |
520        +----------+-------+-----+----------+-------+------+-------+
521        | 'apple'  |     2 |   4 | 'apple'  |     3 |    7 | 'bar' |
522        +----------+-------+-----+----------+-------+------+-------+
523        | 'apple'  |     2 |   5 | 'apple'  |     1 |    4 | 'foo' |
524        +----------+-------+-----+----------+-------+------+-------+
525        | 'apple'  |     2 |   5 | 'apple'  |     3 |    7 | 'bar' |
526        +----------+-------+-----+----------+-------+------+-------+
527        | 'orange' |     2 |   5 | 'orange' |     4 |    9 | 'baz' |
528        +----------+-------+-----+----------+-------+------+-------+
529
530    """
531
532    assert (lkey is None) == (rkey is None), \
533        'facet key field must be provided for both or neither table'
534    return IntervalJoinView(left, right, lstart=lstart, lstop=lstop,
535                            rstart=rstart, rstop=rstop, lkey=lkey,
536                            rkey=rkey, include_stop=include_stop,
537                            lprefix=lprefix, rprefix=rprefix)
538
539
540Table.intervaljoin = intervaljoin
541
542
543class IntervalJoinView(Table):
544
545    def __init__(self, left, right, lstart='start', lstop='stop',
546                 rstart='start', rstop='stop', lkey=None, rkey=None,
547                 include_stop=False, lprefix=None, rprefix=None):
548        self.left = left
549        self.lstart = lstart
550        self.lstop = lstop
551        self.lkey = lkey
552        self.right = right
553        self.rstart = rstart
554        self.rstop = rstop
555        self.rkey = rkey
556        self.include_stop = include_stop
557        self.lprefix = lprefix
558        self.rprefix = rprefix
559
560    def __iter__(self):
561        return iterintervaljoin(
562            left=self.left,
563            right=self.right,
564            lstart=self.lstart,
565            lstop=self.lstop,
566            rstart=self.rstart,
567            rstop=self.rstop,
568            lkey=self.lkey,
569            rkey=self.rkey,
570            include_stop=self.include_stop,
571            missing=None,
572            lprefix=self.lprefix,
573            rprefix=self.rprefix,
574            leftouter=False
575        )
576
577
578def intervalleftjoin(left, right, lstart='start', lstop='stop', rstart='start',
579                     rstop='stop', lkey=None, rkey=None, include_stop=False,
580                     missing=None, lprefix=None, rprefix=None):
581    """
582    Like :func:`petl.transform.intervals.intervaljoin` but rows from the left
583    table without a match in the right table are also included. E.g.::
584
585        >>> import petl as etl
586        >>> left = [['begin', 'end', 'quux'],
587        ...         [1, 2, 'a'],
588        ...         [2, 4, 'b'],
589        ...         [2, 5, 'c'],
590        ...         [9, 14, 'd'],
591        ...         [1, 1, 'e'],
592        ...         [10, 10, 'f']]
593        >>> right = [['start', 'stop', 'value'],
594        ...          [1, 4, 'foo'],
595        ...          [3, 7, 'bar'],
596        ...          [4, 9, 'baz']]
597        >>> table1 = etl.intervalleftjoin(left, right,
598        ...                               lstart='begin', lstop='end',
599        ...                               rstart='start', rstop='stop')
600        >>> table1.lookall()
601        +-------+-----+------+-------+------+-------+
602        | begin | end | quux | start | stop | value |
603        +=======+=====+======+=======+======+=======+
604        |     1 |   2 | 'a'  |     1 |    4 | 'foo' |
605        +-------+-----+------+-------+------+-------+
606        |     2 |   4 | 'b'  |     1 |    4 | 'foo' |
607        +-------+-----+------+-------+------+-------+
608        |     2 |   4 | 'b'  |     3 |    7 | 'bar' |
609        +-------+-----+------+-------+------+-------+
610        |     2 |   5 | 'c'  |     1 |    4 | 'foo' |
611        +-------+-----+------+-------+------+-------+
612        |     2 |   5 | 'c'  |     3 |    7 | 'bar' |
613        +-------+-----+------+-------+------+-------+
614        |     2 |   5 | 'c'  |     4 |    9 | 'baz' |
615        +-------+-----+------+-------+------+-------+
616        |     9 |  14 | 'd'  | None  | None | None  |
617        +-------+-----+------+-------+------+-------+
618        |     1 |   1 | 'e'  | None  | None | None  |
619        +-------+-----+------+-------+------+-------+
620        |    10 |  10 | 'f'  | None  | None | None  |
621        +-------+-----+------+-------+------+-------+
622
623    Note start coordinates are included and stop coordinates are excluded
624    from the interval. Use the `include_stop` keyword argument to include the
625    upper bound of the interval when finding overlaps.
626
627    """
628
629    assert (lkey is None) == (rkey is None), \
630        'facet key field must be provided for both or neither table'
631    return IntervalLeftJoinView(left, right, lstart=lstart, lstop=lstop,
632                                rstart=rstart, rstop=rstop, lkey=lkey,
633                                rkey=rkey, include_stop=include_stop,
634                                missing=missing, lprefix=lprefix,
635                                rprefix=rprefix)
636
637
638Table.intervalleftjoin = intervalleftjoin
639
640
641class IntervalLeftJoinView(Table):
642
643    def __init__(self, left, right, lstart='start', lstop='stop',
644                 rstart='start', rstop='stop', lkey=None, rkey=None,
645                 missing=None, include_stop=False, lprefix=None, rprefix=None):
646        self.left = left
647        self.lstart = lstart
648        self.lstop = lstop
649        self.lkey = lkey
650        self.right = right
651        self.rstart = rstart
652        self.rstop = rstop
653        self.rkey = rkey
654        self.missing = missing
655        self.include_stop = include_stop
656        self.lprefix = lprefix
657        self.rprefix = rprefix
658
659    def __iter__(self):
660        return iterintervaljoin(
661            left=self.left,
662            right=self.right,
663            lstart=self.lstart,
664            lstop=self.lstop,
665            rstart=self.rstart,
666            rstop=self.rstop,
667            lkey=self.lkey,
668            rkey=self.rkey,
669            include_stop=self.include_stop,
670            missing=self.missing,
671            lprefix=self.lprefix,
672            rprefix=self.rprefix,
673            leftouter=True
674        )
675
676
677def intervalantijoin(left, right, lstart='start', lstop='stop', rstart='start',
678                     rstop='stop', lkey=None, rkey=None, include_stop=False,
679                     missing=None):
680    """
681    Return rows from the `left` table with no overlapping rows from the `right`
682    table.
683
684    Note start coordinates are included and stop coordinates are excluded
685    from the interval. Use the `include_stop` keyword argument to include the
686    upper bound of the interval when finding overlaps.
687
688    """
689
690    assert (lkey is None) == (rkey is None), \
691        'facet key field must be provided for both or neither table'
692    return IntervalAntiJoinView(left, right, lstart=lstart, lstop=lstop,
693                                rstart=rstart, rstop=rstop, lkey=lkey,
694                                rkey=rkey, include_stop=include_stop,
695                                missing=missing)
696
697
698Table.intervalantijoin = intervalantijoin
699
700
701class IntervalAntiJoinView(Table):
702
703    def __init__(self, left, right, lstart='start', lstop='stop',
704                 rstart='start', rstop='stop', lkey=None, rkey=None,
705                 missing=None, include_stop=False):
706        self.left = left
707        self.lstart = lstart
708        self.lstop = lstop
709        self.lkey = lkey
710        self.right = right
711        self.rstart = rstart
712        self.rstop = rstop
713        self.rkey = rkey
714        self.missing = missing
715        self.include_stop = include_stop
716
717    def __iter__(self):
718        return iterintervaljoin(
719            left=self.left,
720            right=self.right,
721            lstart=self.lstart,
722            lstop=self.lstop,
723            rstart=self.rstart,
724            rstop=self.rstop,
725            lkey=self.lkey,
726            rkey=self.rkey,
727            include_stop=self.include_stop,
728            missing=self.missing,
729            lprefix=None,
730            rprefix=None,
731            leftouter=True,
732            anti=True
733        )
734
735
736def iterintervaljoin(left, right, lstart, lstop, rstart, rstop, lkey,
737                     rkey, include_stop, missing, lprefix, rprefix, leftouter,
738                     anti=False):
739
740    # create iterators and obtain fields
741    lit = iter(left)
742    lhdr = next(lit)
743    lflds = list(map(text_type, lhdr))
744    rit = iter(right)
745    rhdr = next(rit)
746    rflds = list(map(text_type, rhdr))
747
748    # check fields via petl.util.asindices (raises FieldSelectionError if spec
749    # is not valid)
750    asindices(lhdr, lstart)
751    asindices(lhdr, lstop)
752    if lkey is not None:
753        asindices(lhdr, lkey)
754    asindices(rhdr, rstart)
755    asindices(rhdr, rstop)
756    if rkey is not None:
757        asindices(rhdr, rkey)
758
759    # determine output fields
760    if lprefix is None:
761        outhdr = list(lflds)
762        if not anti:
763            outhdr.extend(rflds)
764    else:
765        outhdr = list(lprefix + f for f in lflds)
766        if not anti:
767            outhdr.extend(rprefix + f for f in rflds)
768    yield tuple(outhdr)
769
770    # create getters for start and stop positions
771    getlstart = itemgetter(lflds.index(lstart))
772    getlstop = itemgetter(lflds.index(lstop))
773
774    if rkey is None:
775        # build interval lookup for right table
776        lookup = intervallookup(right, rstart, rstop, include_stop=include_stop)
777        search = lookup.search
778        # main loop
779        for lrow in lit:
780            start = getlstart(lrow)
781            stop = getlstop(lrow)
782            rrows = search(start, stop)
783            if rrows:
784                if not anti:
785                    for rrow in rrows:
786                        outrow = list(lrow)
787                        outrow.extend(rrow)
788                        yield tuple(outrow)
789            elif leftouter:
790                outrow = list(lrow)
791                if not anti:
792                    outrow.extend([missing] * len(rflds))
793                yield tuple(outrow)
794
795    else:
796        # build interval lookup for right table
797        lookup = facetintervallookup(right, key=rkey, start=rstart,
798                                     stop=rstop, include_stop=include_stop)
799        search = dict()
800        for f in lookup:
801            search[f] = lookup[f].search
802        # getter for facet key values in left table
803        getlkey = itemgetter(*asindices(lflds, lkey))
804        # main loop
805        for lrow in lit:
806            lkey = getlkey(lrow)
807            start = getlstart(lrow)
808            stop = getlstop(lrow)
809
810            try:
811                rrows = search[lkey](start, stop)
812            except KeyError:
813                rrows = None
814            except AttributeError:
815                rrows = None
816
817            if rrows:
818                if not anti:
819                    for rrow in rrows:
820                        outrow = list(lrow)
821                        outrow.extend(rrow)
822                        yield tuple(outrow)
823            elif leftouter:
824                outrow = list(lrow)
825                if not anti:
826                    outrow.extend([missing] * len(rflds))
827                yield tuple(outrow)
828
829
830def intervaljoinvalues(left, right, value, lstart='start', lstop='stop',
831                       rstart='start', rstop='stop', lkey=None, rkey=None,
832                       include_stop=False):
833    """
834    Convenience function to join the left table with values from a specific
835    field in the right hand table.
836
837    Note start coordinates are included and stop coordinates are excluded
838    from the interval. Use the `include_stop` keyword argument to include the
839    upper bound of the interval when finding overlaps.
840
841    """
842
843    assert (lkey is None) == (rkey is None), \
844        'facet key field must be provided for both or neither table'
845    if lkey is None:
846        lkp = intervallookup(right, start=rstart, stop=rstop, value=value,
847                             include_stop=include_stop)
848        f = lambda row: lkp.search(row[lstart], row[lstop])
849    else:
850        lkp = facetintervallookup(right, rkey, start=rstart, stop=rstop,
851                                  value=value, include_stop=include_stop)
852        f = lambda row: lkp[row[lkey]].search(row[lstart], row[lstop])
853    return addfield(left, value, f)
854
855
856Table.intervaljoinvalues = intervaljoinvalues
857
858
859def intervalsubtract(left, right, lstart='start', lstop='stop', rstart='start',
860                     rstop='stop', lkey=None, rkey=None, include_stop=False):
861    """
862    Subtract intervals in the right hand table from intervals in the left hand
863    table.
864
865    """
866
867    assert (lkey is None) == (rkey is None), \
868        'facet key field must be provided for both or neither table'
869    return IntervalSubtractView(left, right, lstart=lstart, lstop=lstop,
870                                rstart=rstart, rstop=rstop, lkey=lkey,
871                                rkey=rkey, include_stop=include_stop)
872
873
874Table.intervalsubtract = intervalsubtract
875
876
877class IntervalSubtractView(Table):
878
879    def __init__(self, left, right, lstart='start', lstop='stop',
880                 rstart='start', rstop='stop', lkey=None, rkey=None,
881                 include_stop=False):
882        self.left = left
883        self.lstart = lstart
884        self.lstop = lstop
885        self.lkey = lkey
886        self.right = right
887        self.rstart = rstart
888        self.rstop = rstop
889        self.rkey = rkey
890        self.include_stop = include_stop
891
892    def __iter__(self):
893        return iterintervalsubtract(self.left, self.right, self.lstart,
894                                    self.lstop, self.rstart, self.rstop,
895                                    self.lkey, self.rkey, self.include_stop)
896
897
898def iterintervalsubtract(left, right, lstart, lstop, rstart, rstop, lkey, rkey,
899                         include_stop):
900
901    # create iterators and obtain fields
902    lit = iter(left)
903    lhdr = next(lit)
904    lflds = list(map(text_type, lhdr))
905    rit = iter(right)
906    rhdr = next(rit)
907
908    # check fields via petl.util.asindices (raises FieldSelectionError if spec
909    # is not valid)
910    asindices(lhdr, lstart)
911    asindices(lhdr, lstop)
912    if lkey is not None:
913        asindices(lhdr, lkey)
914    asindices(rhdr, rstart)
915    asindices(rhdr, rstop)
916    if rkey is not None:
917        asindices(rhdr, rkey)
918
919    # determine output fields
920    outhdr = list(lflds)
921    yield tuple(outhdr)
922
923    # create getters for start and stop positions
924    lstartidx, lstopidx = asindices(lhdr, (lstart, lstop))
925    getlcoords = itemgetter(lstartidx, lstopidx)
926    getrcoords = itemgetter(*asindices(rhdr, (rstart, rstop)))
927
928    if rkey is None:
929        # build interval lookup for right table
930        lookup = intervallookup(right, rstart, rstop, include_stop=include_stop)
931        search = lookup.search
932        # main loop
933        for lrow in lit:
934            start, stop = getlcoords(lrow)
935            rrows = search(start, stop)
936            if not rrows:
937                yield tuple(lrow)
938            else:
939                rivs = sorted([getrcoords(rrow) for rrow in rrows],
940                              key=itemgetter(0))  # sort by start
941                for x, y in _subtract(start, stop, rivs):
942                    out = list(lrow)
943                    out[lstartidx] = x
944                    out[lstopidx] = y
945                    yield tuple(out)
946
947    else:
948        # build interval lookup for right table
949        lookup = facetintervallookup(right, key=rkey, start=rstart, stop=rstop,
950                                     include_stop=include_stop)
951        # getter for facet key values in left table
952        getlkey = itemgetter(*asindices(lhdr, lkey))
953        # main loop
954        for lrow in lit:
955            lkey = getlkey(lrow)
956            start, stop = getlcoords(lrow)
957            try:
958                rrows = lookup[lkey].search(start, stop)
959            except KeyError:
960                rrows = None
961            except AttributeError:
962                rrows = None
963            if not rrows:
964                yield tuple(lrow)
965            else:
966                rivs = sorted([getrcoords(rrow) for rrow in rrows],
967                              key=itemgetter(0))  # sort by start
968                for x, y in _subtract(start, stop, rivs):
969                    out = list(lrow)
970                    out[lstartidx] = x
971                    out[lstopidx] = y
972                    yield tuple(out)
973
974
975from collections import namedtuple
976_Interval = namedtuple('Interval', 'start stop')
977
978
979def collapsedintervals(table, start='start', stop='stop', key=None):
980    """
981    Utility function to collapse intervals in a table.
982
983    If no facet `key` is given, returns an iterator over `(start, stop)` tuples.
984
985    If facet `key` is given, returns an iterator over `(key, start, stop)`
986    tuples.
987
988    """
989
990    if key is None:
991        table = sort(table, key=start)
992        for iv in _collapse(values(table, (start, stop))):
993            yield iv
994    else:
995        table = sort(table, key=(key, start))
996        for k, g in rowgroupby(table, key=key, value=(start, stop)):
997            for iv in _collapse(g):
998                yield (k,) + iv
999
1000
1001Table.collapsedintervals = collapsedintervals
1002
1003
1004def _collapse(intervals):
1005    """
1006    Collapse an iterable of intervals sorted by start coord.
1007
1008    """
1009    span = None
1010    for start, stop in intervals:
1011        if span is None:
1012            span = _Interval(start, stop)
1013        elif start <= span.stop < stop:
1014            span = _Interval(span.start, stop)
1015        elif start > span.stop:
1016            yield span
1017            span = _Interval(start, stop)
1018    if span is not None:
1019        yield span
1020
1021
1022def _subtract(start, stop, intervals):
1023    """
1024    Subtract intervals from a spanning interval.
1025
1026    """
1027    remainder_start = start
1028    sub_stop = None
1029    for sub_start, sub_stop in _collapse(intervals):
1030        if remainder_start < sub_start:
1031            yield _Interval(remainder_start, sub_start)
1032        remainder_start = sub_stop
1033    if sub_stop is not None and sub_stop < stop:
1034        yield _Interval(sub_stop, stop)
1035