1from __future__ import absolute_import, print_function, division 2 3 4from operator import itemgetter, attrgetter 5from petl.compat import text_type 6 7 8from petl.util.base import asindices, records, Table, values, rowgroupby 9from petl.errors import DuplicateKeyError 10from petl.transform.basics import addfield 11from petl.transform.sorts import sort 12 13 14def tupletree(table, start='start', stop='stop', value=None): 15 """ 16 Construct an interval tree for the given table, where each node in the tree 17 is a row of the table. 18 19 """ 20 21 import intervaltree 22 tree = intervaltree.IntervalTree() 23 it = iter(table) 24 hdr = next(it) 25 flds = list(map(text_type, hdr)) 26 assert start in flds, 'start field not recognised' 27 assert stop in flds, 'stop field not recognised' 28 getstart = itemgetter(flds.index(start)) 29 getstop = itemgetter(flds.index(stop)) 30 if value is None: 31 getvalue = tuple 32 else: 33 valueindices = asindices(hdr, value) 34 assert len(valueindices) > 0, 'invalid value field specification' 35 getvalue = itemgetter(*valueindices) 36 for row in it: 37 tree.addi(getstart(row), getstop(row), getvalue(row)) 38 return tree 39 40 41def facettupletrees(table, key, start='start', stop='stop', value=None): 42 """ 43 Construct faceted interval trees for the given table, where each node in 44 the tree is a row of the table. 45 46 """ 47 48 import intervaltree 49 it = iter(table) 50 hdr = next(it) 51 flds = list(map(text_type, hdr)) 52 assert start in flds, 'start field not recognised' 53 assert stop in flds, 'stop field not recognised' 54 getstart = itemgetter(flds.index(start)) 55 getstop = itemgetter(flds.index(stop)) 56 if value is None: 57 getvalue = tuple 58 else: 59 valueindices = asindices(hdr, value) 60 assert len(valueindices) > 0, 'invalid value field specification' 61 getvalue = itemgetter(*valueindices) 62 keyindices = asindices(hdr, key) 63 assert len(keyindices) > 0, 'invalid key' 64 getkey = itemgetter(*keyindices) 65 66 trees = dict() 67 for row in it: 68 k = getkey(row) 69 if k not in trees: 70 trees[k] = intervaltree.IntervalTree() 71 trees[k].addi(getstart(row), getstop(row), getvalue(row)) 72 return trees 73 74 75def recordtree(table, start='start', stop='stop'): 76 """ 77 Construct an interval tree for the given table, where each node in the 78 tree is a row of the table represented as a record object. 79 80 """ 81 82 import intervaltree 83 getstart = attrgetter(start) 84 getstop = attrgetter(stop) 85 tree = intervaltree.IntervalTree() 86 for rec in records(table): 87 tree.addi(getstart(rec), getstop(rec), rec) 88 return tree 89 90 91def facetrecordtrees(table, key, start='start', stop='stop'): 92 """ 93 Construct faceted interval trees for the given table, where each node in 94 the tree is a record. 95 96 """ 97 98 import intervaltree 99 getstart = attrgetter(start) 100 getstop = attrgetter(stop) 101 getkey = attrgetter(key) 102 trees = dict() 103 for rec in records(table): 104 k = getkey(rec) 105 if k not in trees: 106 trees[k] = intervaltree.IntervalTree() 107 trees[k].addi(getstart(rec), getstop(rec), rec) 108 return trees 109 110 111def intervallookup(table, start='start', stop='stop', value=None, 112 include_stop=False): 113 """ 114 Construct an interval lookup for the given table. E.g.:: 115 116 >>> import petl as etl 117 >>> table = [['start', 'stop', 'value'], 118 ... [1, 4, 'foo'], 119 ... [3, 7, 'bar'], 120 ... [4, 9, 'baz']] 121 >>> lkp = etl.intervallookup(table, 'start', 'stop') 122 >>> lkp.search(0, 1) 123 [] 124 >>> lkp.search(1, 2) 125 [(1, 4, 'foo')] 126 >>> lkp.search(2, 4) 127 [(1, 4, 'foo'), (3, 7, 'bar')] 128 >>> lkp.search(2, 5) 129 [(1, 4, 'foo'), (3, 7, 'bar'), (4, 9, 'baz')] 130 >>> lkp.search(9, 14) 131 [] 132 >>> lkp.search(19, 140) 133 [] 134 >>> lkp.search(0) 135 [] 136 >>> lkp.search(1) 137 [(1, 4, 'foo')] 138 >>> lkp.search(2) 139 [(1, 4, 'foo')] 140 >>> lkp.search(4) 141 [(3, 7, 'bar'), (4, 9, 'baz')] 142 >>> lkp.search(5) 143 [(3, 7, 'bar'), (4, 9, 'baz')] 144 145 Note start coordinates are included and stop coordinates are excluded 146 from the interval. Use the `include_stop` keyword argument to include the 147 upper bound of the interval when finding overlaps. 148 149 Some examples using the `include_stop` and `value` keyword arguments:: 150 151 >>> import petl as etl 152 >>> table = [['start', 'stop', 'value'], 153 ... [1, 4, 'foo'], 154 ... [3, 7, 'bar'], 155 ... [4, 9, 'baz']] 156 >>> lkp = etl.intervallookup(table, 'start', 'stop', include_stop=True, 157 ... value='value') 158 >>> lkp.search(0, 1) 159 ['foo'] 160 >>> lkp.search(1, 2) 161 ['foo'] 162 >>> lkp.search(2, 4) 163 ['foo', 'bar', 'baz'] 164 >>> lkp.search(2, 5) 165 ['foo', 'bar', 'baz'] 166 >>> lkp.search(9, 14) 167 ['baz'] 168 >>> lkp.search(19, 140) 169 [] 170 >>> lkp.search(0) 171 [] 172 >>> lkp.search(1) 173 ['foo'] 174 >>> lkp.search(2) 175 ['foo'] 176 >>> lkp.search(4) 177 ['foo', 'bar', 'baz'] 178 >>> lkp.search(5) 179 ['bar', 'baz'] 180 181 """ 182 183 tree = tupletree(table, start=start, stop=stop, value=value) 184 return IntervalTreeLookup(tree, include_stop=include_stop) 185 186 187Table.intervallookup = intervallookup 188 189 190def _search_tree(tree, start, stop, include_stop): 191 if stop is None: 192 if include_stop: 193 stop = start + 1 194 start -= 1 195 args = (start, stop) 196 else: 197 args = (start,) 198 else: 199 if include_stop: 200 stop += 1 201 start -= 1 202 args = (start, stop) 203 if len(args) == 2: 204 results = sorted(tree.overlap(*args)) 205 else: 206 results = sorted(tree.at(*args)) 207 return results 208 209 210class IntervalTreeLookup(object): 211 212 def __init__(self, tree, include_stop=False): 213 self.tree = tree 214 self.include_stop = include_stop 215 216 def search(self, start, stop=None): 217 results = _search_tree(self.tree, start, stop, self.include_stop) 218 return [r.data for r in results] 219 220 find = search 221 222 223def intervallookupone(table, start='start', stop='stop', value=None, 224 include_stop=False, strict=True): 225 """ 226 Construct an interval lookup for the given table, returning at most one 227 result for each query. E.g.:: 228 229 >>> import petl as etl 230 >>> table = [['start', 'stop', 'value'], 231 ... [1, 4, 'foo'], 232 ... [3, 7, 'bar'], 233 ... [4, 9, 'baz']] 234 >>> lkp = etl.intervallookupone(table, 'start', 'stop', strict=False) 235 >>> lkp.search(0, 1) 236 >>> lkp.search(1, 2) 237 (1, 4, 'foo') 238 >>> lkp.search(2, 4) 239 (1, 4, 'foo') 240 >>> lkp.search(2, 5) 241 (1, 4, 'foo') 242 >>> lkp.search(9, 14) 243 >>> lkp.search(19, 140) 244 >>> lkp.search(0) 245 >>> lkp.search(1) 246 (1, 4, 'foo') 247 >>> lkp.search(2) 248 (1, 4, 'foo') 249 >>> lkp.search(4) 250 (3, 7, 'bar') 251 >>> lkp.search(5) 252 (3, 7, 'bar') 253 254 If ``strict=True``, queries returning more than one result will 255 raise a `DuplicateKeyError`. If ``strict=False`` and there is 256 more than one result, the first result is returned. 257 258 Note start coordinates are included and stop coordinates are excluded 259 from the interval. Use the `include_stop` keyword argument to include the 260 upper bound of the interval when finding overlaps. 261 262 """ 263 264 tree = tupletree(table, start=start, stop=stop, value=value) 265 return IntervalTreeLookupOne(tree, strict=strict, include_stop=include_stop) 266 267 268Table.intervallookupone = intervallookupone 269 270 271class IntervalTreeLookupOne(object): 272 273 def __init__(self, tree, strict=True, include_stop=False): 274 self.tree = tree 275 self.strict = strict 276 self.include_stop = include_stop 277 278 def search(self, start, stop=None): 279 results = _search_tree(self.tree, start, stop, self.include_stop) 280 if len(results) == 0: 281 return None 282 elif len(results) > 1 and self.strict: 283 raise DuplicateKeyError((start, stop)) 284 else: 285 return results[0].data 286 287 find = search 288 289 290def intervalrecordlookup(table, start='start', stop='stop', include_stop=False): 291 """ 292 As :func:`petl.transform.intervals.intervallookup` but return records 293 instead of tuples. 294 295 """ 296 297 tree = recordtree(table, start=start, stop=stop) 298 return IntervalTreeLookup(tree, include_stop=include_stop) 299 300 301Table.intervalrecordlookup = intervalrecordlookup 302 303 304def intervalrecordlookupone(table, start='start', stop='stop', 305 include_stop=False, strict=True): 306 """ 307 As :func:`petl.transform.intervals.intervallookupone` but return records 308 instead of tuples. 309 310 """ 311 312 tree = recordtree(table, start=start, stop=stop) 313 return IntervalTreeLookupOne(tree, include_stop=include_stop, strict=strict) 314 315 316Table.intervalrecordlookupone = intervalrecordlookupone 317 318 319def facetintervallookup(table, key, start='start', stop='stop', 320 value=None, include_stop=False): 321 """ 322 323 Construct a faceted interval lookup for the given table. E.g.:: 324 325 >>> import petl as etl 326 >>> table = (('type', 'start', 'stop', 'value'), 327 ... ('apple', 1, 4, 'foo'), 328 ... ('apple', 3, 7, 'bar'), 329 ... ('orange', 4, 9, 'baz')) 330 >>> lkp = etl.facetintervallookup(table, key='type', start='start', stop='stop') 331 >>> lkp['apple'].search(1, 2) 332 [('apple', 1, 4, 'foo')] 333 >>> lkp['apple'].search(2, 4) 334 [('apple', 1, 4, 'foo'), ('apple', 3, 7, 'bar')] 335 >>> lkp['apple'].search(2, 5) 336 [('apple', 1, 4, 'foo'), ('apple', 3, 7, 'bar')] 337 >>> lkp['orange'].search(2, 5) 338 [('orange', 4, 9, 'baz')] 339 >>> lkp['orange'].search(9, 14) 340 [] 341 >>> lkp['orange'].search(19, 140) 342 [] 343 >>> lkp['apple'].search(1) 344 [('apple', 1, 4, 'foo')] 345 >>> lkp['apple'].search(2) 346 [('apple', 1, 4, 'foo')] 347 >>> lkp['apple'].search(4) 348 [('apple', 3, 7, 'bar')] 349 >>> lkp['apple'].search(5) 350 [('apple', 3, 7, 'bar')] 351 >>> lkp['orange'].search(5) 352 [('orange', 4, 9, 'baz')] 353 354 """ 355 356 trees = facettupletrees(table, key, start=start, stop=stop, value=value) 357 out = dict() 358 for k in trees: 359 out[k] = IntervalTreeLookup(trees[k], include_stop=include_stop) 360 return out 361 362 363Table.facetintervallookup = facetintervallookup 364 365 366def facetintervallookupone(table, key, start='start', stop='stop', 367 value=None, include_stop=False, strict=True): 368 """ 369 Construct a faceted interval lookup for the given table, returning at most 370 one result for each query. 371 372 If ``strict=True``, queries returning more than one result will 373 raise a `DuplicateKeyError`. If ``strict=False`` and there is 374 more than one result, the first result is returned. 375 376 """ 377 378 trees = facettupletrees(table, key, start=start, stop=stop, value=value) 379 out = dict() 380 for k in trees: 381 out[k] = IntervalTreeLookupOne(trees[k], include_stop=include_stop, 382 strict=strict) 383 return out 384 385 386Table.facetintervallookupone = facetintervallookupone 387 388 389def facetintervalrecordlookup(table, key, start='start', stop='stop', 390 include_stop=False): 391 """ 392 As :func:`petl.transform.intervals.facetintervallookup` but return records. 393 394 """ 395 396 trees = facetrecordtrees(table, key, start=start, stop=stop) 397 out = dict() 398 for k in trees: 399 out[k] = IntervalTreeLookup(trees[k], include_stop=include_stop) 400 return out 401 402 403Table.facetintervalrecordlookup = facetintervalrecordlookup 404 405 406def facetintervalrecordlookupone(table, key, start, stop, include_stop=False, 407 strict=True): 408 """ 409 As :func:`petl.transform.intervals.facetintervallookupone` but return 410 records. 411 412 """ 413 414 trees = facetrecordtrees(table, key, start=start, stop=stop) 415 out = dict() 416 for k in trees: 417 out[k] = IntervalTreeLookupOne(trees[k], include_stop=include_stop, 418 strict=strict) 419 return out 420 421 422Table.facetintervalrecordlookupone = facetintervalrecordlookupone 423 424 425def intervaljoin(left, right, lstart='start', lstop='stop', rstart='start', 426 rstop='stop', lkey=None, rkey=None, include_stop=False, 427 lprefix=None, rprefix=None): 428 """ 429 Join two tables by overlapping intervals. E.g.:: 430 431 >>> import petl as etl 432 >>> left = [['begin', 'end', 'quux'], 433 ... [1, 2, 'a'], 434 ... [2, 4, 'b'], 435 ... [2, 5, 'c'], 436 ... [9, 14, 'd'], 437 ... [1, 1, 'e'], 438 ... [10, 10, 'f']] 439 >>> right = [['start', 'stop', 'value'], 440 ... [1, 4, 'foo'], 441 ... [3, 7, 'bar'], 442 ... [4, 9, 'baz']] 443 >>> table1 = etl.intervaljoin(left, right, 444 ... lstart='begin', lstop='end', 445 ... rstart='start', rstop='stop') 446 >>> table1.lookall() 447 +-------+-----+------+-------+------+-------+ 448 | begin | end | quux | start | stop | value | 449 +=======+=====+======+=======+======+=======+ 450 | 1 | 2 | 'a' | 1 | 4 | 'foo' | 451 +-------+-----+------+-------+------+-------+ 452 | 2 | 4 | 'b' | 1 | 4 | 'foo' | 453 +-------+-----+------+-------+------+-------+ 454 | 2 | 4 | 'b' | 3 | 7 | 'bar' | 455 +-------+-----+------+-------+------+-------+ 456 | 2 | 5 | 'c' | 1 | 4 | 'foo' | 457 +-------+-----+------+-------+------+-------+ 458 | 2 | 5 | 'c' | 3 | 7 | 'bar' | 459 +-------+-----+------+-------+------+-------+ 460 | 2 | 5 | 'c' | 4 | 9 | 'baz' | 461 +-------+-----+------+-------+------+-------+ 462 463 >>> # include stop coordinate in intervals 464 ... table2 = etl.intervaljoin(left, right, 465 ... lstart='begin', lstop='end', 466 ... rstart='start', rstop='stop', 467 ... include_stop=True) 468 >>> table2.lookall() 469 +-------+-----+------+-------+------+-------+ 470 | begin | end | quux | start | stop | value | 471 +=======+=====+======+=======+======+=======+ 472 | 1 | 2 | 'a' | 1 | 4 | 'foo' | 473 +-------+-----+------+-------+------+-------+ 474 | 2 | 4 | 'b' | 1 | 4 | 'foo' | 475 +-------+-----+------+-------+------+-------+ 476 | 2 | 4 | 'b' | 3 | 7 | 'bar' | 477 +-------+-----+------+-------+------+-------+ 478 | 2 | 4 | 'b' | 4 | 9 | 'baz' | 479 +-------+-----+------+-------+------+-------+ 480 | 2 | 5 | 'c' | 1 | 4 | 'foo' | 481 +-------+-----+------+-------+------+-------+ 482 | 2 | 5 | 'c' | 3 | 7 | 'bar' | 483 +-------+-----+------+-------+------+-------+ 484 | 2 | 5 | 'c' | 4 | 9 | 'baz' | 485 +-------+-----+------+-------+------+-------+ 486 | 9 | 14 | 'd' | 4 | 9 | 'baz' | 487 +-------+-----+------+-------+------+-------+ 488 | 1 | 1 | 'e' | 1 | 4 | 'foo' | 489 +-------+-----+------+-------+------+-------+ 490 491 Note start coordinates are included and stop coordinates are excluded 492 from the interval. Use the `include_stop` keyword argument to include the 493 upper bound of the interval when finding overlaps. 494 495 An additional key comparison can be made, e.g.:: 496 497 >>> import petl as etl 498 >>> left = (('fruit', 'begin', 'end'), 499 ... ('apple', 1, 2), 500 ... ('apple', 2, 4), 501 ... ('apple', 2, 5), 502 ... ('orange', 2, 5), 503 ... ('orange', 9, 14), 504 ... ('orange', 19, 140), 505 ... ('apple', 1, 1)) 506 >>> right = (('type', 'start', 'stop', 'value'), 507 ... ('apple', 1, 4, 'foo'), 508 ... ('apple', 3, 7, 'bar'), 509 ... ('orange', 4, 9, 'baz')) 510 >>> table3 = etl.intervaljoin(left, right, 511 ... lstart='begin', lstop='end', lkey='fruit', 512 ... rstart='start', rstop='stop', rkey='type') 513 >>> table3.lookall() 514 +----------+-------+-----+----------+-------+------+-------+ 515 | fruit | begin | end | type | start | stop | value | 516 +==========+=======+=====+==========+=======+======+=======+ 517 | 'apple' | 1 | 2 | 'apple' | 1 | 4 | 'foo' | 518 +----------+-------+-----+----------+-------+------+-------+ 519 | 'apple' | 2 | 4 | 'apple' | 1 | 4 | 'foo' | 520 +----------+-------+-----+----------+-------+------+-------+ 521 | 'apple' | 2 | 4 | 'apple' | 3 | 7 | 'bar' | 522 +----------+-------+-----+----------+-------+------+-------+ 523 | 'apple' | 2 | 5 | 'apple' | 1 | 4 | 'foo' | 524 +----------+-------+-----+----------+-------+------+-------+ 525 | 'apple' | 2 | 5 | 'apple' | 3 | 7 | 'bar' | 526 +----------+-------+-----+----------+-------+------+-------+ 527 | 'orange' | 2 | 5 | 'orange' | 4 | 9 | 'baz' | 528 +----------+-------+-----+----------+-------+------+-------+ 529 530 """ 531 532 assert (lkey is None) == (rkey is None), \ 533 'facet key field must be provided for both or neither table' 534 return IntervalJoinView(left, right, lstart=lstart, lstop=lstop, 535 rstart=rstart, rstop=rstop, lkey=lkey, 536 rkey=rkey, include_stop=include_stop, 537 lprefix=lprefix, rprefix=rprefix) 538 539 540Table.intervaljoin = intervaljoin 541 542 543class IntervalJoinView(Table): 544 545 def __init__(self, left, right, lstart='start', lstop='stop', 546 rstart='start', rstop='stop', lkey=None, rkey=None, 547 include_stop=False, lprefix=None, rprefix=None): 548 self.left = left 549 self.lstart = lstart 550 self.lstop = lstop 551 self.lkey = lkey 552 self.right = right 553 self.rstart = rstart 554 self.rstop = rstop 555 self.rkey = rkey 556 self.include_stop = include_stop 557 self.lprefix = lprefix 558 self.rprefix = rprefix 559 560 def __iter__(self): 561 return iterintervaljoin( 562 left=self.left, 563 right=self.right, 564 lstart=self.lstart, 565 lstop=self.lstop, 566 rstart=self.rstart, 567 rstop=self.rstop, 568 lkey=self.lkey, 569 rkey=self.rkey, 570 include_stop=self.include_stop, 571 missing=None, 572 lprefix=self.lprefix, 573 rprefix=self.rprefix, 574 leftouter=False 575 ) 576 577 578def intervalleftjoin(left, right, lstart='start', lstop='stop', rstart='start', 579 rstop='stop', lkey=None, rkey=None, include_stop=False, 580 missing=None, lprefix=None, rprefix=None): 581 """ 582 Like :func:`petl.transform.intervals.intervaljoin` but rows from the left 583 table without a match in the right table are also included. E.g.:: 584 585 >>> import petl as etl 586 >>> left = [['begin', 'end', 'quux'], 587 ... [1, 2, 'a'], 588 ... [2, 4, 'b'], 589 ... [2, 5, 'c'], 590 ... [9, 14, 'd'], 591 ... [1, 1, 'e'], 592 ... [10, 10, 'f']] 593 >>> right = [['start', 'stop', 'value'], 594 ... [1, 4, 'foo'], 595 ... [3, 7, 'bar'], 596 ... [4, 9, 'baz']] 597 >>> table1 = etl.intervalleftjoin(left, right, 598 ... lstart='begin', lstop='end', 599 ... rstart='start', rstop='stop') 600 >>> table1.lookall() 601 +-------+-----+------+-------+------+-------+ 602 | begin | end | quux | start | stop | value | 603 +=======+=====+======+=======+======+=======+ 604 | 1 | 2 | 'a' | 1 | 4 | 'foo' | 605 +-------+-----+------+-------+------+-------+ 606 | 2 | 4 | 'b' | 1 | 4 | 'foo' | 607 +-------+-----+------+-------+------+-------+ 608 | 2 | 4 | 'b' | 3 | 7 | 'bar' | 609 +-------+-----+------+-------+------+-------+ 610 | 2 | 5 | 'c' | 1 | 4 | 'foo' | 611 +-------+-----+------+-------+------+-------+ 612 | 2 | 5 | 'c' | 3 | 7 | 'bar' | 613 +-------+-----+------+-------+------+-------+ 614 | 2 | 5 | 'c' | 4 | 9 | 'baz' | 615 +-------+-----+------+-------+------+-------+ 616 | 9 | 14 | 'd' | None | None | None | 617 +-------+-----+------+-------+------+-------+ 618 | 1 | 1 | 'e' | None | None | None | 619 +-------+-----+------+-------+------+-------+ 620 | 10 | 10 | 'f' | None | None | None | 621 +-------+-----+------+-------+------+-------+ 622 623 Note start coordinates are included and stop coordinates are excluded 624 from the interval. Use the `include_stop` keyword argument to include the 625 upper bound of the interval when finding overlaps. 626 627 """ 628 629 assert (lkey is None) == (rkey is None), \ 630 'facet key field must be provided for both or neither table' 631 return IntervalLeftJoinView(left, right, lstart=lstart, lstop=lstop, 632 rstart=rstart, rstop=rstop, lkey=lkey, 633 rkey=rkey, include_stop=include_stop, 634 missing=missing, lprefix=lprefix, 635 rprefix=rprefix) 636 637 638Table.intervalleftjoin = intervalleftjoin 639 640 641class IntervalLeftJoinView(Table): 642 643 def __init__(self, left, right, lstart='start', lstop='stop', 644 rstart='start', rstop='stop', lkey=None, rkey=None, 645 missing=None, include_stop=False, lprefix=None, rprefix=None): 646 self.left = left 647 self.lstart = lstart 648 self.lstop = lstop 649 self.lkey = lkey 650 self.right = right 651 self.rstart = rstart 652 self.rstop = rstop 653 self.rkey = rkey 654 self.missing = missing 655 self.include_stop = include_stop 656 self.lprefix = lprefix 657 self.rprefix = rprefix 658 659 def __iter__(self): 660 return iterintervaljoin( 661 left=self.left, 662 right=self.right, 663 lstart=self.lstart, 664 lstop=self.lstop, 665 rstart=self.rstart, 666 rstop=self.rstop, 667 lkey=self.lkey, 668 rkey=self.rkey, 669 include_stop=self.include_stop, 670 missing=self.missing, 671 lprefix=self.lprefix, 672 rprefix=self.rprefix, 673 leftouter=True 674 ) 675 676 677def intervalantijoin(left, right, lstart='start', lstop='stop', rstart='start', 678 rstop='stop', lkey=None, rkey=None, include_stop=False, 679 missing=None): 680 """ 681 Return rows from the `left` table with no overlapping rows from the `right` 682 table. 683 684 Note start coordinates are included and stop coordinates are excluded 685 from the interval. Use the `include_stop` keyword argument to include the 686 upper bound of the interval when finding overlaps. 687 688 """ 689 690 assert (lkey is None) == (rkey is None), \ 691 'facet key field must be provided for both or neither table' 692 return IntervalAntiJoinView(left, right, lstart=lstart, lstop=lstop, 693 rstart=rstart, rstop=rstop, lkey=lkey, 694 rkey=rkey, include_stop=include_stop, 695 missing=missing) 696 697 698Table.intervalantijoin = intervalantijoin 699 700 701class IntervalAntiJoinView(Table): 702 703 def __init__(self, left, right, lstart='start', lstop='stop', 704 rstart='start', rstop='stop', lkey=None, rkey=None, 705 missing=None, include_stop=False): 706 self.left = left 707 self.lstart = lstart 708 self.lstop = lstop 709 self.lkey = lkey 710 self.right = right 711 self.rstart = rstart 712 self.rstop = rstop 713 self.rkey = rkey 714 self.missing = missing 715 self.include_stop = include_stop 716 717 def __iter__(self): 718 return iterintervaljoin( 719 left=self.left, 720 right=self.right, 721 lstart=self.lstart, 722 lstop=self.lstop, 723 rstart=self.rstart, 724 rstop=self.rstop, 725 lkey=self.lkey, 726 rkey=self.rkey, 727 include_stop=self.include_stop, 728 missing=self.missing, 729 lprefix=None, 730 rprefix=None, 731 leftouter=True, 732 anti=True 733 ) 734 735 736def iterintervaljoin(left, right, lstart, lstop, rstart, rstop, lkey, 737 rkey, include_stop, missing, lprefix, rprefix, leftouter, 738 anti=False): 739 740 # create iterators and obtain fields 741 lit = iter(left) 742 lhdr = next(lit) 743 lflds = list(map(text_type, lhdr)) 744 rit = iter(right) 745 rhdr = next(rit) 746 rflds = list(map(text_type, rhdr)) 747 748 # check fields via petl.util.asindices (raises FieldSelectionError if spec 749 # is not valid) 750 asindices(lhdr, lstart) 751 asindices(lhdr, lstop) 752 if lkey is not None: 753 asindices(lhdr, lkey) 754 asindices(rhdr, rstart) 755 asindices(rhdr, rstop) 756 if rkey is not None: 757 asindices(rhdr, rkey) 758 759 # determine output fields 760 if lprefix is None: 761 outhdr = list(lflds) 762 if not anti: 763 outhdr.extend(rflds) 764 else: 765 outhdr = list(lprefix + f for f in lflds) 766 if not anti: 767 outhdr.extend(rprefix + f for f in rflds) 768 yield tuple(outhdr) 769 770 # create getters for start and stop positions 771 getlstart = itemgetter(lflds.index(lstart)) 772 getlstop = itemgetter(lflds.index(lstop)) 773 774 if rkey is None: 775 # build interval lookup for right table 776 lookup = intervallookup(right, rstart, rstop, include_stop=include_stop) 777 search = lookup.search 778 # main loop 779 for lrow in lit: 780 start = getlstart(lrow) 781 stop = getlstop(lrow) 782 rrows = search(start, stop) 783 if rrows: 784 if not anti: 785 for rrow in rrows: 786 outrow = list(lrow) 787 outrow.extend(rrow) 788 yield tuple(outrow) 789 elif leftouter: 790 outrow = list(lrow) 791 if not anti: 792 outrow.extend([missing] * len(rflds)) 793 yield tuple(outrow) 794 795 else: 796 # build interval lookup for right table 797 lookup = facetintervallookup(right, key=rkey, start=rstart, 798 stop=rstop, include_stop=include_stop) 799 search = dict() 800 for f in lookup: 801 search[f] = lookup[f].search 802 # getter for facet key values in left table 803 getlkey = itemgetter(*asindices(lflds, lkey)) 804 # main loop 805 for lrow in lit: 806 lkey = getlkey(lrow) 807 start = getlstart(lrow) 808 stop = getlstop(lrow) 809 810 try: 811 rrows = search[lkey](start, stop) 812 except KeyError: 813 rrows = None 814 except AttributeError: 815 rrows = None 816 817 if rrows: 818 if not anti: 819 for rrow in rrows: 820 outrow = list(lrow) 821 outrow.extend(rrow) 822 yield tuple(outrow) 823 elif leftouter: 824 outrow = list(lrow) 825 if not anti: 826 outrow.extend([missing] * len(rflds)) 827 yield tuple(outrow) 828 829 830def intervaljoinvalues(left, right, value, lstart='start', lstop='stop', 831 rstart='start', rstop='stop', lkey=None, rkey=None, 832 include_stop=False): 833 """ 834 Convenience function to join the left table with values from a specific 835 field in the right hand table. 836 837 Note start coordinates are included and stop coordinates are excluded 838 from the interval. Use the `include_stop` keyword argument to include the 839 upper bound of the interval when finding overlaps. 840 841 """ 842 843 assert (lkey is None) == (rkey is None), \ 844 'facet key field must be provided for both or neither table' 845 if lkey is None: 846 lkp = intervallookup(right, start=rstart, stop=rstop, value=value, 847 include_stop=include_stop) 848 f = lambda row: lkp.search(row[lstart], row[lstop]) 849 else: 850 lkp = facetintervallookup(right, rkey, start=rstart, stop=rstop, 851 value=value, include_stop=include_stop) 852 f = lambda row: lkp[row[lkey]].search(row[lstart], row[lstop]) 853 return addfield(left, value, f) 854 855 856Table.intervaljoinvalues = intervaljoinvalues 857 858 859def intervalsubtract(left, right, lstart='start', lstop='stop', rstart='start', 860 rstop='stop', lkey=None, rkey=None, include_stop=False): 861 """ 862 Subtract intervals in the right hand table from intervals in the left hand 863 table. 864 865 """ 866 867 assert (lkey is None) == (rkey is None), \ 868 'facet key field must be provided for both or neither table' 869 return IntervalSubtractView(left, right, lstart=lstart, lstop=lstop, 870 rstart=rstart, rstop=rstop, lkey=lkey, 871 rkey=rkey, include_stop=include_stop) 872 873 874Table.intervalsubtract = intervalsubtract 875 876 877class IntervalSubtractView(Table): 878 879 def __init__(self, left, right, lstart='start', lstop='stop', 880 rstart='start', rstop='stop', lkey=None, rkey=None, 881 include_stop=False): 882 self.left = left 883 self.lstart = lstart 884 self.lstop = lstop 885 self.lkey = lkey 886 self.right = right 887 self.rstart = rstart 888 self.rstop = rstop 889 self.rkey = rkey 890 self.include_stop = include_stop 891 892 def __iter__(self): 893 return iterintervalsubtract(self.left, self.right, self.lstart, 894 self.lstop, self.rstart, self.rstop, 895 self.lkey, self.rkey, self.include_stop) 896 897 898def iterintervalsubtract(left, right, lstart, lstop, rstart, rstop, lkey, rkey, 899 include_stop): 900 901 # create iterators and obtain fields 902 lit = iter(left) 903 lhdr = next(lit) 904 lflds = list(map(text_type, lhdr)) 905 rit = iter(right) 906 rhdr = next(rit) 907 908 # check fields via petl.util.asindices (raises FieldSelectionError if spec 909 # is not valid) 910 asindices(lhdr, lstart) 911 asindices(lhdr, lstop) 912 if lkey is not None: 913 asindices(lhdr, lkey) 914 asindices(rhdr, rstart) 915 asindices(rhdr, rstop) 916 if rkey is not None: 917 asindices(rhdr, rkey) 918 919 # determine output fields 920 outhdr = list(lflds) 921 yield tuple(outhdr) 922 923 # create getters for start and stop positions 924 lstartidx, lstopidx = asindices(lhdr, (lstart, lstop)) 925 getlcoords = itemgetter(lstartidx, lstopidx) 926 getrcoords = itemgetter(*asindices(rhdr, (rstart, rstop))) 927 928 if rkey is None: 929 # build interval lookup for right table 930 lookup = intervallookup(right, rstart, rstop, include_stop=include_stop) 931 search = lookup.search 932 # main loop 933 for lrow in lit: 934 start, stop = getlcoords(lrow) 935 rrows = search(start, stop) 936 if not rrows: 937 yield tuple(lrow) 938 else: 939 rivs = sorted([getrcoords(rrow) for rrow in rrows], 940 key=itemgetter(0)) # sort by start 941 for x, y in _subtract(start, stop, rivs): 942 out = list(lrow) 943 out[lstartidx] = x 944 out[lstopidx] = y 945 yield tuple(out) 946 947 else: 948 # build interval lookup for right table 949 lookup = facetintervallookup(right, key=rkey, start=rstart, stop=rstop, 950 include_stop=include_stop) 951 # getter for facet key values in left table 952 getlkey = itemgetter(*asindices(lhdr, lkey)) 953 # main loop 954 for lrow in lit: 955 lkey = getlkey(lrow) 956 start, stop = getlcoords(lrow) 957 try: 958 rrows = lookup[lkey].search(start, stop) 959 except KeyError: 960 rrows = None 961 except AttributeError: 962 rrows = None 963 if not rrows: 964 yield tuple(lrow) 965 else: 966 rivs = sorted([getrcoords(rrow) for rrow in rrows], 967 key=itemgetter(0)) # sort by start 968 for x, y in _subtract(start, stop, rivs): 969 out = list(lrow) 970 out[lstartidx] = x 971 out[lstopidx] = y 972 yield tuple(out) 973 974 975from collections import namedtuple 976_Interval = namedtuple('Interval', 'start stop') 977 978 979def collapsedintervals(table, start='start', stop='stop', key=None): 980 """ 981 Utility function to collapse intervals in a table. 982 983 If no facet `key` is given, returns an iterator over `(start, stop)` tuples. 984 985 If facet `key` is given, returns an iterator over `(key, start, stop)` 986 tuples. 987 988 """ 989 990 if key is None: 991 table = sort(table, key=start) 992 for iv in _collapse(values(table, (start, stop))): 993 yield iv 994 else: 995 table = sort(table, key=(key, start)) 996 for k, g in rowgroupby(table, key=key, value=(start, stop)): 997 for iv in _collapse(g): 998 yield (k,) + iv 999 1000 1001Table.collapsedintervals = collapsedintervals 1002 1003 1004def _collapse(intervals): 1005 """ 1006 Collapse an iterable of intervals sorted by start coord. 1007 1008 """ 1009 span = None 1010 for start, stop in intervals: 1011 if span is None: 1012 span = _Interval(start, stop) 1013 elif start <= span.stop < stop: 1014 span = _Interval(span.start, stop) 1015 elif start > span.stop: 1016 yield span 1017 span = _Interval(start, stop) 1018 if span is not None: 1019 yield span 1020 1021 1022def _subtract(start, stop, intervals): 1023 """ 1024 Subtract intervals from a spanning interval. 1025 1026 """ 1027 remainder_start = start 1028 sub_stop = None 1029 for sub_start, sub_stop in _collapse(intervals): 1030 if remainder_start < sub_start: 1031 yield _Interval(remainder_start, sub_start) 1032 remainder_start = sub_stop 1033 if sub_stop is not None and sub_stop < stop: 1034 yield _Interval(sub_stop, stop) 1035