Source code for petl.transform.intervals

from __future__ import absolute_import, print_function, division


from operator import itemgetter, attrgetter
from petl.compat import text_type


from petl.util.base import asindices, records, Table, values, rowgroupby
from petl.errors import DuplicateKeyError
from petl.transform.basics import addfield
from petl.transform.sorts import sort


def tupletree(table, start='start', stop='stop', value=None):
    """
    Construct an interval tree for the given table, where each node in the tree
    is a row of the table.

    """

    import intervaltree
    tree = intervaltree.IntervalTree()
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    assert start in flds, 'start field not recognised'
    assert stop in flds, 'stop field not recognised'
    getstart = itemgetter(flds.index(start))
    getstop = itemgetter(flds.index(stop))
    if value is None:
        getvalue = tuple
    else:
        valueindices = asindices(hdr, value)
        assert len(valueindices) > 0, 'invalid value field specification'
        getvalue = itemgetter(*valueindices)
    for row in it:
        tree.addi(getstart(row), getstop(row), getvalue(row))
    return tree


def facettupletrees(table, key, start='start', stop='stop', value=None):
    """
    Construct faceted interval trees for the given table, where each node in
    the tree is a row of the table.

    """

    import intervaltree
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    assert start in flds, 'start field not recognised'
    assert stop in flds, 'stop field not recognised'
    getstart = itemgetter(flds.index(start))
    getstop = itemgetter(flds.index(stop))
    if value is None:
        getvalue = tuple
    else:
        valueindices = asindices(hdr, value)
        assert len(valueindices) > 0, 'invalid value field specification'
        getvalue = itemgetter(*valueindices)
    keyindices = asindices(hdr, key)
    assert len(keyindices) > 0, 'invalid key'
    getkey = itemgetter(*keyindices)

    trees = dict()
    for row in it:
        k = getkey(row)
        if k not in trees:
            trees[k] = intervaltree.IntervalTree()
        trees[k].addi(getstart(row), getstop(row), getvalue(row))
    return trees


def recordtree(table, start='start', stop='stop'):
    """
    Construct an interval tree for the given table, where each node in the
    tree is a row of the table represented as a record object.

    """

    import intervaltree
    getstart = attrgetter(start)
    getstop = attrgetter(stop)
    tree = intervaltree.IntervalTree()
    for rec in records(table):
        tree.addi(getstart(rec), getstop(rec), rec)
    return tree


def facetrecordtrees(table, key, start='start', stop='stop'):
    """
    Construct faceted interval trees for the given table, where each node in 
    the tree is a record.

    """

    import intervaltree
    getstart = attrgetter(start)
    getstop = attrgetter(stop)
    getkey = attrgetter(key)
    trees = dict()
    for rec in records(table):
        k = getkey(rec)
        if k not in trees:
            trees[k] = intervaltree.IntervalTree()
        trees[k].addi(getstart(rec), getstop(rec), rec)
    return trees


[docs]def intervallookup(table, start='start', stop='stop', value=None, include_stop=False): """ Construct an interval lookup for the given table. E.g.:: >>> import petl as etl >>> table = [['start', 'stop', 'value'], ... [1, 4, 'foo'], ... [3, 7, 'bar'], ... [4, 9, 'baz']] >>> lkp = etl.intervallookup(table, 'start', 'stop') >>> lkp.search(0, 1) [] >>> lkp.search(1, 2) [(1, 4, 'foo')] >>> lkp.search(2, 4) [(1, 4, 'foo'), (3, 7, 'bar')] >>> lkp.search(2, 5) [(1, 4, 'foo'), (3, 7, 'bar'), (4, 9, 'baz')] >>> lkp.search(9, 14) [] >>> lkp.search(19, 140) [] >>> lkp.search(0) [] >>> lkp.search(1) [(1, 4, 'foo')] >>> lkp.search(2) [(1, 4, 'foo')] >>> lkp.search(4) [(3, 7, 'bar'), (4, 9, 'baz')] >>> lkp.search(5) [(3, 7, 'bar'), (4, 9, 'baz')] Note start coordinates are included and stop coordinates are excluded from the interval. Use the `include_stop` keyword argument to include the upper bound of the interval when finding overlaps. Some examples using the `include_stop` and `value` keyword arguments:: >>> import petl as etl >>> table = [['start', 'stop', 'value'], ... [1, 4, 'foo'], ... [3, 7, 'bar'], ... [4, 9, 'baz']] >>> lkp = etl.intervallookup(table, 'start', 'stop', include_stop=True, ... value='value') >>> lkp.search(0, 1) ['foo'] >>> lkp.search(1, 2) ['foo'] >>> lkp.search(2, 4) ['foo', 'bar', 'baz'] >>> lkp.search(2, 5) ['foo', 'bar', 'baz'] >>> lkp.search(9, 14) ['baz'] >>> lkp.search(19, 140) [] >>> lkp.search(0) [] >>> lkp.search(1) ['foo'] >>> lkp.search(2) ['foo'] >>> lkp.search(4) ['foo', 'bar', 'baz'] >>> lkp.search(5) ['bar', 'baz'] """ tree = tupletree(table, start=start, stop=stop, value=value) return IntervalTreeLookup(tree, include_stop=include_stop)
Table.intervallookup = intervallookup def _search_tree(tree, start, stop, include_stop): if stop is None: if include_stop: stop = start + 1 start -= 1 args = (start, stop) else: args = (start,) else: if include_stop: stop += 1 start -= 1 args = (start, stop) if len(args) == 2: results = sorted(tree.overlap(*args)) else: results = sorted(tree.at(*args)) return results class IntervalTreeLookup(object): def __init__(self, tree, include_stop=False): self.tree = tree self.include_stop = include_stop def search(self, start, stop=None): results = _search_tree(self.tree, start, stop, self.include_stop) return [r.data for r in results] find = search
[docs]def intervallookupone(table, start='start', stop='stop', value=None, include_stop=False, strict=True): """ Construct an interval lookup for the given table, returning at most one result for each query. E.g.:: >>> import petl as etl >>> table = [['start', 'stop', 'value'], ... [1, 4, 'foo'], ... [3, 7, 'bar'], ... [4, 9, 'baz']] >>> lkp = etl.intervallookupone(table, 'start', 'stop', strict=False) >>> lkp.search(0, 1) >>> lkp.search(1, 2) (1, 4, 'foo') >>> lkp.search(2, 4) (1, 4, 'foo') >>> lkp.search(2, 5) (1, 4, 'foo') >>> lkp.search(9, 14) >>> lkp.search(19, 140) >>> lkp.search(0) >>> lkp.search(1) (1, 4, 'foo') >>> lkp.search(2) (1, 4, 'foo') >>> lkp.search(4) (3, 7, 'bar') >>> lkp.search(5) (3, 7, 'bar') If ``strict=True``, queries returning more than one result will raise a `DuplicateKeyError`. If ``strict=False`` and there is more than one result, the first result is returned. Note start coordinates are included and stop coordinates are excluded from the interval. Use the `include_stop` keyword argument to include the upper bound of the interval when finding overlaps. """ tree = tupletree(table, start=start, stop=stop, value=value) return IntervalTreeLookupOne(tree, strict=strict, include_stop=include_stop)
Table.intervallookupone = intervallookupone class IntervalTreeLookupOne(object): def __init__(self, tree, strict=True, include_stop=False): self.tree = tree self.strict = strict self.include_stop = include_stop def search(self, start, stop=None): results = _search_tree(self.tree, start, stop, self.include_stop) if len(results) == 0: return None elif len(results) > 1 and self.strict: raise DuplicateKeyError((start, stop)) else: return results[0].data find = search
[docs]def intervalrecordlookup(table, start='start', stop='stop', include_stop=False): """ As :func:`petl.transform.intervals.intervallookup` but return records instead of tuples. """ tree = recordtree(table, start=start, stop=stop) return IntervalTreeLookup(tree, include_stop=include_stop)
Table.intervalrecordlookup = intervalrecordlookup
[docs]def intervalrecordlookupone(table, start='start', stop='stop', include_stop=False, strict=True): """ As :func:`petl.transform.intervals.intervallookupone` but return records instead of tuples. """ tree = recordtree(table, start=start, stop=stop) return IntervalTreeLookupOne(tree, include_stop=include_stop, strict=strict)
Table.intervalrecordlookupone = intervalrecordlookupone
[docs]def facetintervallookup(table, key, start='start', stop='stop', value=None, include_stop=False): """ Construct a faceted interval lookup for the given table. E.g.:: >>> import petl as etl >>> table = (('type', 'start', 'stop', 'value'), ... ('apple', 1, 4, 'foo'), ... ('apple', 3, 7, 'bar'), ... ('orange', 4, 9, 'baz')) >>> lkp = etl.facetintervallookup(table, key='type', start='start', stop='stop') >>> lkp['apple'].search(1, 2) [('apple', 1, 4, 'foo')] >>> lkp['apple'].search(2, 4) [('apple', 1, 4, 'foo'), ('apple', 3, 7, 'bar')] >>> lkp['apple'].search(2, 5) [('apple', 1, 4, 'foo'), ('apple', 3, 7, 'bar')] >>> lkp['orange'].search(2, 5) [('orange', 4, 9, 'baz')] >>> lkp['orange'].search(9, 14) [] >>> lkp['orange'].search(19, 140) [] >>> lkp['apple'].search(1) [('apple', 1, 4, 'foo')] >>> lkp['apple'].search(2) [('apple', 1, 4, 'foo')] >>> lkp['apple'].search(4) [('apple', 3, 7, 'bar')] >>> lkp['apple'].search(5) [('apple', 3, 7, 'bar')] >>> lkp['orange'].search(5) [('orange', 4, 9, 'baz')] """ trees = facettupletrees(table, key, start=start, stop=stop, value=value) out = dict() for k in trees: out[k] = IntervalTreeLookup(trees[k], include_stop=include_stop) return out
Table.facetintervallookup = facetintervallookup
[docs]def facetintervallookupone(table, key, start='start', stop='stop', value=None, include_stop=False, strict=True): """ Construct a faceted interval lookup for the given table, returning at most one result for each query. If ``strict=True``, queries returning more than one result will raise a `DuplicateKeyError`. If ``strict=False`` and there is more than one result, the first result is returned. """ trees = facettupletrees(table, key, start=start, stop=stop, value=value) out = dict() for k in trees: out[k] = IntervalTreeLookupOne(trees[k], include_stop=include_stop, strict=strict) return out
Table.facetintervallookupone = facetintervallookupone
[docs]def facetintervalrecordlookup(table, key, start='start', stop='stop', include_stop=False): """ As :func:`petl.transform.intervals.facetintervallookup` but return records. """ trees = facetrecordtrees(table, key, start=start, stop=stop) out = dict() for k in trees: out[k] = IntervalTreeLookup(trees[k], include_stop=include_stop) return out
Table.facetintervalrecordlookup = facetintervalrecordlookup
[docs]def facetintervalrecordlookupone(table, key, start, stop, include_stop=False, strict=True): """ As :func:`petl.transform.intervals.facetintervallookupone` but return records. """ trees = facetrecordtrees(table, key, start=start, stop=stop) out = dict() for k in trees: out[k] = IntervalTreeLookupOne(trees[k], include_stop=include_stop, strict=strict) return out
Table.facetintervalrecordlookupone = facetintervalrecordlookupone
[docs]def intervaljoin(left, right, lstart='start', lstop='stop', rstart='start', rstop='stop', lkey=None, rkey=None, include_stop=False, lprefix=None, rprefix=None): """ Join two tables by overlapping intervals. E.g.:: >>> import petl as etl >>> left = [['begin', 'end', 'quux'], ... [1, 2, 'a'], ... [2, 4, 'b'], ... [2, 5, 'c'], ... [9, 14, 'd'], ... [1, 1, 'e'], ... [10, 10, 'f']] >>> right = [['start', 'stop', 'value'], ... [1, 4, 'foo'], ... [3, 7, 'bar'], ... [4, 9, 'baz']] >>> table1 = etl.intervaljoin(left, right, ... lstart='begin', lstop='end', ... rstart='start', rstop='stop') >>> table1.lookall() +-------+-----+------+-------+------+-------+ | begin | end | quux | start | stop | value | +=======+=====+======+=======+======+=======+ | 1 | 2 | 'a' | 1 | 4 | 'foo' | +-------+-----+------+-------+------+-------+ | 2 | 4 | 'b' | 1 | 4 | 'foo' | +-------+-----+------+-------+------+-------+ | 2 | 4 | 'b' | 3 | 7 | 'bar' | +-------+-----+------+-------+------+-------+ | 2 | 5 | 'c' | 1 | 4 | 'foo' | +-------+-----+------+-------+------+-------+ | 2 | 5 | 'c' | 3 | 7 | 'bar' | +-------+-----+------+-------+------+-------+ | 2 | 5 | 'c' | 4 | 9 | 'baz' | +-------+-----+------+-------+------+-------+ >>> # include stop coordinate in intervals ... table2 = etl.intervaljoin(left, right, ... lstart='begin', lstop='end', ... rstart='start', rstop='stop', ... include_stop=True) >>> table2.lookall() +-------+-----+------+-------+------+-------+ | begin | end | quux | start | stop | value | +=======+=====+======+=======+======+=======+ | 1 | 2 | 'a' | 1 | 4 | 'foo' | +-------+-----+------+-------+------+-------+ | 2 | 4 | 'b' | 1 | 4 | 'foo' | +-------+-----+------+-------+------+-------+ | 2 | 4 | 'b' | 3 | 7 | 'bar' | +-------+-----+------+-------+------+-------+ | 2 | 4 | 'b' | 4 | 9 | 'baz' | +-------+-----+------+-------+------+-------+ | 2 | 5 | 'c' | 1 | 4 | 'foo' | +-------+-----+------+-------+------+-------+ | 2 | 5 | 'c' | 3 | 7 | 'bar' | +-------+-----+------+-------+------+-------+ | 2 | 5 | 'c' | 4 | 9 | 'baz' | +-------+-----+------+-------+------+-------+ | 9 | 14 | 'd' | 4 | 9 | 'baz' | +-------+-----+------+-------+------+-------+ | 1 | 1 | 'e' | 1 | 4 | 'foo' | +-------+-----+------+-------+------+-------+ Note start coordinates are included and stop coordinates are excluded from the interval. Use the `include_stop` keyword argument to include the upper bound of the interval when finding overlaps. An additional key comparison can be made, e.g.:: >>> import petl as etl >>> left = (('fruit', 'begin', 'end'), ... ('apple', 1, 2), ... ('apple', 2, 4), ... ('apple', 2, 5), ... ('orange', 2, 5), ... ('orange', 9, 14), ... ('orange', 19, 140), ... ('apple', 1, 1)) >>> right = (('type', 'start', 'stop', 'value'), ... ('apple', 1, 4, 'foo'), ... ('apple', 3, 7, 'bar'), ... ('orange', 4, 9, 'baz')) >>> table3 = etl.intervaljoin(left, right, ... lstart='begin', lstop='end', lkey='fruit', ... rstart='start', rstop='stop', rkey='type') >>> table3.lookall() +----------+-------+-----+----------+-------+------+-------+ | fruit | begin | end | type | start | stop | value | +==========+=======+=====+==========+=======+======+=======+ | 'apple' | 1 | 2 | 'apple' | 1 | 4 | 'foo' | +----------+-------+-----+----------+-------+------+-------+ | 'apple' | 2 | 4 | 'apple' | 1 | 4 | 'foo' | +----------+-------+-----+----------+-------+------+-------+ | 'apple' | 2 | 4 | 'apple' | 3 | 7 | 'bar' | +----------+-------+-----+----------+-------+------+-------+ | 'apple' | 2 | 5 | 'apple' | 1 | 4 | 'foo' | +----------+-------+-----+----------+-------+------+-------+ | 'apple' | 2 | 5 | 'apple' | 3 | 7 | 'bar' | +----------+-------+-----+----------+-------+------+-------+ | 'orange' | 2 | 5 | 'orange' | 4 | 9 | 'baz' | +----------+-------+-----+----------+-------+------+-------+ """ assert (lkey is None) == (rkey is None), \ 'facet key field must be provided for both or neither table' return IntervalJoinView(left, right, lstart=lstart, lstop=lstop, rstart=rstart, rstop=rstop, lkey=lkey, rkey=rkey, include_stop=include_stop, lprefix=lprefix, rprefix=rprefix)
Table.intervaljoin = intervaljoin class IntervalJoinView(Table): def __init__(self, left, right, lstart='start', lstop='stop', rstart='start', rstop='stop', lkey=None, rkey=None, include_stop=False, lprefix=None, rprefix=None): self.left = left self.lstart = lstart self.lstop = lstop self.lkey = lkey self.right = right self.rstart = rstart self.rstop = rstop self.rkey = rkey self.include_stop = include_stop self.lprefix = lprefix self.rprefix = rprefix def __iter__(self): return iterintervaljoin( left=self.left, right=self.right, lstart=self.lstart, lstop=self.lstop, rstart=self.rstart, rstop=self.rstop, lkey=self.lkey, rkey=self.rkey, include_stop=self.include_stop, missing=None, lprefix=self.lprefix, rprefix=self.rprefix, leftouter=False )
[docs]def intervalleftjoin(left, right, lstart='start', lstop='stop', rstart='start', rstop='stop', lkey=None, rkey=None, include_stop=False, missing=None, lprefix=None, rprefix=None): """ Like :func:`petl.transform.intervals.intervaljoin` but rows from the left table without a match in the right table are also included. E.g.:: >>> import petl as etl >>> left = [['begin', 'end', 'quux'], ... [1, 2, 'a'], ... [2, 4, 'b'], ... [2, 5, 'c'], ... [9, 14, 'd'], ... [1, 1, 'e'], ... [10, 10, 'f']] >>> right = [['start', 'stop', 'value'], ... [1, 4, 'foo'], ... [3, 7, 'bar'], ... [4, 9, 'baz']] >>> table1 = etl.intervalleftjoin(left, right, ... lstart='begin', lstop='end', ... rstart='start', rstop='stop') >>> table1.lookall() +-------+-----+------+-------+------+-------+ | begin | end | quux | start | stop | value | +=======+=====+======+=======+======+=======+ | 1 | 2 | 'a' | 1 | 4 | 'foo' | +-------+-----+------+-------+------+-------+ | 2 | 4 | 'b' | 1 | 4 | 'foo' | +-------+-----+------+-------+------+-------+ | 2 | 4 | 'b' | 3 | 7 | 'bar' | +-------+-----+------+-------+------+-------+ | 2 | 5 | 'c' | 1 | 4 | 'foo' | +-------+-----+------+-------+------+-------+ | 2 | 5 | 'c' | 3 | 7 | 'bar' | +-------+-----+------+-------+------+-------+ | 2 | 5 | 'c' | 4 | 9 | 'baz' | +-------+-----+------+-------+------+-------+ | 9 | 14 | 'd' | None | None | None | +-------+-----+------+-------+------+-------+ | 1 | 1 | 'e' | None | None | None | +-------+-----+------+-------+------+-------+ | 10 | 10 | 'f' | None | None | None | +-------+-----+------+-------+------+-------+ Note start coordinates are included and stop coordinates are excluded from the interval. Use the `include_stop` keyword argument to include the upper bound of the interval when finding overlaps. """ assert (lkey is None) == (rkey is None), \ 'facet key field must be provided for both or neither table' return IntervalLeftJoinView(left, right, lstart=lstart, lstop=lstop, rstart=rstart, rstop=rstop, lkey=lkey, rkey=rkey, include_stop=include_stop, missing=missing, lprefix=lprefix, rprefix=rprefix)
Table.intervalleftjoin = intervalleftjoin class IntervalLeftJoinView(Table): def __init__(self, left, right, lstart='start', lstop='stop', rstart='start', rstop='stop', lkey=None, rkey=None, missing=None, include_stop=False, lprefix=None, rprefix=None): self.left = left self.lstart = lstart self.lstop = lstop self.lkey = lkey self.right = right self.rstart = rstart self.rstop = rstop self.rkey = rkey self.missing = missing self.include_stop = include_stop self.lprefix = lprefix self.rprefix = rprefix def __iter__(self): return iterintervaljoin( left=self.left, right=self.right, lstart=self.lstart, lstop=self.lstop, rstart=self.rstart, rstop=self.rstop, lkey=self.lkey, rkey=self.rkey, include_stop=self.include_stop, missing=self.missing, lprefix=self.lprefix, rprefix=self.rprefix, leftouter=True )
[docs]def intervalantijoin(left, right, lstart='start', lstop='stop', rstart='start', rstop='stop', lkey=None, rkey=None, include_stop=False, missing=None): """ Return rows from the `left` table with no overlapping rows from the `right` table. Note start coordinates are included and stop coordinates are excluded from the interval. Use the `include_stop` keyword argument to include the upper bound of the interval when finding overlaps. """ assert (lkey is None) == (rkey is None), \ 'facet key field must be provided for both or neither table' return IntervalAntiJoinView(left, right, lstart=lstart, lstop=lstop, rstart=rstart, rstop=rstop, lkey=lkey, rkey=rkey, include_stop=include_stop, missing=missing)
Table.intervalantijoin = intervalantijoin class IntervalAntiJoinView(Table): def __init__(self, left, right, lstart='start', lstop='stop', rstart='start', rstop='stop', lkey=None, rkey=None, missing=None, include_stop=False): self.left = left self.lstart = lstart self.lstop = lstop self.lkey = lkey self.right = right self.rstart = rstart self.rstop = rstop self.rkey = rkey self.missing = missing self.include_stop = include_stop def __iter__(self): return iterintervaljoin( left=self.left, right=self.right, lstart=self.lstart, lstop=self.lstop, rstart=self.rstart, rstop=self.rstop, lkey=self.lkey, rkey=self.rkey, include_stop=self.include_stop, missing=self.missing, lprefix=None, rprefix=None, leftouter=True, anti=True ) def iterintervaljoin(left, right, lstart, lstop, rstart, rstop, lkey, rkey, include_stop, missing, lprefix, rprefix, leftouter, anti=False): # create iterators and obtain fields lit = iter(left) lhdr = next(lit) lflds = list(map(text_type, lhdr)) rit = iter(right) rhdr = next(rit) rflds = list(map(text_type, rhdr)) # check fields via petl.util.asindices (raises FieldSelectionError if spec # is not valid) asindices(lhdr, lstart) asindices(lhdr, lstop) if lkey is not None: asindices(lhdr, lkey) asindices(rhdr, rstart) asindices(rhdr, rstop) if rkey is not None: asindices(rhdr, rkey) # determine output fields if lprefix is None: outhdr = list(lflds) if not anti: outhdr.extend(rflds) else: outhdr = list(lprefix + f for f in lflds) if not anti: outhdr.extend(rprefix + f for f in rflds) yield tuple(outhdr) # create getters for start and stop positions getlstart = itemgetter(lflds.index(lstart)) getlstop = itemgetter(lflds.index(lstop)) if rkey is None: # build interval lookup for right table lookup = intervallookup(right, rstart, rstop, include_stop=include_stop) search = lookup.search # main loop for lrow in lit: start = getlstart(lrow) stop = getlstop(lrow) rrows = search(start, stop) if rrows: if not anti: for rrow in rrows: outrow = list(lrow) outrow.extend(rrow) yield tuple(outrow) elif leftouter: outrow = list(lrow) if not anti: outrow.extend([missing] * len(rflds)) yield tuple(outrow) else: # build interval lookup for right table lookup = facetintervallookup(right, key=rkey, start=rstart, stop=rstop, include_stop=include_stop) search = dict() for f in lookup: search[f] = lookup[f].search # getter for facet key values in left table getlkey = itemgetter(*asindices(lflds, lkey)) # main loop for lrow in lit: lkey = getlkey(lrow) start = getlstart(lrow) stop = getlstop(lrow) try: rrows = search[lkey](start, stop) except KeyError: rrows = None except AttributeError: rrows = None if rrows: if not anti: for rrow in rrows: outrow = list(lrow) outrow.extend(rrow) yield tuple(outrow) elif leftouter: outrow = list(lrow) if not anti: outrow.extend([missing] * len(rflds)) yield tuple(outrow)
[docs]def intervaljoinvalues(left, right, value, lstart='start', lstop='stop', rstart='start', rstop='stop', lkey=None, rkey=None, include_stop=False): """ Convenience function to join the left table with values from a specific field in the right hand table. Note start coordinates are included and stop coordinates are excluded from the interval. Use the `include_stop` keyword argument to include the upper bound of the interval when finding overlaps. """ assert (lkey is None) == (rkey is None), \ 'facet key field must be provided for both or neither table' if lkey is None: lkp = intervallookup(right, start=rstart, stop=rstop, value=value, include_stop=include_stop) f = lambda row: lkp.search(row[lstart], row[lstop]) else: lkp = facetintervallookup(right, rkey, start=rstart, stop=rstop, value=value, include_stop=include_stop) f = lambda row: lkp[row[lkey]].search(row[lstart], row[lstop]) return addfield(left, value, f)
Table.intervaljoinvalues = intervaljoinvalues
[docs]def intervalsubtract(left, right, lstart='start', lstop='stop', rstart='start', rstop='stop', lkey=None, rkey=None, include_stop=False): """ Subtract intervals in the right hand table from intervals in the left hand table. """ assert (lkey is None) == (rkey is None), \ 'facet key field must be provided for both or neither table' return IntervalSubtractView(left, right, lstart=lstart, lstop=lstop, rstart=rstart, rstop=rstop, lkey=lkey, rkey=rkey, include_stop=include_stop)
Table.intervalsubtract = intervalsubtract class IntervalSubtractView(Table): def __init__(self, left, right, lstart='start', lstop='stop', rstart='start', rstop='stop', lkey=None, rkey=None, include_stop=False): self.left = left self.lstart = lstart self.lstop = lstop self.lkey = lkey self.right = right self.rstart = rstart self.rstop = rstop self.rkey = rkey self.include_stop = include_stop def __iter__(self): return iterintervalsubtract(self.left, self.right, self.lstart, self.lstop, self.rstart, self.rstop, self.lkey, self.rkey, self.include_stop) def iterintervalsubtract(left, right, lstart, lstop, rstart, rstop, lkey, rkey, include_stop): # create iterators and obtain fields lit = iter(left) lhdr = next(lit) lflds = list(map(text_type, lhdr)) rit = iter(right) rhdr = next(rit) # check fields via petl.util.asindices (raises FieldSelectionError if spec # is not valid) asindices(lhdr, lstart) asindices(lhdr, lstop) if lkey is not None: asindices(lhdr, lkey) asindices(rhdr, rstart) asindices(rhdr, rstop) if rkey is not None: asindices(rhdr, rkey) # determine output fields outhdr = list(lflds) yield tuple(outhdr) # create getters for start and stop positions lstartidx, lstopidx = asindices(lhdr, (lstart, lstop)) getlcoords = itemgetter(lstartidx, lstopidx) getrcoords = itemgetter(*asindices(rhdr, (rstart, rstop))) if rkey is None: # build interval lookup for right table lookup = intervallookup(right, rstart, rstop, include_stop=include_stop) search = lookup.search # main loop for lrow in lit: start, stop = getlcoords(lrow) rrows = search(start, stop) if not rrows: yield tuple(lrow) else: rivs = sorted([getrcoords(rrow) for rrow in rrows], key=itemgetter(0)) # sort by start for x, y in _subtract(start, stop, rivs): out = list(lrow) out[lstartidx] = x out[lstopidx] = y yield tuple(out) else: # build interval lookup for right table lookup = facetintervallookup(right, key=rkey, start=rstart, stop=rstop, include_stop=include_stop) # getter for facet key values in left table getlkey = itemgetter(*asindices(lhdr, lkey)) # main loop for lrow in lit: lkey = getlkey(lrow) start, stop = getlcoords(lrow) try: rrows = lookup[lkey].search(start, stop) except KeyError: rrows = None except AttributeError: rrows = None if not rrows: yield tuple(lrow) else: rivs = sorted([getrcoords(rrow) for rrow in rrows], key=itemgetter(0)) # sort by start for x, y in _subtract(start, stop, rivs): out = list(lrow) out[lstartidx] = x out[lstopidx] = y yield tuple(out) from collections import namedtuple _Interval = namedtuple('Interval', 'start stop')
[docs]def collapsedintervals(table, start='start', stop='stop', key=None): """ Utility function to collapse intervals in a table. If no facet `key` is given, returns an iterator over `(start, stop)` tuples. If facet `key` is given, returns an iterator over `(key, start, stop)` tuples. """ if key is None: table = sort(table, key=start) for iv in _collapse(values(table, (start, stop))): yield iv else: table = sort(table, key=(key, start)) for k, g in rowgroupby(table, key=key, value=(start, stop)): for iv in _collapse(g): yield (k,) + iv
Table.collapsedintervals = collapsedintervals def _collapse(intervals): """ Collapse an iterable of intervals sorted by start coord. """ span = None for start, stop in intervals: if span is None: span = _Interval(start, stop) elif start <= span.stop < stop: span = _Interval(span.start, stop) elif start > span.stop: yield span span = _Interval(start, stop) if span is not None: yield span def _subtract(start, stop, intervals): """ Subtract intervals from a spanning interval. """ remainder_start = start sub_stop = None for sub_start, sub_stop in _collapse(intervals): if remainder_start < sub_start: yield _Interval(remainder_start, sub_start) remainder_start = sub_stop if sub_stop is not None and sub_stop < stop: yield _Interval(sub_stop, stop)