Source code for petl.transform.joins

from __future__ import absolute_import, print_function, division


import itertools
import operator
from petl.compat import next, text_type


from petl.errors import ArgumentError
from petl.comparison import comparable_itemgetter, Comparable
from petl.util.base import Table, asindices, rowgetter, rowgroupby, \
    header, data
from petl.transform.sorts import sort
from petl.transform.basics import cut, cutout
from petl.transform.dedup import distinct


def natural_key(left, right):
    # determine key field or fields
    lhdr = header(left)
    lflds = list(map(str, lhdr))
    rhdr = header(right)
    rflds = list(map(str, rhdr))
    key = [f for f in lflds if f in rflds]
    assert len(key) > 0, 'no fields in common'
    if len(key) == 1:
        key = key[0]  # deal with singletons
    return key


def keys_from_args(left, right, key, lkey, rkey):

    if key is lkey is rkey is None:
        # no keys specified, attempt natural join
        lkey = rkey = natural_key(left, right)
    elif key is not None and lkey is rkey is None:
        # common key specified
        lkey = rkey = key
    elif key is None and lkey is not None and rkey is not None:
        # left and right keys specified
        pass
    else:
        raise ArgumentError(
            'bad key arguments: either specify key, or specify both lkey and '
            'rkey, or provide no key/lkey/rkey arguments at all (natural join)'
        )
    return lkey, rkey


[docs]def join(left, right, key=None, lkey=None, rkey=None, presorted=False, buffersize=None, tempdir=None, cache=True, lprefix=None, rprefix=None): """ Perform an equi-join on the given tables. E.g.:: >>> import petl as etl >>> table1 = [['id', 'colour'], ... [1, 'blue'], ... [2, 'red'], ... [3, 'purple']] >>> table2 = [['id', 'shape'], ... [1, 'circle'], ... [3, 'square'], ... [4, 'ellipse']] >>> table3 = etl.join(table1, table2, key='id') >>> table3 +----+----------+----------+ | id | colour | shape | +====+==========+==========+ | 1 | 'blue' | 'circle' | +----+----------+----------+ | 3 | 'purple' | 'square' | +----+----------+----------+ >>> # if no key is given, a natural join is tried ... table4 = etl.join(table1, table2) >>> table4 +----+----------+----------+ | id | colour | shape | +====+==========+==========+ | 1 | 'blue' | 'circle' | +----+----------+----------+ | 3 | 'purple' | 'square' | +----+----------+----------+ >>> # note behaviour if the key is not unique in either or both tables ... table5 = [['id', 'colour'], ... [1, 'blue'], ... [1, 'red'], ... [2, 'purple']] >>> table6 = [['id', 'shape'], ... [1, 'circle'], ... [1, 'square'], ... [2, 'ellipse']] >>> table7 = etl.join(table5, table6, key='id') >>> table7 +----+----------+-----------+ | id | colour | shape | +====+==========+===========+ | 1 | 'blue' | 'circle' | +----+----------+-----------+ | 1 | 'blue' | 'square' | +----+----------+-----------+ | 1 | 'red' | 'circle' | +----+----------+-----------+ | 1 | 'red' | 'square' | +----+----------+-----------+ | 2 | 'purple' | 'ellipse' | +----+----------+-----------+ >>> # compound keys are supported ... table8 = [['id', 'time', 'height'], ... [1, 1, 12.3], ... [1, 2, 34.5], ... [2, 1, 56.7]] >>> table9 = [['id', 'time', 'weight'], ... [1, 2, 4.5], ... [2, 1, 6.7], ... [2, 2, 8.9]] >>> table10 = etl.join(table8, table9, key=['id', 'time']) >>> table10 +----+------+--------+--------+ | id | time | height | weight | +====+======+========+========+ | 1 | 2 | 34.5 | 4.5 | +----+------+--------+--------+ | 2 | 1 | 56.7 | 6.7 | +----+------+--------+--------+ If `presorted` is True, it is assumed that the data are already sorted by the given key, and the `buffersize`, `tempdir` and `cache` arguments are ignored. Otherwise, the data are sorted, see also the discussion of the `buffersize`, `tempdir` and `cache` arguments under the :func:`petl.transform.sorts.sort` function. Left and right tables with different key fields can be handled via the `lkey` and `rkey` arguments. """ # TODO don't read data twice (occurs if using natural key) lkey, rkey = keys_from_args(left, right, key, lkey, rkey) return JoinView(left, right, lkey=lkey, rkey=rkey, presorted=presorted, buffersize=buffersize, tempdir=tempdir, cache=cache, lprefix=lprefix, rprefix=rprefix)
Table.join = join class JoinView(Table): def __init__(self, left, right, lkey, rkey, presorted=False, leftouter=False, rightouter=False, missing=None, buffersize=None, tempdir=None, cache=True, lprefix=None, rprefix=None): self.lkey = lkey self.rkey = rkey if presorted: self.left = left self.right = right else: self.left = sort(left, lkey, buffersize=buffersize, tempdir=tempdir, cache=cache) self.right = sort(right, rkey, buffersize=buffersize, tempdir=tempdir, cache=cache) self.leftouter = leftouter self.rightouter = rightouter self.missing = missing self.lprefix = lprefix self.rprefix = rprefix def __iter__(self): return iterjoin(self.left, self.right, self.lkey, self.rkey, leftouter=self.leftouter, rightouter=self.rightouter, missing=self.missing, lprefix=self.lprefix, rprefix=self.rprefix)
[docs]def leftjoin(left, right, key=None, lkey=None, rkey=None, missing=None, presorted=False, buffersize=None, tempdir=None, cache=True, lprefix=None, rprefix=None): """ Perform a left outer join on the given tables. E.g.:: >>> import petl as etl >>> table1 = [['id', 'colour'], ... [1, 'blue'], ... [2, 'red'], ... [3, 'purple']] >>> table2 = [['id', 'shape'], ... [1, 'circle'], ... [3, 'square'], ... [4, 'ellipse']] >>> table3 = etl.leftjoin(table1, table2, key='id') >>> table3 +----+----------+----------+ | id | colour | shape | +====+==========+==========+ | 1 | 'blue' | 'circle' | +----+----------+----------+ | 2 | 'red' | None | +----+----------+----------+ | 3 | 'purple' | 'square' | +----+----------+----------+ If `presorted` is True, it is assumed that the data are already sorted by the given key, and the `buffersize`, `tempdir` and `cache` arguments are ignored. Otherwise, the data are sorted, see also the discussion of the `buffersize`, `tempdir` and `cache` arguments under the :func:`petl.transform.sorts.sort` function. Left and right tables with different key fields can be handled via the `lkey` and `rkey` arguments. """ # TODO don't read data twice (occurs if using natural key) lkey, rkey = keys_from_args(left, right, key, lkey, rkey) return JoinView(left, right, lkey=lkey, rkey=rkey, presorted=presorted, leftouter=True, rightouter=False, missing=missing, buffersize=buffersize, tempdir=tempdir, cache=cache, lprefix=lprefix, rprefix=rprefix)
Table.leftjoin = leftjoin
[docs]def rightjoin(left, right, key=None, lkey=None, rkey=None, missing=None, presorted=False, buffersize=None, tempdir=None, cache=True, lprefix=None, rprefix=None): """ Perform a right outer join on the given tables. E.g.:: >>> import petl as etl >>> table1 = [['id', 'colour'], ... [1, 'blue'], ... [2, 'red'], ... [3, 'purple']] >>> table2 = [['id', 'shape'], ... [1, 'circle'], ... [3, 'square'], ... [4, 'ellipse']] >>> table3 = etl.rightjoin(table1, table2, key='id') >>> table3 +----+----------+-----------+ | id | colour | shape | +====+==========+===========+ | 1 | 'blue' | 'circle' | +----+----------+-----------+ | 3 | 'purple' | 'square' | +----+----------+-----------+ | 4 | None | 'ellipse' | +----+----------+-----------+ If `presorted` is True, it is assumed that the data are already sorted by the given key, and the `buffersize`, `tempdir` and `cache` arguments are ignored. Otherwise, the data are sorted, see also the discussion of the `buffersize`, `tempdir` and `cache` arguments under the :func:`petl.transform.sorts.sort` function. Left and right tables with different key fields can be handled via the `lkey` and `rkey` arguments. """ # TODO don't read data twice (occurs if using natural key) lkey, rkey = keys_from_args(left, right, key, lkey, rkey) return JoinView(left, right, lkey=lkey, rkey=rkey, presorted=presorted, leftouter=False, rightouter=True, missing=missing, buffersize=buffersize, tempdir=tempdir, cache=cache, lprefix=lprefix, rprefix=rprefix)
Table.rightjoin = rightjoin
[docs]def outerjoin(left, right, key=None, lkey=None, rkey=None, missing=None, presorted=False, buffersize=None, tempdir=None, cache=True, lprefix=None, rprefix=None): """ Perform a full outer join on the given tables. E.g.:: >>> import petl as etl >>> table1 = [['id', 'colour'], ... [1, 'blue'], ... [2, 'red'], ... [3, 'purple']] >>> table2 = [['id', 'shape'], ... [1, 'circle'], ... [3, 'square'], ... [4, 'ellipse']] >>> table3 = etl.outerjoin(table1, table2, key='id') >>> table3 +----+----------+-----------+ | id | colour | shape | +====+==========+===========+ | 1 | 'blue' | 'circle' | +----+----------+-----------+ | 2 | 'red' | None | +----+----------+-----------+ | 3 | 'purple' | 'square' | +----+----------+-----------+ | 4 | None | 'ellipse' | +----+----------+-----------+ If `presorted` is True, it is assumed that the data are already sorted by the given key, and the `buffersize`, `tempdir` and `cache` arguments are ignored. Otherwise, the data are sorted, see also the discussion of the `buffersize`, `tempdir` and `cache` arguments under the :func:`petl.transform.sorts.sort` function. Left and right tables with different key fields can be handled via the `lkey` and `rkey` arguments. """ # TODO don't read data twice (occurs if using natural key) lkey, rkey = keys_from_args(left, right, key, lkey, rkey) return JoinView(left, right, lkey=lkey, rkey=rkey, presorted=presorted, leftouter=True, rightouter=True, missing=missing, buffersize=buffersize, tempdir=tempdir, cache=cache, lprefix=lprefix, rprefix=rprefix)
Table.outerjoin = outerjoin def iterjoin(left, right, lkey, rkey, leftouter=False, rightouter=False, missing=None, lprefix=None, rprefix=None): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from both tables lgetk = comparable_itemgetter(*lkind) rgetk = comparable_itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join two groups of rows def joinrows(_lrowgrp, _rrowgrp): if _rrowgrp is None: for lrow in _lrowgrp: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow) elif _lrowgrp is None: for rrow in _rrowgrp: # start with missing values in place of the left row outrow = [missing] * len(lhdr) # set key values for li, ri in zip(lkind, rkind): outrow[li] = rrow[ri] # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow) else: _rrowgrp = list(_rrowgrp) # may need to iterate more than once for lrow in _lrowgrp: for rrow in _rrowgrp: # start with the left row outrow = list(lrow) # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow) # construct group iterators for both tables lgit = itertools.groupby(lit, key=lgetk) rgit = itertools.groupby(rit, key=rgetk) lrowgrp = [] rrowgrp = [] # loop until *either* of the iterators is exhausted # initialise here to handle empty tables lkval, rkval = Comparable(None), Comparable(None) try: # pick off initial row groups lkval, lrowgrp = next(lgit) rkval, rrowgrp = next(rgit) while True: if lkval < rkval: if leftouter: for row in joinrows(lrowgrp, None): yield tuple(row) # advance left lkval, lrowgrp = next(lgit) elif lkval > rkval: if rightouter: for row in joinrows(None, rrowgrp): yield tuple(row) # advance right rkval, rrowgrp = next(rgit) else: for row in joinrows(lrowgrp, rrowgrp): yield tuple(row) # advance both lkval, lrowgrp = next(lgit) rkval, rrowgrp = next(rgit) except StopIteration: pass # make sure any left rows remaining are yielded if leftouter: if lkval > rkval: # yield anything that got left hanging for row in joinrows(lrowgrp, None): yield tuple(row) # yield the rest for lkval, lrowgrp in lgit: for row in joinrows(lrowgrp, None): yield tuple(row) # make sure any right rows remaining are yielded if rightouter: if lkval < rkval: # yield anything that got left hanging for row in joinrows(None, rrowgrp): yield tuple(row) # yield the rest for rkval, rrowgrp in rgit: for row in joinrows(None, rrowgrp): yield tuple(row)
[docs]def crossjoin(*tables, **kwargs): """ Form the cartesian product of the given tables. E.g.:: >>> import petl as etl >>> table1 = [['id', 'colour'], ... [1, 'blue'], ... [2, 'red']] >>> table2 = [['id', 'shape'], ... [1, 'circle'], ... [3, 'square']] >>> table3 = etl.crossjoin(table1, table2) >>> table3 +----+--------+----+----------+ | id | colour | id | shape | +====+========+====+==========+ | 1 | 'blue' | 1 | 'circle' | +----+--------+----+----------+ | 1 | 'blue' | 3 | 'square' | +----+--------+----+----------+ | 2 | 'red' | 1 | 'circle' | +----+--------+----+----------+ | 2 | 'red' | 3 | 'square' | +----+--------+----+----------+ If `prefix` is `True` then field names in the output table header will be prefixed by the index of the input table. """ return CrossJoinView(*tables, **kwargs)
Table.crossjoin = crossjoin class CrossJoinView(Table): def __init__(self, *sources, **kwargs): self.sources = sources self.prefix = kwargs.get('prefix', False) def __iter__(self): return itercrossjoin(self.sources, self.prefix) def itercrossjoin(sources, prefix): # construct fields outhdr = list() for i, s in enumerate(sources): if prefix: # use one-based numbering outhdr.extend([text_type(i+1) + '_' + text_type(f) for f in header(s)]) else: outhdr.extend(header(s)) yield tuple(outhdr) datasrcs = [data(src) for src in sources] for prod in itertools.product(*datasrcs): outrow = list() for row in prod: outrow.extend(row) yield tuple(outrow)
[docs]def antijoin(left, right, key=None, lkey=None, rkey=None, presorted=False, buffersize=None, tempdir=None, cache=True): """ Return rows from the `left` table where the key value does not occur in the `right` table. E.g.:: >>> import petl as etl >>> table1 = [['id', 'colour'], ... [0, 'black'], ... [1, 'blue'], ... [2, 'red'], ... [4, 'yellow'], ... [5, 'white']] >>> table2 = [['id', 'shape'], ... [1, 'circle'], ... [3, 'square']] >>> table3 = etl.antijoin(table1, table2, key='id') >>> table3 +----+----------+ | id | colour | +====+==========+ | 0 | 'black' | +----+----------+ | 2 | 'red' | +----+----------+ | 4 | 'yellow' | +----+----------+ | 5 | 'white' | +----+----------+ If `presorted` is True, it is assumed that the data are already sorted by the given key, and the `buffersize`, `tempdir` and `cache` arguments are ignored. Otherwise, the data are sorted, see also the discussion of the `buffersize`, `tempdir` and `cache` arguments under the :func:`petl.transform.sorts.sort` function. Left and right tables with different key fields can be handled via the `lkey` and `rkey` arguments. """ lkey, rkey = keys_from_args(left, right, key, lkey, rkey) return AntiJoinView(left=left, right=right, lkey=lkey, rkey=rkey, presorted=presorted, buffersize=buffersize, tempdir=tempdir, cache=cache)
Table.antijoin = antijoin class AntiJoinView(Table): def __init__(self, left, right, lkey, rkey, presorted=False, buffersize=None, tempdir=None, cache=True): if presorted: self.left = left self.right = right else: self.left = sort(left, lkey, buffersize=buffersize, tempdir=tempdir, cache=cache) self.right = sort(right, rkey, buffersize=buffersize, tempdir=tempdir, cache=cache) self.lkey = lkey self.rkey = rkey def __iter__(self): return iterantijoin(self.left, self.right, self.lkey, self.rkey) def iterantijoin(left, right, lkey, rkey): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) yield tuple(lhdr) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from both tables lgetk = comparable_itemgetter(*lkind) rgetk = comparable_itemgetter(*rkind) # construct group iterators for both tables lgit = itertools.groupby(lit, key=lgetk) rgit = itertools.groupby(rit, key=rgetk) lrowgrp = [] # loop until *either* of the iterators is exhausted lkval, rkval = Comparable(None), Comparable(None) try: # pick off initial row groups lkval, lrowgrp = next(lgit) rkval, _ = next(rgit) while True: if lkval < rkval: for row in lrowgrp: yield tuple(row) # advance left lkval, lrowgrp = next(lgit) elif lkval > rkval: # advance right rkval, _ = next(rgit) else: # advance both lkval, lrowgrp = next(lgit) rkval, _ = next(rgit) except StopIteration: pass # any left over? if lkval > rkval: # yield anything that got left hanging for row in lrowgrp: yield tuple(row) # and the rest... for lkval, lrowgrp in lgit: for row in lrowgrp: yield tuple(row)
[docs]def lookupjoin(left, right, key=None, lkey=None, rkey=None, missing=None, presorted=False, buffersize=None, tempdir=None, cache=True, lprefix=None, rprefix=None): """ Perform a left join, but where the key is not unique in the right-hand table, arbitrarily choose the first row and ignore others. E.g.:: >>> import petl as etl >>> table1 = [['id', 'color', 'cost'], ... [1, 'blue', 12], ... [2, 'red', 8], ... [3, 'purple', 4]] >>> table2 = [['id', 'shape', 'size'], ... [1, 'circle', 'big'], ... [1, 'circle', 'small'], ... [2, 'square', 'tiny'], ... [2, 'square', 'big'], ... [3, 'ellipse', 'small'], ... [3, 'ellipse', 'tiny']] >>> table3 = etl.lookupjoin(table1, table2, key='id') >>> table3 +----+----------+------+-----------+---------+ | id | color | cost | shape | size | +====+==========+======+===========+=========+ | 1 | 'blue' | 12 | 'circle' | 'big' | +----+----------+------+-----------+---------+ | 2 | 'red' | 8 | 'square' | 'tiny' | +----+----------+------+-----------+---------+ | 3 | 'purple' | 4 | 'ellipse' | 'small' | +----+----------+------+-----------+---------+ See also :func:`petl.transform.joins.leftjoin`. """ lkey, rkey = keys_from_args(left, right, key, lkey, rkey) return LookupJoinView(left, right, lkey, rkey, presorted=presorted, missing=missing, buffersize=buffersize, tempdir=tempdir, cache=cache, lprefix=lprefix, rprefix=rprefix)
Table.lookupjoin = lookupjoin class LookupJoinView(Table): def __init__(self, left, right, lkey, rkey, presorted=False, missing=None, buffersize=None, tempdir=None, cache=True, lprefix=None, rprefix=None): if presorted: self.left = left self.right = right else: self.left = sort(left, lkey, buffersize=buffersize, tempdir=tempdir, cache=cache) self.right = sort(right, rkey, buffersize=buffersize, tempdir=tempdir, cache=cache) self.lkey = lkey self.rkey = rkey self.missing = missing self.lprefix = lprefix self.rprefix = rprefix def __iter__(self): return iterlookupjoin(self.left, self.right, self.lkey, self.rkey, missing=self.missing, lprefix=self.lprefix, rprefix=self.rprefix) def iterlookupjoin(left, right, lkey, rkey, missing=None, lprefix=None, rprefix=None): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from both tables lgetk = operator.itemgetter(*lkind) rgetk = operator.itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join two groups of rows def joinrows(_lrowgrp, _rrowgrp): if _rrowgrp is None: for lrow in _lrowgrp: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow) else: rrow = next(iter(_rrowgrp)) # pick first arbitrarily for lrow in _lrowgrp: # start with the left row outrow = list(lrow) # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow) # construct group iterators for both tables lgit = itertools.groupby(lit, key=lgetk) rgit = itertools.groupby(rit, key=rgetk) lrowgrp = [] # loop until *either* of the iterators is exhausted lkval, rkval = None, None # initialise here to handle empty tables try: # pick off initial row groups lkval, lrowgrp = next(lgit) rkval, rrowgrp = next(rgit) while True: if lkval < rkval: for row in joinrows(lrowgrp, None): yield tuple(row) # advance left lkval, lrowgrp = next(lgit) elif lkval > rkval: # advance right rkval, rrowgrp = next(rgit) else: for row in joinrows(lrowgrp, rrowgrp): yield tuple(row) # advance both lkval, lrowgrp = next(lgit) rkval, rrowgrp = next(rgit) except StopIteration: pass # make sure any left rows remaining are yielded if lkval > rkval: # yield anything that got left hanging for row in joinrows(lrowgrp, None): yield tuple(row) # yield the rest for lkval, lrowgrp in lgit: for row in joinrows(lrowgrp, None): yield tuple(row)
[docs]def unjoin(table, value, key=None, autoincrement=(1, 1), presorted=False, buffersize=None, tempdir=None, cache=True): """ Split a table into two tables by reversing an inner join. E.g.:: >>> import petl as etl >>> # join key is present in the table ... table1 = (('foo', 'bar', 'baz'), ... ('A', 1, 'apple'), ... ('B', 1, 'apple'), ... ('C', 2, 'orange')) >>> table2, table3 = etl.unjoin(table1, 'baz', key='bar') >>> table2 +-----+-----+ | foo | bar | +=====+=====+ | 'A' | 1 | +-----+-----+ | 'B' | 1 | +-----+-----+ | 'C' | 2 | +-----+-----+ >>> table3 +-----+----------+ | bar | baz | +=====+==========+ | 1 | 'apple' | +-----+----------+ | 2 | 'orange' | +-----+----------+ >>> # an integer join key can also be reconstructed ... table4 = (('foo', 'bar'), ... ('A', 'apple'), ... ('B', 'apple'), ... ('C', 'orange')) >>> table5, table6 = etl.unjoin(table4, 'bar') >>> table5 +-----+--------+ | foo | bar_id | +=====+========+ | 'A' | 1 | +-----+--------+ | 'B' | 1 | +-----+--------+ | 'C' | 2 | +-----+--------+ >>> table6 +----+----------+ | id | bar | +====+==========+ | 1 | 'apple' | +----+----------+ | 2 | 'orange' | +----+----------+ The `autoincrement` parameter controls how an integer join key is reconstructed, and should be a tuple of (`start`, `step`). """ if key is None: # first sort the table by the value field if presorted: tbl_sorted = table else: tbl_sorted = sort(table, value, buffersize=buffersize, tempdir=tempdir, cache=cache) # on the left, return the original table but with the value field # replaced by an incrementing integer left = ConvertToIncrementingCounterView(tbl_sorted, value, autoincrement) # on the right, return a new table with distinct values from the # given field right = EnumerateDistinctView(tbl_sorted, value, autoincrement) else: # on the left, return distinct rows from the original table # with the value field cut out left = distinct(cutout(table, value)) # on the right, return distinct rows from the original table # with all fields but the key and value cut out right = distinct(cut(table, key, value)) return left, right
class ConvertToIncrementingCounterView(Table): def __init__(self, tbl, value, autoincrement): self.table = tbl self.value = value self.autoincrement = autoincrement def __iter__(self): it = iter(self.table) hdr = next(it) table = itertools.chain([hdr], it) value = self.value vidx = hdr.index(value) outhdr = list(hdr) outhdr[vidx] = '%s_id' % value yield tuple(outhdr) offset, multiplier = self.autoincrement for n, (_, group) in enumerate(rowgroupby(table, value)): for row in group: outrow = list(row) outrow[vidx] = (n * multiplier) + offset yield tuple(outrow) Table.unjoin = unjoin class EnumerateDistinctView(Table): def __init__(self, tbl, value, autoincrement): self.table = tbl self.value = value self.autoincrement = autoincrement def __iter__(self): offset, multiplier = self.autoincrement yield ('id', self.value) for n, (v, _) in enumerate(rowgroupby(self.table, self.value)): yield ((n * multiplier) + offset, v)