from __future__ import absolute_import, print_function, division
import operator
from petl.compat import next, text_type
from petl.util.base import Table, asindices, rowgetter, iterpeek
from petl.util.lookups import lookup, lookupone
from petl.transform.joins import keys_from_args
[docs]def hashjoin(left, right, key=None, lkey=None, rkey=None, cache=True,
lprefix=None, rprefix=None):
"""Alternative implementation of :func:`petl.transform.joins.join`, where
the join is executed by constructing an in-memory lookup for the right
hand table, then iterating over rows from the left hand table.
May be faster and/or more resource efficient where the right table is small
and the left table is large.
By default data from right hand table is cached to improve performance
(only available when `key` is given).
Left and right tables with different key fields can be handled via the
`lkey` and `rkey` arguments.
"""
lkey, rkey = keys_from_args(left, right, key, lkey, rkey)
return HashJoinView(left, right, lkey=lkey, rkey=rkey, cache=cache,
lprefix=lprefix, rprefix=rprefix)
Table.hashjoin = hashjoin
class HashJoinView(Table):
def __init__(self, left, right, lkey, rkey, cache=True, lprefix=None,
rprefix=None):
self.left = left
self.right = right
self.lkey = lkey
self.rkey = rkey
self.cache = cache
self.rlookup = None
self.lprefix = lprefix
self.rprefix = rprefix
def __iter__(self):
if not self.cache or self.rlookup is None:
self.rlookup = lookup(self.right, self.rkey)
return iterhashjoin(self.left, self.right, self.lkey, self.rkey,
self.rlookup, self.lprefix, self.rprefix)
def iterhashjoin(left, right, lkey, rkey, rlookup, lprefix, rprefix):
lit = iter(left)
rit = iter(right)
lhdr = next(lit)
rhdr = next(rit)
# determine indices of the key fields in left and right tables
lkind = asindices(lhdr, lkey)
rkind = asindices(rhdr, rkey)
# construct functions to extract key values from left table
lgetk = operator.itemgetter(*lkind)
# determine indices of non-key fields in the right table
# (in the output, we only include key fields from the left table - we
# don't want to duplicate fields)
rvind = [i for i in range(len(rhdr)) if i not in rkind]
rgetv = rowgetter(*rvind)
# determine the output fields
if lprefix is None:
outhdr = list(lhdr)
else:
outhdr = [(text_type(lprefix) + text_type(f))
for f in lhdr]
if rprefix is None:
outhdr.extend(rgetv(rhdr))
else:
outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)])
yield tuple(outhdr)
# define a function to join rows
def joinrows(_lrow, _rrows):
for rrow in _rrows:
# start with the left row
_outrow = list(_lrow)
# extend with non-key values from the right row
_outrow.extend(rgetv(rrow))
yield tuple(_outrow)
for lrow in lit:
k = lgetk(lrow)
if k in rlookup:
rrows = rlookup[k]
for outrow in joinrows(lrow, rrows):
yield outrow
[docs]def hashleftjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
cache=True, lprefix=None, rprefix=None):
"""Alternative implementation of :func:`petl.transform.joins.leftjoin`,
where the join is executed by constructing an in-memory lookup for the
right hand table, then iterating over rows from the left hand table.
May be faster and/or more resource efficient where the right table is small
and the left table is large.
By default data from right hand table is cached to improve performance
(only available when `key` is given).
Left and right tables with different key fields can be handled via the
`lkey` and `rkey` arguments.
"""
lkey, rkey = keys_from_args(left, right, key, lkey, rkey)
return HashLeftJoinView(left, right, lkey, rkey, missing=missing,
cache=cache, lprefix=lprefix, rprefix=rprefix)
Table.hashleftjoin = hashleftjoin
class HashLeftJoinView(Table):
def __init__(self, left, right, lkey, rkey, missing=None, cache=True,
lprefix=None, rprefix=None):
self.left = left
self.right = right
self.lkey = lkey
self.rkey = rkey
self.missing = missing
self.cache = cache
self.rlookup = None
self.lprefix = lprefix
self.rprefix = rprefix
def __iter__(self):
if not self.cache or self.rlookup is None:
self.rlookup = lookup(self.right, self.rkey)
return iterhashleftjoin(self.left, self.right, self.lkey, self.rkey,
self.missing, self.rlookup, self.lprefix,
self.rprefix)
def iterhashleftjoin(left, right, lkey, rkey, missing, rlookup, lprefix,
rprefix):
lit = iter(left)
rit = iter(right)
lhdr = next(lit)
rhdr = next(rit)
# determine indices of the key fields in left and right tables
lkind = asindices(lhdr, lkey)
rkind = asindices(rhdr, rkey)
# construct functions to extract key values from left table
lgetk = operator.itemgetter(*lkind)
# determine indices of non-key fields in the right table
# (in the output, we only include key fields from the left table - we
# don't want to duplicate fields)
rvind = [i for i in range(len(rhdr)) if i not in rkind]
rgetv = rowgetter(*rvind)
# determine the output fields
if lprefix is None:
outhdr = list(lhdr)
else:
outhdr = [(text_type(lprefix) + text_type(f))
for f in lhdr]
if rprefix is None:
outhdr.extend(rgetv(rhdr))
else:
outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)])
yield tuple(outhdr)
# define a function to join rows
def joinrows(_lrow, _rrows):
for rrow in _rrows:
# start with the left row
_outrow = list(_lrow)
# extend with non-key values from the right row
_outrow.extend(rgetv(rrow))
yield tuple(_outrow)
for lrow in lit:
k = lgetk(lrow)
if k in rlookup:
rrows = rlookup[k]
for outrow in joinrows(lrow, rrows):
yield outrow
else:
outrow = list(lrow) # start with the left row
# extend with missing values in place of the right row
outrow.extend([missing] * len(rvind))
yield tuple(outrow)
[docs]def hashrightjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
cache=True, lprefix=None, rprefix=None):
"""Alternative implementation of :func:`petl.transform.joins.rightjoin`,
where the join is executed by constructing an in-memory lookup for the
left hand table, then iterating over rows from the right hand table.
May be faster and/or more resource efficient where the left table is small
and the right table is large.
By default data from right hand table is cached to improve performance
(only available when `key` is given).
Left and right tables with different key fields can be handled via the
`lkey` and `rkey` arguments.
"""
lkey, rkey = keys_from_args(left, right, key, lkey, rkey)
return HashRightJoinView(left, right, lkey, rkey, missing=missing,
cache=cache, lprefix=lprefix, rprefix=rprefix)
Table.hashrightjoin = hashrightjoin
class HashRightJoinView(Table):
def __init__(self, left, right, lkey, rkey, missing=None, cache=True,
lprefix=None, rprefix=None):
self.left = left
self.right = right
self.lkey = lkey
self.rkey = rkey
self.missing = missing
self.cache = cache
self.llookup = None
self.lprefix = lprefix
self.rprefix = rprefix
def __iter__(self):
if not self.cache or self.llookup is None:
self.llookup = lookup(self.left, self.lkey)
return iterhashrightjoin(self.left, self.right, self.lkey, self.rkey,
self.missing, self.llookup, self.lprefix,
self.rprefix)
def iterhashrightjoin(left, right, lkey, rkey, missing, llookup, lprefix,
rprefix):
lit = iter(left)
rit = iter(right)
lhdr = next(lit)
rhdr = next(rit)
# determine indices of the key fields in left and right tables
lkind = asindices(lhdr, lkey)
rkind = asindices(rhdr, rkey)
# construct functions to extract key values from left table
rgetk = operator.itemgetter(*rkind)
# determine indices of non-key fields in the right table
# (in the output, we only include key fields from the left table - we
# don't want to duplicate fields)
rvind = [i for i in range(len(rhdr)) if i not in rkind]
rgetv = rowgetter(*rvind)
# determine the output fields
if lprefix is None:
outhdr = list(lhdr)
else:
outhdr = [(text_type(lprefix) + text_type(f))
for f in lhdr]
if rprefix is None:
outhdr.extend(rgetv(rhdr))
else:
outhdr.extend([(text_type(rprefix) + text_type(f))
for f in rgetv(rhdr)])
yield tuple(outhdr)
# define a function to join rows
def joinrows(_rrow, _lrows):
for lrow in _lrows:
# start with the left row
_outrow = list(lrow)
# extend with non-key values from the right row
_outrow.extend(rgetv(_rrow))
yield tuple(_outrow)
for rrow in rit:
k = rgetk(rrow)
if k in llookup:
lrows = llookup[k]
for outrow in joinrows(rrow, lrows):
yield outrow
else:
# start with missing values in place of the left row
outrow = [missing] * len(lhdr)
# set key values
for li, ri in zip(lkind, rkind):
outrow[li] = rrow[ri]
# extend with non-key values from the right row
outrow.extend(rgetv(rrow))
yield tuple(outrow)
[docs]def hashantijoin(left, right, key=None, lkey=None, rkey=None):
"""Alternative implementation of :func:`petl.transform.joins.antijoin`,
where the join is executed by constructing an in-memory set for all keys
found in the right hand table, then iterating over rows from the left
hand table.
May be faster and/or more resource efficient where the right table is small
and the left table is large.
Left and right tables with different key fields can be handled via the
`lkey` and `rkey` arguments.
"""
lkey, rkey = keys_from_args(left, right, key, lkey, rkey)
return HashAntiJoinView(left, right, lkey, rkey)
Table.hashantijoin = hashantijoin
class HashAntiJoinView(Table):
def __init__(self, left, right, lkey, rkey):
self.left = left
self.right = right
self.lkey = lkey
self.rkey = rkey
def __iter__(self):
return iterhashantijoin(self.left, self.right, self.lkey, self.rkey)
def iterhashantijoin(left, right, lkey, rkey):
lit = iter(left)
rit = iter(right)
lhdr = next(lit)
rhdr = next(rit)
yield tuple(lhdr)
# determine indices of the key fields in left and right tables
lkind = asindices(lhdr, lkey)
rkind = asindices(rhdr, rkey)
# construct functions to extract key values from both tables
lgetk = operator.itemgetter(*lkind)
rgetk = operator.itemgetter(*rkind)
rkeys = set()
for rrow in rit:
rk = rgetk(rrow)
rkeys.add(rk)
for lrow in lit:
lk = lgetk(lrow)
if lk not in rkeys:
yield tuple(lrow)
[docs]def hashlookupjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
lprefix=None, rprefix=None):
"""Alternative implementation of :func:`petl.transform.joins.lookupjoin`,
where the join is executed by constructing an in-memory lookup for the
right hand table, then iterating over rows from the left hand table.
May be faster and/or more resource efficient where the right table is small
and the left table is large.
Left and right tables with different key fields can be handled via the
`lkey` and `rkey` arguments.
"""
lkey, rkey = keys_from_args(left, right, key, lkey, rkey)
return HashLookupJoinView(left, right, lkey, rkey, missing=missing,
lprefix=lprefix, rprefix=rprefix)
Table.hashlookupjoin = hashlookupjoin
class HashLookupJoinView(Table):
def __init__(self, left, right, lkey, rkey, missing=None, lprefix=None,
rprefix=None):
self.left = left
self.right = right
self.lkey = lkey
self.rkey = rkey
self.missing = missing
self.lprefix = lprefix
self.rprefix = rprefix
def __iter__(self):
return iterhashlookupjoin(self.left, self.right, self.lkey, self.rkey,
self.missing, self.lprefix, self.rprefix)
def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix):
lit = iter(left)
lhdr = next(lit)
rhdr, rit = iterpeek(right) # need the whole lot to pass to lookup
rlookup = lookupone(rit, rkey, strict=False)
# determine indices of the key fields in left and right tables
lkind = asindices(lhdr, lkey)
rkind = asindices(rhdr, rkey)
# construct functions to extract key values from left table
lgetk = operator.itemgetter(*lkind)
# determine indices of non-key fields in the right table
# (in the output, we only include key fields from the left table - we
# don't want to duplicate fields)
rvind = [i for i in range(len(rhdr)) if i not in rkind]
rgetv = rowgetter(*rvind)
# determine the output fields
if lprefix is None:
outhdr = list(lhdr)
else:
outhdr = [(text_type(lprefix) + text_type(f))
for f in lhdr]
if rprefix is None:
outhdr.extend(rgetv(rhdr))
else:
outhdr.extend([(text_type(rprefix) + text_type(f))
for f in rgetv(rhdr)])
yield tuple(outhdr)
# define a function to join rows
def joinrows(_lrow, _rrow):
# start with the left row
_outrow = list(_lrow)
# extend with non-key values from the right row
_outrow.extend(rgetv(_rrow))
return tuple(_outrow)
for lrow in lit:
k = lgetk(lrow)
if k in rlookup:
rrow = rlookup[k]
yield joinrows(lrow, rrow)
else:
outrow = list(lrow) # start with the left row
# extend with missing values in place of the right row
outrow.extend([missing] * len(rvind))
yield tuple(outrow)