Source code for petl.transform.hashjoins

from __future__ import absolute_import, print_function, division


import operator
from petl.compat import next, text_type


from petl.util.base import Table, asindices, rowgetter, iterpeek
from petl.util.lookups import lookup, lookupone
from petl.transform.joins import keys_from_args


[docs]def hashjoin(left, right, key=None, lkey=None, rkey=None, cache=True, lprefix=None, rprefix=None): """Alternative implementation of :func:`petl.transform.joins.join`, where the join is executed by constructing an in-memory lookup for the right hand table, then iterating over rows from the left hand table. May be faster and/or more resource efficient where the right table is small and the left table is large. By default data from right hand table is cached to improve performance (only available when `key` is given). Left and right tables with different key fields can be handled via the `lkey` and `rkey` arguments. """ lkey, rkey = keys_from_args(left, right, key, lkey, rkey) return HashJoinView(left, right, lkey=lkey, rkey=rkey, cache=cache, lprefix=lprefix, rprefix=rprefix)
Table.hashjoin = hashjoin class HashJoinView(Table): def __init__(self, left, right, lkey, rkey, cache=True, lprefix=None, rprefix=None): self.left = left self.right = right self.lkey = lkey self.rkey = rkey self.cache = cache self.rlookup = None self.lprefix = lprefix self.rprefix = rprefix def __iter__(self): if not self.cache or self.rlookup is None: self.rlookup = lookup(self.right, self.rkey) return iterhashjoin(self.left, self.right, self.lkey, self.rkey, self.rlookup, self.lprefix, self.rprefix) def iterhashjoin(left, right, lkey, rkey, rlookup, lprefix, rprefix): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table lgetk = operator.itemgetter(*lkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_lrow, _rrows): for rrow in _rrows: # start with the left row _outrow = list(_lrow) # extend with non-key values from the right row _outrow.extend(rgetv(rrow)) yield tuple(_outrow) for lrow in lit: k = lgetk(lrow) if k in rlookup: rrows = rlookup[k] for outrow in joinrows(lrow, rrows): yield outrow
[docs]def hashleftjoin(left, right, key=None, lkey=None, rkey=None, missing=None, cache=True, lprefix=None, rprefix=None): """Alternative implementation of :func:`petl.transform.joins.leftjoin`, where the join is executed by constructing an in-memory lookup for the right hand table, then iterating over rows from the left hand table. May be faster and/or more resource efficient where the right table is small and the left table is large. By default data from right hand table is cached to improve performance (only available when `key` is given). Left and right tables with different key fields can be handled via the `lkey` and `rkey` arguments. """ lkey, rkey = keys_from_args(left, right, key, lkey, rkey) return HashLeftJoinView(left, right, lkey, rkey, missing=missing, cache=cache, lprefix=lprefix, rprefix=rprefix)
Table.hashleftjoin = hashleftjoin class HashLeftJoinView(Table): def __init__(self, left, right, lkey, rkey, missing=None, cache=True, lprefix=None, rprefix=None): self.left = left self.right = right self.lkey = lkey self.rkey = rkey self.missing = missing self.cache = cache self.rlookup = None self.lprefix = lprefix self.rprefix = rprefix def __iter__(self): if not self.cache or self.rlookup is None: self.rlookup = lookup(self.right, self.rkey) return iterhashleftjoin(self.left, self.right, self.lkey, self.rkey, self.missing, self.rlookup, self.lprefix, self.rprefix) def iterhashleftjoin(left, right, lkey, rkey, missing, rlookup, lprefix, rprefix): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table lgetk = operator.itemgetter(*lkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_lrow, _rrows): for rrow in _rrows: # start with the left row _outrow = list(_lrow) # extend with non-key values from the right row _outrow.extend(rgetv(rrow)) yield tuple(_outrow) for lrow in lit: k = lgetk(lrow) if k in rlookup: rrows = rlookup[k] for outrow in joinrows(lrow, rrows): yield outrow else: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow)
[docs]def hashrightjoin(left, right, key=None, lkey=None, rkey=None, missing=None, cache=True, lprefix=None, rprefix=None): """Alternative implementation of :func:`petl.transform.joins.rightjoin`, where the join is executed by constructing an in-memory lookup for the left hand table, then iterating over rows from the right hand table. May be faster and/or more resource efficient where the left table is small and the right table is large. By default data from right hand table is cached to improve performance (only available when `key` is given). Left and right tables with different key fields can be handled via the `lkey` and `rkey` arguments. """ lkey, rkey = keys_from_args(left, right, key, lkey, rkey) return HashRightJoinView(left, right, lkey, rkey, missing=missing, cache=cache, lprefix=lprefix, rprefix=rprefix)
Table.hashrightjoin = hashrightjoin class HashRightJoinView(Table): def __init__(self, left, right, lkey, rkey, missing=None, cache=True, lprefix=None, rprefix=None): self.left = left self.right = right self.lkey = lkey self.rkey = rkey self.missing = missing self.cache = cache self.llookup = None self.lprefix = lprefix self.rprefix = rprefix def __iter__(self): if not self.cache or self.llookup is None: self.llookup = lookup(self.left, self.lkey) return iterhashrightjoin(self.left, self.right, self.lkey, self.rkey, self.missing, self.llookup, self.lprefix, self.rprefix) def iterhashrightjoin(left, right, lkey, rkey, missing, llookup, lprefix, rprefix): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table rgetk = operator.itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_rrow, _lrows): for lrow in _lrows: # start with the left row _outrow = list(lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) yield tuple(_outrow) for rrow in rit: k = rgetk(rrow) if k in llookup: lrows = llookup[k] for outrow in joinrows(rrow, lrows): yield outrow else: # start with missing values in place of the left row outrow = [missing] * len(lhdr) # set key values for li, ri in zip(lkind, rkind): outrow[li] = rrow[ri] # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow)
[docs]def hashantijoin(left, right, key=None, lkey=None, rkey=None): """Alternative implementation of :func:`petl.transform.joins.antijoin`, where the join is executed by constructing an in-memory set for all keys found in the right hand table, then iterating over rows from the left hand table. May be faster and/or more resource efficient where the right table is small and the left table is large. Left and right tables with different key fields can be handled via the `lkey` and `rkey` arguments. """ lkey, rkey = keys_from_args(left, right, key, lkey, rkey) return HashAntiJoinView(left, right, lkey, rkey)
Table.hashantijoin = hashantijoin class HashAntiJoinView(Table): def __init__(self, left, right, lkey, rkey): self.left = left self.right = right self.lkey = lkey self.rkey = rkey def __iter__(self): return iterhashantijoin(self.left, self.right, self.lkey, self.rkey) def iterhashantijoin(left, right, lkey, rkey): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) yield tuple(lhdr) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from both tables lgetk = operator.itemgetter(*lkind) rgetk = operator.itemgetter(*rkind) rkeys = set() for rrow in rit: rk = rgetk(rrow) rkeys.add(rk) for lrow in lit: lk = lgetk(lrow) if lk not in rkeys: yield tuple(lrow)
[docs]def hashlookupjoin(left, right, key=None, lkey=None, rkey=None, missing=None, lprefix=None, rprefix=None): """Alternative implementation of :func:`petl.transform.joins.lookupjoin`, where the join is executed by constructing an in-memory lookup for the right hand table, then iterating over rows from the left hand table. May be faster and/or more resource efficient where the right table is small and the left table is large. Left and right tables with different key fields can be handled via the `lkey` and `rkey` arguments. """ lkey, rkey = keys_from_args(left, right, key, lkey, rkey) return HashLookupJoinView(left, right, lkey, rkey, missing=missing, lprefix=lprefix, rprefix=rprefix)
Table.hashlookupjoin = hashlookupjoin class HashLookupJoinView(Table): def __init__(self, left, right, lkey, rkey, missing=None, lprefix=None, rprefix=None): self.left = left self.right = right self.lkey = lkey self.rkey = rkey self.missing = missing self.lprefix = lprefix self.rprefix = rprefix def __iter__(self): return iterhashlookupjoin(self.left, self.right, self.lkey, self.rkey, self.missing, self.lprefix, self.rprefix) def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix): lit = iter(left) lhdr = next(lit) rhdr, rit = iterpeek(right) # need the whole lot to pass to lookup rlookup = lookupone(rit, rkey, strict=False) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table lgetk = operator.itemgetter(*lkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_lrow, _rrow): # start with the left row _outrow = list(_lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) return tuple(_outrow) for lrow in lit: k = lgetk(lrow) if k in rlookup: rrow = rlookup[k] yield joinrows(lrow, rrow) else: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow)