Source code for petl.transform.regex

from __future__ import absolute_import, print_function, division


import re
import operator
from petl.compat import next, text_type


from petl.errors import ArgumentError
from petl.util.base import Table, asindices
from petl.transform.basics import TransformError
from petl.transform.conversions import convert


[docs]def capture(table, field, pattern, newfields=None, include_original=False, flags=0, fill=None): """ Add one or more new fields with values captured from an existing field searched via a regular expression. E.g.:: >>> import petl as etl >>> table1 = [['id', 'variable', 'value'], ... ['1', 'A1', '12'], ... ['2', 'A2', '15'], ... ['3', 'B1', '18'], ... ['4', 'C12', '19']] >>> table2 = etl.capture(table1, 'variable', '(\\w)(\\d+)', ... ['treat', 'time']) >>> table2 +-----+-------+-------+------+ | id | value | treat | time | +=====+=======+=======+======+ | '1' | '12' | 'A' | '1' | +-----+-------+-------+------+ | '2' | '15' | 'A' | '2' | +-----+-------+-------+------+ | '3' | '18' | 'B' | '1' | +-----+-------+-------+------+ | '4' | '19' | 'C' | '12' | +-----+-------+-------+------+ >>> # using the include_original argument ... table3 = etl.capture(table1, 'variable', '(\\w)(\\d+)', ... ['treat', 'time'], ... include_original=True) >>> table3 +-----+----------+-------+-------+------+ | id | variable | value | treat | time | +=====+==========+=======+=======+======+ | '1' | 'A1' | '12' | 'A' | '1' | +-----+----------+-------+-------+------+ | '2' | 'A2' | '15' | 'A' | '2' | +-----+----------+-------+-------+------+ | '3' | 'B1' | '18' | 'B' | '1' | +-----+----------+-------+-------+------+ | '4' | 'C12' | '19' | 'C' | '12' | +-----+----------+-------+-------+------+ By default the field on which the capture is performed is omitted. It can be included using the `include_original` argument. The ``fill`` parameter can be used to provide a list or tuple of values to use if the regular expression does not match. The ``fill`` parameter should contain as many values as there are capturing groups in the regular expression. If ``fill`` is ``None`` (default) then a ``petl.transform.TransformError`` will be raised on the first non-matching value. """ return CaptureView(table, field, pattern, newfields=newfields, include_original=include_original, flags=flags, fill=fill)
Table.capture = capture class CaptureView(Table): def __init__(self, source, field, pattern, newfields=None, include_original=False, flags=0, fill=None): self.source = source self.field = field self.pattern = pattern self.newfields = newfields self.include_original = include_original self.flags = flags self.fill = fill def __iter__(self): return itercapture(self.source, self.field, self.pattern, self.newfields, self.include_original, self.flags, self.fill) def itercapture(source, field, pattern, newfields, include_original, flags, fill): it = iter(source) prog = re.compile(pattern, flags) hdr = next(it) flds = list(map(text_type, hdr)) if isinstance(field, int) and field < len(hdr): field_index = field elif field in flds: field_index = flds.index(field) else: raise ArgumentError('field invalid: must be either field name or index') # determine output fields outhdr = list(flds) if not include_original: outhdr.remove(field) if newfields: outhdr.extend(newfields) yield tuple(outhdr) # construct the output data for row in it: value = row[field_index] if include_original: out_row = list(row) else: out_row = [v for i, v in enumerate(row) if i != field_index] match = prog.search(value) if match is None: if fill is not None: out_row.extend(fill) else: raise TransformError('value %r did not match pattern %r' % (value, pattern)) else: out_row.extend(match.groups()) yield tuple(out_row)
[docs]def split(table, field, pattern, newfields=None, include_original=False, maxsplit=0, flags=0): """ Add one or more new fields with values generated by splitting an existing value around occurrences of a regular expression. E.g.:: >>> import petl as etl >>> table1 = [['id', 'variable', 'value'], ... ['1', 'parad1', '12'], ... ['2', 'parad2', '15'], ... ['3', 'tempd1', '18'], ... ['4', 'tempd2', '19']] >>> table2 = etl.split(table1, 'variable', 'd', ['variable', 'day']) >>> table2 +-----+-------+----------+-----+ | id | value | variable | day | +=====+=======+==========+=====+ | '1' | '12' | 'para' | '1' | +-----+-------+----------+-----+ | '2' | '15' | 'para' | '2' | +-----+-------+----------+-----+ | '3' | '18' | 'temp' | '1' | +-----+-------+----------+-----+ | '4' | '19' | 'temp' | '2' | +-----+-------+----------+-----+ By default the field on which the split is performed is omitted. It can be included using the `include_original` argument. """ return SplitView(table, field, pattern, newfields, include_original, maxsplit, flags)
Table.split = split class SplitView(Table): def __init__(self, source, field, pattern, newfields=None, include_original=False, maxsplit=0, flags=0): self.source = source self.field = field self.pattern = pattern self.newfields = newfields self.include_original = include_original self.maxsplit = maxsplit self.flags = flags def __iter__(self): return itersplit(self.source, self.field, self.pattern, self.newfields, self.include_original, self.maxsplit, self.flags) def itersplit(source, field, pattern, newfields, include_original, maxsplit, flags): it = iter(source) prog = re.compile(pattern, flags) hdr = next(it) flds = list(map(text_type, hdr)) if isinstance(field, int) and field < len(hdr): field_index = field field = hdr[field_index] elif field in flds: field_index = flds.index(field) else: raise ArgumentError('field invalid: must be either field name or index') # determine output fields outhdr = list(flds) if not include_original: outhdr.remove(field) if newfields: outhdr.extend(newfields) yield tuple(outhdr) # construct the output data for row in it: value = row[field_index] if include_original: out_row = list(row) else: out_row = [v for i, v in enumerate(row) if i != field_index] out_row.extend(prog.split(value, maxsplit)) yield tuple(out_row)
[docs]def sub(table, field, pattern, repl, count=0, flags=0): """ Convenience function to convert values under the given field using a regular expression substitution. See also :func:`re.sub`. """ prog = re.compile(pattern, flags) conv = lambda v: prog.sub(repl, v, count=count) return convert(table, field, conv)
Table.sub = sub Table.search = search class SearchView(Table): def __init__(self, table, pattern, field=None, flags=0, complement=False): self.table = table self.pattern = pattern self.field = field self.flags = flags self.complement = complement def __iter__(self): return itersearch(self.table, self.pattern, self.field, self.flags, self.complement) def itersearch(table, pattern, field, flags, complement): prog = re.compile(pattern, flags) it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) yield tuple(hdr) if field is None: # search whole row test = lambda r: any(prog.search(text_type(v)) for v in r) else: indices = asindices(hdr, field) if len(indices) == 1: index = indices[0] test = lambda r: prog.search(text_type(r[index])) else: getvals = operator.itemgetter(*indices) test = lambda r: any(prog.search(text_type(v)) for v in getvals(r)) # complement==False, return rows that match if not complement: for row in it: if test(row): yield tuple(row) # complement==True, return rows that do not match else: for row in it: if not test(row): yield tuple(row)
[docs]def searchcomplement(table, *args, **kwargs): """ Perform a regular expression search, returning rows that **do not** match a given pattern, either anywhere in the row or within a specific field. E.g.:: >>> import petl as etl >>> table1 = [['foo', 'bar', 'baz'], ... ['orange', 12, 'oranges are nice fruit'], ... ['mango', 42, 'I like them'], ... ['banana', 74, 'lovely too'], ... ['cucumber', 41, 'better than mango']] >>> # search any field ... table2 = etl.searchcomplement(table1, '.g.') >>> table2 +----------+-----+--------------+ | foo | bar | baz | +==========+=====+==============+ | 'banana' | 74 | 'lovely too' | +----------+-----+--------------+ >>> # search a specific field ... table3 = etl.searchcomplement(table1, 'foo', '.g.') >>> table3 +------------+-----+---------------------+ | foo | bar | baz | +============+=====+=====================+ | 'banana' | 74 | 'lovely too' | +------------+-----+---------------------+ | 'cucumber' | 41 | 'better than mango' | +------------+-----+---------------------+ This returns the complement of :func:`petl.transform.regex.search`. """ return search(table, *args, complement=True, **kwargs)
Table.searchcomplement = searchcomplement
[docs]def splitdown(table, field, pattern, maxsplit=0, flags=0): """ Split a field into multiple rows using a regular expression. E.g.: >>> import petl as etl >>> table1 = [['name', 'roles'], ... ['Jane Doe', 'president,engineer,tailor,lawyer'], ... ['John Doe', 'rocket scientist,optometrist,chef,knight,sailor']] >>> table2 = etl.splitdown(table1, 'roles', ',') >>> table2.lookall() +------------+--------------------+ | name | roles | +============+====================+ | 'Jane Doe' | 'president' | +------------+--------------------+ | 'Jane Doe' | 'engineer' | +------------+--------------------+ | 'Jane Doe' | 'tailor' | +------------+--------------------+ | 'Jane Doe' | 'lawyer' | +------------+--------------------+ | 'John Doe' | 'rocket scientist' | +------------+--------------------+ | 'John Doe' | 'optometrist' | +------------+--------------------+ | 'John Doe' | 'chef' | +------------+--------------------+ | 'John Doe' | 'knight' | +------------+--------------------+ | 'John Doe' | 'sailor' | +------------+--------------------+ """ return SplitDownView(table, field, pattern, maxsplit, flags)
Table.splitdown = splitdown class SplitDownView(Table): def __init__(self, table, field, pattern, maxsplit=0, flags=0): self.table = table self.field = field self.pattern = pattern self.maxsplit = maxsplit self.flags = flags def __iter__(self): return itersplitdown(self.table, self.field, self.pattern, self.maxsplit, self.flags) def itersplitdown(table, field, pattern, maxsplit, flags): prog = re.compile(pattern, flags) it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) if isinstance(field, int) and field < len(hdr): field_index = field field = hdr[field_index] elif field in flds: field_index = flds.index(field) else: raise ArgumentError('field invalid: must be either field name or index') yield tuple(hdr) for row in it: value = row[field_index] for v in prog.split(value, maxsplit): yield tuple(v if i == field_index else row[i] for i in range(len(hdr)))