Source code for petl.io.xml

# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


# standard library dependencies
try:
    # prefer lxml as it supports XPath
    from lxml import etree
except ImportError:
    import xml.etree.ElementTree as etree

from operator import attrgetter
import itertools
from petl.compat import string_types, text_type


# internal dependencies
from petl.util.base import Table, fieldnames, iterpeek
from petl.io.sources import read_source_from_arg
from petl.io.text import totext


[docs]def fromxml(source, *args, **kwargs): """ Extract data from an XML file. E.g.:: >>> import petl as etl >>> # setup a file to demonstrate with ... d = '''<table> ... <tr> ... <td>foo</td><td>bar</td> ... </tr> ... <tr> ... <td>a</td><td>1</td> ... </tr> ... <tr> ... <td>b</td><td>2</td> ... </tr> ... <tr> ... <td>c</td><td>2</td> ... </tr> ... </table>''' >>> with open('example.file1.xml', 'w') as f: ... f.write(d) ... 212 >>> table1 = etl.fromxml('example.file1.xml', 'tr', 'td') >>> table1 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | '1' | +-----+-----+ | 'b' | '2' | +-----+-----+ | 'c' | '2' | +-----+-----+ If the data values are stored in an attribute, provide the attribute name as an extra positional argument:: >>> d = '''<table> ... <tr> ... <td v='foo'/><td v='bar'/> ... </tr> ... <tr> ... <td v='a'/><td v='1'/> ... </tr> ... <tr> ... <td v='b'/><td v='2'/> ... </tr> ... <tr> ... <td v='c'/><td v='2'/> ... </tr> ... </table>''' >>> with open('example.file2.xml', 'w') as f: ... f.write(d) ... 220 >>> table2 = etl.fromxml('example.file2.xml', 'tr', 'td', 'v') >>> table2 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | '1' | +-----+-----+ | 'b' | '2' | +-----+-----+ | 'c' | '2' | +-----+-----+ Data values can also be extracted by providing a mapping of field names to element paths:: >>> d = '''<table> ... <row> ... <foo>a</foo><baz><bar v='1'/><bar v='3'/></baz> ... </row> ... <row> ... <foo>b</foo><baz><bar v='2'/></baz> ... </row> ... <row> ... <foo>c</foo><baz><bar v='2'/></baz> ... </row> ... </table>''' >>> with open('example.file3.xml', 'w') as f: ... f.write(d) ... 223 >>> table3 = etl.fromxml('example.file3.xml', 'row', ... {'foo': 'foo', 'bar': ('baz/bar', 'v')}) >>> table3 +------------+-----+ | bar | foo | +============+=====+ | ('1', '3') | 'a' | +------------+-----+ | '2' | 'b' | +------------+-----+ | '2' | 'c' | +------------+-----+ If `lxml <http://lxml.de/>`_ is installed, full XPath expressions can be used. Note that the implementation is currently **not** streaming, i.e., the whole document is loaded into memory. If multiple elements match a given field, all values are reported as a tuple. If there is more than one element name used for row values, a tuple or list of paths can be provided, e.g., ``fromxml('example.file.html', './/tr', ('th', 'td'))``. Optionally a custom parser can be provided, e.g.:: >>> from lxml import etree # doctest: +SKIP ... my_parser = etree.XMLParser(resolve_entities=False) # doctest: +SKIP ... table4 = etl.fromxml('example.file1.xml', 'tr', 'td', parser=my_parser) # doctest: +SKIP """ source = read_source_from_arg(source) return XmlView(source, *args, **kwargs)
class XmlView(Table): def __init__(self, source, *args, **kwargs): self.source = source self.args = args if len(args) == 2 and isinstance(args[1], (string_types, tuple, list)): self.rmatch = args[0] self.vmatch = args[1] self.vdict = None self.attr = None elif len(args) == 2 and isinstance(args[1], dict): self.rmatch = args[0] self.vmatch = None self.vdict = args[1] self.attr = None elif len(args) == 3: self.rmatch = args[0] self.vmatch = args[1] self.vdict = None self.attr = args[2] else: assert False, 'bad parameters' self.missing = kwargs.get('missing', None) self.user_parser = kwargs.get('parser', None) def __iter__(self): vmatch = self.vmatch vdict = self.vdict with self.source.open('rb') as xmlf: parser2 = _create_xml_parser(self.user_parser) tree = etree.parse(xmlf, parser=parser2) if not hasattr(tree, 'iterfind'): # Python 2.6 compatibility tree.iterfind = tree.findall if vmatch is not None: # simple case, all value paths are the same for rowelm in tree.iterfind(self.rmatch): if self.attr is None: getv = attrgetter('text') else: getv = lambda e: e.get(self.attr) if isinstance(vmatch, string_types): # match only one path velms = rowelm.findall(vmatch) else: # match multiple paths velms = itertools.chain(*[rowelm.findall(enm) for enm in vmatch]) yield tuple(getv(velm) for velm in velms) else: # difficult case, deal with different paths for each field # determine output header flds = tuple(sorted(map(text_type, vdict.keys()))) yield flds # setup value getters vmatches = dict() vgetters = dict() for f in flds: vmatch = self.vdict[f] if isinstance(vmatch, string_types): # match element path vmatches[f] = vmatch vgetters[f] = element_text_getter(self.missing) else: # match element path and attribute name vmatches[f] = vmatch[0] attr = vmatch[1] vgetters[f] = attribute_text_getter(attr, self.missing) # determine data rows for rowelm in tree.iterfind(self.rmatch): yield tuple(vgetters[f](rowelm.findall(vmatches[f])) for f in flds) def _create_xml_parser(user_parser): if user_parser is not None: return user_parser try: # Default lxml parser. # This will throw an error if parser is not set and lxml could not be imported # because Python's built XML parser doesn't like the `resolve_entities` kwarg. # return etree.XMLParser(resolve_entities=False) return etree.XMLParser(resolve_entities=False) except TypeError: # lxml not available return None def element_text_getter(missing): def _get(v): if len(v) > 1: return tuple(e.text for e in v) elif len(v) == 1: return v[0].text else: return missing return _get def attribute_text_getter(attr, missing): def _get(v): if len(v) > 1: return tuple(e.get(attr) for e in v) elif len(v) == 1: return v[0].get(attr) else: return missing return _get
[docs]def toxml(table, target=None, root=None, head=None, rows=None, prologue=None, epilogue=None, style='tag', encoding='utf-8'): """ Write the table into a new xml file according to elements defined in the function arguments. The `root`, `head` and `rows` (string, optional) arguments define the tags and the nesting of the xml file. Each one defines xml elements with tags separated by slashes (`/`) like in `root/level/tag`. They can have a arbitrary number of tags that will reflect in more nesting levels for the header or record/row written in the xml file. For details on tag naming and nesting rules check xml `specification`_ or xml `references`_. The `rows` argument define the elements for each row of data to be written in the xml file. When specified, it must have at least 2 tags for defining the tags for `row/column`. Additional tags will add nesting enclosing all records/rows/lines. The `head` argument is similar to the rows, but aplies only to one line/row of header with fieldnames. When specified, it must have at least 2 tags for `fields/name` and the remaining will increase nesting. The `root` argument defines the elements enclosing `head` and `rows` and is required when using `head` for specifying valid xml documents. When none of this arguments are specified, they will default to tags that generate output similar to a html table: `root='table', head='there/tr/td', rows='tbody/tr/td'`. The `prologue` argument (string, optional) could be a snippet of valid xml that will be inserted before other elements in the xml. It can optionally specify the `XML Prolog` of the file. The `epilogue` argument (string, optional) could be a snippet of valid xml that will be inserted after all other xml elements except the root closing tag. It must specify a closing tag if the `root` argument is not specified. The `style` argument select the format of the elements in the xml file. It can be `tag` (default), `name`, `attribute` or a custom string to format each row via `str.format <http://docs.python.org/library/stdtypes.html#str.format>`_. Example usage for writing files:: >>> import petl as etl >>> table1 = [['foo', 'bar'], ... ['a', 1], ... ['b', 2]] >>> etl.toxml(table1, 'example.file4.xml') >>> # see what we did is similar a html table: >>> print(open('example.file4.xml').read()) <?xml version="1.0" encoding="UTF-8"?> <table><thead> <tr><th>foo</th><th>bar</th></tr> </thead><tbody> <tr><td>a</td><td>1</td></tr> <tr><td>b</td><td>2</td></tr> </tbody></table> >>> # define the nesting in xml file: >>> etl.toxml(table1, 'example.file5.xml', rows='plan/line/cell') >>> print(open('example.file5.xml').read()) <?xml version="1.0" encoding="UTF-8"?> <plan> <line><cell>a</cell><cell>1</cell></line> <line><cell>b</cell><cell>2</cell></line> </plan> >>> # choose other style: >>> etl.toxml(table1, 'example.file6.xml', rows='row/col', style='attribute') >>> print(open('example.file6.xml').read()) <?xml version="1.0" encoding="UTF-8"?> <row> <col foo="a" bar="1" /> <col foo="b" bar="2" /> </row> >>> etl.toxml(table1, 'example.file6.xml', rows='row/col', style='name') >>> print(open('example.file6.xml').read()) <?xml version="1.0" encoding="UTF-8"?> <row> <col><foo>a</foo><bar>1</bar></col> <col><foo>b</foo><bar>2</bar></col> </row> The `toxml()` function is just a wrapper over :func:`petl.io.text.totext`. For advanced cases use a template with `totext()` for generating xml files. .. versionadded:: 1.7.0 .. _specification: https://www.w3.org/TR/xml/ .. _references: https://www.w3schools.com/xml/xml_syntax.asp """ if not root and not head and not rows: root = 'table' head = 'thead/tr/th' rows = 'tbody/tr/td' sample, table2 = iterpeek(table, 2) props = fieldnames(sample) top = _build_xml_header(style, props, root, head, rows, prologue, encoding) template = _build_cols(style, props, rows, True) bottom = _build_xml_footer(style, epilogue, rows, root) totext(table2, source=target, encoding=encoding, errors='strict', template=template, prologue=top, epilogue=bottom)
def _build_xml_header(style, props, root, head, rows, prologue, encoding): tab = _build_nesting(root, False, None) if root else '' nested = -1 if style in ('attribute', 'name') else -2 if head: th1 = _build_nesting(head, False, nested) col = _build_cols(style, props, head, False) th2 = _build_nesting(head, True, nested) thd = '{0}\n{1}{2}'.format(th1, col, th2) else: thd = '' tbd = _build_nesting(rows, False, nested) if prologue and prologue.startswith('<?xml'): thb = '{0}{1}{2}\n'.format(tab, thd, tbd) return prologue + thb enc = encoding.upper() if encoding else 'UTF-8' xml = '<?xml version="1.0" encoding="%s"?>' % enc pre = prologue + '\n' if prologue and not root else '' pos = '\n' + prologue if prologue and root else '' res = '{0}\n{1}{2}{3}{4}{5}\n'.format(xml, pre, tab, thd, tbd, pos) return res def _build_xml_footer(style, epilogue, rows, root): nested = -1 if style in ('attribute', 'name') else -2 tbd = _build_nesting(rows, True, nested) tab = _build_nesting(root, True, 0) pre = epilogue + '\n' if epilogue and root else '' pos = '\n' + epilogue if epilogue and not root else '' return pre + tbd + tab + pos def _build_nesting(path, closing, index): if not path: return '' fmt = '</%s>' if closing else '<%s>' if '/' not in path: return fmt % path parts = path.split('/') elements = parts[0:index] if index else parts if closing: elements.reverse() tags = [fmt % e for e in elements] return ''.join(tags) def _build_cols(style, props, path, is_value): is_header = not is_value if style == 'tag' or is_header: return _build_cols_inline(props, path, is_value, True) if style == 'name': return _build_cols_inline(props, path, is_value, False) if style == 'attribute': return _build_cols_attribs(props, path) return style # custom def _build_cols_inline(props, path, is_value, use_tag): parts = path.split('/') if use_tag: if len(parts) < 2: raise ValueError("Tag not in format 'row/col': %s" % path) col = parts[-1] row = parts[-2:-1][0] else: col = '{0}' row = parts[-1] fld = '{{{0}}}' if is_value else '{0}' fmt = '<{0}>{1}</{0}>'.format(col, fld) cols = [fmt.format(e) for e in props] tags = ''.join(cols) res = ' <{0}>{1}</{0}>\n'.format(row, tags) return res def _build_cols_attribs(props, path): parts = path.split('/') row = parts[-1] fmt = '{0}="{{{0}}}"' cols = [fmt.format(e) for e in props] atts = ' '.join(cols) res = ' <{0} {1} />\n'.format(row, atts) return res