Source code for petl.io.json

# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division

# standard library dependencies
import io
import json
from json.encoder import JSONEncoder

from petl.compat import PY2
from petl.io.sources import read_source_from_arg, write_source_from_arg
# internal dependencies
from petl.util.base import data, Table, dicts as _dicts, iterpeek


[docs]def fromjson(source, *args, **kwargs): """ Extract data from a JSON file. The file must contain a JSON array as the top level object, and each member of the array will be treated as a row of data. E.g.:: >>> import petl as etl >>> data = ''' ... [{"foo": "a", "bar": 1}, ... {"foo": "b", "bar": 2}, ... {"foo": "c", "bar": 2}] ... ''' >>> with open('example.file1.json', 'w') as f: ... f.write(data) ... 74 >>> table1 = etl.fromjson('example.file1.json', header=['foo', 'bar']) >>> table1 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | 1 | +-----+-----+ | 'b' | 2 | +-----+-----+ | 'c' | 2 | +-----+-----+ Setting argument `lines` to `True` will enable to infer the document as a JSON lines document. For more details about JSON lines please visit https://jsonlines.org/. >>> import petl as etl >>> data_with_jlines = '''{"name": "Gilbert", "wins": [["straight", "7S"], ["one pair", "10H"]]} ... {"name": "Alexa", "wins": [["two pair", "4S"], ["two pair", "9S"]]} ... {"name": "May", "wins": []} ... {"name": "Deloise", "wins": [["three of a kind", "5S"]]}''' ... >>> with open('example.file2.json', 'w') as f: ... f.write(data_with_jlines) ... 223 >>> table2 = etl.fromjson('example.file2.json', lines=True) >>> table2 +-----------+-------------------------------------------+ | name | wins | +===========+===========================================+ | 'Gilbert' | [['straight', '7S'], ['one pair', '10H']] | +-----------+-------------------------------------------+ | 'Alexa' | [['two pair', '4S'], ['two pair', '9S']] | +-----------+-------------------------------------------+ | 'May' | [] | +-----------+-------------------------------------------+ | 'Deloise' | [['three of a kind', '5S']] | +-----------+-------------------------------------------+ If your JSON file does not fit this structure, you will need to parse it via :func:`json.load` and select the array to treat as the data, see also :func:`petl.io.json.fromdicts`. .. versionchanged:: 1.1.0 If no `header` is specified, fields will be discovered by sampling keys from the first `sample` objects in `source`. The header will be constructed from keys in the order discovered. Note that this ordering may not be stable, and therefore it may be advisable to specify an explicit `header` or to use another function like :func:`petl.transform.headers.sortheader` on the resulting table to guarantee stability. """ source = read_source_from_arg(source) return JsonView(source, *args, **kwargs)
class JsonView(Table): def __init__(self, source, *args, **kwargs): self.source = source self.missing = kwargs.pop('missing', None) self.header = kwargs.pop('header', None) self.sample = kwargs.pop('sample', 1000) self.lines = kwargs.pop('lines', False) self.args = args self.kwargs = kwargs def __iter__(self): with self.source.open('rb') as f: if not PY2: # wrap buffer for text IO f = io.TextIOWrapper(f, encoding='utf-8', newline='', write_through=True) try: if self.lines: for row in iterjlines(f, self.header, self.missing): yield row else: dicts = json.load(f, *self.args, **self.kwargs) for row in iterdicts(dicts, self.header, self.sample, self.missing): yield row finally: if not PY2: f.detach()
[docs]def fromdicts(dicts, header=None, sample=1000, missing=None): """ View a sequence of Python :class:`dict` as a table. E.g.:: >>> import petl as etl >>> dicts = [{"foo": "a", "bar": 1}, ... {"foo": "b", "bar": 2}, ... {"foo": "c", "bar": 2}] >>> table1 = etl.fromdicts(dicts, header=['foo', 'bar']) >>> table1 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | 1 | +-----+-----+ | 'b' | 2 | +-----+-----+ | 'c' | 2 | +-----+-----+ If `header` is not specified, `sample` items from `dicts` will be inspected to discovery dictionary keys. Note that the order in which dictionary keys are discovered may not be stable, See also :func:`petl.io.json.fromjson`. .. versionchanged:: 1.1.0 If no `header` is specified, fields will be discovered by sampling keys from the first `sample` dictionaries in `dicts`. The header will be constructed from keys in the order discovered. Note that this ordering may not be stable, and therefore it may be advisable to specify an explicit `header` or to use another function like :func:`petl.transform.headers.sortheader` on the resulting table to guarantee stability. """ return DictsView(dicts, header=header, sample=sample, missing=missing)
class DictsView(Table): def __init__(self, dicts, header=None, sample=1000, missing=None): self.dicts = dicts self.header = header self.sample = sample self.missing = missing def __iter__(self): return iterdicts(self.dicts, self.header, self.sample, self.missing) def iterjlines(f, header, missing): it = iter(f) if header is None: header = list() peek, it = iterpeek(it, 1) json_obj = json.loads(peek) if hasattr(json_obj, 'keys'): header += [k for k in json_obj.keys() if k not in header] yield tuple(header) for o in it: json_obj = json.loads(o) yield tuple(json_obj[f] if f in json_obj else missing for f in header) def iterdicts(dicts, header, sample, missing): it = iter(dicts) # determine header row if header is None: # discover fields header = list() peek, it = iterpeek(it, sample) for o in peek: if hasattr(o, 'keys'): header += [k for k in o.keys() if k not in header] yield tuple(header) # generate data rows for o in it: yield tuple(o[f] if f in o else missing for f in header)
[docs]def tojson(table, source=None, prefix=None, suffix=None, *args, **kwargs): """ Write a table in JSON format, with rows output as JSON objects. E.g.:: >>> import petl as etl >>> table1 = [['foo', 'bar'], ... ['a', 1], ... ['b', 2], ... ['c', 2]] >>> etl.tojson(table1, 'example.file3.json', sort_keys=True) >>> # check what it did ... print(open('example.file3.json').read()) [{"bar": 1, "foo": "a"}, {"bar": 2, "foo": "b"}, {"bar": 2, "foo": "c"}] Note that this is currently not streaming, all data is loaded into memory before being written to the file. """ obj = list(_dicts(table)) _writejson(source, obj, prefix, suffix, *args, **kwargs)
Table.tojson = tojson
[docs]def tojsonarrays(table, source=None, prefix=None, suffix=None, output_header=False, *args, **kwargs): """ Write a table in JSON format, with rows output as JSON arrays. E.g.:: >>> import petl as etl >>> table1 = [['foo', 'bar'], ... ['a', 1], ... ['b', 2], ... ['c', 2]] >>> etl.tojsonarrays(table1, 'example.file4.json') >>> # check what it did ... print(open('example.file4.json').read()) [["a", 1], ["b", 2], ["c", 2]] Note that this is currently not streaming, all data is loaded into memory before being written to the file. """ if output_header: obj = list(table) else: obj = list(data(table)) _writejson(source, obj, prefix, suffix, *args, **kwargs)
Table.tojsonarrays = tojsonarrays def _writejson(source, obj, prefix, suffix, *args, **kwargs): encoder = JSONEncoder(*args, **kwargs) source = write_source_from_arg(source) with source.open('wb') as f: if PY2: # write directly to buffer _writeobj(encoder, obj, f, prefix, suffix) else: # wrap buffer for text IO f = io.TextIOWrapper(f, encoding='utf-8', newline='', write_through=True) try: _writeobj(encoder, obj, f, prefix, suffix) finally: f.detach() def _writeobj(encoder, obj, f, prefix, suffix): if prefix is not None: f.write(prefix) for chunk in encoder.iterencode(obj): f.write(chunk) if suffix is not None: f.write(suffix)