Source code for petl.io.json

# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division

# standard library dependencies
import io
import json
import inspect
from json.encoder import JSONEncoder
from os import unlink
from tempfile import NamedTemporaryFile

from petl.compat import PY2
from petl.compat import pickle
from petl.io.sources import read_source_from_arg, write_source_from_arg
# internal dependencies
from petl.util.base import data, Table, dicts as _dicts, iterpeek


[docs]def fromjson(source, *args, **kwargs): """ Extract data from a JSON file. The file must contain a JSON array as the top level object, and each member of the array will be treated as a row of data. E.g.:: >>> import petl as etl >>> data = ''' ... [{"foo": "a", "bar": 1}, ... {"foo": "b", "bar": 2}, ... {"foo": "c", "bar": 2}] ... ''' >>> with open('example.file1.json', 'w') as f: ... f.write(data) ... 74 >>> table1 = etl.fromjson('example.file1.json', header=['foo', 'bar']) >>> table1 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | 1 | +-----+-----+ | 'b' | 2 | +-----+-----+ | 'c' | 2 | +-----+-----+ Setting argument `lines` to `True` will enable to infer the document as a JSON lines document. For more details about JSON lines please visit https://jsonlines.org/. >>> import petl as etl >>> data_with_jlines = '''{"name": "Gilbert", "wins": [["straight", "7S"], ["one pair", "10H"]]} ... {"name": "Alexa", "wins": [["two pair", "4S"], ["two pair", "9S"]]} ... {"name": "May", "wins": []} ... {"name": "Deloise", "wins": [["three of a kind", "5S"]]}''' ... >>> with open('example.file2.json', 'w') as f: ... f.write(data_with_jlines) ... 223 >>> table2 = etl.fromjson('example.file2.json', lines=True) >>> table2 +-----------+-------------------------------------------+ | name | wins | +===========+===========================================+ | 'Gilbert' | [['straight', '7S'], ['one pair', '10H']] | +-----------+-------------------------------------------+ | 'Alexa' | [['two pair', '4S'], ['two pair', '9S']] | +-----------+-------------------------------------------+ | 'May' | [] | +-----------+-------------------------------------------+ | 'Deloise' | [['three of a kind', '5S']] | +-----------+-------------------------------------------+ If your JSON file does not fit this structure, you will need to parse it via :func:`json.load` and select the array to treat as the data, see also :func:`petl.io.json.fromdicts`. .. versionchanged:: 1.1.0 If no `header` is specified, fields will be discovered by sampling keys from the first `sample` objects in `source`. The header will be constructed from keys in the order discovered. Note that this ordering may not be stable, and therefore it may be advisable to specify an explicit `header` or to use another function like :func:`petl.transform.headers.sortheader` on the resulting table to guarantee stability. """ source = read_source_from_arg(source) return JsonView(source, *args, **kwargs)
class JsonView(Table): def __init__(self, source, *args, **kwargs): self.source = source self.missing = kwargs.pop('missing', None) self.header = kwargs.pop('header', None) self.sample = kwargs.pop('sample', 1000) self.lines = kwargs.pop('lines', False) self.args = args self.kwargs = kwargs def __iter__(self): with self.source.open('rb') as f: if not PY2: # wrap buffer for text IO f = io.TextIOWrapper(f, encoding='utf-8', newline='', write_through=True) try: if self.lines: for row in iterjlines(f, self.header, self.missing): yield row else: dicts = json.load(f, *self.args, **self.kwargs) for row in iterdicts(dicts, self.header, self.sample, self.missing): yield row finally: if not PY2: f.detach()
[docs]def fromdicts(dicts, header=None, sample=1000, missing=None): """ View a sequence of Python :class:`dict` as a table. E.g.:: >>> import petl as etl >>> dicts = [{"foo": "a", "bar": 1}, ... {"foo": "b", "bar": 2}, ... {"foo": "c", "bar": 2}] >>> table1 = etl.fromdicts(dicts, header=['foo', 'bar']) >>> table1 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | 1 | +-----+-----+ | 'b' | 2 | +-----+-----+ | 'c' | 2 | +-----+-----+ Argument `dicts` can also be a generator, the output of generator is iterated and cached using a temporary file to support further transforms and multiple passes of the table: >>> import petl as etl >>> dicts = ({"foo": chr(ord("a")+i), "bar":i+1} for i in range(3)) >>> table1 = etl.fromdicts(dicts, header=['foo', 'bar']) >>> table1 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | 1 | +-----+-----+ | 'b' | 2 | +-----+-----+ | 'c' | 3 | +-----+-----+ If `header` is not specified, `sample` items from `dicts` will be inspected to discovery dictionary keys. Note that the order in which dictionary keys are discovered may not be stable, See also :func:`petl.io.json.fromjson`. .. versionchanged:: 1.1.0 If no `header` is specified, fields will be discovered by sampling keys from the first `sample` dictionaries in `dicts`. The header will be constructed from keys in the order discovered. Note that this ordering may not be stable, and therefore it may be advisable to specify an explicit `header` or to use another function like :func:`petl.transform.headers.sortheader` on the resulting table to guarantee stability. .. versionchanged:: 1.7.5 Full support of generators passed as `dicts` has been added, leveraging `itertools.tee`. .. versionchanged:: 1.7.11 Generator support has been modified to use temporary file cache instead of `itertools.tee` due to high memory usage. """ view = DictsGeneratorView if inspect.isgenerator(dicts) else DictsView return view(dicts, header=header, sample=sample, missing=missing)
class DictsView(Table): def __init__(self, dicts, header=None, sample=1000, missing=None): self.dicts = dicts self._header = header self.sample = sample self.missing = missing def __iter__(self): return iterdicts(self.dicts, self._header, self.sample, self.missing) class DictsGeneratorView(DictsView): def __init__(self, dicts, header=None, sample=1000, missing=None): super(DictsGeneratorView, self).__init__(dicts, header, sample, missing) self._filecache = None self._cached = 0 def __iter__(self): if not self._header: self._determine_header() yield self._header if not self._filecache: if PY2: self._filecache = NamedTemporaryFile(delete=False, mode='wb+', bufsize=0) else: self._filecache = NamedTemporaryFile(delete=False, mode='wb+', buffering=0) position = 0 it = iter(self.dicts) while True: if position < self._cached: self._filecache.seek(position) row = pickle.load(self._filecache) position = self._filecache.tell() yield row continue try: o = next(it) except StopIteration: break row = tuple(o.get(f, self.missing) for f in self._header) self._filecache.seek(self._cached) pickle.dump(row, self._filecache, protocol=-1) self._cached = position = self._filecache.tell() yield row def _determine_header(self): it = iter(self.dicts) header = list() peek, it = iterpeek(it, self.sample) self.dicts = it if isinstance(peek, dict): peek = [peek] for o in peek: if hasattr(o, 'keys'): header += [k for k in o.keys() if k not in header] self._header = tuple(header) return it def __del__(self): if self._filecache: self._filecache.close() unlink(self._filecache.name) def iterjlines(f, header, missing): it = iter(f) if header is None: header = list() peek, it = iterpeek(it, 1) json_obj = json.loads(peek) if hasattr(json_obj, 'keys'): header += [k for k in json_obj.keys() if k not in header] yield tuple(header) for o in it: json_obj = json.loads(o) yield tuple(json_obj[f] if f in json_obj else missing for f in header) def iterdicts(dicts, header, sample, missing): it = iter(dicts) # determine header row if header is None: # discover fields header = list() peek, it = iterpeek(it, sample) for o in peek: if hasattr(o, 'keys'): header += [k for k in o.keys() if k not in header] yield tuple(header) # generate data rows for o in it: yield tuple(o.get(f, missing) for f in header)
[docs]def tojson(table, source=None, prefix=None, suffix=None, *args, **kwargs): """ Write a table in JSON format, with rows output as JSON objects. E.g.:: >>> import petl as etl >>> table1 = [['foo', 'bar'], ... ['a', 1], ... ['b', 2], ... ['c', 2]] >>> etl.tojson(table1, 'example.file3.json', sort_keys=True) >>> # check what it did ... print(open('example.file3.json').read()) [{"bar": 1, "foo": "a"}, {"bar": 2, "foo": "b"}, {"bar": 2, "foo": "c"}] Setting argument `lines` to `True` will enable to infer the writing format as a JSON lines . For more details about JSON lines please visit https://jsonlines.org/. >>> import petl as etl >>> table1 = [['name', 'wins'], ... ['Gilbert', [['straight', '7S'], ['one pair', '10H']]], ... ['Alexa', [['two pair', '4S'], ['two pair', '9S']]], ... ['May', []], ... ['Deloise',[['three of a kind', '5S']]]] >>> etl.tojson(table1, 'example.file3.jsonl', lines = True, sort_keys=True) >>> # check what it did ... print(open('example.file3.jsonl').read()) {"name": "Gilbert", "wins": [["straight", "7S"], ["one pair", "10H"]]} {"name": "Alexa", "wins": [["two pair", "4S"], ["two pair", "9S"]]} {"name": "May", "wins": []} {"name": "Deloise", "wins": [["three of a kind", "5S"]]} Note that this is currently not streaming, all data is loaded into memory before being written to the file. """ obj = list(_dicts(table)) _writejson(source, obj, prefix, suffix, *args, **kwargs)
Table.tojson = tojson
[docs]def tojsonarrays(table, source=None, prefix=None, suffix=None, output_header=False, *args, **kwargs): """ Write a table in JSON format, with rows output as JSON arrays. E.g.:: >>> import petl as etl >>> table1 = [['foo', 'bar'], ... ['a', 1], ... ['b', 2], ... ['c', 2]] >>> etl.tojsonarrays(table1, 'example.file4.json') >>> # check what it did ... print(open('example.file4.json').read()) [["a", 1], ["b", 2], ["c", 2]] Note that this is currently not streaming, all data is loaded into memory before being written to the file. """ if output_header: obj = list(table) else: obj = list(data(table)) _writejson(source, obj, prefix, suffix, *args, **kwargs)
Table.tojsonarrays = tojsonarrays def _writejson(source, obj, prefix, suffix, *args, **kwargs): lines = kwargs.pop('lines', False) encoder = JSONEncoder(*args, **kwargs) source = write_source_from_arg(source) with source.open('wb') as f: if PY2: # write directly to buffer _writeobj(encoder, obj, f, prefix, suffix, lines=lines) else: # wrap buffer for text IO f = io.TextIOWrapper(f, encoding='utf-8', newline='', write_through=True) try: _writeobj(encoder, obj, f, prefix, suffix, lines=lines) finally: f.detach() def _writeobj(encoder, obj, f, prefix, suffix, lines=False): if prefix is not None: f.write(prefix) if lines: for rec in obj: for chunk in encoder.iterencode(rec): f.write(chunk) f.write('\n') else: for chunk in encoder.iterencode(obj): f.write(chunk) if suffix is not None: f.write(suffix)