# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division
# standard library dependencies
import io
import json
from json.encoder import JSONEncoder
from petl.compat import PY2
from petl.io.sources import read_source_from_arg, write_source_from_arg
# internal dependencies
from petl.util.base import data, Table, dicts as _dicts, iterpeek
[docs]def fromjson(source, *args, **kwargs):
"""
Extract data from a JSON file. The file must contain a JSON array as
the top level object, and each member of the array will be treated as a
row of data. E.g.::
>>> import petl as etl
>>> data = '''
... [{"foo": "a", "bar": 1},
... {"foo": "b", "bar": 2},
... {"foo": "c", "bar": 2}]
... '''
>>> with open('example.json', 'w') as f:
... f.write(data)
...
74
>>> table1 = etl.fromjson('example.json', header=['foo', 'bar'])
>>> table1
+-----+-----+
| foo | bar |
+=====+=====+
| 'a' | 1 |
+-----+-----+
| 'b' | 2 |
+-----+-----+
| 'c' | 2 |
+-----+-----+
Setting argument `lines` to `True` will enable to
infer the document as a JSON lines document. For more details about JSON lines
please visit https://jsonlines.org/.
>>> import petl as etl
>>> data_with_jlines = '''{"name": "Gilbert", "wins": [["straight", "7S"], ["one pair", "10H"]]}
... {"name": "Alexa", "wins": [["two pair", "4S"], ["two pair", "9S"]]}
... {"name": "May", "wins": []}
... {"name": "Deloise", "wins": [["three of a kind", "5S"]]}'''
...
>>> with open('example2.json', 'w') as f:
... f.write(data_with_jlines)
...
223
>>> table2 = etl.fromjson('example2.json', lines=True)
>>> table2
+-----------+-------------------------------------------+
| name | wins |
+===========+===========================================+
| 'Gilbert' | [['straight', '7S'], ['one pair', '10H']] |
+-----------+-------------------------------------------+
| 'Alexa' | [['two pair', '4S'], ['two pair', '9S']] |
+-----------+-------------------------------------------+
| 'May' | [] |
+-----------+-------------------------------------------+
| 'Deloise' | [['three of a kind', '5S']] |
+-----------+-------------------------------------------+
If your JSON file does not fit this structure, you will need to parse it
via :func:`json.load` and select the array to treat as the data, see also
:func:`petl.io.json.fromdicts`.
.. versionchanged:: 1.1.0
If no `header` is specified, fields will be discovered by sampling keys
from the first `sample` objects in `source`. The header will be
constructed from keys in the order discovered. Note that this
ordering may not be stable, and therefore it may be advisable to specify
an explicit `header` or to use another function like
:func:`petl.transform.headers.sortheader` on the resulting table to
guarantee stability.
"""
source = read_source_from_arg(source)
return JsonView(source, *args, **kwargs)
class JsonView(Table):
def __init__(self, source, *args, **kwargs):
self.source = source
self.missing = kwargs.pop('missing', None)
self.header = kwargs.pop('header', None)
self.sample = kwargs.pop('sample', 1000)
self.lines = kwargs.pop('lines', False)
self.args = args
self.kwargs = kwargs
def __iter__(self):
with self.source.open('rb') as f:
if not PY2:
# wrap buffer for text IO
f = io.TextIOWrapper(f, encoding='utf-8', newline='',
write_through=True)
try:
if self.lines:
for row in iterjlines(f, self.header, self.missing):
yield row
else:
dicts = json.load(f, *self.args, **self.kwargs)
for row in iterdicts(dicts, self.header, self.sample,
self.missing):
yield row
finally:
if not PY2:
f.detach()
[docs]def fromdicts(dicts, header=None, sample=1000, missing=None):
"""
View a sequence of Python :class:`dict` as a table. E.g.::
>>> import petl as etl
>>> dicts = [{"foo": "a", "bar": 1},
... {"foo": "b", "bar": 2},
... {"foo": "c", "bar": 2}]
>>> table1 = etl.fromdicts(dicts, header=['foo', 'bar'])
>>> table1
+-----+-----+
| foo | bar |
+=====+=====+
| 'a' | 1 |
+-----+-----+
| 'b' | 2 |
+-----+-----+
| 'c' | 2 |
+-----+-----+
If `header` is not specified, `sample` items from `dicts` will be
inspected to discovery dictionary keys. Note that the order in which
dictionary keys are discovered may not be stable,
See also :func:`petl.io.json.fromjson`.
.. versionchanged:: 1.1.0
If no `header` is specified, fields will be discovered by sampling keys
from the first `sample` dictionaries in `dicts`. The header will be
constructed from keys in the order discovered. Note that this
ordering may not be stable, and therefore it may be advisable to specify
an explicit `header` or to use another function like
:func:`petl.transform.headers.sortheader` on the resulting table to
guarantee stability.
"""
return DictsView(dicts, header=header, sample=sample, missing=missing)
class DictsView(Table):
def __init__(self, dicts, header=None, sample=1000, missing=None):
self.dicts = dicts
self.header = header
self.sample = sample
self.missing = missing
def __iter__(self):
return iterdicts(self.dicts, self.header, self.sample, self.missing)
def iterjlines(f, header, missing):
it = iter(f)
if header is None:
header = list()
peek, it = iterpeek(it, 1)
json_obj = json.loads(peek)
if hasattr(json_obj, 'keys'):
header += [k for k in json_obj.keys() if k not in header]
yield tuple(header)
for o in it:
json_obj = json.loads(o)
yield tuple(json_obj[f] if f in json_obj else missing for f in header)
def iterdicts(dicts, header, sample, missing):
it = iter(dicts)
# determine header row
if header is None:
# discover fields
header = list()
peek, it = iterpeek(it, sample)
for o in peek:
if hasattr(o, 'keys'):
header += [k for k in o.keys() if k not in header]
yield tuple(header)
# generate data rows
for o in it:
yield tuple(o[f] if f in o else missing for f in header)
[docs]def tojson(table, source=None, prefix=None, suffix=None, *args, **kwargs):
"""
Write a table in JSON format, with rows output as JSON objects. E.g.::
>>> import petl as etl
>>> table1 = [['foo', 'bar'],
... ['a', 1],
... ['b', 2],
... ['c', 2]]
>>> etl.tojson(table1, 'example.json', sort_keys=True)
>>> # check what it did
... print(open('example.json').read())
[{"bar": 1, "foo": "a"}, {"bar": 2, "foo": "b"}, {"bar": 2, "foo": "c"}]
Note that this is currently not streaming, all data is loaded into memory
before being written to the file.
"""
obj = list(_dicts(table))
_writejson(source, obj, prefix, suffix, *args, **kwargs)
Table.tojson = tojson
[docs]def tojsonarrays(table, source=None, prefix=None, suffix=None,
output_header=False, *args, **kwargs):
"""
Write a table in JSON format, with rows output as JSON arrays. E.g.::
>>> import petl as etl
>>> table1 = [['foo', 'bar'],
... ['a', 1],
... ['b', 2],
... ['c', 2]]
>>> etl.tojsonarrays(table1, 'example.json')
>>> # check what it did
... print(open('example.json').read())
[["a", 1], ["b", 2], ["c", 2]]
Note that this is currently not streaming, all data is loaded into memory
before being written to the file.
"""
if output_header:
obj = list(table)
else:
obj = list(data(table))
_writejson(source, obj, prefix, suffix, *args, **kwargs)
Table.tojsonarrays = tojsonarrays
def _writejson(source, obj, prefix, suffix, *args, **kwargs):
encoder = JSONEncoder(*args, **kwargs)
source = write_source_from_arg(source)
with source.open('wb') as f:
if PY2:
# write directly to buffer
_writeobj(encoder, obj, f, prefix, suffix)
else:
# wrap buffer for text IO
f = io.TextIOWrapper(f, encoding='utf-8', newline='',
write_through=True)
try:
_writeobj(encoder, obj, f, prefix, suffix)
finally:
f.detach()
def _writeobj(encoder, obj, f, prefix, suffix):
if prefix is not None:
f.write(prefix)
for chunk in encoder.iterencode(obj):
f.write(chunk)
if suffix is not None:
f.write(suffix)