Source code for petl.util.statistics

from __future__ import absolute_import, print_function, division


from collections import namedtuple


from petl.util.base import values, Table


[docs]def limits(table, field): """ Find minimum and maximum values under the given field. E.g.:: >>> import petl as etl >>> table = [['foo', 'bar'], ['a', 1], ['b', 2], ['b', 3]] >>> minv, maxv = etl.limits(table, 'bar') >>> minv 1 >>> maxv 3 The `field` argument can be a field name or index (starting from zero). """ vals = iter(values(table, field)) try: minv = maxv = next(vals) except StopIteration: return None, None else: for v in vals: if v < minv: minv = v if v > maxv: maxv = v return minv, maxv
Table.limits = limits _stats = namedtuple('stats', ('count', 'errors', 'sum', 'min', 'max', 'mean', 'pvariance', 'pstdev'))
[docs]def stats(table, field): """ Calculate basic descriptive statistics on a given field. E.g.:: >>> import petl as etl >>> table = [['foo', 'bar', 'baz'], ... ['A', 1, 2], ... ['B', '2', '3.4'], ... [u'B', u'3', u'7.8', True], ... ['D', 'xyz', 9.0], ... ['E', None]] >>> etl.stats(table, 'bar') stats(count=3, errors=2, sum=6.0, min=1.0, max=3.0, mean=2.0, pvariance=0.6666666666666666, pstdev=0.816496580927726) The `field` argument can be a field name or index (starting from zero). """ _min = None _max = None _sum = 0 _mean = 0 _var = 0 _count = 0 _errors = 0 for v in values(table, field): try: v = float(v) except (ValueError, TypeError): _errors += 1 else: _count += 1 if _min is None or v < _min: _min = v if _max is None or v > _max: _max = v _sum += v _mean, _var = onlinestats(v, _count, mean=_mean, variance=_var) _std = _var**.5 return _stats(_count, _errors, _sum, _min, _max, _mean, _var, _std)
Table.stats = stats def onlinestats(xi, n, mean=0, variance=0): # function to calculate online mean and variance meanprv = mean varianceprv = variance mean = (((n - 1)*meanprv) + xi)/n variance = (((n - 1)*varianceprv) + ((xi - meanprv)*(xi - mean)))/n return mean, variance