Source code for petl.util.timing

from __future__ import absolute_import, print_function, division


import sys
import time


from petl.util.base import Table
from petl.util.statistics import onlinestats


[docs]def progress(table, batchsize=1000, prefix="", out=sys.stderr):
    """
    Report progress on rows passing through. E.g.::

        >>> import petl as etl
        >>> table = etl.dummytable(100000)
        >>> table.progress(10000).tocsv('example.csv')
        10000 rows in 0.13s (78363 row/s); batch in 0.13s (78363 row/s)
        20000 rows in 0.22s (91679 row/s); batch in 0.09s (110448 row/s)
        30000 rows in 0.31s (96573 row/s); batch in 0.09s (108114 row/s)
        40000 rows in 0.40s (99535 row/s); batch in 0.09s (109625 row/s)
        50000 rows in 0.49s (101396 row/s); batch in 0.09s (109591 row/s)
        60000 rows in 0.59s (102245 row/s); batch in 0.09s (106709 row/s)
        70000 rows in 0.68s (103221 row/s); batch in 0.09s (109498 row/s)
        80000 rows in 0.77s (103810 row/s); batch in 0.09s (108126 row/s)
        90000 rows in 0.90s (99465 row/s); batch in 0.13s (74516 row/s)
        100000 rows in 1.02s (98409 row/s); batch in 0.11s (89821 row/s)
        100000 rows in 1.02s (98402 row/s); batches in 0.10 +/- 0.02s [0.09-0.13] (100481 +/- 13340 rows/s [74516-110448])

    See also :func:`petl.util.timing.clock`.

    """

    return ProgressView(table, batchsize, prefix, out)


Table.progress = progress


class ProgressView(Table):

    def __init__(self, inner, batchsize, prefix, out):
        self.inner = inner
        self.batchsize = batchsize
        self.prefix = prefix
        self.out = out

    def __iter__(self):
        start = time.time()
        batchstart = start
        batchn = 0
        batchtimemin, batchtimemax = None, None
        batchtimemean, batchtimevar = 0, 0
        batchratemean, batchratevar = 0, 0

        for n, r in enumerate(self.inner):
            if n % self.batchsize == 0 and n > 0:
                batchn += 1
                batchend = time.time()
                batchtime = batchend - batchstart
                if batchtimemin is None or batchtime < batchtimemin:
                    batchtimemin = batchtime
                if batchtimemax is None or batchtime > batchtimemax:
                    batchtimemax = batchtime
                elapsedtime = batchend - start
                try:
                    rate = int(n / elapsedtime)
                except ZeroDivisionError:
                    rate = 0
                try:
                    batchrate = int(self.batchsize / batchtime)
                except ZeroDivisionError:
                    batchrate = 0
                v = (n, elapsedtime, rate, batchtime, batchrate)
                message = self.prefix + \
                    '%s rows in %.2fs (%s row/s); ' \
                    'batch in %.2fs (%s row/s)' % v
                print(message, file=self.out)
                if hasattr(self.out, 'flush'):
                    self.out.flush()
                batchstart = batchend
                batchtimemean, batchtimevar = \
                    onlinestats(batchtime, batchn, mean=batchtimemean,
                                 variance=batchtimevar)
                batchratemean, batchratevar = \
                    onlinestats(batchrate, batchn, mean=batchratemean,
                                 variance=batchratevar)
            yield r

        # compute total elapsed time and rate
        end = time.time()
        elapsedtime = end - start
        try:
            rate = int(n / elapsedtime)
        except ZeroDivisionError:
            rate = 0

        # construct the final message
        if batchn > 1:
            if batchtimemin is None:
                batchtimemin = 0
            if batchtimemax is None:
                batchtimemax = 0
            try:
                batchratemin = int(self.batchsize / batchtimemax)
            except ZeroDivisionError:
                batchratemin = 0
            try:
                batchratemax = int(self.batchsize / batchtimemin)
            except ZeroDivisionError:
                batchratemax = 0
            v = (n, elapsedtime, rate, batchtimemean, batchtimevar**.5,
                 batchtimemin, batchtimemax, int(batchratemean),
                 int(batchratevar**.5), int(batchratemin), int(batchratemax))
            message = self.prefix + '%s rows in %.2fs (%s row/s); batches in ' \
                                    '%.2f +/- %.2fs [%.2f-%.2f] ' \
                                    '(%s +/- %s rows/s [%s-%s])' % v
        else:
            v = (n, elapsedtime, rate)
            message = self.prefix + '%s rows in %.2fs (%s row/s)' % v

        print(message, file=self.out)
        if hasattr(self.out, 'flush'):
            self.out.flush()


[docs]def clock(table):
    """
    Time how long is spent retrieving rows from the wrapped container. Enables
    diagnosis of which steps in a pipeline are taking the most time. E.g.::

        >>> import petl as etl
        >>> t1 = etl.dummytable(100000)
        >>> c1 = etl.clock(t1)
        >>> t2 = etl.convert(c1, 'foo', lambda v: v**2)
        >>> c2 = etl.clock(t2)
        >>> p = etl.progress(c2, 10000)
        >>> etl.tocsv(p, 'example.csv')
        10000 rows in 0.23s (44036 row/s); batch in 0.23s (44036 row/s)
        20000 rows in 0.38s (52167 row/s); batch in 0.16s (63979 row/s)
        30000 rows in 0.54s (55749 row/s); batch in 0.15s (64624 row/s)
        40000 rows in 0.69s (57765 row/s); batch in 0.15s (64793 row/s)
        50000 rows in 0.85s (59031 row/s); batch in 0.15s (64707 row/s)
        60000 rows in 1.00s (59927 row/s); batch in 0.15s (64847 row/s)
        70000 rows in 1.16s (60483 row/s); batch in 0.16s (64051 row/s)
        80000 rows in 1.31s (61008 row/s); batch in 0.15s (64953 row/s)
        90000 rows in 1.47s (61356 row/s); batch in 0.16s (64285 row/s)
        100000 rows in 1.62s (61703 row/s); batch in 0.15s (65012 row/s)
        100000 rows in 1.62s (61700 row/s); batches in 0.16 +/- 0.02s [0.15-0.23] (62528 +/- 6173 rows/s [44036-65012])
        >>> # time consumed retrieving rows from t1
        ... c1.time
        0.7243089999999492
        >>> # time consumed retrieving rows from t2
        ... c2.time
        1.1704209999999766
        >>> # actual time consumed by the convert step
        ... c2.time - c1.time
        0.4461120000000274

    See also :func:`petl.util.timing.progress`.

    """

    return ClockView(table)


Table.clock = clock


class ClockView(Table):

    def __init__(self, wrapped):
        self.wrapped = wrapped

    def __iter__(self):
        self.time = 0
        it = iter(self.wrapped)
        while True:
            before = time.clock()
            row = next(it)
            after = time.clock()
            self.time += (after - before)
            yield row