Skip to content

Commit 984091a

Browse files
committed
Add epoch_unit to analyzer
And store the offset and scale directly rather than the datetime-based epoch. It's a little more efficient this way as the offset and scale are used directly and we only really need the epoch and unit (which offset and scale are derived from) for display purposes, which is basically a one-off in structa's runtime
1 parent b641325 commit 984091a

2 files changed

Lines changed: 43 additions & 40 deletions

File tree

structa/analyzer.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,12 @@ class Analyzer:
161161
:type epoch: datetime.datetime or None
162162
:param epoch:
163163
The epoch to use when converting numbers to datetime values. Defaults
164-
to the UNIX epoch (1st January, 1970)
164+
to the UNIX epoch (1st January, 1970).
165+
166+
:type epoch_unit: datetime.timedelta or None
167+
:param epoch_unit:
168+
The unit of time used in numeric representations of datetime values.
169+
Defaults to 1 second.
165170
166171
:type progress: object or None
167172
:param progress:
@@ -173,8 +178,11 @@ def __init__(self, *, bad_threshold=Fraction(2, 100),
173178
empty_threshold=Fraction(98, 100),
174179
null_threshold=Fraction(98, 100), field_threshold=20,
175180
merge_threshold=Fraction(50, 100), max_numeric_len=30,
176-
strip_whitespace=False, min_timestamp=None,
177-
max_timestamp=None, epoch=None, progress=None):
181+
strip_whitespace=False,
182+
min_timestamp=None, max_timestamp=None,
183+
epoch=datetime.utcfromtimestamp(0),
184+
epoch_unit=timedelta(seconds=1),
185+
progress=None):
178186
self.bad_threshold = bad_threshold
179187
self.empty_threshold = empty_threshold
180188
self.null_threshold = null_threshold
@@ -183,15 +191,19 @@ def __init__(self, *, bad_threshold=Fraction(2, 100),
183191
self.max_numeric_len = max_numeric_len
184192
self.strip_whitespace = strip_whitespace
185193
unix_epoch = datetime.utcfromtimestamp(0)
186-
self.epoch = unix_epoch if epoch is None else epoch
187-
offset = (unix_epoch - self.epoch).total_seconds() / 86400
194+
self.timestamp_offset = (epoch - unix_epoch).total_seconds()
195+
self.timestamp_scale = epoch_unit.total_seconds()
188196
now = datetime.now()
189197
if min_timestamp is None:
190198
min_timestamp = now - relativedelta(years=20)
191199
if max_timestamp is None:
192200
max_timestamp = now + relativedelta(years=10)
193-
self.min_timestamp = min_timestamp.timestamp() + offset
194-
self.max_timestamp = max_timestamp.timestamp() + offset
201+
self.min_timestamp = (
202+
min_timestamp.timestamp() - self.timestamp_offset
203+
) / self.timestamp_scale
204+
self.max_timestamp = (
205+
max_timestamp.timestamp() - self.timestamp_offset
206+
) / self.timestamp_scale
195207
self._progress = progress
196208

197209
@property
@@ -740,7 +752,8 @@ def _match_possible_datetime(self, pattern):
740752
isinstance(pattern, (Int, Float)) and
741753
in_range(pattern.values.min) and
742754
in_range(pattern.values.max)):
743-
return DateTime.from_numbers(pattern, epoch=self.epoch)
755+
return DateTime.from_numbers(pattern, offset=self.timestamp_offset,
756+
scale=self.timestamp_scale)
744757
elif (
745758
isinstance(pattern, StrRepr) and (
746759
(
@@ -751,6 +764,7 @@ def _match_possible_datetime(self, pattern):
751764
) and
752765
in_range(pattern.content.values.min) and
753766
in_range(pattern.content.values.max)):
754-
return DateTime.from_numbers(pattern, epoch=self.epoch)
767+
return DateTime.from_numbers(pattern, offset=self.timestamp_offset,
768+
scale=self.timestamp_scale)
755769
else:
756770
return pattern

structa/types.py

Lines changed: 20 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,21 @@
77
import math
88
from copy import copy
99
from numbers import Real
10+
from datetime import datetime
1011
from textwrap import indent, shorten
11-
from datetime import datetime, timedelta
1212
from functools import partial, total_ordering
1313
from collections.abc import Mapping
1414
from operator import attrgetter
1515

1616
from .collections import Counter, FrozenCounter
1717
from .conversions import try_conversion, parse_bool
18-
from .format import format_int, format_repr, format_sample
1918
from .xml import ElementFactory, xml, merge_siblings
19+
from .format import (
20+
format_int,
21+
format_repr,
22+
format_sample,
23+
format_timestamp_numrepr,
24+
)
2025

2126

2227
tag = ElementFactory()
@@ -905,14 +910,15 @@ def from_strings(cls, iterable, pattern, bad_threshold=0):
905910
pattern=pattern)
906911

907912
@classmethod
908-
def from_numbers(cls, pattern, epoch=datetime.utcfromtimestamp(0),
909-
unit=timedelta(seconds=1)):
913+
def from_numbers(cls, pattern, offset=0, scale=1):
910914
"""
911915
Class method for constructing an instance wrapped in a :class:`NumRepr`
912916
to indicate a numeric representation of a set of timestamps (e.g. day
913-
offset from the UNIX epoch; a different *epoch* may be specified as
914-
a :class:`~datetime.datetime`, and a different *unit* as a
915-
:class:`~datetime.timedelta`, which defaults to 1 second).
917+
offset from the UNIX epoch). A different epoch may be specified as a
918+
numeric *offset*, and a different epoch *scale* as a numeric number of
919+
seconds). The default offset and scale are 0 and 1 which is equivalent
920+
to a seconds offset from the UNIX epoch (i.e. a traditional UNIX
921+
timestamp).
916922
917923
Constructed with an *sample* of number, a *pattern* (which can be a
918924
:class:`StrRepr` instance if the numbers are themselves represented as
@@ -925,9 +931,6 @@ def from_numbers(cls, pattern, epoch=datetime.utcfromtimestamp(0),
925931
else:
926932
num_pattern = pattern
927933
dt_counter = Counter()
928-
unix_epoch = datetime.utcfromtimestamp(0)
929-
offset = (epoch - unix_epoch).total_seconds()
930-
scale = unit.total_seconds()
931934
for value, count in num_pattern.values.sample.items():
932935
dt_value = datetime.utcfromtimestamp((value * scale) + offset)
933936
dt_counter[dt_value] = count
@@ -1208,27 +1211,13 @@ class NumRepr(Repr):
12081211

12091212
def __str__(self):
12101213
type_, scale, offset = self.pattern
1211-
delta = timedelta(seconds=scale)
1212-
unit = ', '.join(
1213-
'{count}{prop}'.format(
1214-
count='{value}*'.format(value=value) if value != 1 else '',
1215-
prop=prop)
1216-
for prop in ('days', 'seconds', 'microseconds')
1217-
for value in (getattr(delta, prop),)
1218-
if value
1219-
)
1220-
if not offset % 86400:
1221-
epoch = datetime.utcfromtimestamp(offset).date().isoformat()
1222-
else:
1223-
epoch = datetime.utcfromtimestamp(offset).isoformat()
1224-
if type_ is Int:
1225-
template = 'int {unit} after {epoch} of {self.content}'
1226-
elif type_ is Float:
1227-
template = 'float {unit} after {epoch} of {self.content}'
1228-
else:
1229-
assert False, 'str(num-repr) of {self.content!r}'.format(
1230-
self=self, unit=unit, epoch=epoch)
1231-
return template.format(self=self, unit=unit, epoch=epoch)
1214+
try:
1215+
type_name = {Int: 'int', Float: 'float'}[type_]
1216+
except KeyError:
1217+
assert False, 'str(num-repr) of {self.content!r}'.format(self=self)
1218+
return '{type_name} {numrepr} of {self.content}'.format(
1219+
self=self, type_name=type_name,
1220+
numrepr=format_timestamp_numrepr(offset, scale))
12321221

12331222
def __xml__(self):
12341223
type_, scale, offset = self.pattern

0 commit comments

Comments
 (0)