Skip to content

Commit b07afa5

Browse files
committed
Merge branch 'var-date-time'
Fixes #11
2 parents ea07e9f + 351f989 commit b07afa5

15 files changed

Lines changed: 319 additions & 85 deletions

docs/images/types.png

-22 Bytes
Loading

docs/manual.rst

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ Synopsis
2020
[--hide-pattern] [--show-pattern]
2121
[--hide-range] [--show-range {hidden,limits,median,quartiles,graph}]
2222
[--hide-samples] [--show-samples]
23-
[--min-timestamp WHEN] [--max-timestamp WHEN]
23+
[--min-timestamp WHEN] [--max-timestamp WHEN] [--epoch WHEN ]
2424
[--max-numeric-len LEN] [--sample-bytes SIZE]
2525
[--strip-whitespace] [--no-strip-whitespace]
2626
[--csv-format FIELD[QUOTE]] [--yaml-safe] [--no-yaml-safe]
@@ -145,6 +145,14 @@ Optional Arguments
145145
absolute timestamp (in ISO-8601 format) or a duration to be added to the
146146
current timestamp
147147

148+
.. option:: --epoch WHEN
149+
150+
The epoch from which datetimes are measured. Can be specified as an
151+
absolute timestamp (in ISO-8601 format: YYYY-mm-ddTHH:MM:SS), or one of the
152+
special strings, "unix" (which is equivalent to 1970-01-01) or
153+
"excel" (which is roughly equivalent to 1900-01-01, with some adjustments).
154+
The default is "unix"
155+
148156
.. option:: --max-numeric-len LEN
149157

150158
The maximum number of characters that a number, integer or floating-point,

docs/tutorial_basic.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -433,8 +433,9 @@ Date Handling
433433
The other important switches are those used in the detection of dates encoded
434434
as numbers: :option:`structa --min-timestamp` and :option:`structa
435435
--max-timestamp`. When dates are encoded as (potentially fractional)
436-
day-offsets from the UNIX epoch (the 1st January, 1970), how does structa
437-
determine that it's looking at a set of dates rather than a set of numbers?
436+
day-offsets from some :option:`structa --epoch` (which defaults to the UNIX
437+
epoch, i.e. the 1st January, 1970), how does structa determine that it's
438+
looking at a set of dates rather than a set of numbers?
438439

439440
In a typical set of (arbitrary) numbers, it's quite normal to find "0" or "1"
440441
commonly represented, or for the set of numbers to span over a large range

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ classifiers =
2525
Programming Language :: Python :: 3.8
2626
Programming Language :: Python :: 3.9
2727
Programming Language :: Python :: 3.10
28+
Programming Language :: Python :: Implementation :: PyPy
2829

2930
[options]
3031
packages = find:

structa/analyzer.py

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,16 @@ class Analyzer:
158158
The maximum timestamp to use when determining whether floating point
159159
values potentially represent epoch-based datetime values.
160160
161+
:type epoch: datetime.datetime or None
162+
:param epoch:
163+
The epoch to use when converting numbers to datetime values. Defaults
164+
to the UNIX epoch (1st January, 1970).
165+
166+
:type epoch_unit: datetime.timedelta or None
167+
:param epoch_unit:
168+
The unit of time used in numeric representations of datetime values.
169+
Defaults to 1 second.
170+
161171
:type progress: object or None
162172
:param progress:
163173
If specificed, must be an object with ``update`` and ``reset`` methods
@@ -168,22 +178,32 @@ def __init__(self, *, bad_threshold=Fraction(2, 100),
168178
empty_threshold=Fraction(98, 100),
169179
null_threshold=Fraction(98, 100), field_threshold=20,
170180
merge_threshold=Fraction(50, 100), max_numeric_len=30,
171-
strip_whitespace=False, min_timestamp=None,
172-
max_timestamp=None, progress=None):
181+
strip_whitespace=False,
182+
min_timestamp=None, max_timestamp=None,
183+
epoch=datetime.utcfromtimestamp(0),
184+
epoch_unit=timedelta(seconds=1),
185+
progress=None):
173186
self.bad_threshold = bad_threshold
174187
self.empty_threshold = empty_threshold
175188
self.null_threshold = null_threshold
176189
self.field_threshold = field_threshold
177190
self.merge_threshold = merge_threshold
178191
self.max_numeric_len = max_numeric_len
179192
self.strip_whitespace = strip_whitespace
193+
unix_epoch = datetime.utcfromtimestamp(0)
194+
self.timestamp_offset = (epoch - unix_epoch).total_seconds()
195+
self.timestamp_scale = epoch_unit.total_seconds()
180196
now = datetime.now()
181197
if min_timestamp is None:
182198
min_timestamp = now - relativedelta(years=20)
183199
if max_timestamp is None:
184200
max_timestamp = now + relativedelta(years=10)
185-
self.min_timestamp = min_timestamp.timestamp()
186-
self.max_timestamp = max_timestamp.timestamp()
201+
self.min_timestamp = (
202+
min_timestamp.timestamp() - self.timestamp_offset
203+
) / self.timestamp_scale
204+
self.max_timestamp = (
205+
max_timestamp.timestamp() - self.timestamp_offset
206+
) / self.timestamp_scale
187207
self._progress = progress
188208

189209
@property
@@ -732,7 +752,8 @@ def _match_possible_datetime(self, pattern):
732752
isinstance(pattern, (Int, Float)) and
733753
in_range(pattern.values.min) and
734754
in_range(pattern.values.max)):
735-
return DateTime.from_numbers(pattern)
755+
return DateTime.from_numbers(pattern, offset=self.timestamp_offset,
756+
scale=self.timestamp_scale)
736757
elif (
737758
isinstance(pattern, StrRepr) and (
738759
(
@@ -743,6 +764,7 @@ def _match_possible_datetime(self, pattern):
743764
) and
744765
in_range(pattern.content.values.min) and
745766
in_range(pattern.content.values.max)):
746-
return DateTime.from_numbers(pattern)
767+
return DateTime.from_numbers(pattern, offset=self.timestamp_offset,
768+
scale=self.timestamp_scale)
747769
else:
748770
return pattern

structa/conversions.py

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# SPDX-License-Identifier: GPL-2.0-or-later
66

77
import re
8+
from datetime import timedelta
89

910
from dateutil.parser import parse
1011
from dateutil.relativedelta import relativedelta
@@ -84,18 +85,22 @@ def parse_bool(s, false='0', true='1'):
8485
}
8586

8687

87-
def parse_duration(s):
88+
def parse_duration(s, delta_type=relativedelta):
8889
"""
89-
Convert the string *s* to a :class:`~dateutil.relativedelta.relativedelta`.
90-
The string must consist of white-space and/or comma separated values which
91-
are a number followed by a suffix indicating duration. For example:
90+
Convert the string *s* to a :class:`~dateutil.relativedelta.relativedelta`
91+
(by default) or a :class:`~datetime.timedelta` if requested by
92+
*delta_type*. The string must consist of white-space and/or comma separated
93+
values which are a number followed by a suffix indicating duration. For
94+
example:
9295
9396
>>> parse_duration('1s')
9497
relativedelta(seconds=+1)
9598
>>> parse_duration('5 minutes, 30 seconds')
9699
relativedelta(minutes=+5, seconds=+30)
97100
>>> parse_duration('1 year')
98101
relativedelta(years=+1)
102+
>>> parse_duration('1 week, 1 day', delta_type=datetime.timedelta)
103+
timedelta(days=8)
99104
100105
Note that some suffixes like "m" can be ambiguous; using common
101106
abbreviations should avoid ambiguity:
@@ -112,6 +117,9 @@ def parse_duration(s):
112117
* *Microseconds*: microseconds, microsecond, microsec, micros, micro,
113118
useconds, usecond, usecs, usec, us, µseconds, µsecond, µsecs, µsec, µs
114119
120+
* *Milliseconds*: milliseconds, millisecond, millisec, millis, milli,
121+
mseconds, msecond, msecs, msec, ms
122+
115123
* *Seconds*: seconds, second, secs, sec, s
116124
117125
* *Minutes*: minutes, minute, mins, min, mi
@@ -122,35 +130,50 @@ def parse_duration(s):
122130
123131
* *Weeks*: weeks, week, wks, wk, w
124132
125-
* *Months*: months, month, mons, mon, mths, mth, m
133+
* *Months*: months, month, mons, mon, mths, mth, m (relativedelta only)
126134
127-
* *Years*: years, year, yrs, yr, y
135+
* *Years*: years, year, yrs, yr, y (relativedelta only)
128136
129137
If conversion fails, :exc:`ValueError` is raised.
130138
"""
131-
spans = {span: 0 for span in _SPANS}
139+
assert delta_type in (relativedelta, timedelta)
140+
spans = {}
132141
t = s
133142
for span, regex in _SPANS.items():
143+
if delta_type is timedelta and span in ('months', 'years'):
144+
continue
134145
m = regex.search(t)
135146
if m:
136-
spans[span] += int(m.group('num'))
147+
spans[span] = spans.get(span, 0) + int(m.group('num'))
137148
t = (t[:m.start(0)] + t[m.end(0):]).strip(' \t\n,')
138149
if not t:
139150
break
140151
if t:
141152
raise ValueError('invalid duration {}'.format(s))
142-
spans['microseconds'] += spans.pop('milliseconds') * 1000
143-
return relativedelta(**spans)
153+
if delta_type is relativedelta:
154+
spans['microseconds'] = (
155+
spans.get('microseconds', 0) +
156+
(spans.pop('milliseconds', 0) * 1000))
157+
return delta_type(**spans)
158+
159+
160+
def parse_timestamp(s):
161+
"""
162+
Convert the string *s* to a :class:`~datetime.datetime`. A
163+
:exc:`ValueError` is raised if *s* is not a valid datetime representation.
164+
"""
165+
return parse(s)
144166

145167

146-
def parse_duration_or_timestamp(s):
168+
def parse_duration_or_timestamp(s, delta_type=relativedelta):
147169
"""
148170
Convert the string *s* to a :class:`~datetime.datetime` or a
149-
:class:`~dateutil.relativedelta.relativedelta`. Duration conversion is
150-
attempted to and, if this fails, date-time conversion is attempted. A
151-
:exc:`ValueError` is raised if both conversions fail.
171+
:class:`~dateutil.relativedelta.relativedelta` (or
172+
:class:`~datetime.timedelta` if *duration_type* so specifies). Duration
173+
conversion is attempted to and, if this fails, date-time conversion is
174+
attempted. A :exc:`ValueError` is raised if both conversions fail.
152175
"""
153176
try:
154-
return parse_duration(s)
177+
return parse_duration(s, delta_type=delta_type)
155178
except ValueError:
156-
return parse(s)
179+
return parse_timestamp(s)

structa/format.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from math import log
88
from itertools import tee
9-
from datetime import datetime
9+
from datetime import datetime, timedelta
1010

1111

1212
def pairwise(iterable):
@@ -153,3 +153,29 @@ def format_sample(value):
153153
}[type(value)]()
154154
except KeyError:
155155
raise ValueError('invalid type for value {!r}'.format(value))
156+
157+
158+
def format_timestamp_numrepr(offset, scale):
159+
delta = timedelta(seconds=scale)
160+
simple = {
161+
timedelta(**{name: 1}): name
162+
for name in (
163+
'microseconds',
164+
'milliseconds',
165+
'seconds',
166+
'minutes',
167+
'hours',
168+
'days',
169+
'weeks',
170+
)
171+
}
172+
if offset % 86400:
173+
epoch = datetime.utcfromtimestamp(offset).isoformat()
174+
else:
175+
epoch = datetime.utcfromtimestamp(offset).date().isoformat()
176+
try:
177+
return '{unit} since {epoch}'.format(unit=simple[delta], epoch=epoch)
178+
except KeyError:
179+
return 'seconds since {epoch} {op} {scale:g}'.format(
180+
epoch=epoch, op=('*', '/')[scale >= 1],
181+
scale=scale if scale >= 1 else (1 / scale))

structa/types.py

Lines changed: 49 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,13 @@
1515

1616
from .collections import Counter, FrozenCounter
1717
from .conversions import try_conversion, parse_bool
18-
from .format import format_int, format_repr, format_sample
1918
from .xml import ElementFactory, xml, merge_siblings
19+
from .format import (
20+
format_int,
21+
format_repr,
22+
format_sample,
23+
format_timestamp_numrepr,
24+
)
2025

2126

2227
tag = ElementFactory()
@@ -905,11 +910,15 @@ def from_strings(cls, iterable, pattern, bad_threshold=0):
905910
pattern=pattern)
906911

907912
@classmethod
908-
def from_numbers(cls, pattern):
913+
def from_numbers(cls, pattern, offset=0, scale=1):
909914
"""
910915
Class method for constructing an instance wrapped in a :class:`NumRepr`
911916
to indicate a numeric representation of a set of timestamps (e.g. day
912-
offset from the UNIX epoch).
917+
offset from the UNIX epoch). A different epoch may be specified as a
918+
numeric *offset*, and a different epoch *scale* as a numeric number of
919+
seconds). The default offset and scale are 0 and 1 which is equivalent
920+
to a seconds offset from the UNIX epoch (i.e. a traditional UNIX
921+
timestamp).
913922
914923
Constructed with an *sample* of number, a *pattern* (which can be a
915924
:class:`StrRepr` instance if the numbers are themselves represented as
@@ -923,8 +932,10 @@ def from_numbers(cls, pattern):
923932
num_pattern = pattern
924933
dt_counter = Counter()
925934
for value, count in num_pattern.values.sample.items():
926-
dt_counter[datetime.fromtimestamp(value)] = count
927-
result = NumRepr(cls(dt_counter), pattern=num_pattern.__class__)
935+
dt_value = datetime.utcfromtimestamp((value * scale) + offset)
936+
dt_counter[dt_value] = count
937+
result = NumRepr(cls(dt_counter), pattern=(
938+
num_pattern.__class__, scale, offset))
928939
if isinstance(pattern, StrRepr):
929940
return pattern.with_content(result)
930941
else:
@@ -1174,12 +1185,12 @@ def validate(self, value):
11741185
value = parse_bool(value, false, true)
11751186
elif isinstance(self.content, Int) or (
11761187
isinstance(self.content, NumRepr) and
1177-
self.content.pattern is Int
1188+
self.content.pattern[0] is Int
11781189
):
11791190
value = int(value, base=self.int_bases[self.pattern])
11801191
elif isinstance(self.content, Float) or (
11811192
isinstance(self.content, NumRepr) and
1182-
self.content.pattern is Float
1193+
self.content.pattern[0] is Float
11831194
):
11841195
assert self.pattern == 'f'
11851196
value = float(value)
@@ -1199,36 +1210,52 @@ class NumRepr(Repr):
11991210
__slots__ = ()
12001211

12011212
def __str__(self):
1202-
if self.pattern is Int:
1203-
template = 'int of {self.content}'
1204-
elif self.pattern is Float:
1205-
template = 'float of {self.content}'
1206-
else:
1213+
type_, scale, offset = self.pattern
1214+
try:
1215+
type_name = {Int: 'int', Float: 'float'}[type_]
1216+
except KeyError:
12071217
assert False, 'str(num-repr) of {self.content!r}'.format(self=self)
1208-
return template.format(self=self)
1218+
return '{type_name} {numrepr} of {self.content}'.format(
1219+
self=self, type_name=type_name,
1220+
numrepr=format_timestamp_numrepr(offset, scale))
12091221

12101222
def __xml__(self):
1211-
if self.pattern is Int:
1212-
return tag.intof(xml(self.content))
1213-
elif self.pattern is Float:
1214-
return tag.floatof(xml(self.content))
1223+
type_, scale, offset = self.pattern
1224+
if type_ is Int:
1225+
return tag.intof(xml(self.content), scale=scale, offset=offset)
1226+
elif type_ is Float:
1227+
return tag.floatof(xml(self.content), scale=scale, offset=offset)
12151228
else:
12161229
assert False, 'xml(num-repr) of {self.content!r}'.format(self=self)
12171230

12181231
def __add__(self, other):
12191232
if self == other:
1220-
if self.pattern is Float or other.pattern is Float:
1221-
pattern = Float
1233+
self_type, self_scale, self_offset = self.pattern
1234+
other_type, other_scale, other_offset = other.pattern
1235+
if self_type is Float or other_type is Float:
1236+
add_type = Float
12221237
else:
1223-
pattern = Int
1224-
return NumRepr(self.content + other.content, pattern)
1238+
add_type = Int
1239+
return NumRepr(
1240+
self.content + other.content,
1241+
(add_type, self_scale, self_offset))
12251242
return NotImplemented
12261243

1244+
def __eq__(self, other):
1245+
if not isinstance(other, NumRepr):
1246+
return NotImplemented
1247+
if super().__eq__(other) is not True:
1248+
return False
1249+
self_type, self_scale, self_offset = self.pattern
1250+
other_type, other_scale, other_offset = other.pattern
1251+
return (self_scale == other_scale) and (self_offset == other_offset)
1252+
12271253
def validate(self, value):
12281254
if not isinstance(value, Real):
12291255
raise TypeError('{value!r} is not a number'.format(value=value))
12301256
if isinstance(self.content, DateTime):
1231-
value = datetime.fromtimestamp(value)
1257+
type_, scale, offset = self.pattern
1258+
value = datetime.utcfromtimestamp((value * scale) + offset)
12321259
else:
12331260
assert False, (
12341261
'validating num-repr of {self.content!r}'.format(self=self))

0 commit comments

Comments
 (0)