Skip to content

Commit 511cbaa

Browse files
committed
WIP
1 parent ff2763d commit 511cbaa

9 files changed

Lines changed: 95 additions & 30 deletions

File tree

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ classifiers =
2525
Programming Language :: Python :: 3.8
2626
Programming Language :: Python :: 3.9
2727
Programming Language :: Python :: 3.10
28+
Programming Language :: Python :: Implementation :: PyPy
2829

2930
[options]
3031
packages = find:

structa/analyzer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,8 @@ class Analyzer:
158158
The maximum timestamp to use when determining whether floating point
159159
values potentially represent epoch-based datetime values.
160160
161-
:param datetime.datetime epoch:
161+
:type epoch: datetime.datetime or None
162+
:param epoch:
162163
The epoch to use when converting numbers to datetime values. Defaults
163164
to the UNIX epoch (1st January, 1970)
164165
@@ -666,8 +667,7 @@ def _match_fixed_len_str(self, items, *, bad_threshold=0):
666667
for pattern in FIXED_DATETIME_PATTERNS:
667668
try:
668669
return DateTime.from_strings(items, pattern,
669-
bad_threshold=bad_threshold,
670-
epoch=self.epoch)
670+
bad_threshold=bad_threshold)
671671
except ValueError:
672672
pass
673673
pattern = []

structa/conversions.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,14 @@ def parse_duration(s):
149149
raise ValueError('invalid duration {}'.format(s))
150150

151151

152+
def parse_timestamp(s):
153+
"""
154+
Convert the string *s* to a :class:`~datetime.datetime`. A
155+
:exc:`ValueError` is raised if *s* is not a valid datetime representation.
156+
"""
157+
return parse(s)
158+
159+
152160
def parse_duration_or_timestamp(s):
153161
"""
154162
Convert the string *s* to a :class:`~datetime.datetime` or a
@@ -159,4 +167,4 @@ def parse_duration_or_timestamp(s):
159167
try:
160168
return parse_duration(s)
161169
except ValueError:
162-
return parse(s)
170+
return parse_timestamp(s)

structa/types.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -909,7 +909,7 @@ def from_numbers(cls, pattern, epoch=None):
909909
"""
910910
Class method for constructing an instance wrapped in a :class:`NumRepr`
911911
to indicate a numeric representation of a set of timestamps (e.g. day
912-
offset from the UNIX epoch; a differen *epoch* may be specified as
912+
offset from the UNIX epoch; a different *epoch* may be specified as
913913
a :class:`~datetime.datetime`).
914914
915915
Constructed with an *sample* of number, a *pattern* (which can be a
@@ -929,7 +929,7 @@ def from_numbers(cls, pattern, epoch=None):
929929
unix_epoch = datetime.utcfromtimestamp(0)
930930
offset = (unix_epoch - epoch).total_seconds() / 86400
931931
for value, count in num_pattern.values.sample.items():
932-
dt_counter[datetime.fromtimestamp(value - offset)] = count
932+
dt_counter[datetime.utcfromtimestamp(value - offset)] = count
933933
result = NumRepr(cls(dt_counter), pattern=num_pattern.__class__)
934934
if isinstance(pattern, StrRepr):
935935
return pattern.with_content(result)
@@ -1234,7 +1234,7 @@ def validate(self, value):
12341234
if not isinstance(value, Real):
12351235
raise TypeError('{value!r} is not a number'.format(value=value))
12361236
if isinstance(self.content, DateTime):
1237-
value = datetime.fromtimestamp(value)
1237+
value = datetime.utcfromtimestamp(value)
12381238
else:
12391239
assert False, (
12401240
'validating num-repr of {self.content!r}'.format(self=self))

structa/ui/cli.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
from ..analyzer import Analyzer
1919
from ..errors import ValidationWarning
20-
from ..conversions import parse_duration_or_timestamp
20+
from ..conversions import parse_duration_or_timestamp, parse_timestamp
2121
from ..types import sources_list, SourcesList
2222
from ..source import Source
2323
from ..xml import xml, get_transform
@@ -171,6 +171,11 @@ def get_config(args):
171171
"point fields represent UNIX timestamps (default: %(default)s). Can "
172172
"be specified as an absolute timestamp (in ISO-8601 format) or a "
173173
"duration to be added to the current timestamp")
174+
parser.add_argument(
175+
'--epoch', type=epoch, metavar='WHEN', default='unix',
176+
help="The epoch from which datetimes are measured. Can be specified "
177+
"as an absolute timestamp (in ISO-8601 format), or one of the special "
178+
'strings, "unix" or "excel" (default: %(default)s)')
174179
parser.add_argument(
175180
'--max-numeric-len', type=int, metavar='LEN', default=30,
176181
help="The maximum number of characters that a number, integer or "
@@ -347,6 +352,22 @@ def max_timestamp(s, now=_start):
347352
else:
348353
return now + t
349354

355+
def epoch(s):
356+
try:
357+
return {
358+
# The Excel epoch is defined as 1900-01-01, but that is date "1"
359+
# in Excel, rather than "0". Furthermore, for compat. with good
360+
# ol' 1-2-3, 1900 was treated (incorrectly) as a leap-year leading
361+
# to a +1 offset for all dates after 1900-02-28. Rather than
362+
# emulate all that nonsense, we just use 1899-12-30 as the epoch
363+
# which is good enough for all detection purposes (which is all
364+
# structa cares about anyway)
365+
'excel': datetime(1899, 12, 30),
366+
'unix': datetime.utcfromtimestamp(0),
367+
}[s]
368+
except KeyError:
369+
return parse_timestamp(s)
370+
350371
def num(s):
351372
if s.endswith('%'):
352373
return Fraction(num(s[:-1]), 100)

tests/test_analyzer.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ def test_analyze_datetimes():
351351
start = (now - dt.timedelta(days=50)).timestamp()
352352
finish = (now + dt.timedelta(days=50)).timestamp()
353353
data = [
354-
dt.datetime.fromtimestamp(n).replace(microsecond=0)
354+
dt.datetime.utcfromtimestamp(n).replace(microsecond=0)
355355
for n in frange(start, finish, step=86400.0)
356356
]
357357
assert Analyzer(bad_threshold=0).analyze(data) == List(
@@ -364,7 +364,7 @@ def test_analyze_datetime_str():
364364
start = (now - dt.timedelta(days=50)).timestamp()
365365
finish = (now + dt.timedelta(days=50)).timestamp()
366366
dates = [
367-
dt.datetime.fromtimestamp(n).replace(microsecond=0)
367+
dt.datetime.utcfromtimestamp(n).replace(microsecond=0)
368368
for n in frange(start, finish, step=86400.0)
369369
]
370370
data = [date.strftime('%Y-%m-%d %H:%M:%S') for date in dates]
@@ -400,7 +400,7 @@ def test_analyze_datetime_float():
400400
assert Analyzer(bad_threshold=0).analyze(data) == List(
401401
sample=[data],
402402
content=[NumRepr(
403-
DateTime(Counter(dt.datetime.fromtimestamp(n) for n in data)),
403+
DateTime(Counter(dt.datetime.utcfromtimestamp(n) for n in data)),
404404
pattern=float)]
405405
)
406406

@@ -414,7 +414,7 @@ def test_analyze_datetime_float_str():
414414
assert Analyzer(bad_threshold=0).analyze(data) == List(
415415
sample=[data],
416416
content=[StrRepr(NumRepr(
417-
DateTime(Counter(dt.datetime.fromtimestamp(float(n)) for n in data)),
417+
DateTime(Counter(dt.datetime.utcfromtimestamp(float(n)) for n in data)),
418418
pattern=float), pattern='f')]
419419
)
420420

@@ -428,8 +428,8 @@ def test_analyze_datetime_bad_range():
428428
data = {randtime() for n in range(99)} | {start}
429429
data = list(data)
430430
assert Analyzer(bad_threshold=0,
431-
min_timestamp=dt.datetime.fromtimestamp(now),
432-
max_timestamp=dt.datetime.fromtimestamp(finish)
431+
min_timestamp=dt.datetime.utcfromtimestamp(now),
432+
max_timestamp=dt.datetime.utcfromtimestamp(finish)
433433
).analyze(data) == List(
434434
sample=[data],
435435
content=[Float(Counter(data))])
@@ -452,7 +452,7 @@ def test_analyze_strs_with_blanks():
452452
randtime = lambda: random.random() * (finish - start) + start
453453
# Make 10% of the data blank
454454
dates = [
455-
dt.datetime.fromtimestamp(randtime()).replace(microsecond=0)
455+
dt.datetime.utcfromtimestamp(randtime()).replace(microsecond=0)
456456
for n in range(90)
457457
]
458458
data = [
@@ -472,7 +472,7 @@ def test_analyze_too_many_blanks():
472472
randtime = lambda: random.random() * (finish - start) + start
473473
# Make 50% of the data blank
474474
data = [
475-
dt.datetime.fromtimestamp(randtime()).strftime('%Y-%m-%d %H:%M:%S')
475+
dt.datetime.utcfromtimestamp(randtime()).strftime('%Y-%m-%d %H:%M:%S')
476476
for n in range(50)
477477
] + ['' for n in range(50)]
478478
random.shuffle(data)
@@ -488,7 +488,7 @@ def test_analyze_unique_list_with_bad_data():
488488
randtime = lambda: random.random() * (finish - start) + start
489489
# Make 0.1% of the data invalid (oh noes! A MySQL dump!)
490490
dates = {
491-
dt.datetime.fromtimestamp(randtime()).replace(microsecond=0)
491+
dt.datetime.utcfromtimestamp(randtime()).replace(microsecond=0)
492492
for n in range(999)
493493
}
494494
data = {
@@ -508,7 +508,7 @@ def test_analyze_non_unique_list_with_bad_data():
508508
finish = now.timestamp()
509509
randtime = lambda: random.random() * (finish - start) + start
510510
dates = [
511-
dt.datetime.fromtimestamp(randtime()).replace(microsecond=0)
511+
dt.datetime.utcfromtimestamp(randtime()).replace(microsecond=0)
512512
for n in range(100)
513513
]
514514
dates = dates * 10

tests/test_cli.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ def test_max_timestamp():
3535
assert cli.max_timestamp('10 years') == cli._start + relativedelta(years=10)
3636

3737

38+
def test_epoch():
39+
assert cli.epoch('unix') == dt.datetime(1970, 1, 1)
40+
assert cli.epoch('excel') == dt.datetime(1899, 12, 30)
41+
assert cli.epoch('2015-03-31 00:00:00') == dt.datetime(2015, 3, 31)
42+
43+
3844
def test_num():
3945
assert cli.num('1') == 1
4046
assert cli.num('1/2') == Fraction(1, 2)

tests/test_conversions.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,18 @@ def test_parse_duration():
4747
parse_duration('foo')
4848

4949

50+
def test_parse_timestamp():
51+
assert parse_timestamp('1970-01-01T00:00:00') == dt.datetime(1970, 1, 1, 0, 0, 0)
52+
assert parse_timestamp('2000-01-01 00:00:00') == dt.datetime(2000, 1, 1, 0, 0, 0)
53+
with pytest.raises(ValueError):
54+
parse_timestamp('foo')
55+
56+
5057
def test_parse_duration_or_timestamp():
5158
assert parse_duration_or_timestamp('') == relativedelta(seconds=0)
5259
assert parse_duration_or_timestamp('1 hour 30 minutes') == relativedelta(hours=1, minutes=30)
5360
assert parse_duration_or_timestamp('-1yr') == relativedelta(years=-1)
5461
assert parse_duration_or_timestamp('01:30:00') == dt.datetime.today().replace(
5562
hour=1, minute=30, second=0, microsecond=0)
63+
assert parse_duration_or_timestamp('1970-01-01T00:00:00') == dt.datetime(1970, 1, 1, 0, 0, 0)
5664
assert parse_duration_or_timestamp('2000-01-01 00:00:00') == dt.datetime(2000, 1, 1, 0, 0, 0)

tests/test_types.py

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -801,10 +801,10 @@ def test_datetime_strrepr():
801801
@pytest.mark.skipif(sys.maxsize <= 2**32, reason="requires 64-bit arch")
802802
def test_datetime_numrepr():
803803
data = {
804-
dt.datetime.fromtimestamp(0),
805-
dt.datetime.fromtimestamp(1),
806-
dt.datetime.fromtimestamp(86400),
807-
dt.datetime.fromtimestamp(100000),
804+
dt.datetime.utcfromtimestamp(0),
805+
dt.datetime.utcfromtimestamp(1),
806+
dt.datetime.utcfromtimestamp(86400),
807+
dt.datetime.utcfromtimestamp(100000),
808808
}
809809
numbers = Int(Counter(d.timestamp() for d in data))
810810
pattern = DateTime.from_numbers(numbers)
@@ -818,13 +818,34 @@ def test_datetime_numrepr():
818818
pattern.validate(2000000000000)
819819

820820

821+
@pytest.mark.skipif(sys.maxsize <= 2**32, reason="requires 64-bit arch")
822+
def test_datetime_numrepr_epoch():
823+
excel_epoch = dt.datetime(1899, 12, 30)
824+
offset = (dt.datetime.utcfromtimestamp(0) - excel_epoch).total_seconds() // 86400
825+
data = {
826+
dt.datetime(1943, 7, 20),
827+
dt.datetime(1970, 1, 1),
828+
dt.datetime(1976, 1, 1),
829+
}
830+
numbers = Int(Counter(d.timestamp() + offset for d in data))
831+
pattern = DateTime.from_numbers(numbers, epoch=excel_epoch)
832+
assert pattern == NumRepr(DateTime(Counter(data)), pattern=Int)
833+
pattern.validate(1000)
834+
with pytest.raises(TypeError):
835+
pattern.validate('1000')
836+
with pytest.raises(ValueError):
837+
pattern.validate(1200000)
838+
with pytest.raises(ValueError):
839+
pattern.validate(2000000000000)
840+
841+
821842
@pytest.mark.skipif(sys.maxsize <= 2**32, reason="requires 64-bit arch")
822843
def test_datetime_strrepr_numrepr():
823844
data = {
824-
dt.datetime.fromtimestamp(0),
825-
dt.datetime.fromtimestamp(1),
826-
dt.datetime.fromtimestamp(86400),
827-
dt.datetime.fromtimestamp(100000),
845+
dt.datetime.utcfromtimestamp(0),
846+
dt.datetime.utcfromtimestamp(1),
847+
dt.datetime.utcfromtimestamp(86400),
848+
dt.datetime.utcfromtimestamp(100000),
828849
}
829850
numbers = StrRepr(Int(Counter(d.timestamp() for d in data)), pattern='d')
830851
pattern = DateTime.from_numbers(numbers)
@@ -903,10 +924,10 @@ def test_strrepr_add():
903924

904925
def test_numrepr_add():
905926
data = {
906-
dt.datetime.fromtimestamp(0),
907-
dt.datetime.fromtimestamp(1),
908-
dt.datetime.fromtimestamp(86400),
909-
dt.datetime.fromtimestamp(100000),
927+
dt.datetime.utcfromtimestamp(0),
928+
dt.datetime.utcfromtimestamp(1),
929+
dt.datetime.utcfromtimestamp(86400),
930+
dt.datetime.utcfromtimestamp(100000),
910931
}
911932
numbers = Int(Counter(d.timestamp() for d in data))
912933
int_pattern = DateTime.from_numbers(numbers)

0 commit comments

Comments
 (0)