WIP

waveform80 · waveform80 · commit 511cbaa233e9 · 2022-08-08T15:27:45.000+01:00
diff --git a/setup.cfg b/setup.cfg
@@ -25,6 +25,7 @@ classifiers =
     Programming Language :: Python :: 3.8
     Programming Language :: Python :: 3.9
     Programming Language :: Python :: 3.10
+    Programming Language :: Python :: Implementation :: PyPy
 
 [options]
 packages = find:
diff --git a/structa/analyzer.py b/structa/analyzer.py
@@ -158,7 +158,8 @@ class Analyzer:
         The maximum timestamp to use when determining whether floating point
         values potentially represent epoch-based datetime values.
 
-    :param datetime.datetime epoch:
+    :type epoch: datetime.datetime or None
+    :param epoch:
         The epoch to use when converting numbers to datetime values. Defaults
         to the UNIX epoch (1st January, 1970)
 
@@ -666,8 +667,7 @@ def _match_fixed_len_str(self, items, *, bad_threshold=0):
         for pattern in FIXED_DATETIME_PATTERNS:
             try:
                 return DateTime.from_strings(items, pattern,
-                                             bad_threshold=bad_threshold,
-                                             epoch=self.epoch)
+                                             bad_threshold=bad_threshold)
             except ValueError:
                 pass
         pattern = []
diff --git a/structa/conversions.py b/structa/conversions.py
@@ -149,6 +149,14 @@ def parse_duration(s):
             raise ValueError('invalid duration {}'.format(s))
 
 
+def parse_timestamp(s):
+    """
+    Convert the string *s* to a :class:`~datetime.datetime`. A
+    :exc:`ValueError` is raised if *s* is not a valid datetime representation.
+    """
+    return parse(s)
+
+
 def parse_duration_or_timestamp(s):
     """
     Convert the string *s* to a :class:`~datetime.datetime` or a
@@ -159,4 +167,4 @@ def parse_duration_or_timestamp(s):
     try:
         return parse_duration(s)
     except ValueError:
-        return parse(s)
+        return parse_timestamp(s)
diff --git a/structa/types.py b/structa/types.py
@@ -909,7 +909,7 @@ def from_numbers(cls, pattern, epoch=None):
         """
         Class method for constructing an instance wrapped in a :class:`NumRepr`
         to indicate a numeric representation of a set of timestamps (e.g. day
-        offset from the UNIX epoch; a differen *epoch* may be specified as
+        offset from the UNIX epoch; a different *epoch* may be specified as
         a :class:`~datetime.datetime`).
 
         Constructed with an *sample* of number, a *pattern* (which can be a
@@ -929,7 +929,7 @@ def from_numbers(cls, pattern, epoch=None):
             unix_epoch = datetime.utcfromtimestamp(0)
             offset = (unix_epoch - epoch).total_seconds() / 86400
         for value, count in num_pattern.values.sample.items():
-            dt_counter[datetime.fromtimestamp(value - offset)] = count
+            dt_counter[datetime.utcfromtimestamp(value - offset)] = count
         result = NumRepr(cls(dt_counter), pattern=num_pattern.__class__)
         if isinstance(pattern, StrRepr):
             return pattern.with_content(result)
@@ -1234,7 +1234,7 @@ def validate(self, value):
         if not isinstance(value, Real):
             raise TypeError('{value!r} is not a number'.format(value=value))
         if isinstance(self.content, DateTime):
-            value = datetime.fromtimestamp(value)
+            value = datetime.utcfromtimestamp(value)
         else:
             assert False, (
                 'validating num-repr of {self.content!r}'.format(self=self))
diff --git a/structa/ui/cli.py b/structa/ui/cli.py
@@ -17,7 +17,7 @@
 
 from ..analyzer import Analyzer
 from ..errors import ValidationWarning
-from ..conversions import parse_duration_or_timestamp
+from ..conversions import parse_duration_or_timestamp, parse_timestamp
 from ..types import sources_list, SourcesList
 from ..source import Source
 from ..xml import xml, get_transform
@@ -171,6 +171,11 @@ def get_config(args):
         "point fields represent UNIX timestamps (default: %(default)s). Can "
         "be specified as an absolute timestamp (in ISO-8601 format) or a "
         "duration to be added to the current timestamp")
+    parser.add_argument(
+        '--epoch', type=epoch, metavar='WHEN', default='unix',
+        help="The epoch from which datetimes are measured. Can be specified "
+        "as an absolute timestamp (in ISO-8601 format), or one of the special "
+        'strings, "unix" or "excel" (default: %(default)s)')
     parser.add_argument(
         '--max-numeric-len', type=int, metavar='LEN', default=30,
         help="The maximum number of characters that a number, integer or "
@@ -347,6 +352,22 @@ def max_timestamp(s, now=_start):
     else:
         return now + t
 
+def epoch(s):
+    try:
+        return {
+            # The Excel epoch is defined as 1900-01-01, but that is date "1"
+            # in Excel, rather than "0". Furthermore, for compat. with good
+            # ol' 1-2-3, 1900 was treated (incorrectly) as a leap-year leading
+            # to a +1 offset for all dates after 1900-02-28. Rather than
+            # emulate all that nonsense, we just use 1899-12-30 as the epoch
+            # which is good enough for all detection purposes (which is all
+            # structa cares about anyway)
+            'excel': datetime(1899, 12, 30),
+            'unix': datetime.utcfromtimestamp(0),
+        }[s]
+    except KeyError:
+        return parse_timestamp(s)
+
 def num(s):
     if s.endswith('%'):
         return Fraction(num(s[:-1]), 100)
diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py
@@ -351,7 +351,7 @@ def test_analyze_datetimes():
     start = (now - dt.timedelta(days=50)).timestamp()
     finish = (now + dt.timedelta(days=50)).timestamp()
     data = [
-        dt.datetime.fromtimestamp(n).replace(microsecond=0)
+        dt.datetime.utcfromtimestamp(n).replace(microsecond=0)
         for n in frange(start, finish, step=86400.0)
     ]
     assert Analyzer(bad_threshold=0).analyze(data) == List(
@@ -364,7 +364,7 @@ def test_analyze_datetime_str():
     start = (now - dt.timedelta(days=50)).timestamp()
     finish = (now + dt.timedelta(days=50)).timestamp()
     dates = [
-        dt.datetime.fromtimestamp(n).replace(microsecond=0)
+        dt.datetime.utcfromtimestamp(n).replace(microsecond=0)
         for n in frange(start, finish, step=86400.0)
     ]
     data = [date.strftime('%Y-%m-%d %H:%M:%S') for date in dates]
@@ -400,7 +400,7 @@ def test_analyze_datetime_float():
     assert Analyzer(bad_threshold=0).analyze(data) == List(
         sample=[data],
         content=[NumRepr(
-            DateTime(Counter(dt.datetime.fromtimestamp(n) for n in data)),
+            DateTime(Counter(dt.datetime.utcfromtimestamp(n) for n in data)),
             pattern=float)]
     )
 
@@ -414,7 +414,7 @@ def test_analyze_datetime_float_str():
     assert Analyzer(bad_threshold=0).analyze(data) == List(
         sample=[data],
         content=[StrRepr(NumRepr(
-            DateTime(Counter(dt.datetime.fromtimestamp(float(n)) for n in data)),
+            DateTime(Counter(dt.datetime.utcfromtimestamp(float(n)) for n in data)),
             pattern=float), pattern='f')]
     )
 
@@ -428,8 +428,8 @@ def test_analyze_datetime_bad_range():
     data = {randtime() for n in range(99)} | {start}
     data = list(data)
     assert Analyzer(bad_threshold=0,
-                    min_timestamp=dt.datetime.fromtimestamp(now),
-                    max_timestamp=dt.datetime.fromtimestamp(finish)
+                    min_timestamp=dt.datetime.utcfromtimestamp(now),
+                    max_timestamp=dt.datetime.utcfromtimestamp(finish)
                     ).analyze(data) == List(
         sample=[data],
         content=[Float(Counter(data))])
@@ -452,7 +452,7 @@ def test_analyze_strs_with_blanks():
     randtime = lambda: random.random() * (finish - start) + start
     # Make 10% of the data blank
     dates = [
-        dt.datetime.fromtimestamp(randtime()).replace(microsecond=0)
+        dt.datetime.utcfromtimestamp(randtime()).replace(microsecond=0)
         for n in range(90)
     ]
     data = [
@@ -472,7 +472,7 @@ def test_analyze_too_many_blanks():
     randtime = lambda: random.random() * (finish - start) + start
     # Make 50% of the data blank
     data = [
-        dt.datetime.fromtimestamp(randtime()).strftime('%Y-%m-%d %H:%M:%S')
+        dt.datetime.utcfromtimestamp(randtime()).strftime('%Y-%m-%d %H:%M:%S')
         for n in range(50)
     ] + ['' for n in range(50)]
     random.shuffle(data)
@@ -488,7 +488,7 @@ def test_analyze_unique_list_with_bad_data():
     randtime = lambda: random.random() * (finish - start) + start
     # Make 0.1% of the data invalid (oh noes! A MySQL dump!)
     dates = {
-        dt.datetime.fromtimestamp(randtime()).replace(microsecond=0)
+        dt.datetime.utcfromtimestamp(randtime()).replace(microsecond=0)
         for n in range(999)
     }
     data = {
@@ -508,7 +508,7 @@ def test_analyze_non_unique_list_with_bad_data():
     finish = now.timestamp()
     randtime = lambda: random.random() * (finish - start) + start
     dates = [
-        dt.datetime.fromtimestamp(randtime()).replace(microsecond=0)
+        dt.datetime.utcfromtimestamp(randtime()).replace(microsecond=0)
         for n in range(100)
     ]
     dates = dates * 10
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -35,6 +35,12 @@ def test_max_timestamp():
     assert cli.max_timestamp('10 years') == cli._start + relativedelta(years=10)
 
 
+def test_epoch():
+    assert cli.epoch('unix') == dt.datetime(1970, 1, 1)
+    assert cli.epoch('excel') == dt.datetime(1899, 12, 30)
+    assert cli.epoch('2015-03-31 00:00:00') == dt.datetime(2015, 3, 31)
+
+
 def test_num():
     assert cli.num('1') == 1
     assert cli.num('1/2') == Fraction(1, 2)
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
@@ -47,10 +47,18 @@ def test_parse_duration():
         parse_duration('foo')
 
 
+def test_parse_timestamp():
+    assert parse_timestamp('1970-01-01T00:00:00') == dt.datetime(1970, 1, 1, 0, 0, 0)
+    assert parse_timestamp('2000-01-01 00:00:00') == dt.datetime(2000, 1, 1, 0, 0, 0)
+    with pytest.raises(ValueError):
+        parse_timestamp('foo')
+
+
 def test_parse_duration_or_timestamp():
     assert parse_duration_or_timestamp('') == relativedelta(seconds=0)
     assert parse_duration_or_timestamp('1 hour 30 minutes') == relativedelta(hours=1, minutes=30)
     assert parse_duration_or_timestamp('-1yr') == relativedelta(years=-1)
     assert parse_duration_or_timestamp('01:30:00') == dt.datetime.today().replace(
         hour=1, minute=30, second=0, microsecond=0)
+    assert parse_duration_or_timestamp('1970-01-01T00:00:00') == dt.datetime(1970, 1, 1, 0, 0, 0)
     assert parse_duration_or_timestamp('2000-01-01 00:00:00') == dt.datetime(2000, 1, 1, 0, 0, 0)
diff --git a/tests/test_types.py b/tests/test_types.py
@@ -801,10 +801,10 @@ def test_datetime_strrepr():
 @pytest.mark.skipif(sys.maxsize <= 2**32, reason="requires 64-bit arch")
 def test_datetime_numrepr():
     data = {
-        dt.datetime.fromtimestamp(0),
-        dt.datetime.fromtimestamp(1),
-        dt.datetime.fromtimestamp(86400),
-        dt.datetime.fromtimestamp(100000),
+        dt.datetime.utcfromtimestamp(0),
+        dt.datetime.utcfromtimestamp(1),
+        dt.datetime.utcfromtimestamp(86400),
+        dt.datetime.utcfromtimestamp(100000),
     }
     numbers = Int(Counter(d.timestamp() for d in data))
     pattern = DateTime.from_numbers(numbers)
@@ -818,13 +818,34 @@ def test_datetime_numrepr():
         pattern.validate(2000000000000)
 
 
+@pytest.mark.skipif(sys.maxsize <= 2**32, reason="requires 64-bit arch")
+def test_datetime_numrepr_epoch():
+    excel_epoch = dt.datetime(1899, 12, 30)
+    offset = (dt.datetime.utcfromtimestamp(0) - excel_epoch).total_seconds() // 86400
+    data = {
+        dt.datetime(1943, 7, 20),
+        dt.datetime(1970, 1, 1),
+        dt.datetime(1976, 1, 1),
+    }
+    numbers = Int(Counter(d.timestamp() + offset for d in data))
+    pattern = DateTime.from_numbers(numbers, epoch=excel_epoch)
+    assert pattern == NumRepr(DateTime(Counter(data)), pattern=Int)
+    pattern.validate(1000)
+    with pytest.raises(TypeError):
+        pattern.validate('1000')
+    with pytest.raises(ValueError):
+        pattern.validate(1200000)
+    with pytest.raises(ValueError):
+        pattern.validate(2000000000000)
+
+
 @pytest.mark.skipif(sys.maxsize <= 2**32, reason="requires 64-bit arch")
 def test_datetime_strrepr_numrepr():
     data = {
-        dt.datetime.fromtimestamp(0),
-        dt.datetime.fromtimestamp(1),
-        dt.datetime.fromtimestamp(86400),
-        dt.datetime.fromtimestamp(100000),
+        dt.datetime.utcfromtimestamp(0),
+        dt.datetime.utcfromtimestamp(1),
+        dt.datetime.utcfromtimestamp(86400),
+        dt.datetime.utcfromtimestamp(100000),
     }
     numbers = StrRepr(Int(Counter(d.timestamp() for d in data)), pattern='d')
     pattern = DateTime.from_numbers(numbers)
@@ -903,10 +924,10 @@ def test_strrepr_add():
 
 def test_numrepr_add():
     data = {
-        dt.datetime.fromtimestamp(0),
-        dt.datetime.fromtimestamp(1),
-        dt.datetime.fromtimestamp(86400),
-        dt.datetime.fromtimestamp(100000),
+        dt.datetime.utcfromtimestamp(0),
+        dt.datetime.utcfromtimestamp(1),
+        dt.datetime.utcfromtimestamp(86400),
+        dt.datetime.utcfromtimestamp(100000),
     }
     numbers = Int(Counter(d.timestamp() for d in data))
     int_pattern = DateTime.from_numbers(numbers)