Merge branch 'null-support'

waveform80 · waveform80 · commit 68db73c5c7e1 · 2022-08-08T15:23:47.000+01:00
diff --git a/docs/manual.rst b/docs/manual.rst
@@ -15,7 +15,7 @@ Synopsis
 
     structa [-h] [--version] [-f {auto,csv,json,yaml}] [-e ENCODING]
             [--encoding-strict] [--no-encoding-strict]
-            [-F INT] [-M NUM] [-B NUM] [-E NUM] [--str-limit NUM]
+            [-F INT] [-M NUM] [-B NUM] [-E NUM] [-N NUM] [--str-limit NUM]
             [--hide-count] [--show-count] [--hide-lengths] [--show-lengths]
             [--hide-pattern] [--show-pattern]
             [--hide-range] [--show-range {hidden,limits,median,quartiles,graph}]
@@ -91,6 +91,11 @@ Optional Arguments
     the pattern from being reported; the proportion of "empty" data permitted
     in a field (default: 99%)
 
+.. option:: -N NUM, --null-threshold NUM
+
+    The proportion of values permitted to be null without preventing type
+    analysis (default: 99%)
+
 .. option:: --str-limit NUM
 
     The length beyond which only the lengths of strs will be reported; below
diff --git a/docs/tutorial_basic.rst b/docs/tutorial_basic.rst
@@ -154,14 +154,15 @@ bad threshold mechanism only applies to bad data *within* a homogenous type
 (typically bad string representations of numeric or boolean types).
 
 
-Missing Data (``--empty-threshold``)
-====================================
+Missing Data (``--empty-threshold`` and ``--null-threshold``)
+=============================================================
 
-Another type of "bad" data commonly encountered is empty strings which are
-typically used to represent *missing* data, and (predictably) structa has
-another knob that can be twiddled for this: :option:`structa
---empty-threshold`. The following script generates a list of strings of
-integers in which most of the strings (~70%) are blank:
+Another type of "bad" data commonly encountered is empty strings and nulls
+which are typically used to represent *missing* data, and (predictably) structa
+has more knobs that can be twiddled for this: :option:`structa
+--empty-threshold` and :option:`structa --null-threshold`. The following script
+generates a list of strings of integers in which most of the strings (~70%) are
+blank:
 
 .. literalinclude:: examples/mostly-blank.py
    :caption: mostly-blank.py
@@ -174,11 +175,13 @@ normal:
     $ python3 mostly-blank.py | structa
     [ str of int range=0..100 pattern="d" ]
 
-This is because the default for :option:`structa --empty-threshold` is 99% or
-0.99. If the proportion of blank strings in a field exceeds the empty
-threshold, the field will simply be marked as a string without any further
-processing. Hence, when we re-run this script with the setting turned down to
-50%, the output changes:
+This is because the default for both :option:`structa --empty-threshold` and
+:option:`structa --null-threshold` is 99% or 0.99.
+
+If the proportion of blank strings in a field exceeds the empty threshold, the
+field will simply be marked as a string without any further processing. Hence,
+when we re-run this script with the setting turned down to 50%, the output
+changes:
 
 .. code-block:: console
 
@@ -191,6 +194,11 @@ processing. Hence, when we re-run this script with the setting turned down to
     "100" value, but because it's now considered a string (not a string of
     integers), "100" sorts before "99" alphabetically.
 
+Likewise, if the proportion of null values in a field exceeds the null
+threshold, the field will simply be marked as "value" (an arbitrary mix of
+types), because structa assumes there aren't enough values to accurately
+represent the type of the field.
+
 It is also worth nothing that, by default, structa strips whitespace from
 strings prior to analysis. This is probably not necessary for the vast majority
 of modern datasets, but it's a reasonably safe default, and can be controlled
diff --git a/structa/analyzer.py b/structa/analyzer.py
@@ -128,6 +128,11 @@ class Analyzer:
         falling within this threshold will be discounted by the analysis. Valid
         values are between 0 and 1.
 
+    :param numbers.Rational null_threshold:
+        The proportion of values within a field (across repetitive structures)
+        which can be None (null in JSON parlance) without affecting the type
+        match. Valid values are between 0 and 1.
+
     :param int field_threshold:
         The minimum number of fields in a mapping before it will be treated as
         a "table" (a mapping of keys to records) rather than a record (a
@@ -160,12 +165,14 @@ class Analyzer:
         for further details.
     """
     def __init__(self, *, bad_threshold=Fraction(2, 100),
-                 empty_threshold=Fraction(98, 100), field_threshold=20,
+                 empty_threshold=Fraction(98, 100),
+                 null_threshold=Fraction(98, 100), field_threshold=20,
                  merge_threshold=Fraction(50, 100), max_numeric_len=30,
                  strip_whitespace=False, min_timestamp=None,
                  max_timestamp=None, progress=None):
         self.bad_threshold = bad_threshold
         self.empty_threshold = empty_threshold
+        self.null_threshold = null_threshold
         self.field_threshold = field_threshold
         self.merge_threshold = merge_threshold
         self.max_numeric_len = max_numeric_len
@@ -585,6 +592,13 @@ def _match(self, items, path, *, threshold=None, parent_card=None):
                         # dict but there're more than the field threshold
                         return Tuple(items)
 
+                # Ignore null (None) up to the null threshold; in the case
+                # the threshold is exceeded we assume there's not enough data
+                # to even determine the type
+                if None in sample:
+                    if sample[None] / len(items) > self.null_threshold:
+                        return Value(items)
+                    del sample[None]
                 # The following ordering is important; note that bool's domain
                 # is a subset of int's
                 if all(isinstance(value, bool) for value in sample):
diff --git a/structa/ui/cli.py b/structa/ui/cli.py
@@ -108,6 +108,10 @@ def get_config(args):
         help="The proportion of string values permitted to be empty without "
         "preventing the pattern from being reported; the proportion of "
         '"empty" data permitted in a field (default: %(default)s)')
+    parser.add_argument(
+        '-N', '--null-threshold', type=num, metavar='NUM', default='99%',
+        help="The proportion of values permitted to be null without "
+        "preventing type analysis (default: %(default)s)")
     parser.add_argument(
         '--str-limit', type=num, metavar='NUM', default=20,
         help="The length beyond which only the lengths of strs will be "
@@ -300,6 +304,7 @@ def from_config(cls, config, progress):
         return cls(
             bad_threshold=config.bad_threshold,
             empty_threshold=config.empty_threshold,
+            null_threshold=config.null_threshold,
             field_threshold=config.field_threshold,
             merge_threshold=config.merge_threshold,
             max_numeric_len=config.max_numeric_len,
diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py
@@ -190,6 +190,24 @@ def test_analyze_dict_bad_data():
         ])
 
 
+def test_analyze_too_many_nulls():
+    data = {str(i): i if i < 50 else None for i in range(100)}
+    assert Analyzer().analyze(data) == Dict(
+        sample=[data], content=[
+            DictField(
+                StrRepr(Int(Counter(range(100))), pattern='d'),
+                Int(Counter(range(50)))
+            )
+        ])
+    assert Analyzer(null_threshold=0).analyze(data) == Dict(
+        sample=[data], content=[
+            DictField(
+                StrRepr(Int(Counter(range(100))), pattern='d'),
+                Value(Counter(range(5)))
+            )
+        ])
+
+
 def test_analyze_dict_of_dicts():
     data = {n: {'foo': n, 'bar': n} for n in range(99)}
     assert Analyzer().analyze(data) == Dict(