Skip to content

Commit 68db73c

Browse files
committed
Merge branch 'null-support'
2 parents 3e4c168 + 959af68 commit 68db73c

5 files changed

Lines changed: 64 additions & 14 deletions

File tree

docs/manual.rst

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ Synopsis
1515
1616
structa [-h] [--version] [-f {auto,csv,json,yaml}] [-e ENCODING]
1717
[--encoding-strict] [--no-encoding-strict]
18-
[-F INT] [-M NUM] [-B NUM] [-E NUM] [--str-limit NUM]
18+
[-F INT] [-M NUM] [-B NUM] [-E NUM] [-N NUM] [--str-limit NUM]
1919
[--hide-count] [--show-count] [--hide-lengths] [--show-lengths]
2020
[--hide-pattern] [--show-pattern]
2121
[--hide-range] [--show-range {hidden,limits,median,quartiles,graph}]
@@ -91,6 +91,11 @@ Optional Arguments
9191
the pattern from being reported; the proportion of "empty" data permitted
9292
in a field (default: 99%)
9393

94+
.. option:: -N NUM, --null-threshold NUM
95+
96+
The proportion of values permitted to be null without preventing type
97+
analysis (default: 99%)
98+
9499
.. option:: --str-limit NUM
95100

96101
The length beyond which only the lengths of strs will be reported; below

docs/tutorial_basic.rst

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -154,14 +154,15 @@ bad threshold mechanism only applies to bad data *within* a homogenous type
154154
(typically bad string representations of numeric or boolean types).
155155

156156

157-
Missing Data (``--empty-threshold``)
158-
====================================
157+
Missing Data (``--empty-threshold`` and ``--null-threshold``)
158+
=============================================================
159159

160-
Another type of "bad" data commonly encountered is empty strings which are
161-
typically used to represent *missing* data, and (predictably) structa has
162-
another knob that can be twiddled for this: :option:`structa
163-
--empty-threshold`. The following script generates a list of strings of
164-
integers in which most of the strings (~70%) are blank:
160+
Another type of "bad" data commonly encountered is empty strings and nulls
161+
which are typically used to represent *missing* data, and (predictably) structa
162+
has more knobs that can be twiddled for this: :option:`structa
163+
--empty-threshold` and :option:`structa --null-threshold`. The following script
164+
generates a list of strings of integers in which most of the strings (~70%) are
165+
blank:
165166

166167
.. literalinclude:: examples/mostly-blank.py
167168
:caption: mostly-blank.py
@@ -174,11 +175,13 @@ normal:
174175
$ python3 mostly-blank.py | structa
175176
[ str of int range=0..100 pattern="d" ]
176177
177-
This is because the default for :option:`structa --empty-threshold` is 99% or
178-
0.99. If the proportion of blank strings in a field exceeds the empty
179-
threshold, the field will simply be marked as a string without any further
180-
processing. Hence, when we re-run this script with the setting turned down to
181-
50%, the output changes:
178+
This is because the default for both :option:`structa --empty-threshold` and
179+
:option:`structa --null-threshold` is 99% or 0.99.
180+
181+
If the proportion of blank strings in a field exceeds the empty threshold, the
182+
field will simply be marked as a string without any further processing. Hence,
183+
when we re-run this script with the setting turned down to 50%, the output
184+
changes:
182185

183186
.. code-block:: console
184187
@@ -191,6 +194,11 @@ processing. Hence, when we re-run this script with the setting turned down to
191194
"100" value, but because it's now considered a string (not a string of
192195
integers), "100" sorts before "99" alphabetically.
193196

197+
Likewise, if the proportion of null values in a field exceeds the null
198+
threshold, the field will simply be marked as "value" (an arbitrary mix of
199+
types), because structa assumes there aren't enough values to accurately
200+
represent the type of the field.
201+
194202
It is also worth nothing that, by default, structa strips whitespace from
195203
strings prior to analysis. This is probably not necessary for the vast majority
196204
of modern datasets, but it's a reasonably safe default, and can be controlled

structa/analyzer.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,11 @@ class Analyzer:
128128
falling within this threshold will be discounted by the analysis. Valid
129129
values are between 0 and 1.
130130
131+
:param numbers.Rational null_threshold:
132+
The proportion of values within a field (across repetitive structures)
133+
which can be None (null in JSON parlance) without affecting the type
134+
match. Valid values are between 0 and 1.
135+
131136
:param int field_threshold:
132137
The minimum number of fields in a mapping before it will be treated as
133138
a "table" (a mapping of keys to records) rather than a record (a
@@ -160,12 +165,14 @@ class Analyzer:
160165
for further details.
161166
"""
162167
def __init__(self, *, bad_threshold=Fraction(2, 100),
163-
empty_threshold=Fraction(98, 100), field_threshold=20,
168+
empty_threshold=Fraction(98, 100),
169+
null_threshold=Fraction(98, 100), field_threshold=20,
164170
merge_threshold=Fraction(50, 100), max_numeric_len=30,
165171
strip_whitespace=False, min_timestamp=None,
166172
max_timestamp=None, progress=None):
167173
self.bad_threshold = bad_threshold
168174
self.empty_threshold = empty_threshold
175+
self.null_threshold = null_threshold
169176
self.field_threshold = field_threshold
170177
self.merge_threshold = merge_threshold
171178
self.max_numeric_len = max_numeric_len
@@ -585,6 +592,13 @@ def _match(self, items, path, *, threshold=None, parent_card=None):
585592
# dict but there're more than the field threshold
586593
return Tuple(items)
587594

595+
# Ignore null (None) up to the null threshold; in the case
596+
# the threshold is exceeded we assume there's not enough data
597+
# to even determine the type
598+
if None in sample:
599+
if sample[None] / len(items) > self.null_threshold:
600+
return Value(items)
601+
del sample[None]
588602
# The following ordering is important; note that bool's domain
589603
# is a subset of int's
590604
if all(isinstance(value, bool) for value in sample):

structa/ui/cli.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,10 @@ def get_config(args):
108108
help="The proportion of string values permitted to be empty without "
109109
"preventing the pattern from being reported; the proportion of "
110110
'"empty" data permitted in a field (default: %(default)s)')
111+
parser.add_argument(
112+
'-N', '--null-threshold', type=num, metavar='NUM', default='99%',
113+
help="The proportion of values permitted to be null without "
114+
"preventing type analysis (default: %(default)s)")
111115
parser.add_argument(
112116
'--str-limit', type=num, metavar='NUM', default=20,
113117
help="The length beyond which only the lengths of strs will be "
@@ -300,6 +304,7 @@ def from_config(cls, config, progress):
300304
return cls(
301305
bad_threshold=config.bad_threshold,
302306
empty_threshold=config.empty_threshold,
307+
null_threshold=config.null_threshold,
303308
field_threshold=config.field_threshold,
304309
merge_threshold=config.merge_threshold,
305310
max_numeric_len=config.max_numeric_len,

tests/test_analyzer.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,24 @@ def test_analyze_dict_bad_data():
190190
])
191191

192192

193+
def test_analyze_too_many_nulls():
194+
data = {str(i): i if i < 50 else None for i in range(100)}
195+
assert Analyzer().analyze(data) == Dict(
196+
sample=[data], content=[
197+
DictField(
198+
StrRepr(Int(Counter(range(100))), pattern='d'),
199+
Int(Counter(range(50)))
200+
)
201+
])
202+
assert Analyzer(null_threshold=0).analyze(data) == Dict(
203+
sample=[data], content=[
204+
DictField(
205+
StrRepr(Int(Counter(range(100))), pattern='d'),
206+
Value(Counter(range(5)))
207+
)
208+
])
209+
210+
193211
def test_analyze_dict_of_dicts():
194212
data = {n: {'foo': n, 'bar': n} for n in range(99)}
195213
assert Analyzer().analyze(data) == Dict(

0 commit comments

Comments
 (0)