Skip to content

Commit cd37169

Browse files
authored
Merge pull request #4698 from Liam-DeVoe/more-constant-strings
Add more constant strings
2 parents 4939dab + 72f0006 commit cd37169

8 files changed

Lines changed: 55 additions & 3 deletions

File tree

hypothesis-python/RELEASE.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
RELEASE_TYPE: patch
2+
3+
This patch fixes our |st.from_regex| type annotations so that ``from_regex(..., alphabet=None)`` is accepted.
4+
5+
This patch also adds unicode line breaks and thai combining vowels to our list of constant strings to upweight at runtime.

hypothesis-python/docs/prolog.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@
104104
.. |st.ip_addresses| replace:: :func:`~hypothesis.strategies.ip_addresses`
105105
.. |st.register_type_strategy| replace:: :func:`~hypothesis.strategies.register_type_strategy`
106106
.. |st.just| replace:: :func:`~hypothesis.strategies.just`
107+
.. |st.from_regex| replace:: :func:`~hypothesis.strategies.from_regex`
107108
.. |st.domains| replace:: :func:`~hypothesis.provisional.domains`
108109
.. |st.urls| replace:: :func:`~hypothesis.provisional.urls`
109110
.. |register_random| replace:: :func:`~hypothesis.register_random`

hypothesis-python/src/hypothesis/internal/charmap.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,12 @@ def query(
310310
min_codepoint = 0
311311
if max_codepoint is None:
312312
max_codepoint = sys.maxunicode
313+
314+
if min_codepoint > max_codepoint:
315+
raise InvalidArgument(
316+
f"min_codepoint={min_codepoint} is greater than max_codepoint={max_codepoint}"
317+
)
318+
313319
catkey = _category_key(categories)
314320
character_intervals = IntervalSet.from_string("".join(include_characters))
315321
exclude_intervals = IntervalSet.from_string("".join(exclude_characters))
@@ -326,12 +332,14 @@ def query(
326332
return limited_category_index_cache[qkey]
327333
except KeyError:
328334
pass
329-
base = _query_for_key(catkey)
335+
330336
result = []
331-
for u, v in base:
337+
for u, v in _query_for_key(catkey):
332338
if v >= min_codepoint and u <= max_codepoint:
333339
result.append((max(u, min_codepoint), min(v, max_codepoint)))
340+
334341
result = (IntervalSet(result) | character_intervals) - exclude_intervals
335342
if context is None or not context.data.provider.avoid_realization:
336343
limited_category_index_cache[qkey] = result
344+
337345
return result

hypothesis-python/src/hypothesis/internal/conjecture/providers.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,9 @@
230230
# Ogham text, which contains the only character in the Space Separators
231231
# unicode category (Zs) that isn't visually blank:  . # noqa: RUF003
232232
"᚛ᚄᚓᚐᚋᚒᚄ ᚑᚄᚂᚑᚏᚅ᚜",
233+
# thai consonant + spacing vowel combinations, which have unusual visual combining behavior
234+
"กา",
235+
"ก ำกำ",
233236
# readable variations on text (bolt/italic/script)
234237
"𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠",
235238
"𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌",
@@ -251,6 +254,33 @@
251254
"मनीष منش",
252255
"पन्ह पन्ह त्र र्च कृकृ ड्ड न्हृे إلا بسم الله",
253256
"lorem لا بسم الله ipsum 你好1234你好",
257+
# unicode charaacters causing unconditional line breaks, as defined by UAX #14:
258+
# https://www.unicode.org/reports/tr14/.
259+
#
260+
# We've seen multiple bugs caused by assuming `str.splitlines` is equivalent to
261+
# splitting over "\n", while it actually splits over all line breaks!
262+
#
263+
# We intersperse the line breaks with normal characters to increase the likelihood
264+
# of triggering such a bug.
265+
(
266+
"a"
267+
"\u000a" # line feed (class: LF)
268+
"b"
269+
"\u000d" # carriage return (class: CR)
270+
"c"
271+
"\u0085" # next line (class: NL)
272+
"d"
273+
"\u000b" # line tabulation (class: BK)
274+
"e"
275+
"\u000c" # form feed (class: BK)
276+
"f"
277+
"\u2028" # line separator (class: BK)
278+
"g"
279+
"\u2029" # paragraph separator (class: BK)
280+
"h"
281+
"\u000d\u000a" # CR+LF
282+
"i"
283+
),
254284
}
255285

256286

hypothesis-python/src/hypothesis/strategies/_internal/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -877,7 +877,7 @@ def from_regex(
877877
regex: str | Pattern[str],
878878
*,
879879
fullmatch: bool = False,
880-
alphabet: str | SearchStrategy[str] = characters(codec="utf-8"),
880+
alphabet: str | SearchStrategy[str] | None = characters(codec="utf-8"),
881881
) -> SearchStrategy[str]: # pragma: no cover
882882
...
883883

hypothesis-python/tests/cover/test_charmap.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import pytest
1919

2020
from hypothesis import given, strategies as st
21+
from hypothesis.errors import InvalidArgument
2122
from hypothesis.internal import charmap as cm
2223
from hypothesis.internal.intervalsets import IntervalSet
2324

@@ -191,6 +192,11 @@ def test_regenerate_broken_charmap_file():
191192
cm.charmap()
192193

193194

195+
def test_query_rejects_min_codepoint_greater_than_max():
196+
with pytest.raises(InvalidArgument):
197+
cm.query(min_codepoint=1, max_codepoint=0)
198+
199+
194200
def test_exclude_characters_are_included_in_key():
195201
assert cm.query().intervals != cm.query(exclude_characters="0").intervals
196202

hypothesis-python/tests/cover/test_intervalset.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424

2525
@given(intervals(max_codepoint=200))
26+
@settings(deadline=None)
2627
def test_intervals_are_equivalent_to_their_lists(intervals):
2728
ls = list(intervals)
2829
assert len(ls) == len(intervals)

whole_repo_tests/types/revealed_types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
"one_of(integers(), text(), none(), binary(), builds(list), builds(dict))",
6363
"Any",
6464
),
65+
("from_regex(r'.', alphabet=None)", "str"),
6566
]
6667

6768

0 commit comments

Comments
 (0)