Skip to content

Commit 83a7cdf

Browse files
BCSharpslozier
authored andcommitted
Surrogateescape errors (#713)
* Implement codecs.surrogateescape_errors * Fix PythonSurrogateEscapeEncoding * Fix PythonSurrogateEscapeEncoding, encoding side
1 parent 5930166 commit 83a7cdf

4 files changed

Lines changed: 181 additions & 60 deletions

File tree

Src/IronPython/Runtime/Operations/StringOps.cs

Lines changed: 91 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1973,6 +1973,11 @@ internal static Dictionary<string, object> MakeErrorHandlersDict() {
19731973
ReflectionUtils.GetMethodInfos(typeof(StringOps).GetMember(nameof(BackslashReplaceErrors), BindingFlags.Static | BindingFlags.NonPublic)),
19741974
typeof(StringOps));
19751975

1976+
d["surrogateescape"] = BuiltinFunction.MakeFunction(
1977+
"surrogateescape_errors",
1978+
ReflectionUtils.GetMethodInfos(typeof(StringOps).GetMember(nameof(SurrogateEscapeErrors), BindingFlags.Static | BindingFlags.NonPublic)),
1979+
typeof(StringOps));
1980+
19761981
return d;
19771982
}
19781983
}
@@ -2683,7 +2688,7 @@ private static object ReplaceErrors(object unicodeError) {
26832688
}
26842689
}
26852690

2686-
private static object BackslashReplaceErrors(object unicodeError) {
2691+
private static object XmlCharRefReplaceErrors(object unicodeError) {
26872692
switch (unicodeError) {
26882693
case PythonExceptions._UnicodeDecodeError ude:
26892694
throw PythonOps.TypeError("don't know how to handle UnicodeDecodeError in error callback");
@@ -2692,22 +2697,35 @@ private static object BackslashReplaceErrors(object unicodeError) {
26922697
if (uee.@object is string text && uee.start is int start && uee.end is int end) {
26932698
start = Math.Max(0, Math.Min(start, text.Length - 1));
26942699
end = Math.Max(start, Math.Min(end, text.Length));
2695-
return PythonTuple.MakeTuple(RawUnicodeEscapeEncode(text, start, end - start, escapeAscii: true), end);
2700+
var sb = new StringBuilder(10 * (end - start));
2701+
int i = start;
2702+
while (i < end) {
2703+
sb.Append("&#");
2704+
char ch = text[i++];
2705+
if (char.IsHighSurrogate(ch) && i < end && char.IsLowSurrogate(text[i])) {
2706+
sb.Append(char.ConvertToUtf32(ch, text[i++]));
2707+
} else {
2708+
sb.Append((uint)ch);
2709+
}
2710+
sb.Append(';');
2711+
}
2712+
return PythonTuple.MakeTuple(sb.ToString(), end);
26962713
}
26972714
goto default;
26982715

26992716
case DecoderFallbackException dfe:
27002717
throw PythonOps.TypeError("don't know how to handle DecoderFallbackException in error callback");
27012718

27022719
case EncoderFallbackException efe:
2703-
string chars = (efe.CharUnknownHigh != '\0') ? new string(new[] { efe.CharUnknownHigh, efe.CharUnknownLow }) : new string(efe.CharUnknown, 1);
2704-
return PythonTuple.MakeTuple(RawUnicodeEscapeEncode(chars, 0, chars.Length, escapeAscii: true), efe.Index + chars.Length);
2720+
string chars = (efe.CharUnknownHigh != '\0') ? $"&#{char.ConvertToUtf32(efe.CharUnknownHigh, efe.CharUnknownLow)}" : $"&#{(int)efe.CharUnknown};";
2721+
return PythonTuple.MakeTuple(chars, efe.Index + (efe.CharUnknownHigh != '\0' ? 2 : 1));
27052722

27062723
default:
27072724
throw PythonOps.TypeError("codec must pass exception instance");
27082725
}
27092726
}
2710-
private static object XmlCharRefReplaceErrors(object unicodeError) {
2727+
2728+
private static object BackslashReplaceErrors(object unicodeError) {
27112729
switch (unicodeError) {
27122730
case PythonExceptions._UnicodeDecodeError ude:
27132731
throw PythonOps.TypeError("don't know how to handle UnicodeDecodeError in error callback");
@@ -2716,32 +2734,85 @@ private static object XmlCharRefReplaceErrors(object unicodeError) {
27162734
if (uee.@object is string text && uee.start is int start && uee.end is int end) {
27172735
start = Math.Max(0, Math.Min(start, text.Length - 1));
27182736
end = Math.Max(start, Math.Min(end, text.Length));
2719-
var sb = new StringBuilder(10 * (end - start));
2720-
int i = start;
2721-
while (i < end) {
2722-
sb.Append("&#");
2723-
char ch = text[i++];
2724-
if (char.IsHighSurrogate(ch) && i < end && char.IsLowSurrogate(text[i])) {
2725-
sb.Append(char.ConvertToUtf32(ch, text[i++]));
2726-
} else {
2727-
sb.Append((uint)ch);
2728-
}
2729-
sb.Append(';');
2730-
}
2731-
return PythonTuple.MakeTuple(sb.ToString(), end);
2737+
return PythonTuple.MakeTuple(RawUnicodeEscapeEncode(text, start, end - start, escapeAscii: true), end);
27322738
}
27332739
goto default;
27342740

27352741
case DecoderFallbackException dfe:
27362742
throw PythonOps.TypeError("don't know how to handle DecoderFallbackException in error callback");
27372743

27382744
case EncoderFallbackException efe:
2739-
string chars = (efe.CharUnknownHigh != '\0') ? $"&#{char.ConvertToUtf32(efe.CharUnknownHigh, efe.CharUnknownLow)}" : $"&#{(int)efe.CharUnknown};";
2740-
return PythonTuple.MakeTuple(chars, efe.Index + (efe.CharUnknownHigh != '\0' ? 2 : 1));
2745+
string chars = (efe.CharUnknownHigh != '\0') ? new string(new[] { efe.CharUnknownHigh, efe.CharUnknownLow }) : new string(efe.CharUnknown, 1);
2746+
return PythonTuple.MakeTuple(RawUnicodeEscapeEncode(chars, 0, chars.Length, escapeAscii: true), efe.Index + chars.Length);
2747+
2748+
default:
2749+
throw PythonOps.TypeError("codec must pass exception instance");
2750+
}
2751+
}
2752+
2753+
private static object SurrogateEscapeErrors(object unicodeError) {
2754+
switch (unicodeError) {
2755+
case PythonExceptions._UnicodeDecodeError ude:
2756+
if (ude.@object is IList<byte> bytes && ude.start is int bstart && ude.end is int bend) {
2757+
bstart = Math.Max(0, Math.Min(bstart, bytes.Count - 1));
2758+
bend = Math.Max(bstart, Math.Min(bend, bytes.Count));
2759+
string res = surrogateEscapeDecode(bytes, bstart, bend);
2760+
if (res == null) throw ude.GetClrException();
2761+
return PythonTuple.MakeTuple(res, bstart + res.Length);
2762+
}
2763+
goto default;
2764+
2765+
case PythonExceptions._UnicodeEncodeError uee:
2766+
if (uee.@object is string text && uee.start is int tstart && uee.end is int tend) {
2767+
tstart = Math.Max(0, Math.Min(tstart, text.Length - 1));
2768+
tend = Math.Max(tstart, Math.Min(tend, text.Length));
2769+
Bytes res = surrogateEscapeEncode(text, tstart, tend);
2770+
if (res == null) throw uee.GetClrException();
2771+
return PythonTuple.MakeTuple(res, tend);
2772+
}
2773+
goto default;
2774+
2775+
case DecoderFallbackException dfe: {
2776+
if (dfe.BytesUnknown == null) throw dfe;
2777+
string res = surrogateEscapeDecode(dfe.BytesUnknown, 0, dfe.BytesUnknown.Length);
2778+
if (res == null) throw dfe;
2779+
return PythonTuple.MakeTuple(res, res.Length);
2780+
}
2781+
2782+
case EncoderFallbackException efe: {
2783+
string chars = new string(efe.CharUnknown, 1);
2784+
Bytes res = surrogateEscapeEncode(chars, 0, chars.Length);
2785+
return PythonTuple.MakeTuple(res, efe.Index + chars.Length);
2786+
}
27412787

27422788
default:
27432789
throw PythonOps.TypeError("codec must pass exception instance");
27442790
}
2791+
2792+
static string surrogateEscapeDecode(IList<byte> bytes, int start, int end) {
2793+
var sb = new StringBuilder(end - start);
2794+
for (int i = start; i < end; i++) {
2795+
byte b = bytes[i];
2796+
if (b < 0x80) {
2797+
if (i > start) break;
2798+
else return null;
2799+
}
2800+
sb.Append((char)(b | 0xDC00));
2801+
}
2802+
return sb.ToString();
2803+
}
2804+
2805+
static Bytes surrogateEscapeEncode(string text, int start, int end) {
2806+
var lst = new List<byte>(end - start);
2807+
for (int i = start; i < end; i++) {
2808+
char c = text[i];
2809+
if (!char.IsLowSurrogate(c)) return null;
2810+
byte b = (byte)(c & 0xFF);
2811+
if (b < 0x80) return null;
2812+
lst.Add(b);
2813+
}
2814+
return new Bytes(lst);
2815+
}
27452816
}
27462817
#endif
27472818

Src/IronPython/Runtime/PythonEncoding.cs

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -546,8 +546,16 @@ public override byte[] GetFallbackBytes(char charUnknown, int index) {
546546
index
547547
);
548548
}
549+
byte b = (byte)(charUnknown & 0xff);
550+
if (b < 128) {
551+
throw PythonOps.UnicodeEncodeError(
552+
"'surrogateescape' error handler: bytes below 128 cannot be smuggled (PEP 383)",
553+
charUnknown,
554+
index
555+
);
556+
}
549557

550-
return new[] { (byte)(charUnknown & 0xff) };
558+
return new[] { b };
551559
}
552560

553561
}
@@ -570,18 +578,13 @@ public override char[] GetFallbackChars(byte[] bytesUnknown, int index) {
570578
char[] fallbackChars = new char[charNum];
571579

572580
for (int i = 0; i < charNum; i++) {
573-
if (this.EncodingCharWidth == 1) {
574-
// test for value below 128
575-
if (bytesUnknown[i] < 128u) {
576-
throw new DecoderFallbackException(
577-
$"values below 128 cannot be smuggled (PEP 383)",
578-
bytesUnknown,
579-
index
580-
);
581-
}
581+
if (bytesUnknown[i] < 128) {
582+
throw new DecoderFallbackException(
583+
"'surrogateescape' error handler: bytes below 128 cannot be smuggled (PEP 383)",
584+
bytesUnknown,
585+
index
586+
);
582587
}
583-
// no test for "else" case because all supported wide char encodings (UTF-16LE, UTF-16BE, UTF-32LE, UTF-32BE)
584-
// will never fall back for values under 128
585588

586589
fallbackChars[i] = (char)(bytesUnknown[i] | LoneSurrogateBase);
587590
}

Src/IronPythonTest/EncodingTest.cs

Lines changed: 15 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,6 @@ public void SetUp() {
2929
[Test] public void Test256WithPythonAscii() => TestRoundTrip(PythonAsciiEncoding.Instance, _bytes);
3030
[Test] public void Test256WithUtf8() => TestRoundTrip(Encoding.UTF8, _bytes);
3131
[Test] public void Test256WithDefault() => TestRoundTrip(Encoding.Default, _bytes);
32-
[Test] public void Test256WithUnicode() => TestRoundTrip(Encoding.Unicode, _bytes);
33-
[Test] public void Test256WithBigEndianUnicode() => TestRoundTrip(Encoding.BigEndianUnicode, _bytes);
34-
[Test] public void Test256WithUtf32() => TestRoundTrip(Encoding.UTF32, _bytes);
35-
[Test] public void Test256WithUtf32BE() => TestRoundTrip(new UTF32Encoding(bigEndian: true, byteOrderMark: false), _bytes);
3632
}
3733

3834
// Test decoding/encoding a valid UTF-8 sequence
@@ -42,18 +38,14 @@ public class Utf8Test {
4238

4339
[SetUp]
4440
public void SetUp() {
45-
// 12 bytes, rounded to multiply of 4 for the sake of UTF-32 test
41+
// 12 bytes of: Питон!!
4642
_bytes = "\xd0\x9f\xd0\xb8\xd1\x82\xd0\xbe\xd0\xbd!!".AsBytes();
4743
}
4844

4945
[Test] public void TestValidUtf8WithAscii() => TestRoundTrip(Encoding.ASCII, _bytes);
5046
[Test] public void TestValidUtf8WithPythonAscii() => TestRoundTrip(PythonAsciiEncoding.Instance, _bytes);
5147
[Test] public void TestValidUtf8WithUtf8() => TestRoundTrip(Encoding.UTF8, _bytes);
5248
[Test] public void TestValidUtf8WithDefault() => TestRoundTrip(Encoding.Default, _bytes);
53-
[Test] public void TestValidUtf8WithUnicode() => TestRoundTrip(Encoding.Unicode, _bytes);
54-
[Test] public void TestValidUtf8WithBigEndianUnicode() => TestRoundTrip(Encoding.BigEndianUnicode, _bytes);
55-
[Test] public void TestValidUtf8WithUtf32() => TestRoundTrip(Encoding.UTF32, _bytes);
56-
[Test] public void TestValidUtf8WithUtf32BE() => TestRoundTrip(new UTF32Encoding(bigEndian: true, byteOrderMark: false), _bytes);
5749
}
5850

5951
// Test decoding/encoding an invalid UTF-8 sequence
@@ -66,20 +58,16 @@ public void SetUp() {
6658
// 12 bytes: two valid UTF-8 2-byte chars, one non-decodable byte,
6759
// one UTF-8 2-byte char with a non-decodable byte inserted in between the UTF-8 bytes
6860
// and final valid UTF-8 2-byte char
69-
_bytes = "\xd0\x9f\xd0\xb8\x80\xd1\x20\x82\xd0\xbe\xd0\xbd".AsBytes();
61+
_bytes = "\xd0\x9f\xd0\xb8\x80\xd1\xff\x82\xd0\xbe\xd0\xbd".AsBytes();
7062
}
7163

7264
[Test] public void TestBrokenUtf8WithAscii() => TestRoundTrip(Encoding.ASCII, _bytes);
7365
[Test] public void TestBrokenUtf8WithPythonAscii() => TestRoundTrip(PythonAsciiEncoding.Instance, _bytes);
7466
[Test] public void TestBrokenUtf8WithUtf8() => TestRoundTrip(Encoding.UTF8, _bytes);
7567
[Test] public void TestBrokenUtf8WithDefault() => TestRoundTrip(Encoding.Default, _bytes);
76-
[Test] public void TestBrokenUtf8WithUnicode() => TestRoundTrip(Encoding.Unicode, _bytes);
77-
[Test] public void TestBrokenUtf8WithBigEndianUnicode() => TestRoundTrip(Encoding.BigEndianUnicode, _bytes);
78-
[Test] public void TestBrokenUtf8WithUtf32() => TestRoundTrip(Encoding.UTF32, _bytes);
79-
[Test] public void TestBrokenUtf8WithUtf32BE() => TestRoundTrip(new UTF32Encoding(bigEndian: true, byteOrderMark: false), _bytes);
8068
}
8169

82-
// Note: UTF-7 is not round-trip safe in general
70+
// Note: UTF-7, UTF-16, and UTF-32 are not round-trip safe in general
8371
private static void TestRoundTrip(Encoding enc, byte[] bytes) {
8472

8573
Encoding penc = new PythonSurrogateEscapeEncoding(enc);
@@ -344,33 +332,34 @@ public void TestEndiannessWithtUtf16BE() {
344332
public void TestEndiannessWithtUtf32LE() {
345333
Encoding penc = new PythonSurrogateEscapeEncoding(new UTF32Encoding(bigEndian: false, byteOrderMark: false));
346334
Assert.AreEqual("\u000a", penc.GetChars(_bytes1));
347-
Assert.AreEqual("\udc00\udc00\udc00\udc0a", penc.GetChars(_bytes2));
335+
Assert.Throws<DecoderFallbackException>(() => penc.GetChars(_bytes2));
348336
}
349337

350338
[Test]
351339
public void TestEndiannessWithtUtf32BE() {
352340
Encoding penc = new PythonSurrogateEscapeEncoding(new UTF32Encoding(bigEndian: true, byteOrderMark: false));
353-
Assert.AreEqual("\udc0a\udc00\udc00\udc00", penc.GetChars(_bytes1));
341+
Assert.Throws<DecoderFallbackException>(() => penc.GetChars(_bytes1));
354342
Assert.AreEqual("\u000a", penc.GetChars(_bytes2));
355343
}
356344
}
357345

358-
public class IncompleteSequenceTests {
346+
public class AsciiByteTests {
359347

360348
private char[] _chars;
361349

362350
[SetUp]
363351
public void SetUp() {
364-
// one surrogate escape is not enough for wide-char encodings
352+
// surrogate escape carrying byte < 128 is not allowed
365353
_chars = "+++\udc41++".ToCharArray();
366354
}
367355

368-
[Test] public void TestIncompleteSequenceWithtUtf16LE() => TestIncompleteSequence(Encoding.Unicode, 2);
369-
[Test] public void TestIncompleteSequenceWithtUtf16BE() => TestIncompleteSequence(Encoding.BigEndianUnicode, 2);
370-
[Test] public void TestIncompleteSequenceWithtUtf32LE() => TestIncompleteSequence(new UTF32Encoding(bigEndian: false, byteOrderMark: false), 4);
371-
[Test] public void TestIncompleteSequenceWithtUtf32BE() => TestIncompleteSequence(new UTF32Encoding(bigEndian: true, byteOrderMark: false), 4);
356+
[Test] public void TestAsciiByteWithtUtf8() => TestAsciiByte(Encoding.UTF8, 1);
357+
[Test] public void TestAsciiByteWithtUtf16LE() => TestAsciiByte(Encoding.Unicode, 2);
358+
[Test] public void TestAsciiByteWithtUtf16BE() => TestAsciiByte(Encoding.BigEndianUnicode, 2);
359+
[Test] public void TestAsciiByteWithtUtf32LE() => TestAsciiByte(new UTF32Encoding(bigEndian: false, byteOrderMark: false), 4);
360+
[Test] public void TestAsciiByteWithtUtf32BE() => TestAsciiByte(new UTF32Encoding(bigEndian: true, byteOrderMark: false), 4);
372361

373-
public void TestIncompleteSequence(Encoding codec, int charWidth) {
362+
public void TestAsciiByte(Encoding codec, int charWidth) {
374363
Encoding penc = new PythonSurrogateEscapeEncoding(codec);
375364

376365
Assert.That(() => penc.GetBytes(_chars),
@@ -390,11 +379,9 @@ public void TestIncompleteSequence(Encoding codec, int charWidth) {
390379

391380
enc.Reset();
392381

393-
Assert.That(enc.GetByteCount(_chars, 0, 4, flush: false), Is.EqualTo(3 * charWidth));
394-
Assert.That(() => enc.GetBytes(_chars, 0, 4, bytes, 0, flush: false), Throws.Nothing);
395-
Assert.That(() => enc.GetByteCount(_chars, 4, 1, flush: false),
382+
Assert.That(() => enc.GetBytes(_chars, 0, 5, bytes, 3 * charWidth, flush: false),
396383
Throws.TypeOf<EncoderFallbackException>()
397-
.With.Property("Index").EqualTo(-1) // last char from previous increment
384+
.With.Property("Index").EqualTo(3)
398385
.And.Property("CharUnknown").EqualTo(_chars[3]));
399386
}
400387
}

0 commit comments

Comments
 (0)