Skip to content

Commit 5930166

Browse files
BCSharpslozier
authored andcommitted
Implement codecs.*replace_errors (#712)
1 parent 3df9890 commit 5930166

2 files changed

Lines changed: 113 additions & 16 deletions

File tree

Src/IronPython/Runtime/Operations/StringOps.cs

Lines changed: 110 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1589,7 +1589,8 @@ private static string ReprEncode(string s, int start, int count, bool isUniEscap
15891589
// lazily create the StringBuilder only if necessary.
15901590
StringBuilder b = null;
15911591
int i = start;
1592-
while (i < count) {
1592+
int end = start + count;
1593+
while (i < end) {
15931594
char ch = s[i++];
15941595
switch (ch) {
15951596
case '\\': StringBuilderInit(ref b, s, start, i - 1); b.Append("\\\\"); break;
@@ -1620,28 +1621,32 @@ private static string ReprEncode(string s, int start, int count, bool isUniEscap
16201621
}
16211622
}
16221623

1623-
return b?.ToString() ?? s;
1624+
return b?.ToString() ?? s.Substring(start, count);
16241625
}
16251626

1626-
private static string RawUnicodeEscapeEncode(string s, int start, int count) {
1627+
private static string RawUnicodeEscapeEncode(string s, int start, int count, bool escapeAscii = false) {
16271628
// in the common case we don't need to encode anything, so we
16281629
// lazily create the StringBuilder only if necessary.
16291630
StringBuilder b = null;
16301631
int i = start;
1631-
while (i < count) {
1632+
int end = start + count;
1633+
while (i < end) {
16321634
char ch = s[i++];
16331635
if ((ch & 0xFC00) == 0xD800 && i < count && (s[i] & 0xFC00) == 0xDC00) {
16341636
StringBuilderInit(ref b, s, start, i - 1);
16351637
b.AppendFormat("\\U{0:x8}", char.ConvertToUtf32(ch, s[i++]));
16361638
} else if (ch > 0xFF) {
16371639
StringBuilderInit(ref b, s, start, i - 1);
16381640
b.AppendFormat("\\u{0:x4}", (int)ch);
1641+
} else if (escapeAscii) {
1642+
StringBuilderInit(ref b, s, start, i - 1);
1643+
b.AppendFormat("\\x{0:x2}", (int)ch);
16391644
} else {
16401645
b?.Append(ch);
16411646
}
16421647
}
16431648

1644-
return b?.ToString() ?? s;
1649+
return b?.ToString() ?? s.Substring(start, count);
16451650
}
16461651

16471652
private static void StringBuilderInit(ref StringBuilder sb, string s, int start, int end) {
@@ -1953,12 +1958,20 @@ internal static Dictionary<string, object> MakeErrorHandlersDict() {
19531958
ReflectionUtils.GetMethodInfos(typeof(StringOps).GetMember(nameof(IgnoreErrors), BindingFlags.Static | BindingFlags.NonPublic)),
19541959
typeof(StringOps));
19551960

1956-
// TODO: Implement remaining error handlers
1957-
d["replace"] = null;
1961+
d["replace"] = BuiltinFunction.MakeFunction(
1962+
"replace_errors",
1963+
ReflectionUtils.GetMethodInfos(typeof(StringOps).GetMember(nameof(ReplaceErrors), BindingFlags.Static | BindingFlags.NonPublic)),
1964+
typeof(StringOps));
19581965

1959-
d["xmlcharrefreplace"] = null;
1966+
d["xmlcharrefreplace"] = BuiltinFunction.MakeFunction(
1967+
"xmlcharrefreplace_errors",
1968+
ReflectionUtils.GetMethodInfos(typeof(StringOps).GetMember(nameof(XmlCharRefReplaceErrors), BindingFlags.Static | BindingFlags.NonPublic)),
1969+
typeof(StringOps));
19601970

1961-
d["backslashreplace"] = null;
1971+
d["backslashreplace"] = BuiltinFunction.MakeFunction(
1972+
"backslashreplace_errors",
1973+
ReflectionUtils.GetMethodInfos(typeof(StringOps).GetMember(nameof(BackslashReplaceErrors), BindingFlags.Static | BindingFlags.NonPublic)),
1974+
typeof(StringOps));
19621975

19631976
return d;
19641977
}
@@ -2638,10 +2651,96 @@ private static object IgnoreErrors(object unicodeError) {
26382651
case PythonExceptions._UnicodeEncodeError uee:
26392652
return PythonTuple.MakeTuple(string.Empty, uee.end);
26402653
case DecoderFallbackException dfe:
2641-
return PythonTuple.MakeTuple(string.Empty, dfe.Index + dfe.BytesUnknown.Length);
2654+
return PythonTuple.MakeTuple(string.Empty, dfe.Index + dfe.BytesUnknown?.Length ?? 0);
26422655
case EncoderFallbackException efe:
26432656
return PythonTuple.MakeTuple(string.Empty, efe.Index + (efe.CharUnknownHigh != '\0' ? 2 : 1));
2644-
default: throw PythonOps.TypeError("codec must pass exception instance");
2657+
default:
2658+
throw PythonOps.TypeError("codec must pass exception instance");
2659+
}
2660+
}
2661+
2662+
private static object ReplaceErrors(object unicodeError) {
2663+
switch (unicodeError) {
2664+
case PythonExceptions._UnicodeDecodeError ude:
2665+
return PythonTuple.MakeTuple("\ufffd", ude.end);
2666+
2667+
case PythonExceptions._UnicodeEncodeError uee:
2668+
if (uee.@object is string text && uee.start is int start && uee.end is int end) {
2669+
start = Math.Max(0, Math.Min(start, text.Length - 1));
2670+
end = Math.Max(start, Math.Min(end, text.Length));
2671+
return PythonTuple.MakeTuple(new string('?', end - start), end);
2672+
}
2673+
goto default;
2674+
2675+
case DecoderFallbackException dfe:
2676+
return PythonTuple.MakeTuple("\ufffd", dfe.Index + dfe.BytesUnknown?.Length ?? 0);
2677+
2678+
case EncoderFallbackException efe:
2679+
return PythonTuple.MakeTuple("?", efe.Index + (efe.CharUnknownHigh != '\0' ? 2 : 1));
2680+
2681+
default:
2682+
throw PythonOps.TypeError("codec must pass exception instance");
2683+
}
2684+
}
2685+
2686+
private static object BackslashReplaceErrors(object unicodeError) {
2687+
switch (unicodeError) {
2688+
case PythonExceptions._UnicodeDecodeError ude:
2689+
throw PythonOps.TypeError("don't know how to handle UnicodeDecodeError in error callback");
2690+
2691+
case PythonExceptions._UnicodeEncodeError uee:
2692+
if (uee.@object is string text && uee.start is int start && uee.end is int end) {
2693+
start = Math.Max(0, Math.Min(start, text.Length - 1));
2694+
end = Math.Max(start, Math.Min(end, text.Length));
2695+
return PythonTuple.MakeTuple(RawUnicodeEscapeEncode(text, start, end - start, escapeAscii: true), end);
2696+
}
2697+
goto default;
2698+
2699+
case DecoderFallbackException dfe:
2700+
throw PythonOps.TypeError("don't know how to handle DecoderFallbackException in error callback");
2701+
2702+
case EncoderFallbackException efe:
2703+
string chars = (efe.CharUnknownHigh != '\0') ? new string(new[] { efe.CharUnknownHigh, efe.CharUnknownLow }) : new string(efe.CharUnknown, 1);
2704+
return PythonTuple.MakeTuple(RawUnicodeEscapeEncode(chars, 0, chars.Length, escapeAscii: true), efe.Index + chars.Length);
2705+
2706+
default:
2707+
throw PythonOps.TypeError("codec must pass exception instance");
2708+
}
2709+
}
2710+
private static object XmlCharRefReplaceErrors(object unicodeError) {
2711+
switch (unicodeError) {
2712+
case PythonExceptions._UnicodeDecodeError ude:
2713+
throw PythonOps.TypeError("don't know how to handle UnicodeDecodeError in error callback");
2714+
2715+
case PythonExceptions._UnicodeEncodeError uee:
2716+
if (uee.@object is string text && uee.start is int start && uee.end is int end) {
2717+
start = Math.Max(0, Math.Min(start, text.Length - 1));
2718+
end = Math.Max(start, Math.Min(end, text.Length));
2719+
var sb = new StringBuilder(10 * (end - start));
2720+
int i = start;
2721+
while (i < end) {
2722+
sb.Append("&#");
2723+
char ch = text[i++];
2724+
if (char.IsHighSurrogate(ch) && i < end && char.IsLowSurrogate(text[i])) {
2725+
sb.Append(char.ConvertToUtf32(ch, text[i++]));
2726+
} else {
2727+
sb.Append((uint)ch);
2728+
}
2729+
sb.Append(';');
2730+
}
2731+
return PythonTuple.MakeTuple(sb.ToString(), end);
2732+
}
2733+
goto default;
2734+
2735+
case DecoderFallbackException dfe:
2736+
throw PythonOps.TypeError("don't know how to handle DecoderFallbackException in error callback");
2737+
2738+
case EncoderFallbackException efe:
2739+
string chars = (efe.CharUnknownHigh != '\0') ? $"&#{char.ConvertToUtf32(efe.CharUnknownHigh, efe.CharUnknownLow)}" : $"&#{(int)efe.CharUnknown};";
2740+
return PythonTuple.MakeTuple(chars, efe.Index + (efe.CharUnknownHigh != '\0' ? 2 : 1));
2741+
2742+
default:
2743+
throw PythonOps.TypeError("codec must pass exception instance");
26452744
}
26462745
}
26472746
#endif

Tests/modules/io_related/test_codecs.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -460,7 +460,7 @@ def test_latin_1_encode(self):
460460
def test_error_handlers(self):
461461
ude = UnicodeDecodeError('dummy', b"abcdefgh", 3, 5, "decoding testing purposes")
462462
uee = UnicodeEncodeError('dummy', "abcdefgh", 2, 6, "encoding testing purposes")
463-
unicode_data = "ab\xff\u20ac\U0001f40d\0z"
463+
unicode_data = "ab\xff\u20ac\U0001f40d\0\t\r\nz"
464464
uee_unicode = UnicodeEncodeError('dummy', unicode_data, 2, len(unicode_data), "encoding testing purposes")
465465

466466
strict = codecs.lookup_error('strict')
@@ -482,8 +482,6 @@ def test_error_handlers(self):
482482
self.assertEqual(ignore(uee), ("", 6))
483483
self.assertEqual(ignore(uee_unicode), ("", uee_unicode.end))
484484

485-
return # TODO: Implement remaining error handlers
486-
487485
replace = codecs.lookup_error('replace')
488486
self.assertEqual(replace, codecs.replace_errors)
489487
self.assertEqual(replace(ude), ("�", 5))
@@ -494,13 +492,13 @@ def test_error_handlers(self):
494492
self.assertEqual(backslashreplace, codecs.backslashreplace_errors)
495493
self.assertRaisesRegex(TypeError, "don't know how to handle UnicodeDecodeError in error callback", backslashreplace, ude)
496494
self.assertEqual(backslashreplace(uee), (r"\x63\x64\x65\x66", 6))
497-
self.assertEqual(backslashreplace(uee_unicode), (r"\xff\u20ac\U0001f40d\x00\x7a", uee_unicode.end))
495+
self.assertEqual(backslashreplace(uee_unicode), (r"\xff\u20ac\U0001f40d\x00\x09\x0d\x0a\x7a", uee_unicode.end))
498496

499497
xmlcharrefreplace = codecs.lookup_error('xmlcharrefreplace')
500498
self.assertEqual(xmlcharrefreplace, codecs.xmlcharrefreplace_errors)
501499
self.assertRaisesRegex(TypeError, "don't know how to handle UnicodeDecodeError in error callback", xmlcharrefreplace, ude)
502500
self.assertEqual(xmlcharrefreplace(uee), ("&#99;&#100;&#101;&#102;", 6))
503-
self.assertEqual(xmlcharrefreplace(uee_unicode), ("&#255;&#8364;&#128013;&#0;&#122;", uee_unicode.end))
501+
self.assertEqual(xmlcharrefreplace(uee_unicode), ("&#255;&#8364;&#128013;&#0;&#9;&#13;&#10;&#122;", uee_unicode.end))
504502

505503
#TODO: @skip("multiple_execute")
506504
def test_lookup_error(self):

0 commit comments

Comments
 (0)