Skip to content

Commit 28ede20

Browse files
BCSharpslozier
authored andcommitted
Fix codecs.escape_decode (#690)
* Fix escape_decode * Error handlers in _codecs.escape_decode * Cleanup * Execute string switch only once * Update after review
1 parent 3dc1fd3 commit 28ede20

5 files changed

Lines changed: 96 additions & 83 deletions

File tree

Src/IronPython.Modules/_codecs.cs

Lines changed: 20 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -172,67 +172,31 @@ public static object encode(CodeContext/*!*/ context, object obj, string encodin
172172
#region Escape Encoding
173173

174174
public static PythonTuple escape_decode(CodeContext/*!*/ context, string data, string errors = "strict")
175-
=> escape_decode(DoEncode(context, "utf-8", Encoding.UTF8, data, "strict").Item1, errors);
175+
=> escape_decode(StringOps.DoEncodeUtf8(context, data), errors);
176176

177177
public static PythonTuple escape_decode([BytesConversion]IList<byte> data, string errors = "strict") {
178-
var res = new StringBuilder();
179-
for (int i = 0; i < data.Count; i++) {
180-
if (data[i] == '\\') {
181-
if (i == data.Count - 1) throw PythonOps.ValueError("\\ at end of string");
182-
183-
switch ((char)data[++i]) {
184-
case 'a': res.Append((char)0x07); break;
185-
case 'b': res.Append((char)0x08); break;
186-
case 't': res.Append('\t'); break;
187-
case 'n': res.Append('\n'); break;
188-
case 'r': res.Append('\r'); break;
189-
case '\\': res.Append('\\'); break;
190-
case 'f': res.Append((char)0x0c); break;
191-
case 'v': res.Append((char)0x0b); break;
192-
case '\n': break;
193-
case 'x':
194-
if (++i < data.Count && CharToInt((char)data[i], out int dig1)
195-
&& ++i < data.Count && CharToInt((char)data[i], out int dig2)) {
196-
res.Append((char)(dig1 * 16 + dig2));
197-
} else {
198-
switch (errors) {
199-
case "strict":
200-
throw PythonOps.ValueError("invalid \\x escape at position {0}", i);
201-
case "replace":
202-
res.Append("?");
203-
i--;
204-
break;
205-
default:
206-
throw PythonOps.ValueError("decoding error; unknown error handling code: " + errors);
207-
}
208-
}
209-
break;
210-
default:
211-
res.Append("\\" + (char)data[i]);
212-
break;
213-
}
214-
} else {
215-
res.Append((char)data[i]);
178+
var res = LiteralParser.ParseBytes(data, 0, data.Count, isRaw: false, normalizeLineEndings: false, getErrorHandler(errors));
179+
180+
return PythonTuple.MakeTuple(Bytes.Make(res.ToArray()), data.Count);
181+
182+
LiteralParser.ParseBytesErrorHandler<byte> getErrorHandler(string errors) {
183+
if (errors == null) return null;
184+
Func<int, IReadOnlyList<byte>> eh = null;
185+
return errorHandler;
186+
187+
IReadOnlyList<byte> errorHandler(IList<byte> data, int start, int end) {
188+
eh ??= errors switch
189+
{
190+
"strict" => idx => throw PythonOps.ValueError("invalid \\x escape at position {0}", idx),
191+
"replace" => idx => _replacementMarker ??= new[] { (byte)'?' },
192+
"ignore" => idx => null,
193+
_ => idx => throw PythonOps.ValueError("decoding error; unknown error handling code: " + errors),
194+
};
195+
return eh(start);
216196
}
217-
218-
}
219-
return PythonTuple.MakeTuple(Bytes.Make(res.ToString().MakeByteArray()), data.Count);
220-
}
221-
222-
private static bool CharToInt(char ch, out int val) {
223-
if (char.IsDigit(ch)) {
224-
val = ch - '0';
225-
return true;
226197
}
227-
ch = char.ToUpper(ch);
228-
if (ch >= 'A' && ch <= 'F') {
229-
val = ch - 'A' + 10;
230-
return true;
231-
}
232-
233-
val = 0;
234-
return false;
235198
}
199+
private static byte[] _replacementMarker;
236200

237201
public static PythonTuple/*!*/ escape_encode([BytesConversion]IList<byte> text, string errors = "strict") {
238202
StringBuilder res = new StringBuilder();

Src/IronPython/Runtime/LiteralParser.cs

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -234,22 +234,24 @@ private static void StringBuilderInit<T>(ref StringBuilder sb, T[] data, int sta
234234
}
235235
}
236236

237-
internal static List<byte> ParseBytes(char[] text, int start, int length, bool isRaw, bool normalizeLineEndings) {
238-
Debug.Assert(text != null);
239-
Debug.Assert(start + length <= text.Length);
237+
internal delegate IReadOnlyList<byte> ParseBytesErrorHandler<T>(IList<T> data, int start, int end);
238+
239+
internal static List<byte> ParseBytes<T>(IList<T> data, int start, int length, bool isRaw, bool normalizeLineEndings, ParseBytesErrorHandler<T> errorHandler = null) where T : IConvertible {
240+
Debug.Assert(data != null);
241+
Debug.Assert(start + length <= data.Count);
240242

241243
List<byte> buf = new List<byte>(length);
242244

243245
int i = start;
244246
int l = start + length;
245247
int val;
246248
while (i < l) {
247-
char ch = text[i++];
249+
char ch = data[i++].ToChar(null);
248250
if (!isRaw && ch == '\\') {
249251
if (i >= l) {
250252
throw PythonOps.ValueError("Trailing \\ in string");
251253
}
252-
ch = text[i++];
254+
ch = data[i++].ToChar(null);
253255
switch (ch) {
254256
case 'a': buf.Add((byte)'\a'); continue;
255257
case 'b': buf.Add((byte)'\b'); continue;
@@ -265,16 +267,24 @@ internal static List<byte> ParseBytes(char[] text, int start, int length, bool i
265267
case '\r':
266268
if (!normalizeLineEndings) {
267269
goto default;
268-
} else if (i < l && text[i] == '\n') {
270+
} else if (i < l && data[i].ToChar(null) == '\n') {
269271
i++;
270272
}
271273
continue;
272274
case 'x': //hex
273-
if (!TryParseInt(text, i, 2, 16, out val, out int _)) {
274-
throw PythonOps.ValueError("invalid \\x escape at position {0}", i - start - 2);
275+
if (!TryParseInt(data, i, 2, 16, out val, out int consumed)) {
276+
int pos = i - start - 2;
277+
if (errorHandler == null) {
278+
throw PythonOps.ValueError("invalid \\x escape at position {0}", pos);
279+
}
280+
var substitute = errorHandler(data, pos, pos + consumed);
281+
if (substitute != null) {
282+
buf.AddRange(substitute);
283+
}
284+
} else {
285+
buf.Add((byte)val);
275286
}
276-
buf.Add((byte)val);
277-
i += 2;
287+
i += consumed;
278288
continue;
279289
case '0':
280290
case '1':
@@ -285,10 +295,10 @@ internal static List<byte> ParseBytes(char[] text, int start, int length, bool i
285295
case '6':
286296
case '7': {
287297
val = ch - '0';
288-
if (i < l && HexValue(text[i], out int onechar) && onechar < 8) {
298+
if (i < l && HexValue(data[i].ToChar(null), out int onechar) && onechar < 8) {
289299
val = val * 8 + onechar;
290300
i++;
291-
if (i < l && HexValue(text[i], out onechar) && onechar < 8) {
301+
if (i < l && HexValue(data[i].ToChar(null), out onechar) && onechar < 8) {
292302
val = val * 8 + onechar;
293303
i++;
294304
}
@@ -304,7 +314,7 @@ internal static List<byte> ParseBytes(char[] text, int start, int length, bool i
304314
}
305315
} else if (ch == '\r' && normalizeLineEndings) {
306316
// normalize line endings
307-
if (i < l && text[i] == '\n') {
317+
if (i < l && data[i].ToChar(null) == '\n') {
308318
i++;
309319
}
310320
buf.Add((byte)'\n');
@@ -389,14 +399,10 @@ private static bool ParseInt(string text, int b, out int ret) {
389399
return true;
390400
}
391401

392-
private static bool TryParseInt<T>(T[] text, int start, int length, int b, out int value, out int consumed) where T : IConvertible {
402+
private static bool TryParseInt<T>(IList<T> text, int start, int length, int b, out int value, out int consumed) where T : IConvertible {
393403
value = 0;
394-
if (start + length > text.Length) {
395-
consumed = 0;
396-
return false;
397-
}
398404
for (int i = start, end = start + length; i < end; i++) {
399-
if (HexValue(text[i].ToChar(null), out int onechar) && onechar < b) {
405+
if (i < text.Count && HexValue(text[i].ToChar(null), out int onechar) && onechar < b) {
400406
value = value * b + onechar;
401407
} else {
402408
consumed = i - start;

Src/IronPython/Runtime/Operations/StringOps.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1799,7 +1799,7 @@ internal static Bytes RawEncode(CodeContext/*!*/ context, string s, string encod
17991799
}
18001800

18011801
internal static Bytes DoEncodeUtf8(CodeContext context, string s)
1802-
=> RawEncode(context, s, "utf-8", "strict");
1802+
=> DoEncode(context, s, "strict", "utf-8", Encoding.UTF8, includePreamble: false);
18031803

18041804
internal static Bytes DoEncode(CodeContext context, string s, string errors, string encoding, Encoding e, bool includePreamble) {
18051805
#if FEATURE_ENCODING

Tests/modules/io_related/test_codecs.py

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626

2727
class CodecTest(IronPythonTestCase):
2828
def test_escape_decode(self):
29-
#sanity checks
29+
# escape_decode decodes bytes to bytes, but when given a string it encodes it first with UTF-8
30+
self.assertEqual(codecs.escape_decode("abc€ghi🐍xyz"), codecs.escape_decode(b'abc\xe2\x82\xacghi\xf0\x9f\x90\x8dxyz'))
3031

3132
value, length = codecs.escape_decode("ab\a\b\t\n\r\f\vba")
3233
self.assertEqual(value, b'ab\x07\x08\t\n\r\x0c\x0bba')
@@ -36,22 +37,64 @@ def test_escape_decode(self):
3637
self.assertEqual(value, b'\x07')
3738
self.assertEqual(length, 2)
3839

39-
value, length = codecs.escape_decode("ab\a\b\t\n\r\f\vbaab\\a\\b\\t\\n\\r\\f\\vbaab\\\a\\\b\\\t\\\n\\\r\\\f\\\vba")
40-
self.assertEqual(value, b'ab\x07\x08\t\n\r\x0c\x0bbaab\x07\x08\t\n\r\x0c\x0bbaab\\\x07\\\x08\\\t\\\r\\\x0c\\\x0bba')
41-
self.assertEqual(length, 47)
40+
value, length = codecs.escape_decode("ab\a\b\t\n\r\f\v\'\"baab\\a\\b\\t\\n\\r\\f\\v\\'\\\"baab\\\a\\\b\\\t\\\n\\\r\\\f\\\vba")
41+
self.assertEqual(value, b'ab\x07\x08\t\n\r\x0c\x0b\'\"baab\x07\x08\t\n\r\x0c\x0b\'\"baab\\\x07\\\x08\\\t\\\r\\\x0c\\\x0bba')
42+
self.assertEqual(length, 53)
4243

4344
value, length = codecs.escape_decode("\\\a")
4445
self.assertEqual(value, b'\\\x07')
4546
self.assertEqual(length, 2)
4647

47-
self.assertEqual(b"abc", codecs.escape_decode("abc", None)[0])
48+
value, length = codecs.escape_decode("\\07")
49+
self.assertEqual(value, b'\x07')
50+
self.assertEqual(length, 3)
51+
52+
value, length = codecs.escape_decode("\\047")
53+
self.assertEqual(value, b"'")
54+
self.assertEqual(length, 4)
55+
56+
self.assertEquals(codecs.escape_decode(b"ab\nc"), (b"ab\nc", 4))
57+
self.assertEquals(codecs.escape_decode(b"ab\rc"), (b"ab\rc", 4))
58+
self.assertEquals(codecs.escape_decode(b"ab\r\nc"), (b"ab\r\nc", 5))
59+
60+
self.assertEquals(codecs.escape_decode(b"ab\\\nc"), (b"abc", 5))
61+
self.assertEquals(codecs.escape_decode(b"ab\\\rc"), (b"ab\\\rc", 5))
62+
self.assertEquals(codecs.escape_decode(b"ab\\\r\\\nc"), (b"ab\\\rc", 7))
63+
64+
def test_escape_decode_errors(self):
65+
self.assertEqual(codecs.escape_decode("abc", None), (b"abc", 3))
66+
4867
self.assertEqual(b"?", codecs.escape_decode("\\x", 'replace')[0])
4968
self.assertEqual(b"?", codecs.escape_decode("\\x2", 'replace')[0])
5069
self.assertEqual(b"?I", codecs.escape_decode("\\xI", 'replace')[0])
5170
self.assertEqual(b"?II", codecs.escape_decode("\\xII", 'replace')[0])
5271
self.assertEqual(b"?I", codecs.escape_decode("\\x1I", 'replace')[0])
5372
self.assertEqual(b"?I1", codecs.escape_decode("\\xI1", 'replace')[0])
5473

74+
self.assertEqual(b"abc", codecs.escape_decode("abc\\x", 'ignore')[0])
75+
self.assertEqual(b"abc", codecs.escape_decode("abc\\x2", 'ignore')[0])
76+
self.assertEqual(b"abcI", codecs.escape_decode("abc\\xI", 'ignore')[0])
77+
self.assertEqual(b"abcII", codecs.escape_decode("abc\\xII", 'ignore')[0])
78+
self.assertEqual(b"abcI", codecs.escape_decode("abc\\x1I", 'ignore')[0])
79+
self.assertEqual(b"abcI1", codecs.escape_decode("abc\\xI1", 'ignore')[0])
80+
81+
self.assertRaisesRegex(ValueError, r"Trailing \\ in string", codecs.escape_decode, b"\\", None)
82+
self.assertRaisesRegex(ValueError, r"Trailing \\ in string", codecs.escape_decode, b"\\", 'strict')
83+
self.assertRaisesRegex(ValueError, r"Trailing \\ in string", codecs.escape_decode, b"\\", 'replace')
84+
self.assertRaisesRegex(ValueError, r"Trailing \\ in string", codecs.escape_decode, b"\\", 'ignore')
85+
self.assertRaisesRegex(ValueError, r"Trailing \\ in string", codecs.escape_decode, b"\\", 'non-existent')
86+
87+
self.assertRaisesRegex(ValueError, r"invalid \\x escape at position 3", codecs.escape_decode, b"abc\\xii")
88+
self.assertRaisesRegex(ValueError, r"invalid \\x escape at position 3", codecs.escape_decode, b"abc\\x1i")
89+
self.assertRaisesRegex(ValueError, r"invalid \\x escape at position 3", codecs.escape_decode, b"abc\\xii", 'strict')
90+
self.assertRaisesRegex(ValueError, r"invalid \\x escape at position 3", codecs.escape_decode, b"abc\\x1i", 'strict')
91+
self.assertRaisesRegex(ValueError, r"invalid \\x escape at position 3", codecs.escape_decode, b"abc\\xii", None)
92+
self.assertRaisesRegex(ValueError, r"invalid \\x escape at position 3", codecs.escape_decode, b"abc\\x1i", None)
93+
94+
for errors in ['backslashreplace', 'xmlcharrefreplace', 'namereplace', 'surrogateescape', 'surrogatepass', 'non-existent', '']:
95+
self.assertRaisesRegex(ValueError, "decoding error; unknown error handling code: " + errors, codecs.escape_decode, b"abc\\xii", errors)
96+
self.assertRaisesRegex(ValueError, "decoding error; unknown error handling code: " + errors, codecs.escape_decode, b"abc\\x1i", errors)
97+
5598
def test_escape_encode(self):
5699
#sanity checks
57100
value, length = codecs.escape_encode(b"abba")

Tests/test_codecs_stdlib.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ def load_tests(loader, standard_tests, pattern):
5959
suite.addTest(test.test_codecs.CodecsModuleTest('test_undefined'))
6060
suite.addTest(test.test_codecs.EncodedFileTest('test_basic'))
6161
suite.addTest(test.test_codecs.EscapeDecodeTest('test_empty'))
62-
#suite.addTest(test.test_codecs.EscapeDecodeTest('test_errors')) # unknown error handling code: ignore
63-
#suite.addTest(test.test_codecs.EscapeDecodeTest('test_escape')) # (b'[\\"]', 4) != (b'["]', 4)
62+
suite.addTest(test.test_codecs.EscapeDecodeTest('test_errors'))
63+
suite.addTest(test.test_codecs.EscapeDecodeTest('test_escape'))
6464
suite.addTest(test.test_codecs.EscapeDecodeTest('test_raw'))
6565
suite.addTest(test.test_codecs.ExceptionChainingTest('test_codec_lookup_failure_not_wrapped'))
6666
suite.addTest(test.test_codecs.ExceptionChainingTest('test_init_override_is_not_wrapped'))

0 commit comments

Comments
 (0)